diff --git a/MG5aMC/mg5amcnlo b/MG5aMC/mg5amcnlo
index 49c93e01b8..d8c1613ccf 160000
--- a/MG5aMC/mg5amcnlo
+++ b/MG5aMC/mg5amcnlo
@@ -1 +1 @@
-Subproject commit 49c93e01b8596cbdb4e65f628601de1e6f08c744
+Subproject commit d8c1613ccf638b5b078a64379e385def5649622c
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/__init__.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/__init__.py
index a0cd9dbfb3..82661c6c66 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/__init__.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/__init__.py
@@ -55,6 +55,6 @@
     __author__ = 'Andrea Valassi'
     __email__ = 'andrea.valassi@cern.ch'
     __version__ = (1,0,0)
-    minimal_mg5amcnlo_version = (3,5,1)
+    minimal_mg5amcnlo_version = (3,5,2)
     maximal_mg5amcnlo_version = (1000,1000,1000)
-    latest_validated_version = (3,5,1)
+    latest_validated_version = (3,5,2)
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py
index c9d1c7706a..0b849330ef 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py
@@ -57,7 +57,7 @@ def reset_simd(self, old_value, new_value, name):
             return
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-        
+
     def plugin_input(self, finput):
         return
 
@@ -79,7 +79,7 @@ def check_validity(self):
             self['sde_strategy'] = 1
         if self['hel_recycling']:
             self['hel_recycling'] = False
-            
+
 class GPURunCard(CPPRunCard):
     def default_setup(self):
         super(CPPRunCard, self).default_setup()
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
index 8961036fb1..5b557e832a 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
@@ -200,9 +200,9 @@ def convert_model(self, model, wanted_lorentz=[], wanted_coupling=[]):
     # AV (default from OM's tutorial) - add a debug printout
     def finalize(self, matrix_element, cmdhistory, MG5options, outputflag):
         """Typically creating jpeg/HTML output/ compilation/...
-	    cmdhistory is the list of command used so far.
-	    MG5options are all the options of the main interface
-	    outputflags is a list of options provided when doing the output command"""
+           cmdhistory is the list of command used so far.
+           MG5options are all the options of the main interface
+           outputflags is a list of options provided when doing the output command"""
         misc.sprint('Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self))
         if self.in_madevent_mode:
             self.add_input_for_banner()
@@ -214,7 +214,7 @@ def finalize(self, matrix_element, cmdhistory, MG5options, outputflag):
             #if os.system(path + os.sep + 'patchMad.sh ' + self.dir_path + ' PROD ' + patchlevel) != 0:
             #    logger.debug("####### \n stdout is \n %s", stdout)
             #    logger.info("####### \n stderr is \n %s", stderr)
-            #    raise Exception('ERROR! the O/S call to patchMad.sh failed')            
+            #    raise Exception('ERROR! the O/S call to patchMad.sh failed')
             # OLD implementation (SH PR #762)
             #if os.system(PLUGINDIR + os.sep + 'patchMad.sh ' + self.dir_path + ' PROD ' + patchlevel) != 0:
             #    logger.debug("####### \n stdout is \n %s", stdout)
@@ -267,7 +267,7 @@ def add_madevent_plugin_fct(self):
         which contains a series of functions and one dictionary variable TO_OVERWRITE
         that will be used to have temporary overwrite of all the key variable passed as string by their value.
         all variable that are file related should be called as madgraph.dir.file.variable
-        """        
+        """
         plugin_path = os.path.dirname(os.path.realpath( __file__ ))
         files.cp(pjoin(plugin_path, 'launch_plugin.py'), pjoin(self.dir_path, 'bin', 'internal'))
         files.ln(pjoin(self.dir_path, 'lib'),  pjoin(self.dir_path, 'SubProcesses'))
@@ -283,10 +283,10 @@ def change_output_args(args, cmd):
         if 'vector_size' not in ''.join(args):
             args.append('--vector_size=16')
         return args
-    
+
 #------------------------------------------------------------------------------------
 
-class GPU_ProcessExporter(PLUGIN_ProcessExporter):    
+class GPU_ProcessExporter(PLUGIN_ProcessExporter):
     def change_output_args(args, cmd):
         """ """
         cmd._export_format = "madevent"
@@ -295,7 +295,7 @@ def change_output_args(args, cmd):
         if 'vector_size' not in ''.join(args):
             args.append('--vector_size=16384')
         return args
-        
+
     def finalize(self, matrix_element, cmdhistory, MG5options, outputflag):
         misc.sprint("enter dedicated function")
         out = super().finalize(matrix_element, cmdhistory, MG5options, outputflag)
diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
index 16a5e3cdc9..e6546f684c 100644
--- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
@@ -14,7 +14,7 @@ Running MG5 in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect         2023-08-08         *
+*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -62,7 +62,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005366802215576172 [0m
+[1;32mDEBUG: model prefixing  takes 0.005372047424316406 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -154,7 +154,7 @@ INFO: Checking for minimal orders which gives processes.
 INFO: Please specify coupling orders to bypass this step. 
 INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1  
 INFO: Process has 2 diagrams 
-1 processes with 2 diagrams generated in 0.005 s
+1 processes with 2 diagrams generated in 0.004 s
 Total: 1 processes with 2 diagrams
 output madevent ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
@@ -174,7 +174,7 @@ INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1
 INFO: Processing color information for process: e+ e- > mu+ mu- @1 
 INFO: Creating files in directory P1_epem_mupmum 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f2b9e5d1490> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f64657c44f0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -183,27 +183,27 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 0 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 [1;34mWARNING: vector code for lepton pdf not implemented. We removed the option to run dressed lepton [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group epem_mupmum 
 Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s
-Wrote files for 8 helas calls in 0.102 s
+Wrote files for 8 helas calls in 0.098 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
-ALOHA: aloha creates 3 routines in  0.201 s
+ALOHA: aloha creates 3 routines in  0.200 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 7 routines in  0.258 s
+ALOHA: aloha creates 7 routines in  0.255 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -232,6 +232,7 @@ patching file Source/genps.inc
 patching file Source/makefile
 patching file SubProcesses/makefile
 patching file bin/internal/gen_ximprove.py
+Hunk #1 succeeded at 391 (offset 6 lines).
 patching file bin/internal/madevent_interface.py
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses/P1_epem_mupmum; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
@@ -248,9 +249,9 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m1.973s
-user	0m1.681s
-sys	0m0.231s
+real	0m4.853s
+user	0m1.653s
+sys	0m0.201s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -263,7 +264,7 @@ sys	0m0.231s
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect                            *
+*         VERSION 3.5.2_lo_vect                            *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -297,7 +298,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect                            *
+*         VERSION 3.5.2_lo_vect                            *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat
index b9e01f684b..618adbca06 100644
--- a/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.5.1_lo_vect         2023-08-08         *
+#*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/ee_mumu.mad/MGMEVersion.txt b/epochX/cudacpp/ee_mumu.mad/MGMEVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/ee_mumu.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/ee_mumu.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MGVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
index 0af629d3a8..fc293da1de 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h
index f2ef5c1e14..77b610753c 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig.f
index f78f7c102e..02520466e6 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig.f
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f
index fcf2e4dec5..4188745070 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -225,7 +225,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f
index 21e300b33e..1991a72bb9 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -319,7 +319,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/genps.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/genps.f
index fe9c61504b..c00e33d954 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/genps.f
@@ -1877,12 +1877,12 @@ double precision function get_channel_cut(p, config)
          d1 = iforest(1, -i, config)
          d2 = iforest(2, -i, config)
          do j=0,3
-            if (d1.gt.0.and.d1.le.2) then
+            if (d1.gt.0.and.d1.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d1)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d1)
             endif
-            if (d2.gt.0.and.d2.le.2) then
+            if (d2.gt.0.and.d2.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d2)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d2)
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/__init__.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/__init__.py
index 16e60d8182..0d17042f0d 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/__init__.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/__init__.py
@@ -26,6 +26,7 @@ class aMCatNLOError(MadGraph5Error):
 import os
 import logging
 import time
+pjoin = os.path.join
 
 #Look for basic file position MG5DIR and MG4DIR
 MG5DIR = os.path.realpath(os.path.join(os.path.dirname(__file__),
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py
index ef1bf58979..3995ce8109 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py
@@ -2704,7 +2704,8 @@ def __new__(cls, finput=None, **opt):
                                 except Exception as error:
                                     import launch_plugin
                             target_class = launch_plugin.RunCard
-
+            elif issubclass(finput, RunCard):
+                target_class = finput
             else:
                 return None
 
@@ -2968,11 +2969,12 @@ def write(self, output_file, template=None, python_template=False,
         if python_template and not to_write:
             import string
             if self.blocks:
-                text = string.Template(text)
                 mapping = {}
                 for b in self.blocks:
                     mapping[b.name] =  b.get_template(self)
-                text = text.substitute(mapping)
+                    if "$%s" % b.name not in text:
+                        text += "\n$%s\n" % b.name
+                text = string.Template(text).substitute(mapping)
 
             if not self.list_parameter:
                 text = text % self
@@ -4875,6 +4877,9 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 continue
             break
 
+        if proc_characteristic['ninitial'] == 1:
+            self['SDE_strategy'] =1
+
         if 'MLM' in proc_characteristic['limitations']:
             if self['dynamical_scale_choice'] ==  -1:
                 self['dynamical_scale_choice'] = 3
@@ -5940,7 +5945,7 @@ def default_setup(self):
         self.add_param("CheckCycle", 3)
         self.add_param("MaxAttempts", 10)
         self.add_param("ZeroThres", 1e-9)
-        self.add_param("OSThres", 1.0e-13)
+        self.add_param("OSThres", 1.0e-8)
         self.add_param("DoubleCheckHelicityFilter", True)
         self.add_param("WriteOutFilters", True)
         self.add_param("UseLoopFilter", False)
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/common_run_interface.py
index 14c7f310dc..87cb4b88df 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/common_run_interface.py
@@ -4906,6 +4906,7 @@ def __init__(self, question, cards=[], from_banner=None, banner=None, mode='auto
         self.load_default()        
         self.define_paths(**opt)
         self.last_editline_pos = 0
+        self.update_dependent_done = False
 
         if 'allow_arg' not in opt or not opt['allow_arg']:
             # add some mininal content for this:
@@ -6585,7 +6586,9 @@ def postcmd(self, stop, line):
             self.check_card_consistency()
             if self.param_consistency:
                 try:
-                    self.do_update('dependent', timer=20)
+                    if not self.update_dependent_done:
+                        self.do_update('dependent', timer=20)
+                    self.update_dependent_done = False
                 except MadGraph5Error as error:
                     if 'Missing block:' in str(error):
                         self.fail_due_to_format +=1
@@ -6638,6 +6641,8 @@ def do_update(self, line, timer=0):
             self.update_dependent(self.mother_interface, self.me_dir, self.param_card,
                                    self.paths['param'], timer, run_card=self.run_card,
                                    lhapdfconfig=self.lhapdf)
+            self.update_dependent_done = True
+            
 
         elif args[0] == 'missing':
             self.update_missing()
@@ -6717,12 +6722,13 @@ class TimeOutError(Exception):
         def handle_alarm(signum, frame): 
             raise TimeOutError
         signal.signal(signal.SIGALRM, handle_alarm)
+
         if timer:
-            signal.alarm(timer)
             log_level=30
         else:
             log_level=20
 
+
         if run_card:
             as_for_pdf = {'cteq6_m': 0.118,
                           'cteq6_d': 0.118, 
@@ -6781,6 +6787,10 @@ def handle_alarm(signum, frame):
                     logger.log(log_level, "update the strong coupling value (alpha_s) to the value from the pdf selected: %s",  as_for_pdf[pdlabel])
                     modify = True
 
+        if timer:
+            signal.alarm(timer)
+
+
         # Try to load the model in the limited amount of time allowed
         try:
             model = mecmd.get_model()
@@ -6909,7 +6919,8 @@ def check_block(self, blockname):
     def check_answer_consistency(self):
         """function called if the code reads a file"""
         self.check_card_consistency()
-        self.do_update('dependent', timer=20) 
+        if not self.update_dependent_done:
+            self.do_update('dependent', timer=20) 
       
     def help_set(self):
         '''help message for set'''
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_ximprove.py
index a88d60b282..5fd170d18d 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_ximprove.py
@@ -157,12 +157,16 @@ def get_helicity(self, to_submit=True, clean=True):
             
             (stdout, _) = p.communicate(''.encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            try:
+            if stdout:
                 nb_channel = max([math.floor(float(d)) for d in stdout.split()])
-            except Exception as error:
-                misc.sprint(stdout, 'no channel or error for %s' % Pdir)
-                continue
-
+            else:
+                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                    files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                P_zero_result.append(Pdir)
+                if os.path.exists(pjoin(self.me_dir, 'error')):
+                    os.remove(pjoin(self.me_dir, 'error'))
+                continue # bypass bad process
+            
             self.cmd.compile(['madevent_forhel'], cwd=Pdir)
             if not os.path.exists(pjoin(Pdir, 'madevent_forhel')):
                 raise Exception('Error make madevent_forhel not successful')  
@@ -183,11 +187,13 @@ def get_helicity(self, to_submit=True, clean=True):
             #sym_input = "%(points)d %(iterations)d %(accuracy)f \n" % self.opts
             (stdout, _) = p.communicate(" ".encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            if os.path.exists(pjoin(self.me_dir,'error')):
+            if os.path.exists(pjoin(self.me_dir, 'error')):
                 raise Exception(pjoin(self.me_dir,'error')) 
                 # note a continue is not enough here, we have in top to link
                 # the matrixX_optim.f to matrixX_orig.f to let the code to work
                 # after this error.
+                #                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                #    files.cp(matrix_file, matrix_file.replace('orig','optim'))
 
             if 'no events passed cuts' in stdout:
                 raise Exception
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py
index c9d1c7706a..0b849330ef 100644
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py
@@ -57,7 +57,7 @@ def reset_simd(self, old_value, new_value, name):
             return
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-        
+
     def plugin_input(self, finput):
         return
 
@@ -79,7 +79,7 @@ def check_validity(self):
             self['sde_strategy'] = 1
         if self['hel_recycling']:
             self['hel_recycling'] = False
-            
+
 class GPURunCard(CPPRunCard):
     def default_setup(self):
         super(CPPRunCard, self).default_setup()
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py
index d722702891..853aabc98a 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py
@@ -1,4 +1,4 @@
-################################################################################
+###############################################################################
 #
 # Copyright (c) 2011 The MadGraph5_aMC@NLO Development team and Contributors
 #
@@ -3675,7 +3675,7 @@ def do_refine(self, line):
         devnull.close()
     
     ############################################################################ 
-    def do_combine_iteration(self, line):
+    def do_comine_iteration(self, line):
         """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step
             S is for survey 
             R is for refine
@@ -3703,7 +3703,7 @@ def do_combine_iteration(self, line):
     ############################################################################ 
     def do_combine_events(self, line):
         """Advanced commands: Launch combine events"""
-
+        start=time.time()
         args = self.split_arg(line)
         start = time.time()
         # Check argument's validity
@@ -3798,9 +3798,7 @@ def do_combine_events(self, line):
             self.correct_bias()
         elif self.run_card['custom_fcts']:
             self.correct_bias()
-
-        logger.info("combine events done in %s", time.time()-start)
-
+        logger.info("combination of events done in %s s ", time.time()-start)
         
         self.to_store.append('event')
     
@@ -7368,7 +7366,7 @@ def wait_monitoring(Idle, Running, Done):
     import optparse
     # Get the directory of the script real path (bin)                                                                                                                                                           
     # and add it to the current PYTHONPATH                                                                                                                                                                      
-    root_path = os.path.dirname(os.path.dirname(os.path.realpath( __file__ )))
+    #root_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath( __file__ ))))
     sys.path.insert(0, root_path)
 
     class MyOptParser(optparse.OptionParser):    
@@ -7411,7 +7409,13 @@ def error(self, msg=''):
     import logging.config
     # Set logging level according to the logging level given by options                                                                                                                                         
     #logging.basicConfig(level=vars(logging)[options.logging])                                                                                                                                                  
+    import internal
     import internal.coloring_logging
+    # internal.file = XXX/bin/internal/__init__.py
+    # => need three dirname to get XXX
+    # we use internal to have any issue with pythonpath finding the wrong file
+    me_dir = os.path.dirname(os.path.dirname(os.path.dirname(internal.__file__)))
+    print("me_dir is", me_dir)
     try:
         if __debug__ and options.logging == 'INFO':
             options.logging = 'DEBUG'
@@ -7419,7 +7423,8 @@ def error(self, msg=''):
             level = int(options.logging)
         else:
             level = eval('logging.' + options.logging)
-        logging.config.fileConfig(os.path.join(root_path, 'internal', 'me5_logging.conf'))
+        log_path = os.path.join(me_dir, 'bin', 'internal', 'me5_logging.conf')
+        logging.config.fileConfig(log_path)
         logging.root.setLevel(level)
         logging.getLogger('madgraph').setLevel(level)
     except:
@@ -7433,9 +7438,9 @@ def error(self, msg=''):
             if '--web' in args:
                 i = args.index('--web') 
                 args.pop(i)                                                                                                                                                                     
-                cmd_line = MadEventCmd(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmd(me_dir, force_run=True)
             else:
-                cmd_line = MadEventCmdShell(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmdShell(me_dir, force_run=True)
             if not hasattr(cmd_line, 'do_%s' % args[0]):
                 if parser_error:
                     print(parser_error)
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/misc.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/misc.py
index d3fed3baa2..91cd3e5c22 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/misc.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/misc.py
@@ -347,7 +347,7 @@ def tell(msg):
     if dependency=='ninja':
         if cmd.options['ninja'] in ['None',None,''] or\
          (cmd.options['ninja'] == './HEPTools/lib' and not MG5dir is None and\
-         which_lib(pjoin(MG5dir,cmd.options['ninja'],'libninja.a')) is None):
+         which_lib(pjoin(MG5dir,cmd.options['ninja'],'lib','libninja.a')) is None):
             tell("Installing ninja...")
             cmd.do_install('ninja')
  
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/shower_card.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/shower_card.py
index c6d3948cc4..c344ea1b15 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/shower_card.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/shower_card.py
@@ -45,7 +45,9 @@ class ShowerCard(dict):
     false = ['.false.', 'f', 'false', '0']
     logical_vars = ['ue_enabled', 'hadronize', 'b_stable', 'pi_stable', 'wp_stable', 
                     'wm_stable', 'z_stable', 'h_stable', 'tap_stable', 'tam_stable', 
-                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td']
+                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td',
+                    'space_shower_me_corrections', 'time_shower_me_corrections',
+                    'time_shower_me_extended', 'time_shower_me_after_first']
     string_vars = ['extralibs', 'extrapaths', 'includepaths', 'analyse']
     for i in range(1,100):
         string_vars.append('dm_'+str(i))
@@ -82,7 +84,11 @@ class ShowerCard(dict):
             'b_mass' : {'HERWIG6':'b_mass', 'PYTHIA6': 'b_mass', 'HERWIGPP': 'b_mass', 'PYTHIA8': 'b_mass'},
             'analyse' : {'HERWIG6':'hwuti', 'PYTHIA6':'pyuti', 'HERWIGPP':'hwpputi', 'PYTHIA8':'py8uti'},
             'qcut' : {'PYTHIA8':'qcut'},
-            'njmax' : {'PYTHIA8':'njmax'}}
+            'njmax' : {'PYTHIA8':'njmax'},
+            'space_shower_me_corrections' : {'PYTHIA8':'space_shower_me_corrections'},
+            'time_shower_me_corrections' : {'PYTHIA8':'time_shower_me_corrections'},
+            'time_shower_me_extended' : {'PYTHIA8':'time_shower_me_extended'},
+            'time_shower_me_after_first' : {'PYTHIA8':'time_shower_me_after_first'}}
     
     stdhep_dict = {'HERWIG6':'mcatnlo_hwan_stdhep.o', 'PYTHIA6':'mcatnlo_pyan_stdhep.o'}
     
diff --git a/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h b/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h
index 19819e2451..9fa30cfd7f 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc
index 31f620c44e..0b4be4d5ed 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h
index 521831ce4a..64d0b8e761 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
index d48a5c4d44..8cb80f0d38 100644
--- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
@@ -14,7 +14,7 @@ Running MG5 in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect         2023-08-08         *
+*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -62,7 +62,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005533456802368164 [0m
+[1;32mDEBUG: model prefixing  takes 0.005633831024169922 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -174,14 +174,14 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. 
-Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s
+Generated helas calls for 1 subprocesses (2 diagrams) in 0.003 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 4 routines in  0.271 s
+ALOHA: aloha creates 4 routines in  0.267 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -201,6 +201,6 @@ INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu
 [1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
 quit
 
-real	0m0.669s
-user	0m0.609s
-sys	0m0.053s
+real	0m3.653s
+user	0m0.601s
+sys	0m0.049s
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc
index c0ab4edb92..684bd53bf5 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h
index f2ef5c1e14..77b610753c 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h b/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h
index 19819e2451..9fa30cfd7f 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc
index 31f620c44e..0b4be4d5ed 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h
index 521831ce4a..64d0b8e761 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
index 2460cf072a..a1fa47508f 100644
--- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
@@ -14,7 +14,7 @@ Running MG5 in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect         2023-08-08         *
+*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005415439605712891 [0m
+[1;32mDEBUG: model prefixing  takes 0.005694150924682617 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1
 INFO: Processing color information for process: g g > t t~ @1 
 INFO: Creating files in directory P1_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1eba1925b0> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f560fa1e7c0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -184,23 +184,23 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 0 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
-Wrote files for 10 helas calls in 0.106 s
+Wrote files for 10 helas calls in 0.101 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.148 s
+ALOHA: aloha creates 2 routines in  0.145 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.134 s
+ALOHA: aloha creates 4 routines in  0.132 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -225,6 +225,7 @@ patching file Source/genps.inc
 patching file Source/makefile
 patching file SubProcesses/makefile
 patching file bin/internal/gen_ximprove.py
+Hunk #1 succeeded at 391 (offset 6 lines).
 patching file bin/internal/madevent_interface.py
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses/P1_gg_ttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
@@ -237,9 +238,9 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m1.735s
-user	0m1.507s
-sys	0m0.213s
+real	0m4.772s
+user	0m1.470s
+sys	0m0.223s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -252,7 +253,7 @@ sys	0m0.213s
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect                            *
+*         VERSION 3.5.2_lo_vect                            *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -285,7 +286,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect                            *
+*         VERSION 3.5.2_lo_vect                            *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
diff --git a/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat
index 2a2fd25453..4c14989a3f 100644
--- a/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.5.1_lo_vect         2023-08-08         *
+#*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/gg_tt.mad/MGMEVersion.txt b/epochX/cudacpp/gg_tt.mad/MGMEVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/gg_tt.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/gg_tt.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gg_tt.mad/SubProcesses/MGVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
index 02f655f48c..d2e7a3c91d 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
index 0c2d2b0687..3ebd92c038 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f
index fe184caddf..d80d770784 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
index 5a3da931f2..9346ee4c6a 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -216,7 +216,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f
index daea73a6df..0c2ce6ec40 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -301,7 +301,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/genps.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/genps.f
index fe9c61504b..c00e33d954 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/genps.f
@@ -1877,12 +1877,12 @@ double precision function get_channel_cut(p, config)
          d1 = iforest(1, -i, config)
          d2 = iforest(2, -i, config)
          do j=0,3
-            if (d1.gt.0.and.d1.le.2) then
+            if (d1.gt.0.and.d1.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d1)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d1)
             endif
-            if (d2.gt.0.and.d2.le.2) then
+            if (d2.gt.0.and.d2.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d2)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d2)
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/__init__.py b/epochX/cudacpp/gg_tt.mad/bin/internal/__init__.py
index 16e60d8182..0d17042f0d 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/__init__.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/__init__.py
@@ -26,6 +26,7 @@ class aMCatNLOError(MadGraph5Error):
 import os
 import logging
 import time
+pjoin = os.path.join
 
 #Look for basic file position MG5DIR and MG4DIR
 MG5DIR = os.path.realpath(os.path.join(os.path.dirname(__file__),
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py b/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py
index ef1bf58979..3995ce8109 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py
@@ -2704,7 +2704,8 @@ def __new__(cls, finput=None, **opt):
                                 except Exception as error:
                                     import launch_plugin
                             target_class = launch_plugin.RunCard
-
+            elif issubclass(finput, RunCard):
+                target_class = finput
             else:
                 return None
 
@@ -2968,11 +2969,12 @@ def write(self, output_file, template=None, python_template=False,
         if python_template and not to_write:
             import string
             if self.blocks:
-                text = string.Template(text)
                 mapping = {}
                 for b in self.blocks:
                     mapping[b.name] =  b.get_template(self)
-                text = text.substitute(mapping)
+                    if "$%s" % b.name not in text:
+                        text += "\n$%s\n" % b.name
+                text = string.Template(text).substitute(mapping)
 
             if not self.list_parameter:
                 text = text % self
@@ -4875,6 +4877,9 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 continue
             break
 
+        if proc_characteristic['ninitial'] == 1:
+            self['SDE_strategy'] =1
+
         if 'MLM' in proc_characteristic['limitations']:
             if self['dynamical_scale_choice'] ==  -1:
                 self['dynamical_scale_choice'] = 3
@@ -5940,7 +5945,7 @@ def default_setup(self):
         self.add_param("CheckCycle", 3)
         self.add_param("MaxAttempts", 10)
         self.add_param("ZeroThres", 1e-9)
-        self.add_param("OSThres", 1.0e-13)
+        self.add_param("OSThres", 1.0e-8)
         self.add_param("DoubleCheckHelicityFilter", True)
         self.add_param("WriteOutFilters", True)
         self.add_param("UseLoopFilter", False)
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_tt.mad/bin/internal/common_run_interface.py
index 14c7f310dc..87cb4b88df 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/common_run_interface.py
@@ -4906,6 +4906,7 @@ def __init__(self, question, cards=[], from_banner=None, banner=None, mode='auto
         self.load_default()        
         self.define_paths(**opt)
         self.last_editline_pos = 0
+        self.update_dependent_done = False
 
         if 'allow_arg' not in opt or not opt['allow_arg']:
             # add some mininal content for this:
@@ -6585,7 +6586,9 @@ def postcmd(self, stop, line):
             self.check_card_consistency()
             if self.param_consistency:
                 try:
-                    self.do_update('dependent', timer=20)
+                    if not self.update_dependent_done:
+                        self.do_update('dependent', timer=20)
+                    self.update_dependent_done = False
                 except MadGraph5Error as error:
                     if 'Missing block:' in str(error):
                         self.fail_due_to_format +=1
@@ -6638,6 +6641,8 @@ def do_update(self, line, timer=0):
             self.update_dependent(self.mother_interface, self.me_dir, self.param_card,
                                    self.paths['param'], timer, run_card=self.run_card,
                                    lhapdfconfig=self.lhapdf)
+            self.update_dependent_done = True
+            
 
         elif args[0] == 'missing':
             self.update_missing()
@@ -6717,12 +6722,13 @@ class TimeOutError(Exception):
         def handle_alarm(signum, frame): 
             raise TimeOutError
         signal.signal(signal.SIGALRM, handle_alarm)
+
         if timer:
-            signal.alarm(timer)
             log_level=30
         else:
             log_level=20
 
+
         if run_card:
             as_for_pdf = {'cteq6_m': 0.118,
                           'cteq6_d': 0.118, 
@@ -6781,6 +6787,10 @@ def handle_alarm(signum, frame):
                     logger.log(log_level, "update the strong coupling value (alpha_s) to the value from the pdf selected: %s",  as_for_pdf[pdlabel])
                     modify = True
 
+        if timer:
+            signal.alarm(timer)
+
+
         # Try to load the model in the limited amount of time allowed
         try:
             model = mecmd.get_model()
@@ -6909,7 +6919,8 @@ def check_block(self, blockname):
     def check_answer_consistency(self):
         """function called if the code reads a file"""
         self.check_card_consistency()
-        self.do_update('dependent', timer=20) 
+        if not self.update_dependent_done:
+            self.do_update('dependent', timer=20) 
       
     def help_set(self):
         '''help message for set'''
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_ximprove.py
index a88d60b282..5fd170d18d 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_ximprove.py
@@ -157,12 +157,16 @@ def get_helicity(self, to_submit=True, clean=True):
             
             (stdout, _) = p.communicate(''.encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            try:
+            if stdout:
                 nb_channel = max([math.floor(float(d)) for d in stdout.split()])
-            except Exception as error:
-                misc.sprint(stdout, 'no channel or error for %s' % Pdir)
-                continue
-
+            else:
+                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                    files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                P_zero_result.append(Pdir)
+                if os.path.exists(pjoin(self.me_dir, 'error')):
+                    os.remove(pjoin(self.me_dir, 'error'))
+                continue # bypass bad process
+            
             self.cmd.compile(['madevent_forhel'], cwd=Pdir)
             if not os.path.exists(pjoin(Pdir, 'madevent_forhel')):
                 raise Exception('Error make madevent_forhel not successful')  
@@ -183,11 +187,13 @@ def get_helicity(self, to_submit=True, clean=True):
             #sym_input = "%(points)d %(iterations)d %(accuracy)f \n" % self.opts
             (stdout, _) = p.communicate(" ".encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            if os.path.exists(pjoin(self.me_dir,'error')):
+            if os.path.exists(pjoin(self.me_dir, 'error')):
                 raise Exception(pjoin(self.me_dir,'error')) 
                 # note a continue is not enough here, we have in top to link
                 # the matrixX_optim.f to matrixX_orig.f to let the code to work
                 # after this error.
+                #                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                #    files.cp(matrix_file, matrix_file.replace('orig','optim'))
 
             if 'no events passed cuts' in stdout:
                 raise Exception
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py
index c9d1c7706a..0b849330ef 100644
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py
@@ -57,7 +57,7 @@ def reset_simd(self, old_value, new_value, name):
             return
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-        
+
     def plugin_input(self, finput):
         return
 
@@ -79,7 +79,7 @@ def check_validity(self):
             self['sde_strategy'] = 1
         if self['hel_recycling']:
             self['hel_recycling'] = False
-            
+
 class GPURunCard(CPPRunCard):
     def default_setup(self):
         super(CPPRunCard, self).default_setup()
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py
index d722702891..853aabc98a 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py
@@ -1,4 +1,4 @@
-################################################################################
+###############################################################################
 #
 # Copyright (c) 2011 The MadGraph5_aMC@NLO Development team and Contributors
 #
@@ -3675,7 +3675,7 @@ def do_refine(self, line):
         devnull.close()
     
     ############################################################################ 
-    def do_combine_iteration(self, line):
+    def do_comine_iteration(self, line):
         """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step
             S is for survey 
             R is for refine
@@ -3703,7 +3703,7 @@ def do_combine_iteration(self, line):
     ############################################################################ 
     def do_combine_events(self, line):
         """Advanced commands: Launch combine events"""
-
+        start=time.time()
         args = self.split_arg(line)
         start = time.time()
         # Check argument's validity
@@ -3798,9 +3798,7 @@ def do_combine_events(self, line):
             self.correct_bias()
         elif self.run_card['custom_fcts']:
             self.correct_bias()
-
-        logger.info("combine events done in %s", time.time()-start)
-
+        logger.info("combination of events done in %s s ", time.time()-start)
         
         self.to_store.append('event')
     
@@ -7368,7 +7366,7 @@ def wait_monitoring(Idle, Running, Done):
     import optparse
     # Get the directory of the script real path (bin)                                                                                                                                                           
     # and add it to the current PYTHONPATH                                                                                                                                                                      
-    root_path = os.path.dirname(os.path.dirname(os.path.realpath( __file__ )))
+    #root_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath( __file__ ))))
     sys.path.insert(0, root_path)
 
     class MyOptParser(optparse.OptionParser):    
@@ -7411,7 +7409,13 @@ def error(self, msg=''):
     import logging.config
     # Set logging level according to the logging level given by options                                                                                                                                         
     #logging.basicConfig(level=vars(logging)[options.logging])                                                                                                                                                  
+    import internal
     import internal.coloring_logging
+    # internal.file = XXX/bin/internal/__init__.py
+    # => need three dirname to get XXX
+    # we use internal to have any issue with pythonpath finding the wrong file
+    me_dir = os.path.dirname(os.path.dirname(os.path.dirname(internal.__file__)))
+    print("me_dir is", me_dir)
     try:
         if __debug__ and options.logging == 'INFO':
             options.logging = 'DEBUG'
@@ -7419,7 +7423,8 @@ def error(self, msg=''):
             level = int(options.logging)
         else:
             level = eval('logging.' + options.logging)
-        logging.config.fileConfig(os.path.join(root_path, 'internal', 'me5_logging.conf'))
+        log_path = os.path.join(me_dir, 'bin', 'internal', 'me5_logging.conf')
+        logging.config.fileConfig(log_path)
         logging.root.setLevel(level)
         logging.getLogger('madgraph').setLevel(level)
     except:
@@ -7433,9 +7438,9 @@ def error(self, msg=''):
             if '--web' in args:
                 i = args.index('--web') 
                 args.pop(i)                                                                                                                                                                     
-                cmd_line = MadEventCmd(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmd(me_dir, force_run=True)
             else:
-                cmd_line = MadEventCmdShell(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmdShell(me_dir, force_run=True)
             if not hasattr(cmd_line, 'do_%s' % args[0]):
                 if parser_error:
                     print(parser_error)
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/misc.py b/epochX/cudacpp/gg_tt.mad/bin/internal/misc.py
index d3fed3baa2..91cd3e5c22 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/misc.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/misc.py
@@ -347,7 +347,7 @@ def tell(msg):
     if dependency=='ninja':
         if cmd.options['ninja'] in ['None',None,''] or\
          (cmd.options['ninja'] == './HEPTools/lib' and not MG5dir is None and\
-         which_lib(pjoin(MG5dir,cmd.options['ninja'],'libninja.a')) is None):
+         which_lib(pjoin(MG5dir,cmd.options['ninja'],'lib','libninja.a')) is None):
             tell("Installing ninja...")
             cmd.do_install('ninja')
  
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/shower_card.py b/epochX/cudacpp/gg_tt.mad/bin/internal/shower_card.py
index c6d3948cc4..c344ea1b15 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/shower_card.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/shower_card.py
@@ -45,7 +45,9 @@ class ShowerCard(dict):
     false = ['.false.', 'f', 'false', '0']
     logical_vars = ['ue_enabled', 'hadronize', 'b_stable', 'pi_stable', 'wp_stable', 
                     'wm_stable', 'z_stable', 'h_stable', 'tap_stable', 'tam_stable', 
-                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td']
+                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td',
+                    'space_shower_me_corrections', 'time_shower_me_corrections',
+                    'time_shower_me_extended', 'time_shower_me_after_first']
     string_vars = ['extralibs', 'extrapaths', 'includepaths', 'analyse']
     for i in range(1,100):
         string_vars.append('dm_'+str(i))
@@ -82,7 +84,11 @@ class ShowerCard(dict):
             'b_mass' : {'HERWIG6':'b_mass', 'PYTHIA6': 'b_mass', 'HERWIGPP': 'b_mass', 'PYTHIA8': 'b_mass'},
             'analyse' : {'HERWIG6':'hwuti', 'PYTHIA6':'pyuti', 'HERWIGPP':'hwpputi', 'PYTHIA8':'py8uti'},
             'qcut' : {'PYTHIA8':'qcut'},
-            'njmax' : {'PYTHIA8':'njmax'}}
+            'njmax' : {'PYTHIA8':'njmax'},
+            'space_shower_me_corrections' : {'PYTHIA8':'space_shower_me_corrections'},
+            'time_shower_me_corrections' : {'PYTHIA8':'time_shower_me_corrections'},
+            'time_shower_me_extended' : {'PYTHIA8':'time_shower_me_extended'},
+            'time_shower_me_after_first' : {'PYTHIA8':'time_shower_me_after_first'}}
     
     stdhep_dict = {'HERWIG6':'mcatnlo_hwan_stdhep.o', 'PYTHIA6':'mcatnlo_pyan_stdhep.o'}
     
diff --git a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h
index 07d0bfa887..55f43bb43a 100644
--- a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc
index 3452d1e8da..a9bc93ff98 100644
--- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h
index 4f6f322ed9..932f123fea 100644
--- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
index f9425b6b07..805df19bd9 100644
--- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
@@ -14,7 +14,7 @@ Running MG5 in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect         2023-08-08         *
+*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0057506561279296875 [0m
+[1;32mDEBUG: model prefixing  takes 0.00567626953125 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -180,7 +180,7 @@ Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.148 s
+ALOHA: aloha creates 2 routines in  0.143 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -196,6 +196,6 @@ INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/s
 [1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
 quit
 
-real	0m0.684s
-user	0m0.481s
-sys	0m0.057s
+real	0m3.529s
+user	0m0.478s
+sys	0m0.048s
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc
index 141d1f24ac..0e44ef42c3 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h
index 0c2d2b0687..3ebd92c038 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h
index 07d0bfa887..55f43bb43a 100644
--- a/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc
index 3452d1e8da..a9bc93ff98 100644
--- a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h
index 4f6f322ed9..932f123fea 100644
--- a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
index 2db08eff10..9d4dbd85f0 100644
--- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
+++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
@@ -14,7 +14,7 @@ Running MG5 in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect         2023-08-08         *
+*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005487680435180664 [0m
+[1;32mDEBUG: model prefixing  takes 0.005400419235229492 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -163,7 +163,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @2  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.020 s
+1 processes with 16 diagrams generated in 0.019 s
 Total: 2 processes with 19 diagrams
 output madevent ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
@@ -185,7 +185,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1
 INFO: Processing color information for process: g g > t t~ @1 
 INFO: Creating files in directory P2_gg_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa2bded8520> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f43c9ec5c70> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -194,15 +194,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 0 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @2 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
 INFO: Creating files in directory P1_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa2bdedb850> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f43c9ecb2b0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -211,21 +211,21 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 1 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
-Generated helas calls for 2 subprocesses (19 diagrams) in 0.044 s
-Wrote files for 46 helas calls in 0.247 s
+Generated helas calls for 2 subprocesses (19 diagrams) in 0.042 s
+Wrote files for 46 helas calls in 0.242 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.331 s
+ALOHA: aloha creates 5 routines in  0.324 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -233,7 +233,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 10 routines in  0.314 s
+ALOHA: aloha creates 10 routines in  0.308 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -263,6 +263,7 @@ patching file Source/genps.inc
 patching file Source/makefile
 patching file SubProcesses/makefile
 patching file bin/internal/gen_ximprove.py
+Hunk #1 succeeded at 391 (offset 6 lines).
 patching file bin/internal/madevent_interface.py
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses/P1_gg_ttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
@@ -283,9 +284,9 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.331s
-user	0m2.084s
-sys	0m0.240s
+real	0m5.282s
+user	0m2.049s
+sys	0m0.227s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -298,7 +299,7 @@ sys	0m0.240s
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect                            *
+*         VERSION 3.5.2_lo_vect                            *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -331,7 +332,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect                            *
+*         VERSION 3.5.2_lo_vect                            *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat
index cdb64729b1..d0845f65f5 100644
--- a/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.5.1_lo_vect         2023-08-08         *
+#*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/gg_tt01g.mad/MGMEVersion.txt b/epochX/cudacpp/gg_tt01g.mad/MGMEVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/gg_tt01g.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/gg_tt01g.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MGVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
index 02f655f48c..d2e7a3c91d 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
index 0c2d2b0687..3ebd92c038 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig.f
index fe184caddf..d80d770784 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
index 5a3da931f2..9346ee4c6a 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -216,7 +216,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/matrix1.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/matrix1.f
index daea73a6df..0c2ce6ec40 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/matrix1.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -301,7 +301,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc
index ce1badffca..1e24c2819d 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h
index 248ed1ec9e..3901ddcb20 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig.f
index f751e9f14a..53ca75eaf4 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig1.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig1.f
index 6eb0fa0827..d6c6f42c9e 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig1.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -216,7 +216,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/matrix1.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/matrix1.f
index 02f406668c..5c91f2448c 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/matrix1.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -317,7 +317,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/genps.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/genps.f
index fe9c61504b..c00e33d954 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/genps.f
@@ -1877,12 +1877,12 @@ double precision function get_channel_cut(p, config)
          d1 = iforest(1, -i, config)
          d2 = iforest(2, -i, config)
          do j=0,3
-            if (d1.gt.0.and.d1.le.2) then
+            if (d1.gt.0.and.d1.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d1)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d1)
             endif
-            if (d2.gt.0.and.d2.le.2) then
+            if (d2.gt.0.and.d2.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d2)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d2)
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/__init__.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/__init__.py
index 16e60d8182..0d17042f0d 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/__init__.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/__init__.py
@@ -26,6 +26,7 @@ class aMCatNLOError(MadGraph5Error):
 import os
 import logging
 import time
+pjoin = os.path.join
 
 #Look for basic file position MG5DIR and MG4DIR
 MG5DIR = os.path.realpath(os.path.join(os.path.dirname(__file__),
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py
index ef1bf58979..3995ce8109 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py
@@ -2704,7 +2704,8 @@ def __new__(cls, finput=None, **opt):
                                 except Exception as error:
                                     import launch_plugin
                             target_class = launch_plugin.RunCard
-
+            elif issubclass(finput, RunCard):
+                target_class = finput
             else:
                 return None
 
@@ -2968,11 +2969,12 @@ def write(self, output_file, template=None, python_template=False,
         if python_template and not to_write:
             import string
             if self.blocks:
-                text = string.Template(text)
                 mapping = {}
                 for b in self.blocks:
                     mapping[b.name] =  b.get_template(self)
-                text = text.substitute(mapping)
+                    if "$%s" % b.name not in text:
+                        text += "\n$%s\n" % b.name
+                text = string.Template(text).substitute(mapping)
 
             if not self.list_parameter:
                 text = text % self
@@ -4875,6 +4877,9 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 continue
             break
 
+        if proc_characteristic['ninitial'] == 1:
+            self['SDE_strategy'] =1
+
         if 'MLM' in proc_characteristic['limitations']:
             if self['dynamical_scale_choice'] ==  -1:
                 self['dynamical_scale_choice'] = 3
@@ -5940,7 +5945,7 @@ def default_setup(self):
         self.add_param("CheckCycle", 3)
         self.add_param("MaxAttempts", 10)
         self.add_param("ZeroThres", 1e-9)
-        self.add_param("OSThres", 1.0e-13)
+        self.add_param("OSThres", 1.0e-8)
         self.add_param("DoubleCheckHelicityFilter", True)
         self.add_param("WriteOutFilters", True)
         self.add_param("UseLoopFilter", False)
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/common_run_interface.py
index 14c7f310dc..87cb4b88df 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/common_run_interface.py
@@ -4906,6 +4906,7 @@ def __init__(self, question, cards=[], from_banner=None, banner=None, mode='auto
         self.load_default()        
         self.define_paths(**opt)
         self.last_editline_pos = 0
+        self.update_dependent_done = False
 
         if 'allow_arg' not in opt or not opt['allow_arg']:
             # add some mininal content for this:
@@ -6585,7 +6586,9 @@ def postcmd(self, stop, line):
             self.check_card_consistency()
             if self.param_consistency:
                 try:
-                    self.do_update('dependent', timer=20)
+                    if not self.update_dependent_done:
+                        self.do_update('dependent', timer=20)
+                    self.update_dependent_done = False
                 except MadGraph5Error as error:
                     if 'Missing block:' in str(error):
                         self.fail_due_to_format +=1
@@ -6638,6 +6641,8 @@ def do_update(self, line, timer=0):
             self.update_dependent(self.mother_interface, self.me_dir, self.param_card,
                                    self.paths['param'], timer, run_card=self.run_card,
                                    lhapdfconfig=self.lhapdf)
+            self.update_dependent_done = True
+            
 
         elif args[0] == 'missing':
             self.update_missing()
@@ -6717,12 +6722,13 @@ class TimeOutError(Exception):
         def handle_alarm(signum, frame): 
             raise TimeOutError
         signal.signal(signal.SIGALRM, handle_alarm)
+
         if timer:
-            signal.alarm(timer)
             log_level=30
         else:
             log_level=20
 
+
         if run_card:
             as_for_pdf = {'cteq6_m': 0.118,
                           'cteq6_d': 0.118, 
@@ -6781,6 +6787,10 @@ def handle_alarm(signum, frame):
                     logger.log(log_level, "update the strong coupling value (alpha_s) to the value from the pdf selected: %s",  as_for_pdf[pdlabel])
                     modify = True
 
+        if timer:
+            signal.alarm(timer)
+
+
         # Try to load the model in the limited amount of time allowed
         try:
             model = mecmd.get_model()
@@ -6909,7 +6919,8 @@ def check_block(self, blockname):
     def check_answer_consistency(self):
         """function called if the code reads a file"""
         self.check_card_consistency()
-        self.do_update('dependent', timer=20) 
+        if not self.update_dependent_done:
+            self.do_update('dependent', timer=20) 
       
     def help_set(self):
         '''help message for set'''
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_ximprove.py
index a88d60b282..5fd170d18d 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_ximprove.py
@@ -157,12 +157,16 @@ def get_helicity(self, to_submit=True, clean=True):
             
             (stdout, _) = p.communicate(''.encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            try:
+            if stdout:
                 nb_channel = max([math.floor(float(d)) for d in stdout.split()])
-            except Exception as error:
-                misc.sprint(stdout, 'no channel or error for %s' % Pdir)
-                continue
-
+            else:
+                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                    files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                P_zero_result.append(Pdir)
+                if os.path.exists(pjoin(self.me_dir, 'error')):
+                    os.remove(pjoin(self.me_dir, 'error'))
+                continue # bypass bad process
+            
             self.cmd.compile(['madevent_forhel'], cwd=Pdir)
             if not os.path.exists(pjoin(Pdir, 'madevent_forhel')):
                 raise Exception('Error make madevent_forhel not successful')  
@@ -183,11 +187,13 @@ def get_helicity(self, to_submit=True, clean=True):
             #sym_input = "%(points)d %(iterations)d %(accuracy)f \n" % self.opts
             (stdout, _) = p.communicate(" ".encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            if os.path.exists(pjoin(self.me_dir,'error')):
+            if os.path.exists(pjoin(self.me_dir, 'error')):
                 raise Exception(pjoin(self.me_dir,'error')) 
                 # note a continue is not enough here, we have in top to link
                 # the matrixX_optim.f to matrixX_orig.f to let the code to work
                 # after this error.
+                #                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                #    files.cp(matrix_file, matrix_file.replace('orig','optim'))
 
             if 'no events passed cuts' in stdout:
                 raise Exception
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py
index c9d1c7706a..0b849330ef 100644
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py
@@ -57,7 +57,7 @@ def reset_simd(self, old_value, new_value, name):
             return
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-        
+
     def plugin_input(self, finput):
         return
 
@@ -79,7 +79,7 @@ def check_validity(self):
             self['sde_strategy'] = 1
         if self['hel_recycling']:
             self['hel_recycling'] = False
-            
+
 class GPURunCard(CPPRunCard):
     def default_setup(self):
         super(CPPRunCard, self).default_setup()
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py
index d722702891..853aabc98a 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py
@@ -1,4 +1,4 @@
-################################################################################
+###############################################################################
 #
 # Copyright (c) 2011 The MadGraph5_aMC@NLO Development team and Contributors
 #
@@ -3675,7 +3675,7 @@ def do_refine(self, line):
         devnull.close()
     
     ############################################################################ 
-    def do_combine_iteration(self, line):
+    def do_comine_iteration(self, line):
         """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step
             S is for survey 
             R is for refine
@@ -3703,7 +3703,7 @@ def do_combine_iteration(self, line):
     ############################################################################ 
     def do_combine_events(self, line):
         """Advanced commands: Launch combine events"""
-
+        start=time.time()
         args = self.split_arg(line)
         start = time.time()
         # Check argument's validity
@@ -3798,9 +3798,7 @@ def do_combine_events(self, line):
             self.correct_bias()
         elif self.run_card['custom_fcts']:
             self.correct_bias()
-
-        logger.info("combine events done in %s", time.time()-start)
-
+        logger.info("combination of events done in %s s ", time.time()-start)
         
         self.to_store.append('event')
     
@@ -7368,7 +7366,7 @@ def wait_monitoring(Idle, Running, Done):
     import optparse
     # Get the directory of the script real path (bin)                                                                                                                                                           
     # and add it to the current PYTHONPATH                                                                                                                                                                      
-    root_path = os.path.dirname(os.path.dirname(os.path.realpath( __file__ )))
+    #root_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath( __file__ ))))
     sys.path.insert(0, root_path)
 
     class MyOptParser(optparse.OptionParser):    
@@ -7411,7 +7409,13 @@ def error(self, msg=''):
     import logging.config
     # Set logging level according to the logging level given by options                                                                                                                                         
     #logging.basicConfig(level=vars(logging)[options.logging])                                                                                                                                                  
+    import internal
     import internal.coloring_logging
+    # internal.file = XXX/bin/internal/__init__.py
+    # => need three dirname to get XXX
+    # we use internal to have any issue with pythonpath finding the wrong file
+    me_dir = os.path.dirname(os.path.dirname(os.path.dirname(internal.__file__)))
+    print("me_dir is", me_dir)
     try:
         if __debug__ and options.logging == 'INFO':
             options.logging = 'DEBUG'
@@ -7419,7 +7423,8 @@ def error(self, msg=''):
             level = int(options.logging)
         else:
             level = eval('logging.' + options.logging)
-        logging.config.fileConfig(os.path.join(root_path, 'internal', 'me5_logging.conf'))
+        log_path = os.path.join(me_dir, 'bin', 'internal', 'me5_logging.conf')
+        logging.config.fileConfig(log_path)
         logging.root.setLevel(level)
         logging.getLogger('madgraph').setLevel(level)
     except:
@@ -7433,9 +7438,9 @@ def error(self, msg=''):
             if '--web' in args:
                 i = args.index('--web') 
                 args.pop(i)                                                                                                                                                                     
-                cmd_line = MadEventCmd(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmd(me_dir, force_run=True)
             else:
-                cmd_line = MadEventCmdShell(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmdShell(me_dir, force_run=True)
             if not hasattr(cmd_line, 'do_%s' % args[0]):
                 if parser_error:
                     print(parser_error)
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/misc.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/misc.py
index d3fed3baa2..91cd3e5c22 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/misc.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/misc.py
@@ -347,7 +347,7 @@ def tell(msg):
     if dependency=='ninja':
         if cmd.options['ninja'] in ['None',None,''] or\
          (cmd.options['ninja'] == './HEPTools/lib' and not MG5dir is None and\
-         which_lib(pjoin(MG5dir,cmd.options['ninja'],'libninja.a')) is None):
+         which_lib(pjoin(MG5dir,cmd.options['ninja'],'lib','libninja.a')) is None):
             tell("Installing ninja...")
             cmd.do_install('ninja')
  
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/shower_card.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/shower_card.py
index c6d3948cc4..c344ea1b15 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/shower_card.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/shower_card.py
@@ -45,7 +45,9 @@ class ShowerCard(dict):
     false = ['.false.', 'f', 'false', '0']
     logical_vars = ['ue_enabled', 'hadronize', 'b_stable', 'pi_stable', 'wp_stable', 
                     'wm_stable', 'z_stable', 'h_stable', 'tap_stable', 'tam_stable', 
-                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td']
+                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td',
+                    'space_shower_me_corrections', 'time_shower_me_corrections',
+                    'time_shower_me_extended', 'time_shower_me_after_first']
     string_vars = ['extralibs', 'extrapaths', 'includepaths', 'analyse']
     for i in range(1,100):
         string_vars.append('dm_'+str(i))
@@ -82,7 +84,11 @@ class ShowerCard(dict):
             'b_mass' : {'HERWIG6':'b_mass', 'PYTHIA6': 'b_mass', 'HERWIGPP': 'b_mass', 'PYTHIA8': 'b_mass'},
             'analyse' : {'HERWIG6':'hwuti', 'PYTHIA6':'pyuti', 'HERWIGPP':'hwpputi', 'PYTHIA8':'py8uti'},
             'qcut' : {'PYTHIA8':'qcut'},
-            'njmax' : {'PYTHIA8':'njmax'}}
+            'njmax' : {'PYTHIA8':'njmax'},
+            'space_shower_me_corrections' : {'PYTHIA8':'space_shower_me_corrections'},
+            'time_shower_me_corrections' : {'PYTHIA8':'time_shower_me_corrections'},
+            'time_shower_me_extended' : {'PYTHIA8':'time_shower_me_extended'},
+            'time_shower_me_after_first' : {'PYTHIA8':'time_shower_me_after_first'}}
     
     stdhep_dict = {'HERWIG6':'mcatnlo_hwan_stdhep.o', 'PYTHIA6':'mcatnlo_pyan_stdhep.o'}
     
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h
index 8995b15c82..361b488401 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc
index 9d09eb6b62..64fc3fea62 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h
index 6b32c66b9b..b6568d3761 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
index 5643c4439c..68afa8d9b0 100644
--- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
@@ -14,7 +14,7 @@ Running MG5 in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect         2023-08-08         *
+*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -62,7 +62,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005489349365234375 [0m
+[1;32mDEBUG: model prefixing  takes 0.005378007888793945 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.022 s
+1 processes with 16 diagrams generated in 0.021 s
 Total: 1 processes with 16 diagrams
 output madevent ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
@@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1
 INFO: Processing color information for process: g g > t t~ g @1 
 INFO: Creating files in directory P1_gg_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f31aa39a130> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fef242fb1f0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -184,21 +184,21 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 0 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
 Generated helas calls for 1 subprocesses (16 diagrams) in 0.038 s
-Wrote files for 36 helas calls in 0.151 s
+Wrote files for 36 helas calls in 0.148 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.330 s
+ALOHA: aloha creates 5 routines in  0.323 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -206,7 +206,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 10 routines in  0.314 s
+ALOHA: aloha creates 10 routines in  0.309 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -236,6 +236,7 @@ patching file Source/genps.inc
 patching file Source/makefile
 patching file SubProcesses/makefile
 patching file bin/internal/gen_ximprove.py
+Hunk #1 succeeded at 391 (offset 6 lines).
 patching file bin/internal/madevent_interface.py
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses/P1_gg_ttxg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
@@ -252,9 +253,9 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.275s
-user	0m1.969s
-sys	0m0.229s
+real	0m5.147s
+user	0m1.924s
+sys	0m0.225s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -267,7 +268,7 @@ sys	0m0.229s
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect                            *
+*         VERSION 3.5.2_lo_vect                            *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -300,7 +301,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect                            *
+*         VERSION 3.5.2_lo_vect                            *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat
index 3af4991f01..a0ffbbc219 100644
--- a/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.5.1_lo_vect         2023-08-08         *
+#*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/gg_ttg.mad/MGMEVersion.txt b/epochX/cudacpp/gg_ttg.mad/MGMEVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/gg_ttg.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/gg_ttg.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MGVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
index f7f5899260..5e2bf0d19a 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
index 9f559fe3ae..37d6ebe981 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f
index d528b1d2f0..dd4cd3a0c2 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f
index 110e204c24..e28575ead8 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -216,7 +216,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f
index bf665ff6e0..a885b7fde3 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -317,7 +317,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/genps.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/genps.f
index fe9c61504b..c00e33d954 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/genps.f
@@ -1877,12 +1877,12 @@ double precision function get_channel_cut(p, config)
          d1 = iforest(1, -i, config)
          d2 = iforest(2, -i, config)
          do j=0,3
-            if (d1.gt.0.and.d1.le.2) then
+            if (d1.gt.0.and.d1.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d1)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d1)
             endif
-            if (d2.gt.0.and.d2.le.2) then
+            if (d2.gt.0.and.d2.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d2)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d2)
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/__init__.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/__init__.py
index 16e60d8182..0d17042f0d 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/__init__.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/__init__.py
@@ -26,6 +26,7 @@ class aMCatNLOError(MadGraph5Error):
 import os
 import logging
 import time
+pjoin = os.path.join
 
 #Look for basic file position MG5DIR and MG4DIR
 MG5DIR = os.path.realpath(os.path.join(os.path.dirname(__file__),
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py
index ef1bf58979..3995ce8109 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py
@@ -2704,7 +2704,8 @@ def __new__(cls, finput=None, **opt):
                                 except Exception as error:
                                     import launch_plugin
                             target_class = launch_plugin.RunCard
-
+            elif issubclass(finput, RunCard):
+                target_class = finput
             else:
                 return None
 
@@ -2968,11 +2969,12 @@ def write(self, output_file, template=None, python_template=False,
         if python_template and not to_write:
             import string
             if self.blocks:
-                text = string.Template(text)
                 mapping = {}
                 for b in self.blocks:
                     mapping[b.name] =  b.get_template(self)
-                text = text.substitute(mapping)
+                    if "$%s" % b.name not in text:
+                        text += "\n$%s\n" % b.name
+                text = string.Template(text).substitute(mapping)
 
             if not self.list_parameter:
                 text = text % self
@@ -4875,6 +4877,9 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 continue
             break
 
+        if proc_characteristic['ninitial'] == 1:
+            self['SDE_strategy'] =1
+
         if 'MLM' in proc_characteristic['limitations']:
             if self['dynamical_scale_choice'] ==  -1:
                 self['dynamical_scale_choice'] = 3
@@ -5940,7 +5945,7 @@ def default_setup(self):
         self.add_param("CheckCycle", 3)
         self.add_param("MaxAttempts", 10)
         self.add_param("ZeroThres", 1e-9)
-        self.add_param("OSThres", 1.0e-13)
+        self.add_param("OSThres", 1.0e-8)
         self.add_param("DoubleCheckHelicityFilter", True)
         self.add_param("WriteOutFilters", True)
         self.add_param("UseLoopFilter", False)
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/common_run_interface.py
index 14c7f310dc..87cb4b88df 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/common_run_interface.py
@@ -4906,6 +4906,7 @@ def __init__(self, question, cards=[], from_banner=None, banner=None, mode='auto
         self.load_default()        
         self.define_paths(**opt)
         self.last_editline_pos = 0
+        self.update_dependent_done = False
 
         if 'allow_arg' not in opt or not opt['allow_arg']:
             # add some mininal content for this:
@@ -6585,7 +6586,9 @@ def postcmd(self, stop, line):
             self.check_card_consistency()
             if self.param_consistency:
                 try:
-                    self.do_update('dependent', timer=20)
+                    if not self.update_dependent_done:
+                        self.do_update('dependent', timer=20)
+                    self.update_dependent_done = False
                 except MadGraph5Error as error:
                     if 'Missing block:' in str(error):
                         self.fail_due_to_format +=1
@@ -6638,6 +6641,8 @@ def do_update(self, line, timer=0):
             self.update_dependent(self.mother_interface, self.me_dir, self.param_card,
                                    self.paths['param'], timer, run_card=self.run_card,
                                    lhapdfconfig=self.lhapdf)
+            self.update_dependent_done = True
+            
 
         elif args[0] == 'missing':
             self.update_missing()
@@ -6717,12 +6722,13 @@ class TimeOutError(Exception):
         def handle_alarm(signum, frame): 
             raise TimeOutError
         signal.signal(signal.SIGALRM, handle_alarm)
+
         if timer:
-            signal.alarm(timer)
             log_level=30
         else:
             log_level=20
 
+
         if run_card:
             as_for_pdf = {'cteq6_m': 0.118,
                           'cteq6_d': 0.118, 
@@ -6781,6 +6787,10 @@ def handle_alarm(signum, frame):
                     logger.log(log_level, "update the strong coupling value (alpha_s) to the value from the pdf selected: %s",  as_for_pdf[pdlabel])
                     modify = True
 
+        if timer:
+            signal.alarm(timer)
+
+
         # Try to load the model in the limited amount of time allowed
         try:
             model = mecmd.get_model()
@@ -6909,7 +6919,8 @@ def check_block(self, blockname):
     def check_answer_consistency(self):
         """function called if the code reads a file"""
         self.check_card_consistency()
-        self.do_update('dependent', timer=20) 
+        if not self.update_dependent_done:
+            self.do_update('dependent', timer=20) 
       
     def help_set(self):
         '''help message for set'''
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_ximprove.py
index a88d60b282..5fd170d18d 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_ximprove.py
@@ -157,12 +157,16 @@ def get_helicity(self, to_submit=True, clean=True):
             
             (stdout, _) = p.communicate(''.encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            try:
+            if stdout:
                 nb_channel = max([math.floor(float(d)) for d in stdout.split()])
-            except Exception as error:
-                misc.sprint(stdout, 'no channel or error for %s' % Pdir)
-                continue
-
+            else:
+                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                    files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                P_zero_result.append(Pdir)
+                if os.path.exists(pjoin(self.me_dir, 'error')):
+                    os.remove(pjoin(self.me_dir, 'error'))
+                continue # bypass bad process
+            
             self.cmd.compile(['madevent_forhel'], cwd=Pdir)
             if not os.path.exists(pjoin(Pdir, 'madevent_forhel')):
                 raise Exception('Error make madevent_forhel not successful')  
@@ -183,11 +187,13 @@ def get_helicity(self, to_submit=True, clean=True):
             #sym_input = "%(points)d %(iterations)d %(accuracy)f \n" % self.opts
             (stdout, _) = p.communicate(" ".encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            if os.path.exists(pjoin(self.me_dir,'error')):
+            if os.path.exists(pjoin(self.me_dir, 'error')):
                 raise Exception(pjoin(self.me_dir,'error')) 
                 # note a continue is not enough here, we have in top to link
                 # the matrixX_optim.f to matrixX_orig.f to let the code to work
                 # after this error.
+                #                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                #    files.cp(matrix_file, matrix_file.replace('orig','optim'))
 
             if 'no events passed cuts' in stdout:
                 raise Exception
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py
index c9d1c7706a..0b849330ef 100644
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py
@@ -57,7 +57,7 @@ def reset_simd(self, old_value, new_value, name):
             return
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-        
+
     def plugin_input(self, finput):
         return
 
@@ -79,7 +79,7 @@ def check_validity(self):
             self['sde_strategy'] = 1
         if self['hel_recycling']:
             self['hel_recycling'] = False
-            
+
 class GPURunCard(CPPRunCard):
     def default_setup(self):
         super(CPPRunCard, self).default_setup()
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py
index d722702891..853aabc98a 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py
@@ -1,4 +1,4 @@
-################################################################################
+###############################################################################
 #
 # Copyright (c) 2011 The MadGraph5_aMC@NLO Development team and Contributors
 #
@@ -3675,7 +3675,7 @@ def do_refine(self, line):
         devnull.close()
     
     ############################################################################ 
-    def do_combine_iteration(self, line):
+    def do_comine_iteration(self, line):
         """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step
             S is for survey 
             R is for refine
@@ -3703,7 +3703,7 @@ def do_combine_iteration(self, line):
     ############################################################################ 
     def do_combine_events(self, line):
         """Advanced commands: Launch combine events"""
-
+        start=time.time()
         args = self.split_arg(line)
         start = time.time()
         # Check argument's validity
@@ -3798,9 +3798,7 @@ def do_combine_events(self, line):
             self.correct_bias()
         elif self.run_card['custom_fcts']:
             self.correct_bias()
-
-        logger.info("combine events done in %s", time.time()-start)
-
+        logger.info("combination of events done in %s s ", time.time()-start)
         
         self.to_store.append('event')
     
@@ -7368,7 +7366,7 @@ def wait_monitoring(Idle, Running, Done):
     import optparse
     # Get the directory of the script real path (bin)                                                                                                                                                           
     # and add it to the current PYTHONPATH                                                                                                                                                                      
-    root_path = os.path.dirname(os.path.dirname(os.path.realpath( __file__ )))
+    #root_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath( __file__ ))))
     sys.path.insert(0, root_path)
 
     class MyOptParser(optparse.OptionParser):    
@@ -7411,7 +7409,13 @@ def error(self, msg=''):
     import logging.config
     # Set logging level according to the logging level given by options                                                                                                                                         
     #logging.basicConfig(level=vars(logging)[options.logging])                                                                                                                                                  
+    import internal
     import internal.coloring_logging
+    # internal.file = XXX/bin/internal/__init__.py
+    # => need three dirname to get XXX
+    # we use internal to have any issue with pythonpath finding the wrong file
+    me_dir = os.path.dirname(os.path.dirname(os.path.dirname(internal.__file__)))
+    print("me_dir is", me_dir)
     try:
         if __debug__ and options.logging == 'INFO':
             options.logging = 'DEBUG'
@@ -7419,7 +7423,8 @@ def error(self, msg=''):
             level = int(options.logging)
         else:
             level = eval('logging.' + options.logging)
-        logging.config.fileConfig(os.path.join(root_path, 'internal', 'me5_logging.conf'))
+        log_path = os.path.join(me_dir, 'bin', 'internal', 'me5_logging.conf')
+        logging.config.fileConfig(log_path)
         logging.root.setLevel(level)
         logging.getLogger('madgraph').setLevel(level)
     except:
@@ -7433,9 +7438,9 @@ def error(self, msg=''):
             if '--web' in args:
                 i = args.index('--web') 
                 args.pop(i)                                                                                                                                                                     
-                cmd_line = MadEventCmd(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmd(me_dir, force_run=True)
             else:
-                cmd_line = MadEventCmdShell(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmdShell(me_dir, force_run=True)
             if not hasattr(cmd_line, 'do_%s' % args[0]):
                 if parser_error:
                     print(parser_error)
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/misc.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/misc.py
index d3fed3baa2..91cd3e5c22 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/misc.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/misc.py
@@ -347,7 +347,7 @@ def tell(msg):
     if dependency=='ninja':
         if cmd.options['ninja'] in ['None',None,''] or\
          (cmd.options['ninja'] == './HEPTools/lib' and not MG5dir is None and\
-         which_lib(pjoin(MG5dir,cmd.options['ninja'],'libninja.a')) is None):
+         which_lib(pjoin(MG5dir,cmd.options['ninja'],'lib','libninja.a')) is None):
             tell("Installing ninja...")
             cmd.do_install('ninja')
  
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/shower_card.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/shower_card.py
index c6d3948cc4..c344ea1b15 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/shower_card.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/shower_card.py
@@ -45,7 +45,9 @@ class ShowerCard(dict):
     false = ['.false.', 'f', 'false', '0']
     logical_vars = ['ue_enabled', 'hadronize', 'b_stable', 'pi_stable', 'wp_stable', 
                     'wm_stable', 'z_stable', 'h_stable', 'tap_stable', 'tam_stable', 
-                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td']
+                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td',
+                    'space_shower_me_corrections', 'time_shower_me_corrections',
+                    'time_shower_me_extended', 'time_shower_me_after_first']
     string_vars = ['extralibs', 'extrapaths', 'includepaths', 'analyse']
     for i in range(1,100):
         string_vars.append('dm_'+str(i))
@@ -82,7 +84,11 @@ class ShowerCard(dict):
             'b_mass' : {'HERWIG6':'b_mass', 'PYTHIA6': 'b_mass', 'HERWIGPP': 'b_mass', 'PYTHIA8': 'b_mass'},
             'analyse' : {'HERWIG6':'hwuti', 'PYTHIA6':'pyuti', 'HERWIGPP':'hwpputi', 'PYTHIA8':'py8uti'},
             'qcut' : {'PYTHIA8':'qcut'},
-            'njmax' : {'PYTHIA8':'njmax'}}
+            'njmax' : {'PYTHIA8':'njmax'},
+            'space_shower_me_corrections' : {'PYTHIA8':'space_shower_me_corrections'},
+            'time_shower_me_corrections' : {'PYTHIA8':'time_shower_me_corrections'},
+            'time_shower_me_extended' : {'PYTHIA8':'time_shower_me_extended'},
+            'time_shower_me_after_first' : {'PYTHIA8':'time_shower_me_after_first'}}
     
     stdhep_dict = {'HERWIG6':'mcatnlo_hwan_stdhep.o', 'PYTHIA6':'mcatnlo_pyan_stdhep.o'}
     
diff --git a/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h
index 8995b15c82..361b488401 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc
index 9d09eb6b62..64fc3fea62 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h
index 6b32c66b9b..b6568d3761 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
index 6c3bb7fa30..97056958fe 100644
--- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
@@ -14,7 +14,7 @@ Running MG5 in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect         2023-08-08         *
+*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -62,7 +62,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.00565791130065918 [0m
+[1;32mDEBUG: model prefixing  takes 0.005817890167236328 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -183,7 +183,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.331 s
+ALOHA: aloha creates 5 routines in  0.326 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -204,6 +204,6 @@ INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/
 [1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
 quit
 
-real	0m0.836s
-user	0m0.731s
-sys	0m0.060s
+real	0m3.779s
+user	0m0.713s
+sys	0m0.062s
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc
index 9393033e26..7f5e51681d 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h
index 9f559fe3ae..37d6ebe981 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h
index 8995b15c82..361b488401 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc
index 9d09eb6b62..64fc3fea62 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h
index 6b32c66b9b..b6568d3761 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
index 2401636ea2..eacd7a356a 100644
--- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
@@ -14,7 +14,7 @@ Running MG5 in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect         2023-08-08         *
+*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -62,7 +62,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005415916442871094 [0m
+[1;32mDEBUG: model prefixing  takes 0.0053293704986572266 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.160 s
+1 processes with 123 diagrams generated in 0.157 s
 Total: 1 processes with 123 diagrams
 output madevent ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
@@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1
 INFO: Processing color information for process: g g > t t~ g g @1 
 INFO: Creating files in directory P1_gg_ttxgg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f094e346130> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f56507006d0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -184,21 +184,21 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 0, 46, 47, 48, 49, 50, 51, 52, 53, 54, 0, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 0, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 0, 88, 89, 90, 91, 92, 93, 0, 94, 95, 96, 97, 98, 99, 0, 100, 101, 102, 103, 104, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 0 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg 
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.446 s
-Wrote files for 222 helas calls in 0.728 s
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.425 s
+Wrote files for 222 helas calls in 0.691 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.337 s
+ALOHA: aloha creates 5 routines in  0.333 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -206,7 +206,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.317 s
+ALOHA: aloha creates 10 routines in  0.316 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -239,6 +239,7 @@ patching file Source/genps.inc
 patching file Source/makefile
 patching file SubProcesses/makefile
 patching file bin/internal/gen_ximprove.py
+Hunk #1 succeeded at 391 (offset 6 lines).
 patching file bin/internal/madevent_interface.py
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses/P1_gg_ttxgg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
@@ -255,9 +256,9 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m3.354s
-user	0m3.128s
-sys	0m0.221s
+real	0m6.262s
+user	0m3.028s
+sys	0m0.232s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -270,7 +271,7 @@ sys	0m0.221s
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect                            *
+*         VERSION 3.5.2_lo_vect                            *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -303,7 +304,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect                            *
+*         VERSION 3.5.2_lo_vect                            *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat
index e4d3fe550f..b7568d1a73 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.5.1_lo_vect         2023-08-08         *
+#*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/gg_ttgg.mad/MGMEVersion.txt b/epochX/cudacpp/gg_ttgg.mad/MGMEVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/gg_ttgg.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MGVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc
index 896d64343e..57dd4aed47 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h
index d681eb7504..04f7c62976 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig.f
index 9d747e6dc1..adf0afbe05 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig.f
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f
index 043887bde3..e4e527260c 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -216,7 +216,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f
index df931e07c4..272c6bd97d 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -349,7 +349,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/genps.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/genps.f
index fe9c61504b..c00e33d954 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/genps.f
@@ -1877,12 +1877,12 @@ double precision function get_channel_cut(p, config)
          d1 = iforest(1, -i, config)
          d2 = iforest(2, -i, config)
          do j=0,3
-            if (d1.gt.0.and.d1.le.2) then
+            if (d1.gt.0.and.d1.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d1)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d1)
             endif
-            if (d2.gt.0.and.d2.le.2) then
+            if (d2.gt.0.and.d2.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d2)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d2)
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/__init__.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/__init__.py
index 16e60d8182..0d17042f0d 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/__init__.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/__init__.py
@@ -26,6 +26,7 @@ class aMCatNLOError(MadGraph5Error):
 import os
 import logging
 import time
+pjoin = os.path.join
 
 #Look for basic file position MG5DIR and MG4DIR
 MG5DIR = os.path.realpath(os.path.join(os.path.dirname(__file__),
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py
index ef1bf58979..3995ce8109 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py
@@ -2704,7 +2704,8 @@ def __new__(cls, finput=None, **opt):
                                 except Exception as error:
                                     import launch_plugin
                             target_class = launch_plugin.RunCard
-
+            elif issubclass(finput, RunCard):
+                target_class = finput
             else:
                 return None
 
@@ -2968,11 +2969,12 @@ def write(self, output_file, template=None, python_template=False,
         if python_template and not to_write:
             import string
             if self.blocks:
-                text = string.Template(text)
                 mapping = {}
                 for b in self.blocks:
                     mapping[b.name] =  b.get_template(self)
-                text = text.substitute(mapping)
+                    if "$%s" % b.name not in text:
+                        text += "\n$%s\n" % b.name
+                text = string.Template(text).substitute(mapping)
 
             if not self.list_parameter:
                 text = text % self
@@ -4875,6 +4877,9 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 continue
             break
 
+        if proc_characteristic['ninitial'] == 1:
+            self['SDE_strategy'] =1
+
         if 'MLM' in proc_characteristic['limitations']:
             if self['dynamical_scale_choice'] ==  -1:
                 self['dynamical_scale_choice'] = 3
@@ -5940,7 +5945,7 @@ def default_setup(self):
         self.add_param("CheckCycle", 3)
         self.add_param("MaxAttempts", 10)
         self.add_param("ZeroThres", 1e-9)
-        self.add_param("OSThres", 1.0e-13)
+        self.add_param("OSThres", 1.0e-8)
         self.add_param("DoubleCheckHelicityFilter", True)
         self.add_param("WriteOutFilters", True)
         self.add_param("UseLoopFilter", False)
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/common_run_interface.py
index 14c7f310dc..87cb4b88df 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/common_run_interface.py
@@ -4906,6 +4906,7 @@ def __init__(self, question, cards=[], from_banner=None, banner=None, mode='auto
         self.load_default()        
         self.define_paths(**opt)
         self.last_editline_pos = 0
+        self.update_dependent_done = False
 
         if 'allow_arg' not in opt or not opt['allow_arg']:
             # add some mininal content for this:
@@ -6585,7 +6586,9 @@ def postcmd(self, stop, line):
             self.check_card_consistency()
             if self.param_consistency:
                 try:
-                    self.do_update('dependent', timer=20)
+                    if not self.update_dependent_done:
+                        self.do_update('dependent', timer=20)
+                    self.update_dependent_done = False
                 except MadGraph5Error as error:
                     if 'Missing block:' in str(error):
                         self.fail_due_to_format +=1
@@ -6638,6 +6641,8 @@ def do_update(self, line, timer=0):
             self.update_dependent(self.mother_interface, self.me_dir, self.param_card,
                                    self.paths['param'], timer, run_card=self.run_card,
                                    lhapdfconfig=self.lhapdf)
+            self.update_dependent_done = True
+            
 
         elif args[0] == 'missing':
             self.update_missing()
@@ -6717,12 +6722,13 @@ class TimeOutError(Exception):
         def handle_alarm(signum, frame): 
             raise TimeOutError
         signal.signal(signal.SIGALRM, handle_alarm)
+
         if timer:
-            signal.alarm(timer)
             log_level=30
         else:
             log_level=20
 
+
         if run_card:
             as_for_pdf = {'cteq6_m': 0.118,
                           'cteq6_d': 0.118, 
@@ -6781,6 +6787,10 @@ def handle_alarm(signum, frame):
                     logger.log(log_level, "update the strong coupling value (alpha_s) to the value from the pdf selected: %s",  as_for_pdf[pdlabel])
                     modify = True
 
+        if timer:
+            signal.alarm(timer)
+
+
         # Try to load the model in the limited amount of time allowed
         try:
             model = mecmd.get_model()
@@ -6909,7 +6919,8 @@ def check_block(self, blockname):
     def check_answer_consistency(self):
         """function called if the code reads a file"""
         self.check_card_consistency()
-        self.do_update('dependent', timer=20) 
+        if not self.update_dependent_done:
+            self.do_update('dependent', timer=20) 
       
     def help_set(self):
         '''help message for set'''
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_ximprove.py
index a88d60b282..5fd170d18d 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_ximprove.py
@@ -157,12 +157,16 @@ def get_helicity(self, to_submit=True, clean=True):
             
             (stdout, _) = p.communicate(''.encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            try:
+            if stdout:
                 nb_channel = max([math.floor(float(d)) for d in stdout.split()])
-            except Exception as error:
-                misc.sprint(stdout, 'no channel or error for %s' % Pdir)
-                continue
-
+            else:
+                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                    files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                P_zero_result.append(Pdir)
+                if os.path.exists(pjoin(self.me_dir, 'error')):
+                    os.remove(pjoin(self.me_dir, 'error'))
+                continue # bypass bad process
+            
             self.cmd.compile(['madevent_forhel'], cwd=Pdir)
             if not os.path.exists(pjoin(Pdir, 'madevent_forhel')):
                 raise Exception('Error make madevent_forhel not successful')  
@@ -183,11 +187,13 @@ def get_helicity(self, to_submit=True, clean=True):
             #sym_input = "%(points)d %(iterations)d %(accuracy)f \n" % self.opts
             (stdout, _) = p.communicate(" ".encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            if os.path.exists(pjoin(self.me_dir,'error')):
+            if os.path.exists(pjoin(self.me_dir, 'error')):
                 raise Exception(pjoin(self.me_dir,'error')) 
                 # note a continue is not enough here, we have in top to link
                 # the matrixX_optim.f to matrixX_orig.f to let the code to work
                 # after this error.
+                #                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                #    files.cp(matrix_file, matrix_file.replace('orig','optim'))
 
             if 'no events passed cuts' in stdout:
                 raise Exception
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py
index c9d1c7706a..0b849330ef 100644
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py
@@ -57,7 +57,7 @@ def reset_simd(self, old_value, new_value, name):
             return
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-        
+
     def plugin_input(self, finput):
         return
 
@@ -79,7 +79,7 @@ def check_validity(self):
             self['sde_strategy'] = 1
         if self['hel_recycling']:
             self['hel_recycling'] = False
-            
+
 class GPURunCard(CPPRunCard):
     def default_setup(self):
         super(CPPRunCard, self).default_setup()
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py
index d722702891..853aabc98a 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py
@@ -1,4 +1,4 @@
-################################################################################
+###############################################################################
 #
 # Copyright (c) 2011 The MadGraph5_aMC@NLO Development team and Contributors
 #
@@ -3675,7 +3675,7 @@ def do_refine(self, line):
         devnull.close()
     
     ############################################################################ 
-    def do_combine_iteration(self, line):
+    def do_comine_iteration(self, line):
         """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step
             S is for survey 
             R is for refine
@@ -3703,7 +3703,7 @@ def do_combine_iteration(self, line):
     ############################################################################ 
     def do_combine_events(self, line):
         """Advanced commands: Launch combine events"""
-
+        start=time.time()
         args = self.split_arg(line)
         start = time.time()
         # Check argument's validity
@@ -3798,9 +3798,7 @@ def do_combine_events(self, line):
             self.correct_bias()
         elif self.run_card['custom_fcts']:
             self.correct_bias()
-
-        logger.info("combine events done in %s", time.time()-start)
-
+        logger.info("combination of events done in %s s ", time.time()-start)
         
         self.to_store.append('event')
     
@@ -7368,7 +7366,7 @@ def wait_monitoring(Idle, Running, Done):
     import optparse
     # Get the directory of the script real path (bin)                                                                                                                                                           
     # and add it to the current PYTHONPATH                                                                                                                                                                      
-    root_path = os.path.dirname(os.path.dirname(os.path.realpath( __file__ )))
+    #root_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath( __file__ ))))
     sys.path.insert(0, root_path)
 
     class MyOptParser(optparse.OptionParser):    
@@ -7411,7 +7409,13 @@ def error(self, msg=''):
     import logging.config
     # Set logging level according to the logging level given by options                                                                                                                                         
     #logging.basicConfig(level=vars(logging)[options.logging])                                                                                                                                                  
+    import internal
     import internal.coloring_logging
+    # internal.file = XXX/bin/internal/__init__.py
+    # => need three dirname to get XXX
+    # we use internal to have any issue with pythonpath finding the wrong file
+    me_dir = os.path.dirname(os.path.dirname(os.path.dirname(internal.__file__)))
+    print("me_dir is", me_dir)
     try:
         if __debug__ and options.logging == 'INFO':
             options.logging = 'DEBUG'
@@ -7419,7 +7423,8 @@ def error(self, msg=''):
             level = int(options.logging)
         else:
             level = eval('logging.' + options.logging)
-        logging.config.fileConfig(os.path.join(root_path, 'internal', 'me5_logging.conf'))
+        log_path = os.path.join(me_dir, 'bin', 'internal', 'me5_logging.conf')
+        logging.config.fileConfig(log_path)
         logging.root.setLevel(level)
         logging.getLogger('madgraph').setLevel(level)
     except:
@@ -7433,9 +7438,9 @@ def error(self, msg=''):
             if '--web' in args:
                 i = args.index('--web') 
                 args.pop(i)                                                                                                                                                                     
-                cmd_line = MadEventCmd(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmd(me_dir, force_run=True)
             else:
-                cmd_line = MadEventCmdShell(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmdShell(me_dir, force_run=True)
             if not hasattr(cmd_line, 'do_%s' % args[0]):
                 if parser_error:
                     print(parser_error)
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/misc.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/misc.py
index d3fed3baa2..91cd3e5c22 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/misc.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/misc.py
@@ -347,7 +347,7 @@ def tell(msg):
     if dependency=='ninja':
         if cmd.options['ninja'] in ['None',None,''] or\
          (cmd.options['ninja'] == './HEPTools/lib' and not MG5dir is None and\
-         which_lib(pjoin(MG5dir,cmd.options['ninja'],'libninja.a')) is None):
+         which_lib(pjoin(MG5dir,cmd.options['ninja'],'lib','libninja.a')) is None):
             tell("Installing ninja...")
             cmd.do_install('ninja')
  
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/shower_card.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/shower_card.py
index c6d3948cc4..c344ea1b15 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/shower_card.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/shower_card.py
@@ -45,7 +45,9 @@ class ShowerCard(dict):
     false = ['.false.', 'f', 'false', '0']
     logical_vars = ['ue_enabled', 'hadronize', 'b_stable', 'pi_stable', 'wp_stable', 
                     'wm_stable', 'z_stable', 'h_stable', 'tap_stable', 'tam_stable', 
-                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td']
+                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td',
+                    'space_shower_me_corrections', 'time_shower_me_corrections',
+                    'time_shower_me_extended', 'time_shower_me_after_first']
     string_vars = ['extralibs', 'extrapaths', 'includepaths', 'analyse']
     for i in range(1,100):
         string_vars.append('dm_'+str(i))
@@ -82,7 +84,11 @@ class ShowerCard(dict):
             'b_mass' : {'HERWIG6':'b_mass', 'PYTHIA6': 'b_mass', 'HERWIGPP': 'b_mass', 'PYTHIA8': 'b_mass'},
             'analyse' : {'HERWIG6':'hwuti', 'PYTHIA6':'pyuti', 'HERWIGPP':'hwpputi', 'PYTHIA8':'py8uti'},
             'qcut' : {'PYTHIA8':'qcut'},
-            'njmax' : {'PYTHIA8':'njmax'}}
+            'njmax' : {'PYTHIA8':'njmax'},
+            'space_shower_me_corrections' : {'PYTHIA8':'space_shower_me_corrections'},
+            'time_shower_me_corrections' : {'PYTHIA8':'time_shower_me_corrections'},
+            'time_shower_me_extended' : {'PYTHIA8':'time_shower_me_extended'},
+            'time_shower_me_after_first' : {'PYTHIA8':'time_shower_me_after_first'}}
     
     stdhep_dict = {'HERWIG6':'mcatnlo_hwan_stdhep.o', 'PYTHIA6':'mcatnlo_pyan_stdhep.o'}
     
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h
index 9b946c21e1..8df465ad6d 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc
index 9d09eb6b62..64fc3fea62 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h
index 6b32c66b9b..b6568d3761 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
index d29fe4c726..80631c94bf 100644
--- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
@@ -14,7 +14,7 @@ Running MG5 in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect         2023-08-08         *
+*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -62,7 +62,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005518913269042969 [0m
+[1;32mDEBUG: model prefixing  takes 0.00567317008972168 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.160 s
+1 processes with 123 diagrams generated in 0.157 s
 Total: 1 processes with 123 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg
 Load PLUGIN.CUDACPP_OUTPUT
@@ -175,7 +175,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. 
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.430 s
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.423 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -183,7 +183,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.324 s
+ALOHA: aloha creates 5 routines in  0.318 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -207,6 +207,6 @@ INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg
 [1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
 quit
 
-real	0m1.541s
-user	0m1.392s
-sys	0m0.062s
+real	0m4.435s
+user	0m1.373s
+sys	0m0.056s
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc
index 927a19a802..204439a1dc 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h
index d681eb7504..04f7c62976 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h
index 9b946c21e1..8df465ad6d 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc
index 9d09eb6b62..64fc3fea62 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h
index 6b32c66b9b..b6568d3761 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
index cd9806264d..ab3974344c 100644
--- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
@@ -14,7 +14,7 @@ Running MG5 in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect         2023-08-08         *
+*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -62,7 +62,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005460023880004883 [0m
+[1;32mDEBUG: model prefixing  takes 0.005319833755493164 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 1.902 s
+1 processes with 1240 diagrams generated in 1.855 s
 Total: 1 processes with 1240 diagrams
 output madevent ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
@@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttxggg
 INFO: Computing Color-Flow optimization [15120 term] 
 INFO: Color-Flow passed to 1630 term in 8s. Introduce 3030 contraction 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f32a5870e20> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe481da42e0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -186,21 +186,21 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 0, 3, 4, 0, 5, 6, 0, 0, 0, 0, 0, 7, 8, 9, 0, 10, 11, 12, 0, 13, 14, 15, 0, 16, 17, 18, 19, 20, 21, 0, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 0, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 0, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 0, 67, 68, 69, 70, 71, 72, 73, 74, 75, 0, 76, 77, 78, 79, 80, 81, 82, 83, 84, 0, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 0, 0, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 0, 121, 122, 0, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 0, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 0, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 0, 197, 198, 199, 200, 201, 202, 0, 203, 204, 205, 206, 207, 208, 0, 209, 210, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 0, 226, 227, 0, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 0, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 0, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 0, 302, 303, 304, 305, 306, 307, 0, 308, 309, 310, 311, 312, 313, 0, 314, 315, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 316, 317, 318, 319, 320, 321, 0, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 0, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 0, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 0, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 0, 378, 379, 0, 380, 381, 0, 0, 0, 0, 0, 382, 383, 384, 385, 386, 387, 388, 389, 390, 0, 391, 392, 393, 394, 395, 396, 397, 398, 399, 0, 400, 401, 402, 403, 404, 405, 406, 407, 408, 0, 409, 410, 411, 412, 413, 414, 0, 415, 416, 417, 418, 419, 420, 0, 0, 0, 421, 422, 423, 424, 425, 426, 0, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 0, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 0, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 0, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 0, 483, 484, 0, 485, 486, 0, 0, 0, 0, 0, 487, 488, 489, 490, 491, 492, 493, 494, 495, 0, 496, 497, 498, 499, 500, 501, 502, 503, 504, 0, 505, 506, 507, 508, 509, 510, 511, 512, 513, 0, 514, 515, 516, 517, 518, 519, 0, 520, 521, 522, 523, 524, 525, 0, 0, 0, 526, 527, 528, 529, 530, 531, 0, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 0, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 0, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 0, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 0, 588, 589, 0, 590, 591, 0, 0, 0, 0, 0, 592, 593, 594, 595, 596, 597, 598, 599, 600, 0, 601, 602, 603, 604, 605, 606, 607, 608, 609, 0, 610, 611, 612, 613, 614, 615, 616, 617, 618, 0, 619, 620, 621, 622, 623, 624, 0, 625, 626, 627, 628, 629, 630, 0, 0, 0, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 0, 664, 665, 666, 667, 668, 669, 0, 670, 671, 672, 673, 674, 675, 0, 0, 0, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 0, 709, 710, 711, 712, 713, 714, 0, 715, 716, 717, 718, 719, 720, 0, 0, 0, 721, 722, 0, 723, 724, 0, 725, 726, 0, 0, 0, 0, 0, 727, 728, 729, 730, 731, 732, 733, 734, 735, 0, 736, 737, 738, 739, 740, 741, 742, 743, 744, 0, 745, 746, 747, 748, 749, 750, 751, 752, 753, 0, 754, 755, 756, 757, 758, 759, 0, 760, 761, 762, 763, 764, 765, 766, 767, 0, 768, 769, 0, 770, 771, 0, 0, 0, 0, 0, 772, 773, 774, 775, 776, 777, 778, 779, 780, 0, 781, 782, 783, 784, 785, 786, 787, 788, 789, 0, 790, 791, 792, 793, 794, 795, 796, 797, 798, 0, 799, 800, 801, 802, 803, 804, 0, 805, 806, 807, 808, 809, 810, 811, 812, 0, 813, 814, 0, 815, 816, 0, 0, 0, 0, 0, 817, 818, 819, 820, 821, 822, 823, 824, 825, 0, 826, 827, 828, 829, 830, 831, 832, 833, 834, 0, 835, 836, 837, 838, 839, 840, 841, 842, 843, 0, 844, 845, 846, 847, 848, 849, 0, 850, 851, 852, 853, 854, 855, 856, 857, 0, 858, 859, 0, 860, 861, 0, 0, 0, 0, 862, 863, 0, 864, 865, 0, 866, 867, 0, 0, 0, 0, 868, 869, 0, 870, 871, 0, 872, 873, 0, 0, 0, 0, 0, 0, 0, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 0, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 0, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 0, 928, 929, 930, 931, 932, 933, 0, 934, 935, 936, 937, 938, 939, 0, 940, 941, 942, 943, 944, 945, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 0 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg 
-Generated helas calls for 1 subprocesses (1240 diagrams) in 6.662 s
-Wrote files for 2281 helas calls in 18.810 s
+Generated helas calls for 1 subprocesses (1240 diagrams) in 6.574 s
+Wrote files for 2281 helas calls in 18.431 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.319 s
+ALOHA: aloha creates 5 routines in  0.335 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -208,7 +208,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.314 s
+ALOHA: aloha creates 10 routines in  0.313 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -241,6 +241,7 @@ patching file Source/genps.inc
 patching file Source/makefile
 patching file SubProcesses/makefile
 patching file bin/internal/gen_ximprove.py
+Hunk #1 succeeded at 391 (offset 6 lines).
 patching file bin/internal/madevent_interface.py
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses/P1_gg_ttxggg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
@@ -257,9 +258,9 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m29.634s
-user	0m29.131s
-sys	0m0.396s
+real	0m32.103s
+user	0m28.586s
+sys	0m0.412s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -272,7 +273,7 @@ sys	0m0.396s
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect                            *
+*         VERSION 3.5.2_lo_vect                            *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -305,7 +306,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect                            *
+*         VERSION 3.5.2_lo_vect                            *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat
index 05d11d495d..2f92ecc4ba 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.5.1_lo_vect         2023-08-08         *
+#*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/gg_ttggg.mad/MGMEVersion.txt b/epochX/cudacpp/gg_ttggg.mad/MGMEVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/gg_ttggg.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/gg_ttggg.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MGVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc
index a525c4ba3f..59033d7b2f 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h
index dc41720ca6..2565923dde 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig.f
index 2d3c5725be..d2a61fa2ac 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f
index 51b8d47520..f22dfbf5e6 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -216,7 +216,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f
index b8a6a894de..41dbc97183 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -413,7 +413,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -17540,7 +17540,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
       TMP_JAMP(2914) = TMP_JAMP(2351) +  TMP_JAMP(1665)  ! used 2 times
       TMP_JAMP(2913) = TMP_JAMP(2310) +  TMP_JAMP(2134)  ! used 2 times
       TMP_JAMP(2912) = TMP_JAMP(2073) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1483)  ! used 2 times
+     $ ,1.000000000000000D+00)) * AMP(1481)  ! used 2 times
       TMP_JAMP(3030) = TMP_JAMP(2935) + ((0.000000000000000D+00,
      $ -1.000000000000000D+00)) * TMP_JAMP(1044)  ! used 2 times
       TMP_JAMP(3029) = TMP_JAMP(2934) - TMP_JAMP(329)  ! used 2 times
@@ -17688,7 +17688,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ +TMP_JAMP(360)+TMP_JAMP(485)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(558)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(576)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*AMP(1489)+(-1.000000000000000D+00)
+     $ ,1.000000000000000D+00))*AMP(1485)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2911)+(-1.000000000000000D+00)*TMP_JAMP(2916)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2971)+TMP_JAMP(2994)
       JAMP(2,1) = (-1.000000000000000D+00)*AMP(242)+(
@@ -17698,7 +17698,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ *TMP_JAMP(557)+((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(576)+((0.000000000000000D+00,-1.000000000000000D+00))
      $ *TMP_JAMP(1580)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(1480)+TMP_JAMP(2655)+(-1.000000000000000D+00)
+     $ *AMP(1476)+TMP_JAMP(2655)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2913)+(-1.000000000000000D+00)*TMP_JAMP(2940)
       JAMP(3,1) = (-1.000000000000000D+00)*AMP(250)+(
      $ -1.000000000000000D+00)*TMP_JAMP(484)+((0.000000000000000D+00
@@ -17715,7 +17715,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ -1.000000000000000D+00))*TMP_JAMP(575)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(1589)+TMP_JAMP(1693)
      $ +TMP_JAMP(2050)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(1471)+(-1.000000000000000D+00)*TMP_JAMP(2353)
+     $ *AMP(1467)+(-1.000000000000000D+00)*TMP_JAMP(2353)
      $ +TMP_JAMP(2659)+TMP_JAMP(2905)+((0.000000000000000D+00
      $ ,1.000000000000000D+00))*TMP_JAMP(2955)+TMP_JAMP(2960)
       JAMP(5,1) = (-1.000000000000000D+00)*AMP(241)
@@ -17919,7 +17919,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
       JAMP(25,1) = (-1.000000000000000D+00)*TMP_JAMP(360)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(454)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(517)
-     $ +(-1.000000000000000D+00)*AMP(976)+(-1.000000000000000D+00)
+     $ +(-1.000000000000000D+00)*AMP(974)+(-1.000000000000000D+00)
      $ *TMP_JAMP(1843)+TMP_JAMP(1859)+((0.000000000000000D+00
      $ ,1.000000000000000D+00))*TMP_JAMP(2085)+TMP_JAMP(2104)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2662)+TMP_JAMP(2851)
@@ -17929,7 +17929,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ -1.000000000000000D+00))*TMP_JAMP(518)+(-1.000000000000000D+00)
      $ *TMP_JAMP(834)+(-1.000000000000000D+00)*TMP_JAMP(1019)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1374)
-     $ +(-1.000000000000000D+00)*AMP(967)+(-1.000000000000000D+00)
+     $ +(-1.000000000000000D+00)*AMP(965)+(-1.000000000000000D+00)
      $ *TMP_JAMP(1479)+TMP_JAMP(1842)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(2085)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2129)+(-1.000000000000000D+00)*TMP_JAMP(2648)
@@ -17940,7 +17940,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ *TMP_JAMP(513)+(-1.000000000000000D+00)*TMP_JAMP(809)+(
      $ -1.000000000000000D+00)*TMP_JAMP(1028)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(1373)+(-1.000000000000000D+00)
-     $ *AMP(975)+(-1.000000000000000D+00)*TMP_JAMP(1963)+TMP_JAMP(2060)
+     $ *AMP(973)+(-1.000000000000000D+00)*TMP_JAMP(1963)+TMP_JAMP(2060)
      $ +(-1.000000000000000D+00)*TMP_JAMP(2104)+TMP_JAMP(2317)
      $ +TMP_JAMP(2387)+TMP_JAMP(2567)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2604)+TMP_JAMP(2796)+TMP_JAMP(2811)+(
@@ -17950,7 +17950,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ *TMP_JAMP(470)+((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(514)+((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(735)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1392)+(-1.000000000000000D+00)*AMP(958)+TMP_JAMP(1448)
+     $ *TMP_JAMP(1392)+(-1.000000000000000D+00)*AMP(956)+TMP_JAMP(1448)
      $ +(-1.000000000000000D+00)*TMP_JAMP(1839)+((0.000000000000000D
      $ +00,1.000000000000000D+00))*TMP_JAMP(1846)+(-1.000000000000000D
      $ +00)*TMP_JAMP(1919)+TMP_JAMP(1963)+(-1.000000000000000D+00)
@@ -17960,13 +17960,13 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
       JAMP(29,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
      $ *TMP_JAMP(314)+((0.000000000000000D+00,-1.000000000000000D+00))
      $ *TMP_JAMP(462)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(717)+(-1.000000000000000D+00)*AMP(966)+TMP_JAMP(1709)
+     $ *TMP_JAMP(717)+(-1.000000000000000D+00)*AMP(964)+TMP_JAMP(1709)
      $ +(-1.000000000000000D+00)*TMP_JAMP(1874)+TMP_JAMP(2061)
-     $ +TMP_JAMP(2129)+AMP(1642)+TMP_JAMP(2445)+(-1.000000000000000D
+     $ +TMP_JAMP(2129)+AMP(1638)+TMP_JAMP(2445)+(-1.000000000000000D
      $ +00)*TMP_JAMP(2493)+TMP_JAMP(2647)+TMP_JAMP(2985)+TMP_JAMP(2996)
       JAMP(30,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
      $ *TMP_JAMP(320)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(520)+(-1.000000000000000D+00)*AMP(957)+(
+     $ *TMP_JAMP(520)+(-1.000000000000000D+00)*AMP(955)+(
      $ -1.000000000000000D+00)*TMP_JAMP(1840)+TMP_JAMP(1874)
      $ +TMP_JAMP(1919)+TMP_JAMP(1966)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(2064)+TMP_JAMP(2250)+(
@@ -17974,7 +17974,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ +TMP_JAMP(3000)+TMP_JAMP(3007)
       JAMP(31,1) = TMP_JAMP(804)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(1391)+(-1.000000000000000D+00)
-     $ *AMP(979)+TMP_JAMP(1857)+TMP_JAMP(1894)+TMP_JAMP(2130)
+     $ *AMP(977)+TMP_JAMP(1857)+TMP_JAMP(1894)+TMP_JAMP(2130)
      $ +TMP_JAMP(2609)+(-1.000000000000000D+00)*TMP_JAMP(2816)
      $ +TMP_JAMP(2825)+(-1.000000000000000D+00)*TMP_JAMP(2863)+(
      $ -1.000000000000000D+00)*TMP_JAMP(3018)
@@ -17982,7 +17982,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ *TMP_JAMP(949)+((0.000000000000000D+00,-1.000000000000000D+00))
      $ *TMP_JAMP(1147)+TMP_JAMP(1280)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(1374)+(-1.000000000000000D+00)
-     $ *AMP(970)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *AMP(968)+((0.000000000000000D+00,-1.000000000000000D+00))
      $ *TMP_JAMP(2067)+(-1.000000000000000D+00)*TMP_JAMP(2130)
      $ +TMP_JAMP(2333)+(-1.000000000000000D+00)*TMP_JAMP(2542)
      $ +TMP_JAMP(2713)+(-1.000000000000000D+00)*TMP_JAMP(2763)
@@ -17991,7 +17991,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
       JAMP(33,1) = (-1.000000000000000D+00)*TMP_JAMP(1102)+(
      $ -1.000000000000000D+00)*TMP_JAMP(1256)+((0.000000000000000D+00
      $ ,1.000000000000000D+00))*TMP_JAMP(1391)+(-1.000000000000000D+00)
-     $ *AMP(977)+(-1.000000000000000D+00)*TMP_JAMP(1688)+(
+     $ *AMP(975)+(-1.000000000000000D+00)*TMP_JAMP(1688)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2556)+TMP_JAMP(2811)
      $ +TMP_JAMP(2817)+TMP_JAMP(2882)+((0.000000000000000D+00
      $ ,1.000000000000000D+00))*TMP_JAMP(2976)+(-1.000000000000000D+00)
@@ -18009,7 +18009,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ *TMP_JAMP(1033)+((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(1152)+((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(1155)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1224)+(-1.000000000000000D+00)*AMP(968)+TMP_JAMP(1582)
+     $ *TMP_JAMP(1224)+(-1.000000000000000D+00)*AMP(966)+TMP_JAMP(1582)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2006)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2105)
      $ +TMP_JAMP(2514)+TMP_JAMP(2546)+TMP_JAMP(2695)+(
@@ -18029,7 +18029,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ -1.000000000000000D+00))*TMP_JAMP(910)+TMP_JAMP(1277)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1346)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1373)
-     $ +(-1.000000000000000D+00)*AMP(980)+TMP_JAMP(1883)
+     $ +(-1.000000000000000D+00)*AMP(978)+TMP_JAMP(1883)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2066)
      $ +TMP_JAMP(2128)+TMP_JAMP(2609)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2846)+(-1.000000000000000D+00)*TMP_JAMP(2899)+(
@@ -18040,7 +18040,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ -1.000000000000000D+00))*TMP_JAMP(1143)+((0.000000000000000D+00
      $ ,-1.000000000000000D+00))*TMP_JAMP(1148)+((0.000000000000000D
      $ +00,-1.000000000000000D+00))*TMP_JAMP(1392)+(
-     $ -1.000000000000000D+00)*AMP(961)+(-1.000000000000000D+00)
+     $ -1.000000000000000D+00)*AMP(959)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2128)+((0.000000000000000D+00,-1.000000000000000D+00))
      $ *TMP_JAMP(2138)+TMP_JAMP(2296)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2483)+(-1.000000000000000D+00)*TMP_JAMP(2535)+(
@@ -18050,7 +18050,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ -1.000000000000000D+00)*TMP_JAMP(1020)+(-1.000000000000000D+00)
      $ *TMP_JAMP(1039)+TMP_JAMP(1100)+(-1.000000000000000D+00)
      $ *TMP_JAMP(1255)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1346)+(-1.000000000000000D+00)*AMP(978)+TMP_JAMP(1686)
+     $ *TMP_JAMP(1346)+(-1.000000000000000D+00)*AMP(976)+TMP_JAMP(1686)
      $ +(-1.000000000000000D+00)*TMP_JAMP(1799)+((0.000000000000000D
      $ +00,1.000000000000000D+00))*TMP_JAMP(1988)+(-1.000000000000000D
      $ +00)*TMP_JAMP(2497)+TMP_JAMP(2591)+(-1.000000000000000D+00)
@@ -18072,7 +18072,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ ,1.000000000000000D+00))*TMP_JAMP(1159)+((0.000000000000000D+00
      $ ,-1.000000000000000D+00))*TMP_JAMP(1211)+(-1.000000000000000D
      $ +00)*TMP_JAMP(1270)+((0.000000000000000D+00,-1.000000000000000D
-     $ +00))*TMP_JAMP(1311)+(-1.000000000000000D+00)*AMP(959)
+     $ +00))*TMP_JAMP(1311)+(-1.000000000000000D+00)*AMP(957)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1784)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1868)
      $ +(-1.000000000000000D+00)*TMP_JAMP(1939)+((0.000000000000000D
@@ -18094,11 +18094,11 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
       JAMP(43,1) = TMP_JAMP(678)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(688)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(949)+TMP_JAMP(1387)+(
-     $ -1.000000000000000D+00)*AMP(971)+TMP_JAMP(2125)+TMP_JAMP(2127)
+     $ -1.000000000000000D+00)*AMP(969)+TMP_JAMP(2125)+TMP_JAMP(2127)
      $ +(-1.000000000000000D+00)*TMP_JAMP(2481)+TMP_JAMP(2497)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2722)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2897)+(-1.000000000000000D+00)*TMP_JAMP(2996)
-      JAMP(44,1) = TMP_JAMP(1384)+(-1.000000000000000D+00)*AMP(962)+(
+      JAMP(44,1) = TMP_JAMP(1384)+(-1.000000000000000D+00)*AMP(960)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2126)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2127)+(-1.000000000000000D+00)*TMP_JAMP(2535)
      $ +TMP_JAMP(2556)+(-1.000000000000000D+00)*TMP_JAMP(2730)+(
@@ -18107,7 +18107,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
       JAMP(45,1) = ((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(728)+((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(874)+TMP_JAMP(1382)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(1387)+(-1.000000000000000D+00)*AMP(969)+TMP_JAMP(1824)
+     $ *TMP_JAMP(1387)+(-1.000000000000000D+00)*AMP(967)+TMP_JAMP(1824)
      $ +(-1.000000000000000D+00)*TMP_JAMP(2088)+((0.000000000000000D
      $ +00,1.000000000000000D+00))*TMP_JAMP(2105)+(-1.000000000000000D
      $ +00)*TMP_JAMP(2327)+(-1.000000000000000D+00)*TMP_JAMP(2608)
@@ -18127,7 +18127,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
       JAMP(47,1) = TMP_JAMP(1129)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(1158)+((0.000000000000000D+00
      $ ,-1.000000000000000D+00))*TMP_JAMP(1303)+(-1.000000000000000D
-     $ +00)*TMP_JAMP(1384)+(-1.000000000000000D+00)*AMP(960)
+     $ +00)*TMP_JAMP(1384)+(-1.000000000000000D+00)*AMP(958)
      $ +TMP_JAMP(1563)+((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(2086)+(-1.000000000000000D+00)*TMP_JAMP(2089)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2364)+TMP_JAMP(2466)+(
@@ -18146,21 +18146,21 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ *TMP_JAMP(1900)+TMP_JAMP(1972)+TMP_JAMP(2677)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2897)+TMP_JAMP(2954)
       JAMP(49,1) = ((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1393)+(-1.000000000000000D+00)*AMP(1405)
+     $ *TMP_JAMP(1393)+(-1.000000000000000D+00)*AMP(1403)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1746)
      $ +TMP_JAMP(1892)+(-1.000000000000000D+00)*TMP_JAMP(1939)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2136)
      $ +TMP_JAMP(2579)+TMP_JAMP(2630)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2836)+TMP_JAMP(2837)+TMP_JAMP(2860)+TMP_JAMP(2990)
       JAMP(50,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1405)+(-1.000000000000000D+00)*AMP(1399)+(
+     $ *TMP_JAMP(1405)+(-1.000000000000000D+00)*AMP(1397)+(
      $ -1.000000000000000D+00)*TMP_JAMP(1892)+TMP_JAMP(1938)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1977)
      $ +TMP_JAMP(2026)+(-1.000000000000000D+00)*TMP_JAMP(2620)
      $ +TMP_JAMP(2731)+TMP_JAMP(2783)+TMP_JAMP(2938)+TMP_JAMP(2986)
       JAMP(51,1) = ((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(1394)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1397)+(-1.000000000000000D+00)*AMP(1404)
+     $ *TMP_JAMP(1397)+(-1.000000000000000D+00)*AMP(1402)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1737)
      $ +TMP_JAMP(1891)+TMP_JAMP(1937)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(2136)+TMP_JAMP(2575)
@@ -18168,11 +18168,11 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ -1.000000000000000D+00)*TMP_JAMP(2895)
       JAMP(52,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
      $ *TMP_JAMP(1176)+TMP_JAMP(1385)+(-1.000000000000000D+00)
-     $ *AMP(1020)+(-1.000000000000000D+00)*TMP_JAMP(1619)+(
+     $ *AMP(1018)+(-1.000000000000000D+00)*TMP_JAMP(1619)+(
      $ -1.000000000000000D+00)*TMP_JAMP(1891)+TMP_JAMP(2145)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2531)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2853)+TMP_JAMP(2938)+TMP_JAMP(2988)+TMP_JAMP(3009)
-      JAMP(53,1) = TMP_JAMP(1415)+(-1.000000000000000D+00)*AMP(1398)
+      JAMP(53,1) = TMP_JAMP(1415)+(-1.000000000000000D+00)*AMP(1396)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1744)
      $ +(-1.000000000000000D+00)*TMP_JAMP(1811)+TMP_JAMP(1890)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1977)
@@ -18184,7 +18184,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ *TMP_JAMP(721)+(-1.000000000000000D+00)*TMP_JAMP(1263)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1295)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1375)
-     $ +(-1.000000000000000D+00)*AMP(1019)+(-1.000000000000000D+00)
+     $ +(-1.000000000000000D+00)*AMP(1017)+(-1.000000000000000D+00)
      $ *TMP_JAMP(1655)+(-1.000000000000000D+00)*TMP_JAMP(1890)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1986)
      $ +(-1.000000000000000D+00)*TMP_JAMP(2145)+TMP_JAMP(2492)
@@ -18194,7 +18194,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
       JAMP(55,1) = ((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(1063)+TMP_JAMP(1141)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(1177)+(-1.000000000000000D+00)
-     $ *AMP(1408)+(-1.000000000000000D+00)*TMP_JAMP(1894)+(
+     $ *AMP(1406)+(-1.000000000000000D+00)*TMP_JAMP(1894)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2075)+((0.000000000000000D+00
      $ ,1.000000000000000D+00))*TMP_JAMP(2108)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2578)+TMP_JAMP(2821)+(-1.000000000000000D+00)
@@ -18203,7 +18203,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
       JAMP(56,1) = TMP_JAMP(647)+((0.000000000000000D+00
      $ ,1.000000000000000D+00))*TMP_JAMP(1168)+((0.000000000000000D+00
      $ ,1.000000000000000D+00))*TMP_JAMP(1205)+(-1.000000000000000D+00)
-     $ *AMP(1402)+TMP_JAMP(2047)+((0.000000000000000D+00,
+     $ *AMP(1400)+TMP_JAMP(2047)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(2108)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2452)+TMP_JAMP(2814)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2940)+(-1.000000000000000D+00)*TMP_JAMP(2957)+(
@@ -18213,7 +18213,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ *TMP_JAMP(1172)+TMP_JAMP(1257)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(1301)+((0.000000000000000D+00
      $ ,1.000000000000000D+00))*TMP_JAMP(1340)+(-1.000000000000000D+00)
-     $ *AMP(1406)+TMP_JAMP(1677)+((0.000000000000000D+00
+     $ *AMP(1404)+TMP_JAMP(1677)+((0.000000000000000D+00
      $ ,1.000000000000000D+00))*TMP_JAMP(2142)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2820)+TMP_JAMP(2832)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2909)+((0.000000000000000D+00,-1.000000000000000D+00))
@@ -18233,7 +18233,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(893)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1169)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1209)
-     $ +TMP_JAMP(1377)+(-1.000000000000000D+00)*AMP(1400)
+     $ +TMP_JAMP(1377)+(-1.000000000000000D+00)*AMP(1398)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1776)
      $ +(-1.000000000000000D+00)*TMP_JAMP(2149)+TMP_JAMP(2729)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2819)+(-1.000000000000000D+00)
@@ -18251,7 +18251,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ -1.000000000000000D+00)*TMP_JAMP(2879)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2983)
       JAMP(61,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1394)+(-1.000000000000000D+00)*AMP(1409)
+     $ *TMP_JAMP(1394)+(-1.000000000000000D+00)*AMP(1407)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2106)
      $ +(-1.000000000000000D+00)*TMP_JAMP(2319)+(-1.000000000000000D
      $ +00)*TMP_JAMP(2805)+(-1.000000000000000D+00)*TMP_JAMP(2881)
@@ -18261,14 +18261,14 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ ,1.000000000000000D+00))*TMP_JAMP(1231)+((0.000000000000000D+00
      $ ,-1.000000000000000D+00))*TMP_JAMP(1288)+((0.000000000000000D
      $ +00,1.000000000000000D+00))*TMP_JAMP(1342)+(-1.000000000000000D
-     $ +00)*AMP(1022)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ +00)*AMP(1020)+((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(2106)+(-1.000000000000000D+00)*TMP_JAMP(2146)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2271)+TMP_JAMP(2363)
      $ +TMP_JAMP(2437)+TMP_JAMP(2562)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2745)+(-1.000000000000000D+00)*TMP_JAMP(2988)+(
      $ -1.000000000000000D+00)*TMP_JAMP(3022)
       JAMP(63,1) = (-1.000000000000000D+00)*TMP_JAMP(1380)+(
-     $ -1.000000000000000D+00)*AMP(1407)+TMP_JAMP(1952)
+     $ -1.000000000000000D+00)*AMP(1405)+TMP_JAMP(1952)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2142)
      $ +(-1.000000000000000D+00)*TMP_JAMP(2341)+TMP_JAMP(2452)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2687)+(-1.000000000000000D+00)
@@ -18278,7 +18278,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(311)
      $ +(-1.000000000000000D+00)*TMP_JAMP(421)+((0.000000000000000D+00
      $ ,-1.000000000000000D+00))*TMP_JAMP(501)+TMP_JAMP(1380)+(
-     $ -1.000000000000000D+00)*AMP(947)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00)*AMP(945)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(1544)+TMP_JAMP(1683)
      $ +TMP_JAMP(1801)+(-1.000000000000000D+00)*TMP_JAMP(2450)
      $ +TMP_JAMP(2586)+TMP_JAMP(2720)+TMP_JAMP(2869)
@@ -18287,7 +18287,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
       JAMP(65,1) = TMP_JAMP(579)+(-1.000000000000000D+00)
      $ *TMP_JAMP(1008)+((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(1049)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1218)+(-1.000000000000000D+00)*AMP(1017)
+     $ *TMP_JAMP(1218)+(-1.000000000000000D+00)*AMP(1015)
      $ +TMP_JAMP(1611)+TMP_JAMP(1862)+((0.000000000000000D+00
      $ ,1.000000000000000D+00))*TMP_JAMP(1901)+TMP_JAMP(2273)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2441)+TMP_JAMP(3022)
@@ -18304,7 +18304,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ -1.000000000000000D+00)*TMP_JAMP(2584)+TMP_JAMP(2887)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2914)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(2975)
-      JAMP(67,1) = (-1.000000000000000D+00)*AMP(1403)+(
+      JAMP(67,1) = (-1.000000000000000D+00)*AMP(1401)+(
      $ -1.000000000000000D+00)*TMP_JAMP(1626)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2144)+(-1.000000000000000D+00)*TMP_JAMP(2452)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2678)+TMP_JAMP(2768)
@@ -18314,13 +18314,13 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ *TMP_JAMP(1055)+((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(1058)+TMP_JAMP(1275)+((0.000000000000000D+00
      $ ,1.000000000000000D+00))*TMP_JAMP(1342)+(-1.000000000000000D+00)
-     $ *AMP(1021)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *AMP(1019)+((0.000000000000000D+00,-1.000000000000000D+00))
      $ *TMP_JAMP(2116)+TMP_JAMP(2144)+TMP_JAMP(2297)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2341)+TMP_JAMP(2426)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2486)+TMP_JAMP(2794)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2999)+TMP_JAMP(3016)
       JAMP(69,1) = (-1.000000000000000D+00)*TMP_JAMP(1413)+(
-     $ -1.000000000000000D+00)*AMP(1401)+TMP_JAMP(2042)+TMP_JAMP(2149)
+     $ -1.000000000000000D+00)*AMP(1399)+TMP_JAMP(2042)+TMP_JAMP(2149)
      $ +TMP_JAMP(2578)+TMP_JAMP(2679)+TMP_JAMP(2731)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2800)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2883)+TMP_JAMP(3004)
@@ -18337,7 +18337,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ -1.000000000000000D+00)*TMP_JAMP(2961)
       JAMP(71,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
      $ *TMP_JAMP(1176)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1296)+(-1.000000000000000D+00)*AMP(1018)
+     $ *TMP_JAMP(1296)+(-1.000000000000000D+00)*AMP(1016)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2091)
      $ +TMP_JAMP(2343)+(-1.000000000000000D+00)*TMP_JAMP(2800)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2945)+(-1.000000000000000D+00)
@@ -18359,11 +18359,11 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ ,1.000000000000000D+00))*TMP_JAMP(1761)+((0.000000000000000D+00
      $ ,-1.000000000000000D+00))*TMP_JAMP(1764)+TMP_JAMP(1895)+(
      $ -1.000000000000000D+00)*TMP_JAMP(1932)+(-1.000000000000000D+00)
-     $ *AMP(1428)+TMP_JAMP(2569)+(-1.000000000000000D+00)
+     $ *AMP(1424)+TMP_JAMP(2569)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2652)+TMP_JAMP(2683)+TMP_JAMP(2786)+TMP_JAMP(2796)
      $ +TMP_JAMP(2902)
       JAMP(74,1) = TMP_JAMP(2027)+TMP_JAMP(2042)+(-1.000000000000000D
-     $ +00)*AMP(1422)+TMP_JAMP(2383)+TMP_JAMP(2580)+(
+     $ +00)*AMP(1418)+TMP_JAMP(2383)+TMP_JAMP(2580)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2683)+TMP_JAMP(2735)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2798)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2932)+TMP_JAMP(2942)+TMP_JAMP(3008)
@@ -18372,14 +18372,14 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ *TMP_JAMP(1383)+(-1.000000000000000D+00)*TMP_JAMP(1386)
      $ +TMP_JAMP(1860)+(-1.000000000000000D+00)*TMP_JAMP(1863)+(
      $ -1.000000000000000D+00)*TMP_JAMP(1895)+TMP_JAMP(1899)+(
-     $ -1.000000000000000D+00)*AMP(1427)+TMP_JAMP(2627)+TMP_JAMP(2780)
+     $ -1.000000000000000D+00)*AMP(1423)+TMP_JAMP(2627)+TMP_JAMP(2780)
      $ +(-1.000000000000000D+00)*TMP_JAMP(2895)+(-1.000000000000000D
      $ +00)*TMP_JAMP(2936)
       JAMP(76,1) = (-1.000000000000000D+00)*TMP_JAMP(1038)+(
      $ -1.000000000000000D+00)*TMP_JAMP(1107)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(1185)+((0.000000000000000D+00
      $ ,-1.000000000000000D+00))*TMP_JAMP(1203)+(-1.000000000000000D
-     $ +00)*AMP(1029)+(-1.000000000000000D+00)*TMP_JAMP(1899)
+     $ +00)*AMP(1027)+(-1.000000000000000D+00)*TMP_JAMP(1899)
      $ +TMP_JAMP(2043)+(-1.000000000000000D+00)*TMP_JAMP(2095)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2328)+TMP_JAMP(2458)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2611)+TMP_JAMP(2649)+(
@@ -18388,13 +18388,13 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ +TMP_JAMP(3009)
       JAMP(77,1) = (-1.000000000000000D+00)*TMP_JAMP(800)
      $ +TMP_JAMP(1631)+(-1.000000000000000D+00)*TMP_JAMP(1812)
-     $ +TMP_JAMP(1898)+(-1.000000000000000D+00)*AMP(1421)+(
+     $ +TMP_JAMP(1898)+(-1.000000000000000D+00)*AMP(1417)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2332)+TMP_JAMP(2537)
      $ +TMP_JAMP(2932)+(-1.000000000000000D+00)*TMP_JAMP(2936)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2972)+TMP_JAMP(3023)
       JAMP(78,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
      $ *TMP_JAMP(1216)+(-1.000000000000000D+00)*TMP_JAMP(1264)+(
-     $ -1.000000000000000D+00)*AMP(1028)+(-1.000000000000000D+00)
+     $ -1.000000000000000D+00)*AMP(1026)+(-1.000000000000000D+00)
      $ *TMP_JAMP(1494)+(-1.000000000000000D+00)*TMP_JAMP(1633)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1764)
      $ +(-1.000000000000000D+00)*TMP_JAMP(1898)+TMP_JAMP(2095)+(
@@ -18408,7 +18408,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ *TMP_JAMP(1200)+TMP_JAMP(1626)+((0.000000000000000D+00
      $ ,1.000000000000000D+00))*TMP_JAMP(1849)+(-1.000000000000000D+00)
      $ *TMP_JAMP(1883)+(-1.000000000000000D+00)*TMP_JAMP(2036)+(
-     $ -1.000000000000000D+00)*AMP(1431)+TMP_JAMP(2489)+(
+     $ -1.000000000000000D+00)*AMP(1427)+TMP_JAMP(2489)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2505)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2570)+(-1.000000000000000D+00)*TMP_JAMP(2630)
      $ +TMP_JAMP(2645)+TMP_JAMP(2686)+(-1.000000000000000D+00)
@@ -18417,7 +18417,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ ,1.000000000000000D+00))*TMP_JAMP(1207)+((0.000000000000000D+00
      $ ,-1.000000000000000D+00))*TMP_JAMP(1291)+TMP_JAMP(2037)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2138)
-     $ +(-1.000000000000000D+00)*AMP(1425)+(-1.000000000000000D+00)
+     $ +(-1.000000000000000D+00)*AMP(1421)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2250)+(-1.000000000000000D+00)*TMP_JAMP(2381)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2686)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2699)+TMP_JAMP(2905)+TMP_JAMP(2987)+(
@@ -18428,7 +18428,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ ,1.000000000000000D+00))*TMP_JAMP(1349)+((0.000000000000000D+00
      $ ,-1.000000000000000D+00))*TMP_JAMP(1987)+TMP_JAMP(2020)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2141)+(-1.000000000000000D+00)
-     $ *AMP(1429)+(-1.000000000000000D+00)*TMP_JAMP(2773)
+     $ *AMP(1425)+(-1.000000000000000D+00)*TMP_JAMP(2773)
      $ +TMP_JAMP(2864)+(-1.000000000000000D+00)*TMP_JAMP(2909)
      $ +TMP_JAMP(3011)
       JAMP(82,1) = (-1.000000000000000D+00)*AMP(404)
@@ -18448,7 +18448,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ *TMP_JAMP(1212)+TMP_JAMP(1268)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(1868)+((0.000000000000000D+00
      $ ,-1.000000000000000D+00))*TMP_JAMP(2011)+(-1.000000000000000D
-     $ +00)*AMP(1423)+TMP_JAMP(2451)+TMP_JAMP(2699)+(
+     $ +00)*AMP(1419)+TMP_JAMP(2451)+TMP_JAMP(2699)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2772)+TMP_JAMP(2917)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2939)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2965)
@@ -18465,7 +18465,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ -1.000000000000000D+00)*TMP_JAMP(2761)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2880)+(-1.000000000000000D+00)*TMP_JAMP(2922)
      $ +TMP_JAMP(2965)
-      JAMP(85,1) = TMP_JAMP(1386)+(-1.000000000000000D+00)*AMP(1432)+(
+      JAMP(85,1) = TMP_JAMP(1386)+(-1.000000000000000D+00)*AMP(1428)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2372)+TMP_JAMP(2387)
      $ +TMP_JAMP(2393)+TMP_JAMP(2427)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2467)+(-1.000000000000000D+00)*TMP_JAMP(2505)+(
@@ -18478,14 +18478,14 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ *TMP_JAMP(983)+TMP_JAMP(1107)+TMP_JAMP(1127)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1204)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1290)
-     $ +(-1.000000000000000D+00)*AMP(1031)+TMP_JAMP(2146)+(
+     $ +(-1.000000000000000D+00)*AMP(1029)+TMP_JAMP(2146)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2480)+TMP_JAMP(2499)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2721)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2896)+((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(2977)+(-1.000000000000000D+00)*TMP_JAMP(2995)
       JAMP(87,1) = (-1.000000000000000D+00)*TMP_JAMP(1379)+(
      $ -1.000000000000000D+00)*TMP_JAMP(1953)+TMP_JAMP(2141)+(
-     $ -1.000000000000000D+00)*AMP(1430)+TMP_JAMP(2247)+TMP_JAMP(2403)
+     $ -1.000000000000000D+00)*AMP(1426)+TMP_JAMP(2247)+TMP_JAMP(2403)
      $ +TMP_JAMP(2882)+TMP_JAMP(2902)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2929)+TMP_JAMP(3005)
       JAMP(88,1) = (-1.000000000000000D+00)*AMP(405)+(
@@ -18504,7 +18504,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ ,1.000000000000000D+00))*TMP_JAMP(476)+TMP_JAMP(1007)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1052)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1219)
-     $ +(-1.000000000000000D+00)*AMP(1026)+TMP_JAMP(1696)+(
+     $ +(-1.000000000000000D+00)*AMP(1024)+TMP_JAMP(1696)+(
      $ -1.000000000000000D+00)*TMP_JAMP(1722)+((0.000000000000000D+00
      $ ,1.000000000000000D+00))*TMP_JAMP(1858)+((0.000000000000000D+00
      $ ,-1.000000000000000D+00))*TMP_JAMP(1901)+(-1.000000000000000D
@@ -18526,22 +18526,22 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ *TMP_JAMP(3003)
       JAMP(91,1) = TMP_JAMP(647)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(2113)+(-1.000000000000000D+00)
-     $ *AMP(1426)+TMP_JAMP(2369)+TMP_JAMP(2502)+(-1.000000000000000D
+     $ *AMP(1422)+TMP_JAMP(2369)+TMP_JAMP(2502)+(-1.000000000000000D
      $ +00)*TMP_JAMP(2941)+(-1.000000000000000D+00)*TMP_JAMP(3023)+(
      $ -1.000000000000000D+00)*TMP_JAMP(3024)
       JAMP(92,1) = ((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(985)+((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(1204)+TMP_JAMP(1261)+TMP_JAMP(1280)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1350)
-     $ +(-1.000000000000000D+00)*AMP(1030)+((0.000000000000000D+00
+     $ +(-1.000000000000000D+00)*AMP(1028)+((0.000000000000000D+00
      $ ,1.000000000000000D+00))*TMP_JAMP(2113)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2143)+TMP_JAMP(2334)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2545)+TMP_JAMP(2714)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2762)+TMP_JAMP(2857)+(-1.000000000000000D+00)
      $ *TMP_JAMP(3002)
       JAMP(93,1) = ((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1769)+(-1.000000000000000D+00)*AMP(1424)+(
-     $ -1.000000000000000D+00)*AMP(1893)+TMP_JAMP(2465)+TMP_JAMP(2476)
+     $ *TMP_JAMP(1769)+(-1.000000000000000D+00)*AMP(1420)+(
+     $ -1.000000000000000D+00)*AMP(1889)+TMP_JAMP(2465)+TMP_JAMP(2476)
      $ +(-1.000000000000000D+00)*TMP_JAMP(2625)+(-1.000000000000000D
      $ +00)*TMP_JAMP(2917)+TMP_JAMP(2928)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2931)+TMP_JAMP(2950)+TMP_JAMP(3024)
@@ -18558,7 +18558,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ +00))*TMP_JAMP(237)+(-1.000000000000000D+00)*TMP_JAMP(1043)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1250)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1350)
-     $ +(-1.000000000000000D+00)*AMP(1027)+TMP_JAMP(2135)
+     $ +(-1.000000000000000D+00)*AMP(1025)+TMP_JAMP(2135)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2148)
      $ +(-1.000000000000000D+00)*TMP_JAMP(2355)+(-1.000000000000000D
      $ +00)*TMP_JAMP(2381)+TMP_JAMP(2757)+TMP_JAMP(2779)+(
@@ -18578,13 +18578,13 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ *TMP_JAMP(1399)+(-1.000000000000000D+00)*TMP_JAMP(1953)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2025)+((0.000000000000000D+00
      $ ,1.000000000000000D+00))*TMP_JAMP(2121)+(-1.000000000000000D+00)
-     $ *AMP(1449)+TMP_JAMP(2234)+TMP_JAMP(2634)+(-1.000000000000000D
+     $ *AMP(1445)+TMP_JAMP(2234)+TMP_JAMP(2634)+(-1.000000000000000D
      $ +00)*TMP_JAMP(2671)+TMP_JAMP(2689)+TMP_JAMP(2727)+TMP_JAMP(2866)
      $ +TMP_JAMP(3012)
       JAMP(98,1) = ((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(1401)+TMP_JAMP(1952)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2022)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2118)+(-1.000000000000000D+00)*AMP(1443)
+     $ *TMP_JAMP(2118)+(-1.000000000000000D+00)*AMP(1439)
      $ +TMP_JAMP(2390)+(-1.000000000000000D+00)*TMP_JAMP(2408)
      $ +TMP_JAMP(2456)+(-1.000000000000000D+00)*TMP_JAMP(2689)
      $ +TMP_JAMP(2841)+TMP_JAMP(2908)+(-1.000000000000000D+00)
@@ -18593,13 +18593,13 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ *TMP_JAMP(1018)+TMP_JAMP(1376)+((0.000000000000000D+00
      $ ,1.000000000000000D+00))*TMP_JAMP(1378)+TMP_JAMP(1913)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2121)
-     $ +TMP_JAMP(2124)+(-1.000000000000000D+00)*AMP(1448)+(
+     $ +TMP_JAMP(2124)+(-1.000000000000000D+00)*AMP(1444)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2490)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2638)+TMP_JAMP(2765)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2843)+TMP_JAMP(2901)
       JAMP(100,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
      $ *TMP_JAMP(746)+(-1.000000000000000D+00)*TMP_JAMP(1278)+(
-     $ -1.000000000000000D+00)*AMP(1038)+(-1.000000000000000D+00)
+     $ -1.000000000000000D+00)*AMP(1036)+(-1.000000000000000D+00)
      $ *TMP_JAMP(1913)+((0.000000000000000D+00,-1.000000000000000D+00))
      $ *TMP_JAMP(2012)+((0.000000000000000D+00,-1.000000000000000D+00))
      $ *TMP_JAMP(2119)+(-1.000000000000000D+00)*TMP_JAMP(2499)
@@ -18608,13 +18608,13 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ -1.000000000000000D+00)*TMP_JAMP(2952)+TMP_JAMP(3020)
       JAMP(101,1) = TMP_JAMP(1910)+((0.000000000000000D+00
      $ ,1.000000000000000D+00))*TMP_JAMP(2118)+TMP_JAMP(2124)+(
-     $ -1.000000000000000D+00)*AMP(1442)+AMP(1813)+TMP_JAMP(2342)+(
+     $ -1.000000000000000D+00)*AMP(1438)+AMP(1809)+TMP_JAMP(2342)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2549)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2842)+(-1.000000000000000D+00)*TMP_JAMP(2867)
      $ +TMP_JAMP(2984)+TMP_JAMP(3014)
       JAMP(102,1) = (-1.000000000000000D+00)*TMP_JAMP(1030)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1404)
-     $ +(-1.000000000000000D+00)*AMP(1037)+(-1.000000000000000D+00)
+     $ +(-1.000000000000000D+00)*AMP(1035)+(-1.000000000000000D+00)
      $ *TMP_JAMP(1809)+(-1.000000000000000D+00)*TMP_JAMP(1910)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2018)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2119)
@@ -18624,7 +18624,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
       JAMP(103,1) = ((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(1252)+((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(1388)+(-1.000000000000000D+00)*TMP_JAMP(2125)+(
-     $ -1.000000000000000D+00)*AMP(1452)+TMP_JAMP(2430)+(
+     $ -1.000000000000000D+00)*AMP(1448)+TMP_JAMP(2430)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2447)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2478)+(-1.000000000000000D+00)*TMP_JAMP(2633)
      $ +TMP_JAMP(2664)+(-1.000000000000000D+00)*TMP_JAMP(2848)
@@ -18634,7 +18634,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ ,1.000000000000000D+00))*TMP_JAMP(845)+((0.000000000000000D+00
      $ ,1.000000000000000D+00))*TMP_JAMP(962)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(1228)+TMP_JAMP(2126)+(
-     $ -1.000000000000000D+00)*AMP(1446)+(-1.000000000000000D+00)
+     $ -1.000000000000000D+00)*AMP(1442)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2440)+(-1.000000000000000D+00)*TMP_JAMP(2457)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2580)+TMP_JAMP(2739)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2830)+(-1.000000000000000D+00)
@@ -18644,7 +18644,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ *TMP_JAMP(989)+((0.000000000000000D+00,-1.000000000000000D+00))
      $ *TMP_JAMP(1388)+(-1.000000000000000D+00)*TMP_JAMP(1670)
      $ +TMP_JAMP(2088)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2117)+(-1.000000000000000D+00)*AMP(1450)
+     $ *TMP_JAMP(2117)+(-1.000000000000000D+00)*AMP(1446)
      $ +TMP_JAMP(2901)+(-1.000000000000000D+00)*TMP_JAMP(2937)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2944)+(-1.000000000000000D+00)
      $ *TMP_JAMP(3026)
@@ -18666,7 +18666,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1304)
      $ +(-1.000000000000000D+00)*TMP_JAMP(1914)+TMP_JAMP(2089)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2137)
-     $ +(-1.000000000000000D+00)*AMP(1444)+TMP_JAMP(2576)
+     $ +(-1.000000000000000D+00)*AMP(1440)+TMP_JAMP(2576)
      $ +TMP_JAMP(2828)+(-1.000000000000000D+00)*TMP_JAMP(2939)+(
      $ -1.000000000000000D+00)*TMP_JAMP(3026)
       JAMP(108,1) = (-1.000000000000000D+00)*AMP(411)
@@ -18674,7 +18674,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(301)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(334)
      $ +(-1.000000000000000D+00)*TMP_JAMP(437)+TMP_JAMP(440)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(596)+(
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(594)+(
      $ -1.000000000000000D+00)*TMP_JAMP(781)+(-1.000000000000000D+00)
      $ *TMP_JAMP(817)+TMP_JAMP(846)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(977)+((0.000000000000000D+00,
@@ -18689,7 +18689,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1378)
      $ +(-1.000000000000000D+00)*TMP_JAMP(1884)+(-1.000000000000000D
      $ +00)*TMP_JAMP(2039)+((0.000000000000000D+00,-1.000000000000000D
-     $ +00))*TMP_JAMP(2068)+(-1.000000000000000D+00)*AMP(1453)+(
+     $ +00))*TMP_JAMP(2068)+(-1.000000000000000D+00)*AMP(1449)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2357)+TMP_JAMP(2523)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2573)+TMP_JAMP(2678)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2766)+TMP_JAMP(2775)+(
@@ -18697,7 +18697,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
       JAMP(110,1) = ((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(990)+((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(1248)+TMP_JAMP(1277)+(-1.000000000000000D+00)
-     $ *AMP(1040)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *AMP(1038)+((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(1852)+TMP_JAMP(1884)+TMP_JAMP(2040)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2116)
      $ +(-1.000000000000000D+00)*TMP_JAMP(2338)+(-1.000000000000000D
@@ -18705,7 +18705,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ +TMP_JAMP(3015)+(-1.000000000000000D+00)*TMP_JAMP(3020)
       JAMP(111,1) = TMP_JAMP(1516)+(-1.000000000000000D+00)
      $ *TMP_JAMP(1932)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2117)+(-1.000000000000000D+00)*AMP(1451)+(
+     $ *TMP_JAMP(2117)+(-1.000000000000000D+00)*AMP(1447)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2371)+TMP_JAMP(2519)
      $ +TMP_JAMP(2572)+(-1.000000000000000D+00)*TMP_JAMP(2679)
      $ +TMP_JAMP(2695)+TMP_JAMP(2787)+((0.000000000000000D+00
@@ -18724,7 +18724,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ *TMP_JAMP(78)+((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(321)+((0.000000000000000D+00,-1.000000000000000D+00))
      $ *TMP_JAMP(739)+(-1.000000000000000D+00)*TMP_JAMP(1272)+(
-     $ -1.000000000000000D+00)*AMP(1035)+(-1.000000000000000D+00)
+     $ -1.000000000000000D+00)*AMP(1033)+(-1.000000000000000D+00)
      $ *TMP_JAMP(1810)+((0.000000000000000D+00,-1.000000000000000D+00))
      $ *TMP_JAMP(2091)+TMP_JAMP(2803)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2933)+TMP_JAMP(2991)+(-1.000000000000000D+00)
@@ -18745,15 +18745,15 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ *TMP_JAMP(2915)+(-1.000000000000000D+00)*TMP_JAMP(2991)
       JAMP(115,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
      $ *TMP_JAMP(589)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2122)+(-1.000000000000000D+00)*AMP(1447)+(
+     $ *TMP_JAMP(2122)+(-1.000000000000000D+00)*AMP(1443)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2373)+TMP_JAMP(2550)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2574)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2582)+(-1.000000000000000D+00)*TMP_JAMP(2626)
      $ +TMP_JAMP(2629)+TMP_JAMP(2941)+(-1.000000000000000D+00)
      $ *TMP_JAMP(3014)
-      JAMP(116,1) = TMP_JAMP(1279)+(-1.000000000000000D+00)*AMP(1039)
+      JAMP(116,1) = TMP_JAMP(1279)+(-1.000000000000000D+00)*AMP(1037)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2122)
-     $ +TMP_JAMP(2143)+AMP(1669)+(-1.000000000000000D+00)
+     $ +TMP_JAMP(2143)+AMP(1665)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2371)+(-1.000000000000000D+00)*TMP_JAMP(2619)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2823)+TMP_JAMP(2853)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2989)+(-1.000000000000000D+00)
@@ -18761,7 +18761,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
       JAMP(117,1) = ((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(589)+(-1.000000000000000D+00)*TMP_JAMP(1658)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2137)
-     $ +(-1.000000000000000D+00)*AMP(1445)+AMP(1519)+TMP_JAMP(2596)
+     $ +(-1.000000000000000D+00)*AMP(1441)+AMP(1515)+TMP_JAMP(2596)
      $ +TMP_JAMP(2624)+TMP_JAMP(2633)+TMP_JAMP(2884)+TMP_JAMP(2908)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2928)+TMP_JAMP(2959)
       JAMP(118,1) = ((0.000000000000000D+00,1.000000000000000D+00))
@@ -18777,7 +18777,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ *TMP_JAMP(2858)+TMP_JAMP(2918)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2959)
       JAMP(119,1) = (-1.000000000000000D+00)*TMP_JAMP(1041)+(
-     $ -1.000000000000000D+00)*AMP(1036)+TMP_JAMP(1608)
+     $ -1.000000000000000D+00)*AMP(1034)+TMP_JAMP(1608)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2148)
      $ +(-1.000000000000000D+00)*TMP_JAMP(2614)+TMP_JAMP(2635)
      $ +TMP_JAMP(2933)+TMP_JAMP(2992)+TMP_JAMP(3019)
@@ -18790,7 +18790,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ +TMP_JAMP(531)+(-1.000000000000000D+00)*TMP_JAMP(1418)+(
      $ -1.000000000000000D+00)*TMP_JAMP(1673)+TMP_JAMP(1724)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1797)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(1462)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(1458)
      $ +TMP_JAMP(2619)+(-1.000000000000000D+00)*TMP_JAMP(2634)
      $ +TMP_JAMP(2670)+(-1.000000000000000D+00)*TMP_JAMP(2916)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2992)
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/genps.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/genps.f
index fe9c61504b..c00e33d954 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/genps.f
@@ -1877,12 +1877,12 @@ double precision function get_channel_cut(p, config)
          d1 = iforest(1, -i, config)
          d2 = iforest(2, -i, config)
          do j=0,3
-            if (d1.gt.0.and.d1.le.2) then
+            if (d1.gt.0.and.d1.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d1)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d1)
             endif
-            if (d2.gt.0.and.d2.le.2) then
+            if (d2.gt.0.and.d2.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d2)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d2)
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/__init__.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/__init__.py
index 16e60d8182..0d17042f0d 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/__init__.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/__init__.py
@@ -26,6 +26,7 @@ class aMCatNLOError(MadGraph5Error):
 import os
 import logging
 import time
+pjoin = os.path.join
 
 #Look for basic file position MG5DIR and MG4DIR
 MG5DIR = os.path.realpath(os.path.join(os.path.dirname(__file__),
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py
index ef1bf58979..3995ce8109 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py
@@ -2704,7 +2704,8 @@ def __new__(cls, finput=None, **opt):
                                 except Exception as error:
                                     import launch_plugin
                             target_class = launch_plugin.RunCard
-
+            elif issubclass(finput, RunCard):
+                target_class = finput
             else:
                 return None
 
@@ -2968,11 +2969,12 @@ def write(self, output_file, template=None, python_template=False,
         if python_template and not to_write:
             import string
             if self.blocks:
-                text = string.Template(text)
                 mapping = {}
                 for b in self.blocks:
                     mapping[b.name] =  b.get_template(self)
-                text = text.substitute(mapping)
+                    if "$%s" % b.name not in text:
+                        text += "\n$%s\n" % b.name
+                text = string.Template(text).substitute(mapping)
 
             if not self.list_parameter:
                 text = text % self
@@ -4875,6 +4877,9 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 continue
             break
 
+        if proc_characteristic['ninitial'] == 1:
+            self['SDE_strategy'] =1
+
         if 'MLM' in proc_characteristic['limitations']:
             if self['dynamical_scale_choice'] ==  -1:
                 self['dynamical_scale_choice'] = 3
@@ -5940,7 +5945,7 @@ def default_setup(self):
         self.add_param("CheckCycle", 3)
         self.add_param("MaxAttempts", 10)
         self.add_param("ZeroThres", 1e-9)
-        self.add_param("OSThres", 1.0e-13)
+        self.add_param("OSThres", 1.0e-8)
         self.add_param("DoubleCheckHelicityFilter", True)
         self.add_param("WriteOutFilters", True)
         self.add_param("UseLoopFilter", False)
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/common_run_interface.py
index 14c7f310dc..87cb4b88df 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/common_run_interface.py
@@ -4906,6 +4906,7 @@ def __init__(self, question, cards=[], from_banner=None, banner=None, mode='auto
         self.load_default()        
         self.define_paths(**opt)
         self.last_editline_pos = 0
+        self.update_dependent_done = False
 
         if 'allow_arg' not in opt or not opt['allow_arg']:
             # add some mininal content for this:
@@ -6585,7 +6586,9 @@ def postcmd(self, stop, line):
             self.check_card_consistency()
             if self.param_consistency:
                 try:
-                    self.do_update('dependent', timer=20)
+                    if not self.update_dependent_done:
+                        self.do_update('dependent', timer=20)
+                    self.update_dependent_done = False
                 except MadGraph5Error as error:
                     if 'Missing block:' in str(error):
                         self.fail_due_to_format +=1
@@ -6638,6 +6641,8 @@ def do_update(self, line, timer=0):
             self.update_dependent(self.mother_interface, self.me_dir, self.param_card,
                                    self.paths['param'], timer, run_card=self.run_card,
                                    lhapdfconfig=self.lhapdf)
+            self.update_dependent_done = True
+            
 
         elif args[0] == 'missing':
             self.update_missing()
@@ -6717,12 +6722,13 @@ class TimeOutError(Exception):
         def handle_alarm(signum, frame): 
             raise TimeOutError
         signal.signal(signal.SIGALRM, handle_alarm)
+
         if timer:
-            signal.alarm(timer)
             log_level=30
         else:
             log_level=20
 
+
         if run_card:
             as_for_pdf = {'cteq6_m': 0.118,
                           'cteq6_d': 0.118, 
@@ -6781,6 +6787,10 @@ def handle_alarm(signum, frame):
                     logger.log(log_level, "update the strong coupling value (alpha_s) to the value from the pdf selected: %s",  as_for_pdf[pdlabel])
                     modify = True
 
+        if timer:
+            signal.alarm(timer)
+
+
         # Try to load the model in the limited amount of time allowed
         try:
             model = mecmd.get_model()
@@ -6909,7 +6919,8 @@ def check_block(self, blockname):
     def check_answer_consistency(self):
         """function called if the code reads a file"""
         self.check_card_consistency()
-        self.do_update('dependent', timer=20) 
+        if not self.update_dependent_done:
+            self.do_update('dependent', timer=20) 
       
     def help_set(self):
         '''help message for set'''
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_ximprove.py
index a88d60b282..5fd170d18d 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_ximprove.py
@@ -157,12 +157,16 @@ def get_helicity(self, to_submit=True, clean=True):
             
             (stdout, _) = p.communicate(''.encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            try:
+            if stdout:
                 nb_channel = max([math.floor(float(d)) for d in stdout.split()])
-            except Exception as error:
-                misc.sprint(stdout, 'no channel or error for %s' % Pdir)
-                continue
-
+            else:
+                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                    files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                P_zero_result.append(Pdir)
+                if os.path.exists(pjoin(self.me_dir, 'error')):
+                    os.remove(pjoin(self.me_dir, 'error'))
+                continue # bypass bad process
+            
             self.cmd.compile(['madevent_forhel'], cwd=Pdir)
             if not os.path.exists(pjoin(Pdir, 'madevent_forhel')):
                 raise Exception('Error make madevent_forhel not successful')  
@@ -183,11 +187,13 @@ def get_helicity(self, to_submit=True, clean=True):
             #sym_input = "%(points)d %(iterations)d %(accuracy)f \n" % self.opts
             (stdout, _) = p.communicate(" ".encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            if os.path.exists(pjoin(self.me_dir,'error')):
+            if os.path.exists(pjoin(self.me_dir, 'error')):
                 raise Exception(pjoin(self.me_dir,'error')) 
                 # note a continue is not enough here, we have in top to link
                 # the matrixX_optim.f to matrixX_orig.f to let the code to work
                 # after this error.
+                #                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                #    files.cp(matrix_file, matrix_file.replace('orig','optim'))
 
             if 'no events passed cuts' in stdout:
                 raise Exception
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py
index c9d1c7706a..0b849330ef 100644
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py
@@ -57,7 +57,7 @@ def reset_simd(self, old_value, new_value, name):
             return
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-        
+
     def plugin_input(self, finput):
         return
 
@@ -79,7 +79,7 @@ def check_validity(self):
             self['sde_strategy'] = 1
         if self['hel_recycling']:
             self['hel_recycling'] = False
-            
+
 class GPURunCard(CPPRunCard):
     def default_setup(self):
         super(CPPRunCard, self).default_setup()
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py
index d722702891..853aabc98a 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py
@@ -1,4 +1,4 @@
-################################################################################
+###############################################################################
 #
 # Copyright (c) 2011 The MadGraph5_aMC@NLO Development team and Contributors
 #
@@ -3675,7 +3675,7 @@ def do_refine(self, line):
         devnull.close()
     
     ############################################################################ 
-    def do_combine_iteration(self, line):
+    def do_comine_iteration(self, line):
         """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step
             S is for survey 
             R is for refine
@@ -3703,7 +3703,7 @@ def do_combine_iteration(self, line):
     ############################################################################ 
     def do_combine_events(self, line):
         """Advanced commands: Launch combine events"""
-
+        start=time.time()
         args = self.split_arg(line)
         start = time.time()
         # Check argument's validity
@@ -3798,9 +3798,7 @@ def do_combine_events(self, line):
             self.correct_bias()
         elif self.run_card['custom_fcts']:
             self.correct_bias()
-
-        logger.info("combine events done in %s", time.time()-start)
-
+        logger.info("combination of events done in %s s ", time.time()-start)
         
         self.to_store.append('event')
     
@@ -7368,7 +7366,7 @@ def wait_monitoring(Idle, Running, Done):
     import optparse
     # Get the directory of the script real path (bin)                                                                                                                                                           
     # and add it to the current PYTHONPATH                                                                                                                                                                      
-    root_path = os.path.dirname(os.path.dirname(os.path.realpath( __file__ )))
+    #root_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath( __file__ ))))
     sys.path.insert(0, root_path)
 
     class MyOptParser(optparse.OptionParser):    
@@ -7411,7 +7409,13 @@ def error(self, msg=''):
     import logging.config
     # Set logging level according to the logging level given by options                                                                                                                                         
     #logging.basicConfig(level=vars(logging)[options.logging])                                                                                                                                                  
+    import internal
     import internal.coloring_logging
+    # internal.file = XXX/bin/internal/__init__.py
+    # => need three dirname to get XXX
+    # we use internal to have any issue with pythonpath finding the wrong file
+    me_dir = os.path.dirname(os.path.dirname(os.path.dirname(internal.__file__)))
+    print("me_dir is", me_dir)
     try:
         if __debug__ and options.logging == 'INFO':
             options.logging = 'DEBUG'
@@ -7419,7 +7423,8 @@ def error(self, msg=''):
             level = int(options.logging)
         else:
             level = eval('logging.' + options.logging)
-        logging.config.fileConfig(os.path.join(root_path, 'internal', 'me5_logging.conf'))
+        log_path = os.path.join(me_dir, 'bin', 'internal', 'me5_logging.conf')
+        logging.config.fileConfig(log_path)
         logging.root.setLevel(level)
         logging.getLogger('madgraph').setLevel(level)
     except:
@@ -7433,9 +7438,9 @@ def error(self, msg=''):
             if '--web' in args:
                 i = args.index('--web') 
                 args.pop(i)                                                                                                                                                                     
-                cmd_line = MadEventCmd(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmd(me_dir, force_run=True)
             else:
-                cmd_line = MadEventCmdShell(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmdShell(me_dir, force_run=True)
             if not hasattr(cmd_line, 'do_%s' % args[0]):
                 if parser_error:
                     print(parser_error)
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/misc.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/misc.py
index d3fed3baa2..91cd3e5c22 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/misc.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/misc.py
@@ -347,7 +347,7 @@ def tell(msg):
     if dependency=='ninja':
         if cmd.options['ninja'] in ['None',None,''] or\
          (cmd.options['ninja'] == './HEPTools/lib' and not MG5dir is None and\
-         which_lib(pjoin(MG5dir,cmd.options['ninja'],'libninja.a')) is None):
+         which_lib(pjoin(MG5dir,cmd.options['ninja'],'lib','libninja.a')) is None):
             tell("Installing ninja...")
             cmd.do_install('ninja')
  
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/shower_card.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/shower_card.py
index c6d3948cc4..c344ea1b15 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/shower_card.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/shower_card.py
@@ -45,7 +45,9 @@ class ShowerCard(dict):
     false = ['.false.', 'f', 'false', '0']
     logical_vars = ['ue_enabled', 'hadronize', 'b_stable', 'pi_stable', 'wp_stable', 
                     'wm_stable', 'z_stable', 'h_stable', 'tap_stable', 'tam_stable', 
-                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td']
+                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td',
+                    'space_shower_me_corrections', 'time_shower_me_corrections',
+                    'time_shower_me_extended', 'time_shower_me_after_first']
     string_vars = ['extralibs', 'extrapaths', 'includepaths', 'analyse']
     for i in range(1,100):
         string_vars.append('dm_'+str(i))
@@ -82,7 +84,11 @@ class ShowerCard(dict):
             'b_mass' : {'HERWIG6':'b_mass', 'PYTHIA6': 'b_mass', 'HERWIGPP': 'b_mass', 'PYTHIA8': 'b_mass'},
             'analyse' : {'HERWIG6':'hwuti', 'PYTHIA6':'pyuti', 'HERWIGPP':'hwpputi', 'PYTHIA8':'py8uti'},
             'qcut' : {'PYTHIA8':'qcut'},
-            'njmax' : {'PYTHIA8':'njmax'}}
+            'njmax' : {'PYTHIA8':'njmax'},
+            'space_shower_me_corrections' : {'PYTHIA8':'space_shower_me_corrections'},
+            'time_shower_me_corrections' : {'PYTHIA8':'time_shower_me_corrections'},
+            'time_shower_me_extended' : {'PYTHIA8':'time_shower_me_extended'},
+            'time_shower_me_after_first' : {'PYTHIA8':'time_shower_me_after_first'}}
     
     stdhep_dict = {'HERWIG6':'mcatnlo_hwan_stdhep.o', 'PYTHIA6':'mcatnlo_pyan_stdhep.o'}
     
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h
index 9b946c21e1..8df465ad6d 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc
index 9d09eb6b62..64fc3fea62 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h
index 6b32c66b9b..b6568d3761 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
index e8d8232be5..33bae20142 100644
--- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
@@ -14,7 +14,7 @@ Running MG5 in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect         2023-08-08         *
+*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -62,7 +62,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005519866943359375 [0m
+[1;32mDEBUG: model prefixing  takes 0.005532503128051758 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 1.893 s
+1 processes with 1240 diagrams generated in 1.880 s
 Total: 1 processes with 1240 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg
 Load PLUGIN.CUDACPP_OUTPUT
@@ -175,7 +175,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. 
-Generated helas calls for 1 subprocesses (1240 diagrams) in 6.604 s
+Generated helas calls for 1 subprocesses (1240 diagrams) in 6.540 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -183,7 +183,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.356 s
+ALOHA: aloha creates 5 routines in  0.351 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -207,6 +207,6 @@ INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg
 [1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
 quit
 
-real	0m13.085s
-user	0m12.921s
-sys	0m0.106s
+real	0m15.959s
+user	0m12.810s
+sys	0m0.102s
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc
index a67b74e5b7..30acce4afc 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h
index dc41720ca6..2565923dde 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h
index 9b946c21e1..8df465ad6d 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc
index 9d09eb6b62..64fc3fea62 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h
index 6b32c66b9b..b6568d3761 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
index 2338d395b7..89cb2749b0 100644
--- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
+++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
@@ -14,7 +14,7 @@ Running MG5 in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect         2023-08-08         *
+*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -61,7 +61,7 @@ set zerowidth_tchannel F
 define q = u c d s u~ c~ d~ s~
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005399465560913086 [0m
+[1;32mDEBUG: model prefixing  takes 0.0057373046875 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -170,7 +170,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams.
 INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. 
 INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. 
 INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. 
-8 processes with 40 diagrams generated in 0.079 s
+8 processes with 40 diagrams generated in 0.078 s
 Total: 8 processes with 40 diagrams
 output madevent ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
@@ -198,7 +198,7 @@ INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~
 INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Creating files in directory P1_gu_ttxu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f09e0eee5e0> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f395a95ddc0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -207,15 +207,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 0 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gu_ttxu 
 INFO: Creating files in directory P1_gux_ttxux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f09e0d723d0> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f395aad79d0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -224,23 +224,23 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 1 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gux_ttxux 
 Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s
-Wrote files for 32 helas calls in 0.222 s
+Wrote files for 32 helas calls in 0.217 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.147 s
+ALOHA: aloha creates 2 routines in  0.144 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.133 s
+ALOHA: aloha creates 4 routines in  0.132 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -266,6 +266,7 @@ patching file Source/genps.inc
 patching file Source/makefile
 patching file SubProcesses/makefile
 patching file bin/internal/gen_ximprove.py
+Hunk #1 succeeded at 391 (offset 6 lines).
 patching file bin/internal/madevent_interface.py
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses/P1_gu_ttxu; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
@@ -294,9 +295,9 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m1.957s
-user	0m1.708s
-sys	0m0.241s
+real	0m4.915s
+user	0m1.680s
+sys	0m0.237s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -309,7 +310,7 @@ sys	0m0.241s
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect                            *
+*         VERSION 3.5.2_lo_vect                            *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -342,7 +343,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect                            *
+*         VERSION 3.5.2_lo_vect                            *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat
index dc07af3836..efb0752a31 100644
--- a/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.5.1_lo_vect         2023-08-08         *
+#*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/gq_ttq.mad/MGMEVersion.txt b/epochX/cudacpp/gq_ttq.mad/MGMEVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/gq_ttq.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/gq_ttq.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MGVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
index c526dd6b31..649c608210 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
index cdc2dc91ac..bf037c6c28 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f
index 249a3e4e3c..6c1667bc0f 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f
index ba39cab867..ee1484ab56 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -231,7 +231,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/matrix1.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/matrix1.f
index e6d01dad0b..bd8e2f143a 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/matrix1.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -333,7 +333,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
index 8d92e4e769..930da28159 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
index a90abc4ab4..0f49f5247b 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f
index f2eba72de7..c9b8759b60 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f
index 5ec9701b78..62c235de64 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -231,7 +231,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/matrix1.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/matrix1.f
index 7a2e329e64..4c05be74a0 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/matrix1.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -333,7 +333,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/genps.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/genps.f
index fe9c61504b..c00e33d954 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/genps.f
@@ -1877,12 +1877,12 @@ double precision function get_channel_cut(p, config)
          d1 = iforest(1, -i, config)
          d2 = iforest(2, -i, config)
          do j=0,3
-            if (d1.gt.0.and.d1.le.2) then
+            if (d1.gt.0.and.d1.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d1)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d1)
             endif
-            if (d2.gt.0.and.d2.le.2) then
+            if (d2.gt.0.and.d2.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d2)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d2)
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/__init__.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/__init__.py
index 16e60d8182..0d17042f0d 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/__init__.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/__init__.py
@@ -26,6 +26,7 @@ class aMCatNLOError(MadGraph5Error):
 import os
 import logging
 import time
+pjoin = os.path.join
 
 #Look for basic file position MG5DIR and MG4DIR
 MG5DIR = os.path.realpath(os.path.join(os.path.dirname(__file__),
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py
index ef1bf58979..3995ce8109 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py
@@ -2704,7 +2704,8 @@ def __new__(cls, finput=None, **opt):
                                 except Exception as error:
                                     import launch_plugin
                             target_class = launch_plugin.RunCard
-
+            elif issubclass(finput, RunCard):
+                target_class = finput
             else:
                 return None
 
@@ -2968,11 +2969,12 @@ def write(self, output_file, template=None, python_template=False,
         if python_template and not to_write:
             import string
             if self.blocks:
-                text = string.Template(text)
                 mapping = {}
                 for b in self.blocks:
                     mapping[b.name] =  b.get_template(self)
-                text = text.substitute(mapping)
+                    if "$%s" % b.name not in text:
+                        text += "\n$%s\n" % b.name
+                text = string.Template(text).substitute(mapping)
 
             if not self.list_parameter:
                 text = text % self
@@ -4875,6 +4877,9 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 continue
             break
 
+        if proc_characteristic['ninitial'] == 1:
+            self['SDE_strategy'] =1
+
         if 'MLM' in proc_characteristic['limitations']:
             if self['dynamical_scale_choice'] ==  -1:
                 self['dynamical_scale_choice'] = 3
@@ -5940,7 +5945,7 @@ def default_setup(self):
         self.add_param("CheckCycle", 3)
         self.add_param("MaxAttempts", 10)
         self.add_param("ZeroThres", 1e-9)
-        self.add_param("OSThres", 1.0e-13)
+        self.add_param("OSThres", 1.0e-8)
         self.add_param("DoubleCheckHelicityFilter", True)
         self.add_param("WriteOutFilters", True)
         self.add_param("UseLoopFilter", False)
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/common_run_interface.py
index 14c7f310dc..87cb4b88df 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/common_run_interface.py
@@ -4906,6 +4906,7 @@ def __init__(self, question, cards=[], from_banner=None, banner=None, mode='auto
         self.load_default()        
         self.define_paths(**opt)
         self.last_editline_pos = 0
+        self.update_dependent_done = False
 
         if 'allow_arg' not in opt or not opt['allow_arg']:
             # add some mininal content for this:
@@ -6585,7 +6586,9 @@ def postcmd(self, stop, line):
             self.check_card_consistency()
             if self.param_consistency:
                 try:
-                    self.do_update('dependent', timer=20)
+                    if not self.update_dependent_done:
+                        self.do_update('dependent', timer=20)
+                    self.update_dependent_done = False
                 except MadGraph5Error as error:
                     if 'Missing block:' in str(error):
                         self.fail_due_to_format +=1
@@ -6638,6 +6641,8 @@ def do_update(self, line, timer=0):
             self.update_dependent(self.mother_interface, self.me_dir, self.param_card,
                                    self.paths['param'], timer, run_card=self.run_card,
                                    lhapdfconfig=self.lhapdf)
+            self.update_dependent_done = True
+            
 
         elif args[0] == 'missing':
             self.update_missing()
@@ -6717,12 +6722,13 @@ class TimeOutError(Exception):
         def handle_alarm(signum, frame): 
             raise TimeOutError
         signal.signal(signal.SIGALRM, handle_alarm)
+
         if timer:
-            signal.alarm(timer)
             log_level=30
         else:
             log_level=20
 
+
         if run_card:
             as_for_pdf = {'cteq6_m': 0.118,
                           'cteq6_d': 0.118, 
@@ -6781,6 +6787,10 @@ def handle_alarm(signum, frame):
                     logger.log(log_level, "update the strong coupling value (alpha_s) to the value from the pdf selected: %s",  as_for_pdf[pdlabel])
                     modify = True
 
+        if timer:
+            signal.alarm(timer)
+
+
         # Try to load the model in the limited amount of time allowed
         try:
             model = mecmd.get_model()
@@ -6909,7 +6919,8 @@ def check_block(self, blockname):
     def check_answer_consistency(self):
         """function called if the code reads a file"""
         self.check_card_consistency()
-        self.do_update('dependent', timer=20) 
+        if not self.update_dependent_done:
+            self.do_update('dependent', timer=20) 
       
     def help_set(self):
         '''help message for set'''
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_ximprove.py
index a88d60b282..5fd170d18d 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_ximprove.py
@@ -157,12 +157,16 @@ def get_helicity(self, to_submit=True, clean=True):
             
             (stdout, _) = p.communicate(''.encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            try:
+            if stdout:
                 nb_channel = max([math.floor(float(d)) for d in stdout.split()])
-            except Exception as error:
-                misc.sprint(stdout, 'no channel or error for %s' % Pdir)
-                continue
-
+            else:
+                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                    files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                P_zero_result.append(Pdir)
+                if os.path.exists(pjoin(self.me_dir, 'error')):
+                    os.remove(pjoin(self.me_dir, 'error'))
+                continue # bypass bad process
+            
             self.cmd.compile(['madevent_forhel'], cwd=Pdir)
             if not os.path.exists(pjoin(Pdir, 'madevent_forhel')):
                 raise Exception('Error make madevent_forhel not successful')  
@@ -183,11 +187,13 @@ def get_helicity(self, to_submit=True, clean=True):
             #sym_input = "%(points)d %(iterations)d %(accuracy)f \n" % self.opts
             (stdout, _) = p.communicate(" ".encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            if os.path.exists(pjoin(self.me_dir,'error')):
+            if os.path.exists(pjoin(self.me_dir, 'error')):
                 raise Exception(pjoin(self.me_dir,'error')) 
                 # note a continue is not enough here, we have in top to link
                 # the matrixX_optim.f to matrixX_orig.f to let the code to work
                 # after this error.
+                #                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                #    files.cp(matrix_file, matrix_file.replace('orig','optim'))
 
             if 'no events passed cuts' in stdout:
                 raise Exception
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py
index c9d1c7706a..0b849330ef 100644
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py
@@ -57,7 +57,7 @@ def reset_simd(self, old_value, new_value, name):
             return
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-        
+
     def plugin_input(self, finput):
         return
 
@@ -79,7 +79,7 @@ def check_validity(self):
             self['sde_strategy'] = 1
         if self['hel_recycling']:
             self['hel_recycling'] = False
-            
+
 class GPURunCard(CPPRunCard):
     def default_setup(self):
         super(CPPRunCard, self).default_setup()
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py
index d722702891..853aabc98a 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py
@@ -1,4 +1,4 @@
-################################################################################
+###############################################################################
 #
 # Copyright (c) 2011 The MadGraph5_aMC@NLO Development team and Contributors
 #
@@ -3675,7 +3675,7 @@ def do_refine(self, line):
         devnull.close()
     
     ############################################################################ 
-    def do_combine_iteration(self, line):
+    def do_comine_iteration(self, line):
         """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step
             S is for survey 
             R is for refine
@@ -3703,7 +3703,7 @@ def do_combine_iteration(self, line):
     ############################################################################ 
     def do_combine_events(self, line):
         """Advanced commands: Launch combine events"""
-
+        start=time.time()
         args = self.split_arg(line)
         start = time.time()
         # Check argument's validity
@@ -3798,9 +3798,7 @@ def do_combine_events(self, line):
             self.correct_bias()
         elif self.run_card['custom_fcts']:
             self.correct_bias()
-
-        logger.info("combine events done in %s", time.time()-start)
-
+        logger.info("combination of events done in %s s ", time.time()-start)
         
         self.to_store.append('event')
     
@@ -7368,7 +7366,7 @@ def wait_monitoring(Idle, Running, Done):
     import optparse
     # Get the directory of the script real path (bin)                                                                                                                                                           
     # and add it to the current PYTHONPATH                                                                                                                                                                      
-    root_path = os.path.dirname(os.path.dirname(os.path.realpath( __file__ )))
+    #root_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath( __file__ ))))
     sys.path.insert(0, root_path)
 
     class MyOptParser(optparse.OptionParser):    
@@ -7411,7 +7409,13 @@ def error(self, msg=''):
     import logging.config
     # Set logging level according to the logging level given by options                                                                                                                                         
     #logging.basicConfig(level=vars(logging)[options.logging])                                                                                                                                                  
+    import internal
     import internal.coloring_logging
+    # internal.file = XXX/bin/internal/__init__.py
+    # => need three dirname to get XXX
+    # we use internal to have any issue with pythonpath finding the wrong file
+    me_dir = os.path.dirname(os.path.dirname(os.path.dirname(internal.__file__)))
+    print("me_dir is", me_dir)
     try:
         if __debug__ and options.logging == 'INFO':
             options.logging = 'DEBUG'
@@ -7419,7 +7423,8 @@ def error(self, msg=''):
             level = int(options.logging)
         else:
             level = eval('logging.' + options.logging)
-        logging.config.fileConfig(os.path.join(root_path, 'internal', 'me5_logging.conf'))
+        log_path = os.path.join(me_dir, 'bin', 'internal', 'me5_logging.conf')
+        logging.config.fileConfig(log_path)
         logging.root.setLevel(level)
         logging.getLogger('madgraph').setLevel(level)
     except:
@@ -7433,9 +7438,9 @@ def error(self, msg=''):
             if '--web' in args:
                 i = args.index('--web') 
                 args.pop(i)                                                                                                                                                                     
-                cmd_line = MadEventCmd(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmd(me_dir, force_run=True)
             else:
-                cmd_line = MadEventCmdShell(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmdShell(me_dir, force_run=True)
             if not hasattr(cmd_line, 'do_%s' % args[0]):
                 if parser_error:
                     print(parser_error)
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/misc.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/misc.py
index d3fed3baa2..91cd3e5c22 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/misc.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/misc.py
@@ -347,7 +347,7 @@ def tell(msg):
     if dependency=='ninja':
         if cmd.options['ninja'] in ['None',None,''] or\
          (cmd.options['ninja'] == './HEPTools/lib' and not MG5dir is None and\
-         which_lib(pjoin(MG5dir,cmd.options['ninja'],'libninja.a')) is None):
+         which_lib(pjoin(MG5dir,cmd.options['ninja'],'lib','libninja.a')) is None):
             tell("Installing ninja...")
             cmd.do_install('ninja')
  
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/shower_card.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/shower_card.py
index c6d3948cc4..c344ea1b15 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/shower_card.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/shower_card.py
@@ -45,7 +45,9 @@ class ShowerCard(dict):
     false = ['.false.', 'f', 'false', '0']
     logical_vars = ['ue_enabled', 'hadronize', 'b_stable', 'pi_stable', 'wp_stable', 
                     'wm_stable', 'z_stable', 'h_stable', 'tap_stable', 'tam_stable', 
-                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td']
+                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td',
+                    'space_shower_me_corrections', 'time_shower_me_corrections',
+                    'time_shower_me_extended', 'time_shower_me_after_first']
     string_vars = ['extralibs', 'extrapaths', 'includepaths', 'analyse']
     for i in range(1,100):
         string_vars.append('dm_'+str(i))
@@ -82,7 +84,11 @@ class ShowerCard(dict):
             'b_mass' : {'HERWIG6':'b_mass', 'PYTHIA6': 'b_mass', 'HERWIGPP': 'b_mass', 'PYTHIA8': 'b_mass'},
             'analyse' : {'HERWIG6':'hwuti', 'PYTHIA6':'pyuti', 'HERWIGPP':'hwpputi', 'PYTHIA8':'py8uti'},
             'qcut' : {'PYTHIA8':'qcut'},
-            'njmax' : {'PYTHIA8':'njmax'}}
+            'njmax' : {'PYTHIA8':'njmax'},
+            'space_shower_me_corrections' : {'PYTHIA8':'space_shower_me_corrections'},
+            'time_shower_me_corrections' : {'PYTHIA8':'time_shower_me_corrections'},
+            'time_shower_me_extended' : {'PYTHIA8':'time_shower_me_extended'},
+            'time_shower_me_after_first' : {'PYTHIA8':'time_shower_me_after_first'}}
     
     stdhep_dict = {'HERWIG6':'mcatnlo_hwan_stdhep.o', 'PYTHIA6':'mcatnlo_pyan_stdhep.o'}
     
diff --git a/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h b/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h
index 0dd5f20f71..cd4e6de668 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc
index d5eda63ee0..c06dcbb252 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h
index 0c77cf58f0..a6eb185434 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
index ad74707ae9..16374bd28e 100644
--- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
+++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
@@ -14,7 +14,7 @@ Running MG5 in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect         2023-08-08         *
+*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -61,7 +61,7 @@ set zerowidth_tchannel F
 define q = u c d s u~ c~ d~ s~
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005573272705078125 [0m
+[1;32mDEBUG: model prefixing  takes 0.005791902542114258 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -170,7 +170,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams.
 INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. 
 INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. 
 INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. 
-8 processes with 40 diagrams generated in 0.079 s
+8 processes with 40 diagrams generated in 0.078 s
 Total: 8 processes with 40 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq
 Load PLUGIN.CUDACPP_OUTPUT
@@ -206,12 +206,12 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. 
-Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s
+Generated helas calls for 2 subprocesses (10 diagrams) in 0.030 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.145 s
+ALOHA: aloha creates 2 routines in  0.144 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -228,6 +228,6 @@ INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/
 [1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
 quit
 
-real	0m0.658s
-user	0m0.590s
-sys	0m0.062s
+real	0m3.656s
+user	0m0.594s
+sys	0m0.059s
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc
index 037662f7db..4965f393c5 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h
index cdc2dc91ac..bf037c6c28 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc
index 12179b9801..5024e8e239 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h
index a90abc4ab4..0f49f5247b 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h b/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h
index 0dd5f20f71..cd4e6de668 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc
index d5eda63ee0..c06dcbb252 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h
index 0c77cf58f0..a6eb185434 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt
index 9d96566eb2..3b04fc3fb3 100644
--- a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt
+++ b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt
@@ -14,7 +14,7 @@ Running MG5 in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect         2023-08-08         *
+*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -166,6 +166,6 @@ INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg
 [1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
 quit
 
-real	0m0.429s
+real	0m3.422s
 user	0m0.371s
-sys	0m0.051s
+sys	0m0.048s
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc
index 6cc0be1461..1d59f8e3cf 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.h
index d0312182d5..dbc5aa0e4e 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h b/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h
index a2e9b6a70c..eae9ff5242 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h
+++ b/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h
@@ -8,7 +8,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc
index fde65d5571..e5442756b1 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h
index d1a451b2c3..790485fee0 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h
+++ b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
index bb2844f553..8b6ca99446 100644
--- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
+++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
@@ -14,7 +14,7 @@ Running MG5 in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect         2023-08-08         *
+*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -61,7 +61,7 @@ set zerowidth_tchannel F
 define j = p
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.00551915168762207 [0m
+[1;32mDEBUG: model prefixing  takes 0.00538325309753418 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -212,7 +212,7 @@ INFO: Process d~ g > t t~ d~ added to mirror process g d~ > t t~ d~
 INFO: Process d~ d > t t~ g added to mirror process d d~ > t t~ g 
 INFO: Process s~ g > t t~ s~ added to mirror process g s~ > t t~ s~ 
 INFO: Process s~ s > t t~ g added to mirror process s s~ > t t~ g 
-13 processes with 76 diagrams generated in 0.137 s
+13 processes with 76 diagrams generated in 0.134 s
 Total: 18 processes with 83 diagrams
 add process p p > t t~ j j @2
 INFO: Checking for minimal orders which gives processes. 
@@ -378,7 +378,7 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~
 INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ 
 INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ 
 INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. 
-65 processes with 1119 diagrams generated in 1.844 s
+65 processes with 1119 diagrams generated in 1.811 s
 Total: 83 processes with 1202 diagrams
 output madevent ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
@@ -497,7 +497,7 @@ INFO: Combined process d d~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED
 INFO: Combined process s s~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED<=2 
 INFO: Creating files in directory P2_gg_ttxgg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0ef6fb9160> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e4ed4c0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -506,15 +506,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 0, 46, 47, 48, 49, 50, 51, 52, 53, 54, 0, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 0, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 0, 88, 89, 90, 91, 92, 93, 0, 94, 95, 96, 97, 98, 99, 0, 100, 101, 102, 103, 104, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 0 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg 
 INFO: Creating files in directory P2_gg_ttxuux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0ef6ef9700> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e4ea100> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -523,15 +523,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 0, 34, 35] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 1 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ u u~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxuux 
 INFO: Creating files in directory P2_gu_ttxgu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0ef6fb9160> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e4ea100> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -540,15 +540,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 0, 34, 35] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 2 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g u > t t~ g u WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group gu_ttxgu 
 INFO: Creating files in directory P2_gux_ttxgux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0ef6fb9160> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e7e7670> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -557,15 +557,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 0, 34, 35] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 3 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g u~ > t t~ g u~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group gux_ttxgux 
 INFO: Creating files in directory P2_uux_ttxgg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0ef6ef9700> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e7e7670> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -574,15 +574,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 0, 34, 35] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 4 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ g g WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uux_ttxgg 
 INFO: Creating files in directory P1_gg_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0ef6ef9700> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1f047160> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -591,15 +591,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 5 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
 INFO: Creating files in directory P2_uu_ttxuu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0ef6ef9700> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e793a00> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -608,15 +608,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 6 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: u u > t t~ u u WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uu_ttxuu 
 INFO: Creating files in directory P2_uux_ttxuux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0ef6baa8b0> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e4ea100> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -625,15 +625,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 7 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ u u~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uux_ttxuux 
 INFO: Creating files in directory P2_uxux_ttxuxux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0ef6fb9160> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e4ea100> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -642,15 +642,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 8 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: u~ u~ > t t~ u~ u~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uxux_ttxuxux 
 INFO: Creating files in directory P2_uc_ttxuc 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0ef72491c0> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e4ea100> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -659,15 +659,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 9 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: u c > t t~ u c WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uc_ttxuc 
 INFO: Creating files in directory P2_uux_ttxccx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0ef72491c0> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e4ed4c0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -676,15 +676,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 10 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ c c~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uux_ttxccx 
 INFO: Creating files in directory P2_ucx_ttxucx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0ef6ef9700> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1f047160> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -693,15 +693,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 11 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: u c~ > t t~ u c~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group ucx_ttxucx 
 INFO: Creating files in directory P2_uxcx_ttxuxcx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0ef6fb9160> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e4ea100> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -710,15 +710,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 12 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: u~ c~ > t t~ u~ c~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uxcx_ttxuxcx 
 INFO: Creating files in directory P1_gu_ttxu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0ef6f6c730> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e4ea100> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -727,15 +727,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 13 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gu_ttxu 
 INFO: Creating files in directory P1_gux_ttxux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0ef6fb9160> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e7ddac0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -744,15 +744,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 14 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gux_ttxux 
 INFO: Creating files in directory P1_uux_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0ef6fb9160> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e7a9640> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -761,15 +761,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 15 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ g WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group uux_ttxg 
 INFO: Creating files in directory P0_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0ef6eba6d0> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e79d2e0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -778,15 +778,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 16 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
 INFO: Creating files in directory P0_uux_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0ef6fb9160> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1eb8eee0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -795,21 +795,21 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 17 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ WEIGHTED<=2 
 INFO: Finding symmetric diagrams for subprocess group uux_ttx 
-Generated helas calls for 18 subprocesses (372 diagrams) in 1.298 s
-Wrote files for 810 helas calls in 3.297 s
+Generated helas calls for 18 subprocesses (372 diagrams) in 1.267 s
+Wrote files for 810 helas calls in 3.215 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.339 s
+ALOHA: aloha creates 5 routines in  0.333 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -817,7 +817,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.315 s
+ALOHA: aloha creates 10 routines in  0.312 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -850,6 +850,7 @@ patching file Source/genps.inc
 patching file Source/makefile
 patching file SubProcesses/makefile
 patching file bin/internal/gen_ximprove.py
+Hunk #1 succeeded at 391 (offset 6 lines).
 patching file bin/internal/madevent_interface.py
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P0_gg_ttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
@@ -1028,9 +1029,9 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m8.967s
-user	0m8.408s
-sys	0m0.506s
+real	0m11.764s
+user	0m8.242s
+sys	0m0.480s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -1043,7 +1044,7 @@ sys	0m0.506s
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect                            *
+*         VERSION 3.5.2_lo_vect                            *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -1076,7 +1077,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect                            *
+*         VERSION 3.5.2_lo_vect                            *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat
index 944298ae75..c0b1a2fd98 100644
--- a/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.5.1_lo_vect         2023-08-08         *
+#*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/pp_tt012j.mad/MGMEVersion.txt b/epochX/cudacpp/pp_tt012j.mad/MGMEVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/pp_tt012j.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/pp_tt012j.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MGVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc
index 0317bbc95a..30815cd085 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h
index ecd2d1364e..448175be9d 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig.f
index dce732e252..963d8ec072 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig1.f
index a48f6997f3..d4e2956b18 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -216,7 +216,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/matrix1.f
index d803e4f19f..5b3b723e59 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -301,7 +301,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc
index 75110e8fec..fa46e42b8f 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h
index 3d5ca9d556..e166fa1652 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig.f
index 3d59efb411..2cc5a2026a 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig1.f
index f9147f699e..2344ddbe81 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/matrix1.f
index 4c21758744..1dea73e826 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -304,7 +304,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
index f7f5899260..5e2bf0d19a 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
index 9f559fe3ae..37d6ebe981 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f
index d528b1d2f0..dd4cd3a0c2 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f
index 110e204c24..e28575ead8 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -216,7 +216,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/matrix1.f
index bf665ff6e0..a885b7fde3 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -317,7 +317,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
index 90a457ac40..3b6b1a6c16 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
index cdc2dc91ac..bf037c6c28 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f
index 249a3e4e3c..6c1667bc0f 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f
index ba39cab867..ee1484ab56 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -231,7 +231,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/matrix1.f
index d61f0e1a21..b7d8649204 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -320,7 +320,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
index 9a73b3ed94..eb62f13990 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
index a90abc4ab4..0f49f5247b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f
index f2eba72de7..c9b8759b60 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f
index 5ec9701b78..62c235de64 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -231,7 +231,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/matrix1.f
index b082becd2a..8a699645cd 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -320,7 +320,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc
index dc1a3e9d26..c47ef64ec8 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h
index 06af307caa..f8bdb38aee 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig.f
index 408403e5d9..628e0d8092 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig1.f
index 842b1c72d4..b66a887225 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/matrix1.f
index 265f6006db..7bc63ee8a4 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -320,7 +320,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc
index cbc45ff652..0cbb15fba7 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h
index a41aa7611a..9f43559181 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig.f
index c23550e9b7..84ee7e5b85 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig1.f
index 4e2bfe85ab..aa73f64dba 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -216,7 +216,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/matrix1.f
index c8fbb1cc8b..46e6ff0da7 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -349,7 +349,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc
index 5723ed5665..d9f2d09952 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h
index 95f4bf6912..f26b60c5bb 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig.f
index d196e8ed65..abb75a925b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig1.f
index e5a0390c47..d6bf2155ff 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -228,7 +228,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/matrix1.f
index 4f966fab6d..fabc6786d3 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -352,7 +352,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc
index b8f74ecafe..0d1c319939 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h
index a54b0bb8fe..853175b477 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig.f
index bc732da055..94fe1937c3 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig1.f
index 309be94a99..50c024adc3 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -231,7 +231,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/matrix1.f
index c03cebacb0..210884dccf 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -352,7 +352,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc
index 2495941a73..8e3985f427 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h
index d31dd972a9..e60cb5b6d7 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig.f
index 399b68be58..3e0e30af23 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig1.f
index 23d82657bf..e639ee4c34 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -231,7 +231,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/matrix1.f
index 39422dc34c..a8c5f11ae3 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -352,7 +352,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc
index 529477ff3e..22398e7ab4 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h
index 4f557f24ab..5329710b87 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig.f
index da207359fc..94cfdd1487 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig1.f
index 4d12dfeade..37f4a35577 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -240,7 +240,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/matrix1.f
index 9e27e48c99..66b1820c10 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -354,7 +354,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc
index e54a24ea57..3955de70dd 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h
index 1818cf79ed..391789dc81 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig.f
index cfd6a270b5..5ce83d5f12 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig1.f
index 5bac32b00a..ea0697602c 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -266,7 +266,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/matrix1.f
index 6bdc5db576..9403b67a1a 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -360,7 +360,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc
index 8638bbefa2..bfc3d0809f 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h
index 41e15f6ad0..2d95f4b170 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig.f
index efdae70d19..44e8c9d920 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig1.f
index 50c16edaac..302d0eda9c 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/matrix1.f
index 8b2cf62531..f51744ae5d 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -352,7 +352,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc
index c071cc6900..222800dcfd 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h
index b93bb3909d..14490d782f 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig.f
index 72e76f54e4..ab270fe554 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig1.f
index 577a8d9c54..e9b4ddc613 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -266,7 +266,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/matrix1.f
index c5a7b6787c..f93b850d5f 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -360,7 +360,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc
index 2eb6b491fa..ef9407041b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h
index 2f4866b6ca..1543c29649 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig.f
index 4b08b69f90..f5ef1f7b43 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig1.f
index f4e431c5ce..83e40fb02c 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/matrix1.f
index a843f4656a..9996fdea2d 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -352,7 +352,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc
index 8682128442..1aa88699db 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h
index dbd5b60487..58cece5c62 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig.f
index 3e29e25982..867eb95566 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig1.f
index 123a3ae00e..ae43656176 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/matrix1.f
index 6d8f6b4ed8..205e3daf83 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -352,7 +352,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc
index 7d3141cfc4..5f356a519e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h
index f92e527895..6bd3135c3c 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig.f
index 44da6cd9ce..8ded31027d 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f
index a4cb748b19..7ce014f5f5 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -240,7 +240,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/matrix1.f
index 53f591633e..dfbec413a8 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -354,7 +354,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc
index 6ec302f68b..af04d58c3e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h
index 53c3b7149b..4e53fa1250 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig.f
index 43ccdff1e1..2acdc960db 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f
index 3a3ed05151..115e19c70e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/matrix1.f
index dce10b9553..392b30a39f 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -352,7 +352,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/genps.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/genps.f
index fe9c61504b..c00e33d954 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/genps.f
@@ -1877,12 +1877,12 @@ double precision function get_channel_cut(p, config)
          d1 = iforest(1, -i, config)
          d2 = iforest(2, -i, config)
          do j=0,3
-            if (d1.gt.0.and.d1.le.2) then
+            if (d1.gt.0.and.d1.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d1)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d1)
             endif
-            if (d2.gt.0.and.d2.le.2) then
+            if (d2.gt.0.and.d2.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d2)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d2)
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/__init__.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/__init__.py
index 16e60d8182..0d17042f0d 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/__init__.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/__init__.py
@@ -26,6 +26,7 @@ class aMCatNLOError(MadGraph5Error):
 import os
 import logging
 import time
+pjoin = os.path.join
 
 #Look for basic file position MG5DIR and MG4DIR
 MG5DIR = os.path.realpath(os.path.join(os.path.dirname(__file__),
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py
index ef1bf58979..3995ce8109 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py
@@ -2704,7 +2704,8 @@ def __new__(cls, finput=None, **opt):
                                 except Exception as error:
                                     import launch_plugin
                             target_class = launch_plugin.RunCard
-
+            elif issubclass(finput, RunCard):
+                target_class = finput
             else:
                 return None
 
@@ -2968,11 +2969,12 @@ def write(self, output_file, template=None, python_template=False,
         if python_template and not to_write:
             import string
             if self.blocks:
-                text = string.Template(text)
                 mapping = {}
                 for b in self.blocks:
                     mapping[b.name] =  b.get_template(self)
-                text = text.substitute(mapping)
+                    if "$%s" % b.name not in text:
+                        text += "\n$%s\n" % b.name
+                text = string.Template(text).substitute(mapping)
 
             if not self.list_parameter:
                 text = text % self
@@ -4875,6 +4877,9 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 continue
             break
 
+        if proc_characteristic['ninitial'] == 1:
+            self['SDE_strategy'] =1
+
         if 'MLM' in proc_characteristic['limitations']:
             if self['dynamical_scale_choice'] ==  -1:
                 self['dynamical_scale_choice'] = 3
@@ -5940,7 +5945,7 @@ def default_setup(self):
         self.add_param("CheckCycle", 3)
         self.add_param("MaxAttempts", 10)
         self.add_param("ZeroThres", 1e-9)
-        self.add_param("OSThres", 1.0e-13)
+        self.add_param("OSThres", 1.0e-8)
         self.add_param("DoubleCheckHelicityFilter", True)
         self.add_param("WriteOutFilters", True)
         self.add_param("UseLoopFilter", False)
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/common_run_interface.py
index 14c7f310dc..87cb4b88df 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/common_run_interface.py
@@ -4906,6 +4906,7 @@ def __init__(self, question, cards=[], from_banner=None, banner=None, mode='auto
         self.load_default()        
         self.define_paths(**opt)
         self.last_editline_pos = 0
+        self.update_dependent_done = False
 
         if 'allow_arg' not in opt or not opt['allow_arg']:
             # add some mininal content for this:
@@ -6585,7 +6586,9 @@ def postcmd(self, stop, line):
             self.check_card_consistency()
             if self.param_consistency:
                 try:
-                    self.do_update('dependent', timer=20)
+                    if not self.update_dependent_done:
+                        self.do_update('dependent', timer=20)
+                    self.update_dependent_done = False
                 except MadGraph5Error as error:
                     if 'Missing block:' in str(error):
                         self.fail_due_to_format +=1
@@ -6638,6 +6641,8 @@ def do_update(self, line, timer=0):
             self.update_dependent(self.mother_interface, self.me_dir, self.param_card,
                                    self.paths['param'], timer, run_card=self.run_card,
                                    lhapdfconfig=self.lhapdf)
+            self.update_dependent_done = True
+            
 
         elif args[0] == 'missing':
             self.update_missing()
@@ -6717,12 +6722,13 @@ class TimeOutError(Exception):
         def handle_alarm(signum, frame): 
             raise TimeOutError
         signal.signal(signal.SIGALRM, handle_alarm)
+
         if timer:
-            signal.alarm(timer)
             log_level=30
         else:
             log_level=20
 
+
         if run_card:
             as_for_pdf = {'cteq6_m': 0.118,
                           'cteq6_d': 0.118, 
@@ -6781,6 +6787,10 @@ def handle_alarm(signum, frame):
                     logger.log(log_level, "update the strong coupling value (alpha_s) to the value from the pdf selected: %s",  as_for_pdf[pdlabel])
                     modify = True
 
+        if timer:
+            signal.alarm(timer)
+
+
         # Try to load the model in the limited amount of time allowed
         try:
             model = mecmd.get_model()
@@ -6909,7 +6919,8 @@ def check_block(self, blockname):
     def check_answer_consistency(self):
         """function called if the code reads a file"""
         self.check_card_consistency()
-        self.do_update('dependent', timer=20) 
+        if not self.update_dependent_done:
+            self.do_update('dependent', timer=20) 
       
     def help_set(self):
         '''help message for set'''
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_ximprove.py
index a88d60b282..5fd170d18d 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_ximprove.py
@@ -157,12 +157,16 @@ def get_helicity(self, to_submit=True, clean=True):
             
             (stdout, _) = p.communicate(''.encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            try:
+            if stdout:
                 nb_channel = max([math.floor(float(d)) for d in stdout.split()])
-            except Exception as error:
-                misc.sprint(stdout, 'no channel or error for %s' % Pdir)
-                continue
-
+            else:
+                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                    files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                P_zero_result.append(Pdir)
+                if os.path.exists(pjoin(self.me_dir, 'error')):
+                    os.remove(pjoin(self.me_dir, 'error'))
+                continue # bypass bad process
+            
             self.cmd.compile(['madevent_forhel'], cwd=Pdir)
             if not os.path.exists(pjoin(Pdir, 'madevent_forhel')):
                 raise Exception('Error make madevent_forhel not successful')  
@@ -183,11 +187,13 @@ def get_helicity(self, to_submit=True, clean=True):
             #sym_input = "%(points)d %(iterations)d %(accuracy)f \n" % self.opts
             (stdout, _) = p.communicate(" ".encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            if os.path.exists(pjoin(self.me_dir,'error')):
+            if os.path.exists(pjoin(self.me_dir, 'error')):
                 raise Exception(pjoin(self.me_dir,'error')) 
                 # note a continue is not enough here, we have in top to link
                 # the matrixX_optim.f to matrixX_orig.f to let the code to work
                 # after this error.
+                #                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                #    files.cp(matrix_file, matrix_file.replace('orig','optim'))
 
             if 'no events passed cuts' in stdout:
                 raise Exception
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py
index c9d1c7706a..0b849330ef 100644
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py
@@ -57,7 +57,7 @@ def reset_simd(self, old_value, new_value, name):
             return
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-        
+
     def plugin_input(self, finput):
         return
 
@@ -79,7 +79,7 @@ def check_validity(self):
             self['sde_strategy'] = 1
         if self['hel_recycling']:
             self['hel_recycling'] = False
-            
+
 class GPURunCard(CPPRunCard):
     def default_setup(self):
         super(CPPRunCard, self).default_setup()
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py
index d722702891..853aabc98a 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py
@@ -1,4 +1,4 @@
-################################################################################
+###############################################################################
 #
 # Copyright (c) 2011 The MadGraph5_aMC@NLO Development team and Contributors
 #
@@ -3675,7 +3675,7 @@ def do_refine(self, line):
         devnull.close()
     
     ############################################################################ 
-    def do_combine_iteration(self, line):
+    def do_comine_iteration(self, line):
         """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step
             S is for survey 
             R is for refine
@@ -3703,7 +3703,7 @@ def do_combine_iteration(self, line):
     ############################################################################ 
     def do_combine_events(self, line):
         """Advanced commands: Launch combine events"""
-
+        start=time.time()
         args = self.split_arg(line)
         start = time.time()
         # Check argument's validity
@@ -3798,9 +3798,7 @@ def do_combine_events(self, line):
             self.correct_bias()
         elif self.run_card['custom_fcts']:
             self.correct_bias()
-
-        logger.info("combine events done in %s", time.time()-start)
-
+        logger.info("combination of events done in %s s ", time.time()-start)
         
         self.to_store.append('event')
     
@@ -7368,7 +7366,7 @@ def wait_monitoring(Idle, Running, Done):
     import optparse
     # Get the directory of the script real path (bin)                                                                                                                                                           
     # and add it to the current PYTHONPATH                                                                                                                                                                      
-    root_path = os.path.dirname(os.path.dirname(os.path.realpath( __file__ )))
+    #root_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath( __file__ ))))
     sys.path.insert(0, root_path)
 
     class MyOptParser(optparse.OptionParser):    
@@ -7411,7 +7409,13 @@ def error(self, msg=''):
     import logging.config
     # Set logging level according to the logging level given by options                                                                                                                                         
     #logging.basicConfig(level=vars(logging)[options.logging])                                                                                                                                                  
+    import internal
     import internal.coloring_logging
+    # internal.file = XXX/bin/internal/__init__.py
+    # => need three dirname to get XXX
+    # we use internal to have any issue with pythonpath finding the wrong file
+    me_dir = os.path.dirname(os.path.dirname(os.path.dirname(internal.__file__)))
+    print("me_dir is", me_dir)
     try:
         if __debug__ and options.logging == 'INFO':
             options.logging = 'DEBUG'
@@ -7419,7 +7423,8 @@ def error(self, msg=''):
             level = int(options.logging)
         else:
             level = eval('logging.' + options.logging)
-        logging.config.fileConfig(os.path.join(root_path, 'internal', 'me5_logging.conf'))
+        log_path = os.path.join(me_dir, 'bin', 'internal', 'me5_logging.conf')
+        logging.config.fileConfig(log_path)
         logging.root.setLevel(level)
         logging.getLogger('madgraph').setLevel(level)
     except:
@@ -7433,9 +7438,9 @@ def error(self, msg=''):
             if '--web' in args:
                 i = args.index('--web') 
                 args.pop(i)                                                                                                                                                                     
-                cmd_line = MadEventCmd(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmd(me_dir, force_run=True)
             else:
-                cmd_line = MadEventCmdShell(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmdShell(me_dir, force_run=True)
             if not hasattr(cmd_line, 'do_%s' % args[0]):
                 if parser_error:
                     print(parser_error)
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/misc.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/misc.py
index d3fed3baa2..91cd3e5c22 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/misc.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/misc.py
@@ -347,7 +347,7 @@ def tell(msg):
     if dependency=='ninja':
         if cmd.options['ninja'] in ['None',None,''] or\
          (cmd.options['ninja'] == './HEPTools/lib' and not MG5dir is None and\
-         which_lib(pjoin(MG5dir,cmd.options['ninja'],'libninja.a')) is None):
+         which_lib(pjoin(MG5dir,cmd.options['ninja'],'lib','libninja.a')) is None):
             tell("Installing ninja...")
             cmd.do_install('ninja')
  
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/shower_card.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/shower_card.py
index c6d3948cc4..c344ea1b15 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/shower_card.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/shower_card.py
@@ -45,7 +45,9 @@ class ShowerCard(dict):
     false = ['.false.', 'f', 'false', '0']
     logical_vars = ['ue_enabled', 'hadronize', 'b_stable', 'pi_stable', 'wp_stable', 
                     'wm_stable', 'z_stable', 'h_stable', 'tap_stable', 'tam_stable', 
-                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td']
+                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td',
+                    'space_shower_me_corrections', 'time_shower_me_corrections',
+                    'time_shower_me_extended', 'time_shower_me_after_first']
     string_vars = ['extralibs', 'extrapaths', 'includepaths', 'analyse']
     for i in range(1,100):
         string_vars.append('dm_'+str(i))
@@ -82,7 +84,11 @@ class ShowerCard(dict):
             'b_mass' : {'HERWIG6':'b_mass', 'PYTHIA6': 'b_mass', 'HERWIGPP': 'b_mass', 'PYTHIA8': 'b_mass'},
             'analyse' : {'HERWIG6':'hwuti', 'PYTHIA6':'pyuti', 'HERWIGPP':'hwpputi', 'PYTHIA8':'py8uti'},
             'qcut' : {'PYTHIA8':'qcut'},
-            'njmax' : {'PYTHIA8':'njmax'}}
+            'njmax' : {'PYTHIA8':'njmax'},
+            'space_shower_me_corrections' : {'PYTHIA8':'space_shower_me_corrections'},
+            'time_shower_me_corrections' : {'PYTHIA8':'time_shower_me_corrections'},
+            'time_shower_me_extended' : {'PYTHIA8':'time_shower_me_extended'},
+            'time_shower_me_after_first' : {'PYTHIA8':'time_shower_me_after_first'}}
     
     stdhep_dict = {'HERWIG6':'mcatnlo_hwan_stdhep.o', 'PYTHIA6':'mcatnlo_pyan_stdhep.o'}
     
diff --git a/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h b/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h
index 9b946c21e1..8df465ad6d 100644
--- a/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc
index 9d09eb6b62..64fc3fea62 100644
--- a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h
index 6b32c66b9b..b6568d3761 100644
--- a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
index bcf56600ba..16028d3846 100644
--- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
@@ -2,9 +2,9 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/e
 CUDACPP_BUILDDIR='.'
 
 
-
 make USEBUILDDIR=1 AVX=none
 
+
 make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
@@ -15,25 +15,25 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
-CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-03_19:52:13
+DATE: 2023-11-09_18:26:07
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0
  [UNWEIGHT] Wrote 3893 events (found 7395 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6373s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6287s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0086s for     8192 events => throughput is 9.49E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6383s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6302s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0081s for     8192 events => throughput is 1.01E+06 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1807s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1728s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0079s for     8192 events => throughput is 1.03E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1882s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1797s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0085s for     8192 events => throughput is 9.66E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4217s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3352s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0865s for    90112 events => throughput is 1.04E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4280s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3388s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0892s for    90112 events => throughput is 1.01E+06 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1919s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1852s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0067s for     8192 events => throughput is 1.23E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1893s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1830s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0064s for     8192 events => throughput is 1.28E+06 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4231s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3509s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0722s for    90112 events => throughput is 1.25E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4144s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3434s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0710s for    90112 events => throughput is 1.27E+06 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.217666e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.246747e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.241611e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.254814e+06                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1865s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1826s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0039s for     8192 events => throughput is 2.10E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1854s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1814s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0040s for     8192 events => throughput is 2.06E+06 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813628E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3926s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3476s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0451s for    90112 events => throughput is 2.00E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3872s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3433s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0440s for    90112 events => throughput is 2.05E+06 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.991197e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.008841e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.990100e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.041604e+06                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1864s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1832s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0032s for     8192 events => throughput is 2.59E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1821s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1790s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0031s for     8192 events => throughput is 2.66E+06 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3800s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3465s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0335s for    90112 events => throughput is 2.69E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3764s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3431s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0333s for    90112 events => throughput is 2.71E+06 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.603611e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.648221e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.718712e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.737599e+06                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1833s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1805s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0028s for     8192 events => throughput is 2.93E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1828s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1800s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0028s for     8192 events => throughput is 2.94E+06 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3774s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3449s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0324s for    90112 events => throughput is 2.78E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3726s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3409s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0317s for    90112 events => throughput is 2.84E+06 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.713996e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.822405e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.775269e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.840653e+06                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1890s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1855s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0035s for     8192 events => throughput is 2.35E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1827s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1792s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0035s for     8192 events => throughput is 2.34E+06 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3894s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3496s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0398s for    90112 events => throughput is 2.26E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3845s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3458s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0387s for    90112 events => throughput is 2.33E+06 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.190424e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.213684e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.183626e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.288308e+06                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5997s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5992s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.68E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5941s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5936s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.56E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,8 +547,8 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813628E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7696s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7647s
+ [COUNTERS] PROGRAM TOTAL          :    0.7643s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7594s
  [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0049s for    90112 events => throughput is 1.85E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.173877e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.122558e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.893710e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.902108e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.716630e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.029032e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.387595e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.427964e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.739579e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.990174e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.929113e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.966232e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.693635e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.011562e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.118370e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.099952e+08                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
index ff3c2ae8d4..bed8731e5c 100644
--- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
@@ -4,8 +4,8 @@ CUDACPP_BUILDDIR='.'
 
 
 make USEBUILDDIR=1 AVX=none
-make USEBUILDDIR=1 AVX=sse4
 
+make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
 
@@ -15,10 +15,11 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
@@ -27,13 +28,12 @@ make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-03_19:52:30
+DATE: 2023-11-09_18:26:24
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0
  [UNWEIGHT] Wrote 3893 events (found 7395 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6418s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6338s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0080s for     8192 events => throughput is 1.02E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6375s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6295s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0080s for     8192 events => throughput is 1.03E+06 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1827s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1742s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0085s for     8192 events => throughput is 9.67E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1779s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1700s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0079s for     8192 events => throughput is 1.04E+06 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4264s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3383s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0882s for    90112 events => throughput is 1.02E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4168s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3310s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0858s for    90112 events => throughput is 1.05E+06 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747166087172673] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1909s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1845s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0064s for     8192 events => throughput is 1.27E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1876s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1813s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0063s for     8192 events => throughput is 1.31E+06 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501907796603360E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4197s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3492s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0705s for    90112 events => throughput is 1.28E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4132s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3439s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0693s for    90112 events => throughput is 1.30E+06 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.260485e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.290954e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.240620e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.269110e+06                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747165570339780] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1824s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1798s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0026s for     8192 events => throughput is 3.18E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1799s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1773s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0026s for     8192 events => throughput is 3.11E+06 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501905322826635E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3742s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3464s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0278s for    90112 events => throughput is 3.24E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3696s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3425s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0271s for    90112 events => throughput is 3.33E+06 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.182676e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.211958e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.343050e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.331194e+06                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747165593922979] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1914s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1892s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0022s for     8192 events => throughput is 3.72E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1846s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1823s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0022s for     8192 events => throughput is 3.66E+06 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501905316084181E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3767s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3513s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0254s for    90112 events => throughput is 3.55E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3689s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3438s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0251s for    90112 events => throughput is 3.59E+06 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.496883e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.583243e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.660390e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.664821e+06                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747165593922979] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1867s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1844s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0023s for     8192 events => throughput is 3.57E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1881s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1858s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0022s for     8192 events => throughput is 3.64E+06 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501905316084181E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3763s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3515s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0248s for    90112 events => throughput is 3.63E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4013s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3744s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0269s for    90112 events => throughput is 3.35E+06 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.562187e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.708142e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.601892e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.716354e+06                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747166440400542] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1875s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1852s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0023s for     8192 events => throughput is 3.57E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1876s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1854s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0022s for     8192 events => throughput is 3.69E+06 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501908978565555E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3774s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3519s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0255s for    90112 events => throughput is 3.53E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3791s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3532s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0259s for    90112 events => throughput is 3.47E+06 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.223682e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.388042e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.583359e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.799218e+06                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747166823487174] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5998s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5993s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.73E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5957s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5952s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.72E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,9 +547,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501910542849674E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7713s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7665s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0048s for    90112 events => throughput is 1.87E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7596s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7551s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0046s for    90112 events => throughput is 1.97E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.583398e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.613080e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.881767e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.898284e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.997979e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.543811e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.043514e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.026187e+09                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.954785e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.468953e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.219791e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.241582e+09                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.299152e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.812787e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.462264e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.411277e+08                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
index 7741c53b46..8b8c11aaf5 100644
--- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
@@ -1,11 +1,11 @@
 Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 CUDACPP_BUILDDIR='.'
 
+
 make USEBUILDDIR=1 AVX=none
 
 make USEBUILDDIR=1 AVX=sse4
 
-
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
 
@@ -15,13 +15,13 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-CUDACPP_BUILDDIR='build.none_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
-CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.none_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-03_19:52:47
+DATE: 2023-11-09_18:26:42
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0
  [UNWEIGHT] Wrote 3893 events (found 7395 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6387s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6300s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0086s for     8192 events => throughput is 9.48E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6293s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6211s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0082s for     8192 events => throughput is 1.00E+06 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1817s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1737s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0079s for     8192 events => throughput is 1.03E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1828s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1742s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0086s for     8192 events => throughput is 9.52E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,8 +109,8 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4238s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3373s
+ [COUNTERS] PROGRAM TOTAL          :    0.4185s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3320s
  [COUNTERS] Fortran MEs      ( 1 ) :    0.0865s for    90112 events => throughput is 1.04E+06 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
@@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169074211734] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1953s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1883s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0070s for     8192 events => throughput is 1.17E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1883s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1817s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0066s for     8192 events => throughput is 1.24E+06 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919915927155E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4433s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3677s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0757s for    90112 events => throughput is 1.19E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4177s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3452s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0725s for    90112 events => throughput is 1.24E+06 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.177056e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.192297e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.187091e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.206668e+06                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169074211728] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1964s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1922s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0042s for     8192 events => throughput is 1.95E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1831s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1792s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0039s for     8192 events => throughput is 2.12E+06 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919915927155E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4148s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3686s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0462s for    90112 events => throughput is 1.95E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3847s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3426s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0421s for    90112 events => throughput is 2.14E+06 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.000126e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.077610e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.127276e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.127798e+06                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1855s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1824s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0031s for     8192 events => throughput is 2.62E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1845s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1815s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0030s for     8192 events => throughput is 2.72E+06 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3807s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3465s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0342s for    90112 events => throughput is 2.63E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3749s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3413s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0336s for    90112 events => throughput is 2.68E+06 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.610232e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.567900e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.645393e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.786544e+06                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1900s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1870s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0031s for     8192 events => throughput is 2.68E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1822s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1793s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0029s for     8192 events => throughput is 2.82E+06 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3808s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3484s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0325s for    90112 events => throughput is 2.78E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3728s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3415s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0313s for    90112 events => throughput is 2.88E+06 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.740865e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.787216e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.846547e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.802177e+06                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1860s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1826s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0033s for     8192 events => throughput is 2.46E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1827s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1792s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0035s for     8192 events => throughput is 2.33E+06 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3896s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3515s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0381s for    90112 events => throughput is 2.36E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3808s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3437s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0371s for    90112 events => throughput is 2.43E+06 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.142678e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.306669e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.406082e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.302969e+06                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169066587257] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5998s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5993s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.60E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5952s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5947s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.67E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,9 +547,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919911173610E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7721s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7672s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0049s for    90112 events => throughput is 1.85E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7615s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7567s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0048s for    90112 events => throughput is 1.88E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.181977e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.094813e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.926668e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.912678e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.726329e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.000800e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.399920e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.334730e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.694690e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.018486e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.877527e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.914438e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.708142e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.024074e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.118945e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.129214e+08                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
index 1c30dae812..824a8e25d5 100644
--- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
@@ -2,8 +2,8 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g
 CUDACPP_BUILDDIR='.'
 
 
-make USEBUILDDIR=1 AVX=none
 
+make USEBUILDDIR=1 AVX=none
 make USEBUILDDIR=1 AVX=sse4
 
 make USEBUILDDIR=1 AVX=avx2
@@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
-CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0'
-CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-03_19:53:04
+DATE: 2023-11-09_18:26:59
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0
  [UNWEIGHT] Wrote 420 events (found 1577 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3686s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3264s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0422s for     8192 events => throughput is 1.94E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3548s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3140s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0408s for     8192 events => throughput is 2.01E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3158s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2736s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0422s for     8192 events => throughput is 1.94E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3094s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2683s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0411s for     8192 events => throughput is 1.99E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775365] fbridge_mode=0
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6988s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2402s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.4586s for    90112 events => throughput is 1.97E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6956s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2429s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.4528s for    90112 events => throughput is 1.99E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600102] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3516s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3139s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0377s for     8192 events => throughput is 2.17E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3445s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3078s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0367s for     8192 events => throughput is 2.23E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775372] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7148s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2977s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4171s for    90112 events => throughput is 2.16E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6787s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2659s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4128s for    90112 events => throughput is 2.18E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.143201e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.206364e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.178576e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.211188e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600102] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3233s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3012s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0221s for     8192 events => throughput is 3.70E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3133s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2921s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0211s for     8192 events => throughput is 3.88E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775379] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.5228s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2789s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2439s for    90112 events => throughput is 3.69E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4919s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2565s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2354s for    90112 events => throughput is 3.83E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.615837e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.806213e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.718240e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.795645e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3023s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2891s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0132s for     8192 events => throughput is 6.21E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2981s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2850s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0131s for     8192 events => throughput is 6.23E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775386] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4266s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2769s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1496s for    90112 events => throughput is 6.02E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3832s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2385s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1448s for    90112 events => throughput is 6.22E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.870487e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.053490e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.072305e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.106690e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3006s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2887s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0119s for     8192 events => throughput is 6.89E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2943s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2825s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0117s for     8192 events => throughput is 6.97E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775386] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4026s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2691s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1335s for    90112 events => throughput is 6.75E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3653s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2365s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1287s for    90112 events => throughput is 7.00E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.610205e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.704382e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.622254e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.799597e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3191s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2997s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0194s for     8192 events => throughput is 4.23E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3082s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2885s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0198s for     8192 events => throughput is 4.15E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775386] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.5054s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2826s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2228s for    90112 events => throughput is 4.04E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6624s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4291s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2333s for    90112 events => throughput is 3.86E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.911690e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.938387e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.045481e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.929754e+05                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7037s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7032s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0006s for     8192 events => throughput is 1.43E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6969s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6963s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0006s for     8192 events => throughput is 1.46E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,9 +547,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775386] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6871s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.6805s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0066s for    90112 events => throughput is 1.37E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6570s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.6507s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0063s for    90112 events => throughput is 1.43E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.043596e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.071187e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.671088e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.692368e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.005777e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.183000e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.074802e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.074203e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.019573e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.195387e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.147636e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.150737e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.014036e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.203236e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.011683e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.040065e+07                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
index 7edcebceb9..6ff403b879 100644
--- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
@@ -1,12 +1,12 @@
 Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 CUDACPP_BUILDDIR='.'
 
-
 make USEBUILDDIR=1 AVX=none
 
-make USEBUILDDIR=1 AVX=sse4
 
 make USEBUILDDIR=1 AVX=avx2
+make USEBUILDDIR=1 AVX=sse4
+
 make USEBUILDDIR=1 AVX=512y
 
 make USEBUILDDIR=1 AVX=512z
@@ -17,13 +17,13 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-03_19:53:30
+DATE: 2023-11-09_18:27:25
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0
  [UNWEIGHT] Wrote 420 events (found 1577 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3667s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3245s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0422s for     8192 events => throughput is 1.94E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3494s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3093s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0401s for     8192 events => throughput is 2.04E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3249s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2811s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0438s for     8192 events => throughput is 1.87E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3068s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2663s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0405s for     8192 events => throughput is 2.02E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775365] fbridge_mode=0
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7464s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2748s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.4716s for    90112 events => throughput is 1.91E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6536s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2070s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.4466s for    90112 events => throughput is 2.02E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690706767555099] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3467s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3115s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0352s for     8192 events => throughput is 2.33E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3397s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3049s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0348s for     8192 events => throughput is 2.35E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782605295497] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6806s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2908s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3898s for    90112 events => throughput is 2.31E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6398s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2589s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3809s for    90112 events => throughput is 2.37E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.279168e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.342865e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.299428e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.331036e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690702885183541] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3091s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2943s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0148s for     8192 events => throughput is 5.53E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2992s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2845s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0147s for     8192 events => throughput is 5.59E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223778858016772] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4417s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2764s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1652s for    90112 events => throughput is 5.45E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4772s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3090s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1682s for    90112 events => throughput is 5.36E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.234141e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.225442e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.323283e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.299428e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690694374060818] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2903s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2825s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0077s for     8192 events => throughput is 1.06E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3093s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3001s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0092s for     8192 events => throughput is 8.88E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223775951815753] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3699s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2805s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0894s for    90112 events => throughput is 1.01E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3166s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2317s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0848s for    90112 events => throughput is 1.06E+06 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.010480e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.025673e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.003913e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.017812e+06                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690694374060818] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2913s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2842s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0071s for     8192 events => throughput is 1.15E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2858s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2785s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0073s for     8192 events => throughput is 1.12E+06 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223775951815753] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3439s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2627s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0812s for    90112 events => throughput is 1.11E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3072s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2282s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0790s for    90112 events => throughput is 1.14E+06 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.090791e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.097760e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.092579e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.119253e+06                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690698914467276] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2963s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2859s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0104s for     8192 events => throughput is 7.89E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2907s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2807s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0099s for     8192 events => throughput is 8.25E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223780273983500] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3867s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2714s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1153s for    90112 events => throughput is 7.81E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3509s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2397s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1112s for    90112 events => throughput is 8.10E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.366599e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.884299e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.487198e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.701504e+05                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690703397697980] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7024s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7018s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.52E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6960s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6955s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.51E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,9 +547,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223786763175951] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6918s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.6861s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0056s for    90112 events => throughput is 1.60E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6624s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.6571s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0054s for    90112 events => throughput is 1.68E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.243778e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.111635e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.844714e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.880409e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.837802e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.143607e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.769339e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.762374e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.775138e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.140173e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.863954e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.866583e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.397746e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.685718e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.449606e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.400545e+07                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
index 30dac17633..9b02995ca5 100644
--- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
@@ -3,9 +3,9 @@ CUDACPP_BUILDDIR='.'
 
 
 make USEBUILDDIR=1 AVX=none
-make USEBUILDDIR=1 AVX=sse4
 
 
+make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
 
@@ -15,13 +15,13 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.none_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-03_19:53:55
+DATE: 2023-11-09_18:27:49
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0
  [UNWEIGHT] Wrote 420 events (found 1577 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3561s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3153s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0408s for     8192 events => throughput is 2.01E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3509s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3105s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0405s for     8192 events => throughput is 2.02E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3103s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2697s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0406s for     8192 events => throughput is 2.02E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3067s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2662s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0405s for     8192 events => throughput is 2.02E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775365] fbridge_mode=0
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6795s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2275s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.4521s for    90112 events => throughput is 1.99E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6580s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2117s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.4462s for    90112 events => throughput is 2.02E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690709601032026] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3522s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3138s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0384s for     8192 events => throughput is 2.13E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3460s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3081s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0379s for     8192 events => throughput is 2.16E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223783635280988] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7156s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2968s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4188s for    90112 events => throughput is 2.15E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6700s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2581s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4119s for    90112 events => throughput is 2.19E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.113023e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.182152e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.146418e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.183502e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690709601032026] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3199s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2989s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0210s for     8192 events => throughput is 3.90E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3147s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2941s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0207s for     8192 events => throughput is 3.96E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223783635280988] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.5211s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2871s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2339s for    90112 events => throughput is 3.85E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4759s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2478s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2281s for    90112 events => throughput is 3.95E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.687467e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.820026e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.724259e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.775419e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690709681138244] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3043s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2913s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0129s for     8192 events => throughput is 6.33E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2965s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2837s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0128s for     8192 events => throughput is 6.38E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223783652032040] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4192s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2746s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1446s for    90112 events => throughput is 6.23E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3920s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2488s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1432s for    90112 events => throughput is 6.29E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.051901e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.159361e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.195854e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.220899e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690709681138244] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2995s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2875s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0120s for     8192 events => throughput is 6.80E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3061s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2938s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0123s for     8192 events => throughput is 6.65E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223783652032040] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3959s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2664s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1296s for    90112 events => throughput is 6.96E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3693s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2423s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1270s for    90112 events => throughput is 7.10E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.842430e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.912537e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.007264e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.069074e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690709681138244] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3146s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2960s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0186s for     8192 events => throughput is 4.40E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3327s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3109s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0218s for     8192 events => throughput is 3.75E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223783652032040] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.5378s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.3142s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2236s for    90112 events => throughput is 4.03E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4629s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2551s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2078s for    90112 events => throughput is 4.34E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.894022e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.077933e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.946552e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.997576e+05                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708266690699] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7067s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7061s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0006s for     8192 events => throughput is 1.46E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6985s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6979s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0006s for     8192 events => throughput is 1.40E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,9 +547,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782303744791] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6929s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.6862s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0067s for    90112 events => throughput is 1.34E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6617s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.6553s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0064s for    90112 events => throughput is 1.42E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.049753e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.060435e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.613651e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.608769e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.019403e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.186491e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.060699e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.059369e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.995962e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.182441e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.142982e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.136921e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.026315e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.174632e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.022885e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.949461e+07                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
index d992721ecf..241597d591 100644
--- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
@@ -6,8 +6,8 @@ make USEBUILDDIR=1 AVX=none
 
 make USEBUILDDIR=1 AVX=sse4
 
-make USEBUILDDIR=1 AVX=512y
 make USEBUILDDIR=1 AVX=avx2
+make USEBUILDDIR=1 AVX=512y
 
 make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
@@ -16,7 +16,6 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
-CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
@@ -24,6 +23,7 @@ make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-03_19:54:21
+DATE: 2023-11-09_18:28:15
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 42 events (found 469 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5463s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2264s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3199s for     8192 events => throughput is 2.56E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5556s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2379s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3178s for     8192 events => throughput is 2.58E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5423s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2222s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3201s for     8192 events => throughput is 2.56E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5351s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2203s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3148s for     8192 events => throughput is 2.60E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    4.9241s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4090s
- [COUNTERS] Fortran MEs      ( 1 ) :    3.5151s for    90112 events => throughput is 2.56E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.8579s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3886s
+ [COUNTERS] Fortran MEs      ( 1 ) :    3.4692s for    90112 events => throughput is 2.60E+04 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196357922470791E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8783s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5509s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3274s for     8192 events => throughput is 2.50E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8596s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5355s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3241s for     8192 events => throughput is 2.53E+04 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872077655597E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    5.3304s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.7125s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    3.6180s for    90112 events => throughput is 2.49E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    5.2563s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.6842s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    3.5721s for    90112 events => throughput is 2.52E+04 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.563855e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.570949e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.539633e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.596498e+04                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196357922470777E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5609s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3903s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1705s for     8192 events => throughput is 4.80E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5542s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3858s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1684s for     8192 events => throughput is 4.87E+04 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    3.4794s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5811s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.8984s for    90112 events => throughput is 4.75E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    3.5019s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5803s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.9216s for    90112 events => throughput is 4.69E+04 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.820475e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.985717e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.874297e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.959096e+04                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3928s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3073s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0854s for     8192 events => throughput is 9.59E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3840s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3011s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0829s for     8192 events => throughput is 9.88E+04 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872077655541E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    2.4294s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4857s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.9437s for    90112 events => throughput is 9.55E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.3753s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4512s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.9241s for    90112 events => throughput is 9.75E+04 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.717012e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.005162e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.756457e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.000723e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3746s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2982s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0765s for     8192 events => throughput is 1.07E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3672s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2918s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0754s for     8192 events => throughput is 1.09E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872077655541E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    2.3655s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5058s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8597s for    90112 events => throughput is 1.05E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.2690s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4430s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8260s for    90112 events => throughput is 1.09E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.094100e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.111268e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.081248e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.117996e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4370s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3297s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1073s for     8192 events => throughput is 7.64E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4279s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3235s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1044s for     8192 events => throughput is 7.85E+04 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872077655541E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    2.6869s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5079s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1790s for    90112 events => throughput is 7.64E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.6406s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4855s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1551s for    90112 events => throughput is 7.80E+04 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.730653e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.832306e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.578143e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.896180e+04                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6799s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6745s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0053s for     8192 events => throughput is 1.53E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6558s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6503s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0055s for     8192 events => throughput is 1.50E+06 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,9 +547,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872077655597E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    1.8667s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.8438s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0229s for    90112 events => throughput is 3.94E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8300s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8072s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0228s for    90112 events => throughput is 3.95E+06 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.611230e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.613028e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.333105e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.229609e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.644038e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.871226e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.240451e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.236452e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.653799e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.869896e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.251657e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.247810e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.651458e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.851703e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.754830e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.745705e+06                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
index a339973536..9b1af7b411 100644
--- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
@@ -2,10 +2,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g
 CUDACPP_BUILDDIR='.'
 
 
-
 make USEBUILDDIR=1 AVX=none
-make USEBUILDDIR=1 AVX=sse4
 
+
+make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
 
@@ -15,11 +15,11 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-03_19:55:03
+DATE: 2023-11-09_18:28:56
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 42 events (found 469 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5498s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2259s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3239s for     8192 events => throughput is 2.53E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5377s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2218s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3159s for     8192 events => throughput is 2.59E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5475s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2238s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3236s for     8192 events => throughput is 2.53E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5364s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2203s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3161s for     8192 events => throughput is 2.59E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    4.9843s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4284s
- [COUNTERS] Fortran MEs      ( 1 ) :    3.5559s for    90112 events => throughput is 2.53E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.9162s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3985s
+ [COUNTERS] Fortran MEs      ( 1 ) :    3.5176s for    90112 events => throughput is 2.56E+04 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196349765248158E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8606s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5403s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3204s for     8192 events => throughput is 2.56E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8412s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5250s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3162s for     8192 events => throughput is 2.59E+04 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310860767768514E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    5.2449s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.7120s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    3.5329s for    90112 events => throughput is 2.55E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    5.1769s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.6882s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    3.4887s for    90112 events => throughput is 2.58E+04 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.612374e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.661457e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.564881e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.666467e+04                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196334183509370E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4339s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3327s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1012s for     8192 events => throughput is 8.10E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4080s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3132s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0948s for     8192 events => throughput is 8.64E+04 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310847547651041E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    2.5445s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4937s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0508s for    90112 events => throughput is 8.58E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.5043s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4696s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0347s for    90112 events => throughput is 8.71E+04 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.676181e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.800531e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.776153e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.815957e+04                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196330801117323E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3149s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2698s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0450s for     8192 events => throughput is 1.82E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3077s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2643s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0434s for     8192 events => throughput is 1.89E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310847326088065E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    1.9260s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4419s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4841s for    90112 events => throughput is 1.86E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8998s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4206s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4793s for    90112 events => throughput is 1.88E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.865505e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.823286e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.837629e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.826868e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196330801117323E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3024s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2625s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0399s for     8192 events => throughput is 2.05E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2983s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2592s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0391s for     8192 events => throughput is 2.10E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310847326088065E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    1.8768s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4395s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4373s for    90112 events => throughput is 2.06E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8383s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4080s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4303s for    90112 events => throughput is 2.09E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.065719e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.101947e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.103855e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.126133e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196344079460428E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3291s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2768s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0523s for     8192 events => throughput is 1.57E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3220s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2717s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0503s for     8192 events => throughput is 1.63E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310857804286998E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    2.0319s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4573s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5745s for    90112 events => throughput is 1.57E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.9888s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4251s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5637s for    90112 events => throughput is 1.60E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.561181e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.589248e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.560141e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.587181e+05                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196349366365994E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6502s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6494s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0009s for     8192 events => throughput is 9.50E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6498s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6490s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0008s for     8192 events => throughput is 9.66E+06 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,9 +547,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310864949473968E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    1.8485s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.8390s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0096s for    90112 events => throughput is 9.41E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8143s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8048s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0095s for    90112 events => throughput is 9.51E+06 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.292780e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.303788e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.862148e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.857184e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.637111e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.727610e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.443658e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.358085e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.653596e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.712514e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.515346e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.447022e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.504423e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.573590e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.620516e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.621450e+07                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
index 0d971ecde6..e102a98f20 100644
--- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
@@ -4,8 +4,8 @@ CUDACPP_BUILDDIR='.'
 
 
 make USEBUILDDIR=1 AVX=none
-
 make USEBUILDDIR=1 AVX=sse4
+
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
 
@@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
-CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.none_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-CUDACPP_BUILDDIR='build.none_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-03_19:55:40
+DATE: 2023-11-09_18:29:33
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 42 events (found 469 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5559s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2317s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3242s for     8192 events => throughput is 2.53E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5406s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2214s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3192s for     8192 events => throughput is 2.57E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5470s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2235s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3235s for     8192 events => throughput is 2.53E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5369s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2199s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3170s for     8192 events => throughput is 2.58E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    4.9714s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4219s
- [COUNTERS] Fortran MEs      ( 1 ) :    3.5496s for    90112 events => throughput is 2.54E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.8531s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3845s
+ [COUNTERS] Fortran MEs      ( 1 ) :    3.4687s for    90112 events => throughput is 2.60E+04 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196358763382007E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8877s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5532s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3345s for     8192 events => throughput is 2.45E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8764s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5433s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3331s for     8192 events => throughput is 2.46E+04 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872835011053E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    5.5218s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.7614s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    3.7604s for    90112 events => throughput is 2.40E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    5.3597s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7144s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    3.6453s for    90112 events => throughput is 2.47E+04 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.427313e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.553245e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.496439e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.536593e+04                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196358804670396E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5567s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3892s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1675s for     8192 events => throughput is 4.89E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5484s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3827s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1657s for     8192 events => throughput is 4.94E+04 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872836789727E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    3.4587s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5767s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.8820s for    90112 events => throughput is 4.79E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    3.3712s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5426s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.8286s for    90112 events => throughput is 4.93E+04 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.968795e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.047917e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.959892e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.047714e+04                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196358586501358E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3947s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3085s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0862s for     8192 events => throughput is 9.51E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3884s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3047s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0838s for     8192 events => throughput is 9.78E+04 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872708918333E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    2.4507s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4993s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.9513s for    90112 events => throughput is 9.47E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.3827s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4554s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.9273s for    90112 events => throughput is 9.72E+04 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.685236e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.985245e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.962312e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.974556e+04                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196358586501358E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3728s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2978s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0750s for     8192 events => throughput is 1.09E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3689s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2951s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0738s for     8192 events => throughput is 1.11E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872708918333E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    2.2949s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4713s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8237s for    90112 events => throughput is 1.09E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.2643s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4493s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8151s for    90112 events => throughput is 1.11E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.124486e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.067840e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.126890e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.069793e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196358757578441E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4452s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3335s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1117s for     8192 events => throughput is 7.34E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4597s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3406s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1191s for     8192 events => throughput is 6.88E+04 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872803699391E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    2.7230s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5103s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.2127s for    90112 events => throughput is 7.43E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.7582s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5206s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.2377s for    90112 events => throughput is 7.28E+04 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.441126e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.675272e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.419166e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.626790e+04                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196358102981245E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6594s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6540s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0054s for     8192 events => throughput is 1.51E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6588s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6533s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0055s for     8192 events => throughput is 1.50E+06 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,9 +547,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872068634174E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    1.8526s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.8298s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0228s for    90112 events => throughput is 3.94E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8293s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8065s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0228s for    90112 events => throughput is 3.95E+06 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.626262e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.635720e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.888012e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.120274e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.627419e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.835173e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.234131e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.231986e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.606969e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.818919e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.246896e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.242590e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.626608e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.805414e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.728520e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.724480e+06                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
index ba8c60f62e..408d8d380a 100644
--- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
@@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-03_19:56:23
+DATE: 2023-11-09_18:30:15
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0
  [UNWEIGHT] Wrote 48 events (found 439 events)
- [COUNTERS] PROGRAM TOTAL          :    4.4568s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2815s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.1753s for     8192 events => throughput is 1.96E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.3928s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2780s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.1147s for     8192 events => throughput is 1.99E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    4.5175s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2787s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.2387s for     8192 events => throughput is 1.93E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.3846s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2715s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.1131s for     8192 events => throughput is 1.99E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725748421164E-004] fbridge_mode=0
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   48.0120s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.9235s
- [COUNTERS] Fortran MEs      ( 1 ) :   46.0885s for    90112 events => throughput is 1.96E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   47.4210s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8888s
+ [COUNTERS] Fortran MEs      ( 1 ) :   45.5321s for    90112 events => throughput is 1.98E+03 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311352998E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    8.7799s
- [COUNTERS] Fortran Overhead ( 0 ) :    4.4663s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    4.3136s for     8192 events => throughput is 1.90E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    8.6565s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.4044s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    4.2521s for     8192 events => throughput is 1.93E+03 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725748421161E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   53.8857s
- [COUNTERS] Fortran Overhead ( 0 ) :    6.1301s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   47.7557s for    90112 events => throughput is 1.89E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   52.9600s
+ [COUNTERS] Fortran Overhead ( 0 ) :    6.0482s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   46.9118s for    90112 events => throughput is 1.92E+03 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.953970e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.992604e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.950653e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.989276e+03                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311352993E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    4.8228s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.5191s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    2.3037s for     8192 events => throughput is 3.56E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.7458s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.4719s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    2.2739s for     8192 events => throughput is 3.60E+03 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   29.7001s
- [COUNTERS] Fortran Overhead ( 0 ) :    4.1956s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   25.5045s for    90112 events => throughput is 3.53E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   29.7086s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.1354s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   25.5732s for    90112 events => throughput is 3.52E+03 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.686347e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.697279e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.681541e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.704506e+03                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311353009E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    2.2608s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2531s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0077s for     8192 events => throughput is 8.13E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.2161s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2329s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.9832s for     8192 events => throughput is 8.33E+03 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   13.8799s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.8850s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   10.9950s for    90112 events => throughput is 8.20E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   13.6675s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.8433s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   10.8242s for    90112 events => throughput is 8.33E+03 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.425637e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.632389e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.448586e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.597678e+03                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311353009E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    2.0082s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.1311s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8771s for     8192 events => throughput is 9.34E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.9728s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.1137s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8591s for     8192 events => throughput is 9.54E+03 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   12.4208s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.7744s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    9.6464s for    90112 events => throughput is 9.34E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   12.1508s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.7103s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    9.4405s for    90112 events => throughput is 9.55E+03 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.625406e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.863291e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.599473e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.840135e+03                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311353009E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    2.4768s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.3764s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1005s for     8192 events => throughput is 7.44E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.5697s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4500s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1197s for     8192 events => throughput is 7.32E+03 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   15.3944s
- [COUNTERS] Fortran Overhead ( 0 ) :    3.0207s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   12.3737s for    90112 events => throughput is 7.28E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   15.0835s
+ [COUNTERS] Fortran Overhead ( 0 ) :    3.0464s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   12.0370s for    90112 events => throughput is 7.49E+03 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.487218e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.677485e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.501573e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.683279e+03                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311352998E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8150s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7821s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0330s for     8192 events => throughput is 2.49E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8101s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7773s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0327s for     8192 events => throughput is 2.50E+05 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,9 +547,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725748421161E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :    2.7813s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.4228s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3586s for    90112 events => throughput is 2.51E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.7514s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.3963s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3551s for    90112 events => throughput is 2.54E+05 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.281506e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.285714e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.519229e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.505353e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.106281e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.109677e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.149081e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.147684e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.098811e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.113597e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.169654e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.164951e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.104970e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.106343e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.438070e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.432331e+05                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
index 2c58d8399d..f4a809f68b 100644
--- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
@@ -2,10 +2,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g
 CUDACPP_BUILDDIR='.'
 
 
+
 make USEBUILDDIR=1 AVX=none
 
 make USEBUILDDIR=1 AVX=sse4
-
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
 
@@ -15,11 +15,11 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-03_20:00:40
+DATE: 2023-11-09_18:34:30
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0
  [UNWEIGHT] Wrote 48 events (found 439 events)
- [COUNTERS] PROGRAM TOTAL          :    4.4730s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2806s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.1924s for     8192 events => throughput is 1.95E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.3944s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2768s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.1176s for     8192 events => throughput is 1.99E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    4.4924s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2814s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.2110s for     8192 events => throughput is 1.95E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.5146s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2747s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.2399s for     8192 events => throughput is 1.93E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725748421164E-004] fbridge_mode=0
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   48.0870s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.9193s
- [COUNTERS] Fortran MEs      ( 1 ) :   46.1676s for    90112 events => throughput is 1.95E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   47.3456s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8962s
+ [COUNTERS] Fortran MEs      ( 1 ) :   45.4494s for    90112 events => throughput is 1.98E+03 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277396490802749E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    8.5167s
- [COUNTERS] Fortran Overhead ( 0 ) :    4.3246s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    4.1920s for     8192 events => throughput is 1.95E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    8.3558s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.2546s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    4.1013s for     8192 events => throughput is 2.00E+03 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803774602344628E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   52.0969s
- [COUNTERS] Fortran Overhead ( 0 ) :    5.9741s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   46.1228s for    90112 events => throughput is 1.95E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   51.2827s
+ [COUNTERS] Fortran Overhead ( 0 ) :    5.9515s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   45.3313s for    90112 events => throughput is 1.99E+03 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.036738e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.068073e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.035901e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.068719e+03                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277389126121586E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    2.5366s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.3964s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1402s for     8192 events => throughput is 7.18E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.4998s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3795s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1203s for     8192 events => throughput is 7.31E+03 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803771887543366E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   15.6834s
- [COUNTERS] Fortran Overhead ( 0 ) :    3.0490s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   12.6344s for    90112 events => throughput is 7.13E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   15.4928s
+ [COUNTERS] Fortran Overhead ( 0 ) :    3.0115s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   12.4813s for    90112 events => throughput is 7.22E+03 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.385848e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.470531e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.336063e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.461238e+03                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277390198115864E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    1.2706s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7693s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5013s for     8192 events => throughput is 1.63E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.2540s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7572s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4968s for     8192 events => throughput is 1.65E+04 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803774416711566E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :    7.9611s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.4052s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    5.5558s for    90112 events => throughput is 1.62E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    7.8987s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.3780s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    5.5207s for    90112 events => throughput is 1.63E+04 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.671775e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.671559e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.674155e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.684139e+04                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277390198115864E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    1.1460s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7047s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4413s for     8192 events => throughput is 1.86E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.1397s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7044s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4354s for     8192 events => throughput is 1.88E+04 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803774416711566E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :    7.1917s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.3395s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    4.8523s for    90112 events => throughput is 1.86E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    7.1254s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.3176s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    4.8078s for    90112 events => throughput is 1.87E+04 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.912795e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.932083e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.909696e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.934934e+04                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277396394633404E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3662s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8206s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5456s for     8192 events => throughput is 1.50E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3342s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8031s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5312s for     8192 events => throughput is 1.54E+04 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803777741065333E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :    8.4389s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.4516s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    5.9874s for    90112 events => throughput is 1.51E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    8.3073s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.4189s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    5.8884s for    90112 events => throughput is 1.53E+04 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.534307e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.547676e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.484518e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.546957e+04                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277400478491260E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7763s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7549s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0214s for     8192 events => throughput is 3.82E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7736s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7522s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0215s for     8192 events => throughput is 3.81E+05 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,9 +547,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803779990154892E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :    2.6207s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.3864s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2342s for    90112 events => throughput is 3.85E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.5981s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.3628s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2353s for    90112 events => throughput is 3.83E+05 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.582914e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.602414e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.939400e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.925045e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.483584e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.484752e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.662803e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.656642e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.489429e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.490786e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.631443e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.725267e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.463590e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.471712e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.531910e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.530964e+05                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
index 7032d72896..9bed8b02d9 100644
--- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
@@ -2,10 +2,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g
 CUDACPP_BUILDDIR='.'
 
 
-make USEBUILDDIR=1 AVX=none
-make USEBUILDDIR=1 AVX=sse4
 
+make USEBUILDDIR=1 AVX=none
 
+make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
 
@@ -15,17 +15,17 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.none_m_inl0_hrd0'
-CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-03_20:04:02
+DATE: 2023-11-09_18:37:49
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0
  [UNWEIGHT] Wrote 48 events (found 439 events)
- [COUNTERS] PROGRAM TOTAL          :    4.4626s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2774s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.1852s for     8192 events => throughput is 1.96E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.3681s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2752s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.0929s for     8192 events => throughput is 2.00E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    4.4427s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2777s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.1649s for     8192 events => throughput is 1.97E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.3422s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2703s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.0719s for     8192 events => throughput is 2.01E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725748421164E-004] fbridge_mode=0
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   48.3675s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.9183s
- [COUNTERS] Fortran MEs      ( 1 ) :   46.4493s for    90112 events => throughput is 1.94E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   47.1722s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8864s
+ [COUNTERS] Fortran MEs      ( 1 ) :   45.2857s for    90112 events => throughput is 1.99E+03 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277432965013E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    9.0356s
- [COUNTERS] Fortran Overhead ( 0 ) :    4.6432s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    4.3924s for     8192 events => throughput is 1.87E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    8.6914s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.4356s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    4.2558s for     8192 events => throughput is 1.92E+03 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725813026109E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   54.3841s
- [COUNTERS] Fortran Overhead ( 0 ) :    6.2075s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   48.1766s for    90112 events => throughput is 1.87E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   54.0099s
+ [COUNTERS] Fortran Overhead ( 0 ) :    6.0604s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   47.9495s for    90112 events => throughput is 1.88E+03 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.891623e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.955214e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.924168e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.962469e+03                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277430934464E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    4.7893s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.5036s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    2.2857s for     8192 events => throughput is 3.58E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.7696s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.4653s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    2.3043s for     8192 events => throughput is 3.56E+03 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725816246317E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   29.4680s
- [COUNTERS] Fortran Overhead ( 0 ) :    4.1631s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   25.3048s for    90112 events => throughput is 3.56E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   28.7487s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.0795s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   24.6692s for    90112 events => throughput is 3.65E+03 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.703810e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.767280e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.713606e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.771152e+03                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277419683297E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    2.2254s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2372s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.9882s for     8192 events => throughput is 8.29E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.1933s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2225s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.9708s for     8192 events => throughput is 8.44E+03 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725810769321E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   13.9261s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.8936s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   11.0324s for    90112 events => throughput is 8.17E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   13.6387s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.8343s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   10.8044s for    90112 events => throughput is 8.34E+03 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.503062e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.765902e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.519397e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.708316e+03                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277419683297E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    2.0128s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.1323s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8805s for     8192 events => throughput is 9.30E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.9610s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.1075s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8536s for     8192 events => throughput is 9.60E+03 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725810769321E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   12.3871s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.7750s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    9.6121s for    90112 events => throughput is 9.37E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   12.2117s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.7290s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    9.4827s for    90112 events => throughput is 9.50E+03 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.683983e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.837213e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.679001e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.813722e+03                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277419683297E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    2.5013s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.3879s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1134s for     8192 events => throughput is 7.36E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.4206s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3396s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0809s for     8192 events => throughput is 7.58E+03 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725810769321E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   15.3721s
- [COUNTERS] Fortran Overhead ( 0 ) :    3.0357s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   12.3363s for    90112 events => throughput is 7.30E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   14.8540s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.9517s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   11.9023s for    90112 events => throughput is 7.57E+03 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.423059e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.664729e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.425324e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.661148e+03                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277293084707E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8158s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7828s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0331s for     8192 events => throughput is 2.48E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8068s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7745s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0323s for     8192 events => throughput is 2.54E+05 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,9 +547,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725738731039E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :    2.7756s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.4130s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3626s for    90112 events => throughput is 2.49E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.7640s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.4053s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3587s for    90112 events => throughput is 2.51E+05 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.294705e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.297023e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.524485e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.536170e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.113307e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.107408e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.174133e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.153471e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.119833e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.118088e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.183136e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.176343e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.103258e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.120562e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.436179e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.436751e+05                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
index 568f545851..635bc8aab0 100644
--- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
@@ -2,9 +2,9 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g
 CUDACPP_BUILDDIR='.'
 
 
+make USEBUILDDIR=1 AVX=none
 
 
-make USEBUILDDIR=1 AVX=none
 make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
@@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
-CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-03_20:09:47
+DATE: 2023-11-09_18:43:28
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
@@ -51,14 +51,552 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192
 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
 --------------------
 Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran'
-ERROR! ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' failed
-d R # 5  >    -0.0    -0.0    -0.0     0.4     0.4
-d R # 6  >    -0.0    -0.0    -0.0    -0.0     0.4
-s min # 3>     0.0119716.0 29929.0 29929.0     0.0
-s min # 4>     0.0     0.0 29929.0 29929.0     0.0
-s min # 5>     0.0     0.0     0.0     0.0     0.0
-s min # 6>     0.0     0.0     0.0     0.0     0.0
-xqcutij # 3>     0.0     0.0     0.0     0.0     0.0
-xqcutij # 4>     0.0     0.0     0.0     0.0     0.0
-xqcutij # 5>     0.0     0.0     0.0     0.0     0.0
-xqcutij # 6>     0.0     0.0     0.0     0.0     0.0
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0
+ [UNWEIGHT] Wrote 1 events (found 166 events)
+ [COUNTERS] PROGRAM TOTAL          :   96.1979s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4594s
+ [COUNTERS] Fortran MEs      ( 1 ) :   95.7384s for     8192 events => throughput is 8.56E+01 events/s
+
+*** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :   96.1938s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4572s
+ [COUNTERS] Fortran MEs      ( 1 ) :   95.7366s for     8192 events => throughput is 8.56E+01 events/s
+
+*** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1358436158813976E-007] fbridge_mode=0
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          : 1056.1191s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.1851s
+ [COUNTERS] Fortran MEs      ( 1 ) : 1051.9341s for    90112 events => throughput is 8.57E+01 events/s
+
+*** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1693100945435831E-006] fbridge_mode=1
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :  221.2522s
+ [COUNTERS] Fortran Overhead ( 0 ) :  101.5022s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  119.7500s for     8192 events => throughput is 6.84E+01 events/s
+
+*** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693100945435831E-006) differ by less than 2E-14 (2.4424906541753444e-15)
+
+*** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
+
+*** (2-none) EXECUTE MADEVENT_CPP x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1358436158813953E-007] fbridge_mode=1
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          : 1409.0435s
+ [COUNTERS] Fortran Overhead ( 0 ) :   99.0565s
+ [COUNTERS] CudaCpp MEs      ( 2 ) : 1309.9869s for    90112 events => throughput is 6.88E+01 events/s
+
+*** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813953E-007) differ by less than 2E-14 (1.1102230246251565e-15)
+
+*** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
+
+*** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 8.535302e+01                 )  sec^-1
+
+*** EXECUTE CHECK(8192) -p 256 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 8.232167e+01                 )  sec^-1
+
+*** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1693100945435827E-006] fbridge_mode=1
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :  107.7463s
+ [COUNTERS] Fortran Overhead ( 0 ) :   49.5074s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   58.2390s for     8192 events => throughput is 1.41E+02 events/s
+
+*** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693100945435827E-006) differ by less than 2E-14 (2.220446049250313e-15)
+
+*** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
+
+*** (2-sse4) EXECUTE MADEVENT_CPP x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          :  695.6110s
+ [COUNTERS] Fortran Overhead ( 0 ) :   53.4125s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  642.1984s for    90112 events => throughput is 1.40E+02 events/s
+
+*** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (8.881784197001252e-16)
+
+*** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
+
+*** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 1.667754e+02                 )  sec^-1
+
+*** EXECUTE CHECK(8192) -p 256 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 1.672792e+02                 )  sec^-1
+
+*** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1693100945435829E-006] fbridge_mode=1
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :   50.7441s
+ [COUNTERS] Fortran Overhead ( 0 ) :   23.3520s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   27.3921s for     8192 events => throughput is 2.99E+02 events/s
+
+*** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693100945435829E-006) differ by less than 2E-14 (2.4424906541753444e-15)
+
+*** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
+
+*** (2-avx2) EXECUTE MADEVENT_CPP x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          :  331.0298s
+ [COUNTERS] Fortran Overhead ( 0 ) :   27.1582s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  303.8716s for    90112 events => throughput is 2.97E+02 events/s
+
+*** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (8.881784197001252e-16)
+
+*** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
+
+*** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 3.602735e+02                 )  sec^-1
+
+*** EXECUTE CHECK(8192) -p 256 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 3.607119e+02                 )  sec^-1
+
+*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1693100945435829E-006] fbridge_mode=1
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :   44.2409s
+ [COUNTERS] Fortran Overhead ( 0 ) :   20.3557s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   23.8852s for     8192 events => throughput is 3.43E+02 events/s
+
+*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693100945435829E-006) differ by less than 2E-14 (2.4424906541753444e-15)
+
+*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
+
+*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          :  289.3981s
+ [COUNTERS] Fortran Overhead ( 0 ) :   23.9732s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  265.4249s for    90112 events => throughput is 3.40E+02 events/s
+
+*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (8.881784197001252e-16)
+
+*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
+
+*** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 4.111160e+02                 )  sec^-1
+
+*** EXECUTE CHECK(8192) -p 256 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 4.141844e+02                 )  sec^-1
+
+*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1693100945435829E-006] fbridge_mode=1
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :   45.6199s
+ [COUNTERS] Fortran Overhead ( 0 ) :   22.4059s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   23.2139s for     8192 events => throughput is 3.53E+02 events/s
+
+*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693100945435829E-006) differ by less than 2E-14 (2.4424906541753444e-15)
+
+*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
+
+*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          :  283.6130s
+ [COUNTERS] Fortran Overhead ( 0 ) :   26.2046s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  257.4085s for    90112 events => throughput is 3.50E+02 events/s
+
+*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (8.881784197001252e-16)
+
+*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
+
+*** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 3.763228e+02                 )  sec^-1
+
+*** EXECUTE CHECK(8192) -p 256 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 3.741992e+02                 )  sec^-1
+
+*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1693100945435838E-006] fbridge_mode=1
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :    4.1979s
+ [COUNTERS] Fortran Overhead ( 0 ) :    3.1190s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0789s for     8192 events => throughput is 7.59E+03 events/s
+
+*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693100945435838E-006) differ by less than 2E-14 (3.1086244689504383e-15)
+
+*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
+
+*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          :   18.6565s
+ [COUNTERS] Fortran Overhead ( 0 ) :    6.7674s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   11.8891s for    90112 events => throughput is 7.58E+03 events/s
+
+*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (8.881784197001252e-16)
+
+*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
+
+*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 7.527117e+03                 )  sec^-1
+
+*** EXECUTE GCHECK(8192) -p 256 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 9.256112e+03                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 9.240392e+03                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX) -p 512 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 9.568765e+03                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 9.279873e+03                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 9.441727e+03                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 9.268118e+03                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 3.240204e+03                 )  sec^-1
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
index e844ee5b79..9a7b15ddba 100644
--- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
@@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-03_20:09:50
+DATE: 2023-11-09_20:10:26
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
@@ -51,14 +51,552 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192
 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
 --------------------
 Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran'
-ERROR! ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' failed
-d R # 5  >    -0.0    -0.0    -0.0     0.4     0.4
-d R # 6  >    -0.0    -0.0    -0.0    -0.0     0.4
-s min # 3>     0.0119716.0 29929.0 29929.0     0.0
-s min # 4>     0.0     0.0 29929.0 29929.0     0.0
-s min # 5>     0.0     0.0     0.0     0.0     0.0
-s min # 6>     0.0     0.0     0.0     0.0     0.0
-xqcutij # 3>     0.0     0.0     0.0     0.0     0.0
-xqcutij # 4>     0.0     0.0     0.0     0.0     0.0
-xqcutij # 5>     0.0     0.0     0.0     0.0     0.0
-xqcutij # 6>     0.0     0.0     0.0     0.0     0.0
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0
+ [UNWEIGHT] Wrote 1 events (found 166 events)
+ [COUNTERS] PROGRAM TOTAL          :   95.6517s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4537s
+ [COUNTERS] Fortran MEs      ( 1 ) :   95.1980s for     8192 events => throughput is 8.61E+01 events/s
+
+*** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :   95.5775s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4538s
+ [COUNTERS] Fortran MEs      ( 1 ) :   95.1237s for     8192 events => throughput is 8.61E+01 events/s
+
+*** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1358436158813976E-007] fbridge_mode=0
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          : 1055.1274s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.1731s
+ [COUNTERS] Fortran MEs      ( 1 ) : 1050.9543s for    90112 events => throughput is 8.57E+01 events/s
+
+*** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1694768344939596E-006] fbridge_mode=1
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :  198.8691s
+ [COUNTERS] Fortran Overhead ( 0 ) :   90.2534s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  108.6157s for     8192 events => throughput is 7.54E+01 events/s
+
+*** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1694768344939596E-006) differ by less than 4E-4 (0.00014259686216466783)
+
+*** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
+
+*** (2-none) EXECUTE MADEVENT_CPP x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1361436150871156E-007] fbridge_mode=1
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          : 1275.3669s
+ [COUNTERS] Fortran Overhead ( 0 ) :   93.9491s
+ [COUNTERS] CudaCpp MEs      ( 2 ) : 1181.4178s for    90112 events => throughput is 7.63E+01 events/s
+
+*** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361436150871156E-007) differ by less than 4E-4 (0.00014045934987350073)
+
+*** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
+
+*** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 9.083570e+01                 )  sec^-1
+
+*** EXECUTE CHECK(8192) -p 256 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 9.167448e+01                 )  sec^-1
+
+*** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1694765850750953E-006] fbridge_mode=1
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :   49.8398s
+ [COUNTERS] Fortran Overhead ( 0 ) :   23.4099s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   26.4299s for     8192 events => throughput is 3.10E+02 events/s
+
+*** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1694765850750953E-006) differ by less than 4E-4 (0.00014238355787066226)
+
+*** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
+
+*** (2-sse4) EXECUTE MADEVENT_CPP x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1361430669586527E-007] fbridge_mode=1
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          :  320.3836s
+ [COUNTERS] Fortran Overhead ( 0 ) :   26.9904s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  293.3932s for    90112 events => throughput is 3.07E+02 events/s
+
+*** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361430669586527E-007) differ by less than 4E-4 (0.00014020271663550687)
+
+*** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
+
+*** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 3.524011e+02                 )  sec^-1
+
+*** EXECUTE CHECK(8192) -p 256 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 3.562557e+02                 )  sec^-1
+
+*** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1694764951124567E-006] fbridge_mode=1
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :   25.3018s
+ [COUNTERS] Fortran Overhead ( 0 ) :   11.8221s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   13.4798s for     8192 events => throughput is 6.08E+02 events/s
+
+*** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1694764951124567E-006) differ by less than 4E-4 (0.00014230662135994443)
+
+*** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
+
+*** (2-avx2) EXECUTE MADEVENT_CPP x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1361430425531218E-007] fbridge_mode=1
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          :  161.8530s
+ [COUNTERS] Fortran Overhead ( 0 ) :   15.4501s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  146.4028s for    90112 events => throughput is 6.16E+02 events/s
+
+*** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361430425531218E-007) differ by less than 4E-4 (0.0001401912899885449)
+
+*** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
+
+*** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 7.213869e+02                 )  sec^-1
+
+*** EXECUTE CHECK(8192) -p 256 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 7.163477e+02                 )  sec^-1
+
+*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1694764951124567E-006] fbridge_mode=1
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :   22.2497s
+ [COUNTERS] Fortran Overhead ( 0 ) :   10.3581s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   11.8916s for     8192 events => throughput is 6.89E+02 events/s
+
+*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1694764951124567E-006) differ by less than 4E-4 (0.00014230662135994443)
+
+*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
+
+*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1361430425531218E-007] fbridge_mode=1
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          :  144.5243s
+ [COUNTERS] Fortran Overhead ( 0 ) :   14.0601s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  130.4642s for    90112 events => throughput is 6.91E+02 events/s
+
+*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361430425531218E-007) differ by less than 4E-4 (0.0001401912899885449)
+
+*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
+
+*** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 8.261245e+02                 )  sec^-1
+
+*** EXECUTE CHECK(8192) -p 256 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 8.179572e+02                 )  sec^-1
+
+*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1694767957195604E-006] fbridge_mode=1
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :   22.8272s
+ [COUNTERS] Fortran Overhead ( 0 ) :   11.2607s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   11.5665s for     8192 events => throughput is 7.08E+02 events/s
+
+*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1694767957195604E-006) differ by less than 4E-4 (0.00014256370209930758)
+
+*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
+
+*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1361435956349820E-007] fbridge_mode=1
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          :  143.3402s
+ [COUNTERS] Fortran Overhead ( 0 ) :   14.9961s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  128.3441s for    90112 events => throughput is 7.02E+02 events/s
+
+*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361435956349820E-007) differ by less than 4E-4 (0.00014045024240250115)
+
+*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
+
+*** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 7.537594e+02                 )  sec^-1
+
+*** EXECUTE CHECK(8192) -p 256 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 7.456699e+02                 )  sec^-1
+
+*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1694770708195000E-006] fbridge_mode=1
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :    2.4571s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.9676s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4895s for     8192 events => throughput is 1.67E+04 events/s
+
+*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1694770708195000E-006) differ by less than 4E-4 (0.00014279896898083955)
+
+*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
+
+*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1361443477565659E-007] fbridge_mode=1
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          :   11.0626s
+ [COUNTERS] Fortran Overhead ( 0 ) :    5.6077s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    5.4549s for    90112 events => throughput is 1.65E+04 events/s
+
+*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361443477565659E-007) differ by less than 4E-4 (0.0001408023850304474)
+
+*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
+
+*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 1.640892e+04                 )  sec^-1
+
+*** EXECUTE GCHECK(8192) -p 256 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 1.619412e+04                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 2.340657e+04                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX) -p 512 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 2.426283e+04                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 2.326049e+04                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 2.360046e+04                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 2.341201e+04                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 6.441486e+03                 )  sec^-1
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
index 43bf5072f2..e947131942 100644
--- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
@@ -2,10 +2,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g
 CUDACPP_BUILDDIR='.'
 
 
-make USEBUILDDIR=1 AVX=none
-make USEBUILDDIR=1 AVX=sse4
 
+make USEBUILDDIR=1 AVX=none
 
+make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
 
@@ -15,17 +15,17 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.none_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-03_20:09:53
+DATE: 2023-11-09_21:16:08
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
@@ -51,14 +51,552 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192
 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
 --------------------
 Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran'
-ERROR! ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' failed
-d R # 5  >    -0.0    -0.0    -0.0     0.4     0.4
-d R # 6  >    -0.0    -0.0    -0.0    -0.0     0.4
-s min # 3>     0.0119716.0 29929.0 29929.0     0.0
-s min # 4>     0.0     0.0 29929.0 29929.0     0.0
-s min # 5>     0.0     0.0     0.0     0.0     0.0
-s min # 6>     0.0     0.0     0.0     0.0     0.0
-xqcutij # 3>     0.0     0.0     0.0     0.0     0.0
-xqcutij # 4>     0.0     0.0     0.0     0.0     0.0
-xqcutij # 5>     0.0     0.0     0.0     0.0     0.0
-xqcutij # 6>     0.0     0.0     0.0     0.0     0.0
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0
+ [UNWEIGHT] Wrote 1 events (found 166 events)
+ [COUNTERS] PROGRAM TOTAL          :   95.6107s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4599s
+ [COUNTERS] Fortran MEs      ( 1 ) :   95.1508s for     8192 events => throughput is 8.61E+01 events/s
+
+*** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :   95.5844s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4540s
+ [COUNTERS] Fortran MEs      ( 1 ) :   95.1304s for     8192 events => throughput is 8.61E+01 events/s
+
+*** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1358436158813976E-007] fbridge_mode=0
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          : 1052.2893s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.1570s
+ [COUNTERS] Fortran MEs      ( 1 ) : 1048.1323s for    90112 events => throughput is 8.60E+01 events/s
+
+*** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1693101016896846E-006] fbridge_mode=1
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :  223.0748s
+ [COUNTERS] Fortran Overhead ( 0 ) :  103.3973s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  119.6775s for     8192 events => throughput is 6.85E+01 events/s
+
+*** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693101016896846E-006) differ by less than 2E-4 (6.111385175699979e-09)
+
+*** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
+
+*** (2-none) EXECUTE MADEVENT_CPP x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1358436275882778E-007] fbridge_mode=1
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          : 1425.4469s
+ [COUNTERS] Fortran Overhead ( 0 ) :  107.1167s
+ [COUNTERS] CudaCpp MEs      ( 2 ) : 1318.3302s for    90112 events => throughput is 6.84E+01 events/s
+
+*** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436275882778E-007) differ by less than 2E-4 (5.48115042242614e-09)
+
+*** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
+
+*** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 7.990567e+01                 )  sec^-1
+
+*** EXECUTE CHECK(8192) -p 256 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 8.033316e+01                 )  sec^-1
+
+*** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1693101020910778E-006] fbridge_mode=1
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :  112.1583s
+ [COUNTERS] Fortran Overhead ( 0 ) :   51.1368s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   61.0216s for     8192 events => throughput is 1.34E+02 events/s
+
+*** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693101020910778E-006) differ by less than 2E-4 (6.454658807442115e-09)
+
+*** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
+
+*** (2-sse4) EXECUTE MADEVENT_CPP x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1358436284111598E-007] fbridge_mode=1
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          :  719.1467s
+ [COUNTERS] Fortran Overhead ( 0 ) :   54.6964s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  664.4503s for    90112 events => throughput is 1.36E+02 events/s
+
+*** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436284111598E-007) differ by less than 2E-4 (5.866422903011426e-09)
+
+*** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
+
+*** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 1.625730e+02                 )  sec^-1
+
+*** EXECUTE CHECK(8192) -p 256 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 1.622146e+02                 )  sec^-1
+
+*** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1693101021831071E-006] fbridge_mode=1
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :   48.7268s
+ [COUNTERS] Fortran Overhead ( 0 ) :   22.2016s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   26.5252s for     8192 events => throughput is 3.09E+02 events/s
+
+*** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693101021831071E-006) differ by less than 2E-4 (6.5333627397023974e-09)
+
+*** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
+
+*** (2-avx2) EXECUTE MADEVENT_CPP x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1358436281462142E-007] fbridge_mode=1
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          :  312.7787s
+ [COUNTERS] Fortran Overhead ( 0 ) :   25.8939s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  286.8848s for    90112 events => throughput is 3.14E+02 events/s
+
+*** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436281462142E-007) differ by less than 2E-4 (5.742375686068613e-09)
+
+*** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
+
+*** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 3.761983e+02                 )  sec^-1
+
+*** EXECUTE CHECK(8192) -p 256 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 3.775859e+02                 )  sec^-1
+
+*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1693101021831071E-006] fbridge_mode=1
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :   42.1739s
+ [COUNTERS] Fortran Overhead ( 0 ) :   19.2356s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   22.9383s for     8192 events => throughput is 3.57E+02 events/s
+
+*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693101021831071E-006) differ by less than 2E-4 (6.5333627397023974e-09)
+
+*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
+
+*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1358436281462142E-007] fbridge_mode=1
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          :  277.3137s
+ [COUNTERS] Fortran Overhead ( 0 ) :   23.0478s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  254.2659s for    90112 events => throughput is 3.54E+02 events/s
+
+*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436281462142E-007) differ by less than 2E-4 (5.742375686068613e-09)
+
+*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
+
+*** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 4.346725e+02                 )  sec^-1
+
+*** EXECUTE CHECK(8192) -p 256 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 4.360141e+02                 )  sec^-1
+
+*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1693101021831071E-006] fbridge_mode=1
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :   45.3760s
+ [COUNTERS] Fortran Overhead ( 0 ) :   21.9554s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   23.4206s for     8192 events => throughput is 3.50E+02 events/s
+
+*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693101021831071E-006) differ by less than 2E-4 (6.5333627397023974e-09)
+
+*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
+
+*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1358436281462142E-007] fbridge_mode=1
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          :  283.3743s
+ [COUNTERS] Fortran Overhead ( 0 ) :   25.7277s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  257.6465s for    90112 events => throughput is 3.50E+02 events/s
+
+*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436281462142E-007) differ by less than 2E-4 (5.742375686068613e-09)
+
+*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
+
+*** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 3.787133e+02                 )  sec^-1
+
+*** EXECUTE CHECK(8192) -p 256 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 3.796022e+02                 )  sec^-1
+
+*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1693100942770687E-006] fbridge_mode=1
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :    3.5891s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.7218s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8672s for     8192 events => throughput is 9.45E+03 events/s
+
+*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693100942770687E-006) differ by less than 2E-4 (2.2792201459509442e-10)
+
+*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
+
+*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1358436157495368E-007] fbridge_mode=1
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          :   15.8181s
+ [COUNTERS] Fortran Overhead ( 0 ) :    6.3338s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    9.4843s for    90112 events => throughput is 9.50E+03 events/s
+
+*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436157495368E-007) differ by less than 2E-4 (6.173705990875078e-11)
+
+*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
+
+*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 9.489325e+03                 )  sec^-1
+
+*** EXECUTE GCHECK(8192) -p 256 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 1.086868e+04                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 1.112402e+04                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX) -p 512 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 1.163573e+04                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 1.112546e+04                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 1.110187e+04                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 1.113455e+04                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 3.651684e+03                 )  sec^-1
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
index 2a2ae334de..17d6db3749 100644
--- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
@@ -3,9 +3,9 @@ CUDACPP_BUILDDIR='.'
 
 
 make USEBUILDDIR=1 AVX=none
+make USEBUILDDIR=1 AVX=sse4
 
 
-make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
 
@@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
-CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0'
-CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-03_20:08:20
+DATE: 2023-11-09_18:42:02
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0
  [UNWEIGHT] Wrote 78 events (found 561 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3085s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2380s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0705s for     8192 events => throughput is 1.16E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3065s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2361s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0704s for     8192 events => throughput is 1.16E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3042s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2333s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0708s for     8192 events => throughput is 1.16E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2994s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2293s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0701s for     8192 events => throughput is 1.17E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182648615874] fbridge_mode=0
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    2.2114s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4363s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.7751s for    90112 events => throughput is 1.16E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.1760s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4125s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.7635s for    90112 events => throughput is 1.18E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3922s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3158s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0765s for     8192 events => throughput is 1.07E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3841s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3081s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0760s for     8192 events => throughput is 1.08E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182648615863] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    2.3858s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5438s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8420s for    90112 events => throughput is 1.07E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.3472s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5183s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8289s for    90112 events => throughput is 1.09E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.080426e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.089572e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.086485e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.081996e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333309703733] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3230s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2818s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0412s for     8192 events => throughput is 1.99E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3165s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2761s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0404s for     8192 events => throughput is 2.03E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182648615872] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.9553s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4982s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4571s for    90112 events => throughput is 1.97E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.9307s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4770s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4537s for    90112 events => throughput is 1.99E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.984398e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.997353e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.942353e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.027039e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333309703733] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2839s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2604s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0236s for     8192 events => throughput is 3.47E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2805s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2572s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0233s for     8192 events => throughput is 3.52E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182648615863] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7384s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4779s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2605s for    90112 events => throughput is 3.46E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7189s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4610s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2579s for    90112 events => throughput is 3.49E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.360936e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.495576e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.508331e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.465419e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333309703733] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2789s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2578s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0211s for     8192 events => throughput is 3.88E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2772s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2561s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0211s for     8192 events => throughput is 3.89E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182648615863] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7099s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4763s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2336s for    90112 events => throughput is 3.86E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6775s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4482s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2293s for    90112 events => throughput is 3.93E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.911581e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.760921e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.775740e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.978083e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333309703733] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3043s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2722s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0321s for     8192 events => throughput is 2.55E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2977s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2665s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0311s for     8192 events => throughput is 2.63E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182648615863] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.8369s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4881s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3488s for    90112 events => throughput is 2.58E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8099s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4686s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3413s for    90112 events => throughput is 2.64E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.489296e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.568787e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.512748e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.561174e+05                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,8 +514,8 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333309703733] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6694s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6687s
+ [COUNTERS] PROGRAM TOTAL          :    0.6636s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6629s
  [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0007s for     8192 events => throughput is 1.19E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -547,9 +547,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182648615869] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.9131s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.9051s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0080s for    90112 events => throughput is 1.13E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8698s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8622s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0076s for    90112 events => throughput is 1.18E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.578046e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.555687e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.918680e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.006338e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.385541e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.515172e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.515910e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.526258e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.366310e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.533570e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.781318e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.783496e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.383694e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.532375e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.778819e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.774257e+07                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
index 76ba714558..a15824491a 100644
--- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
@@ -4,8 +4,8 @@ CUDACPP_BUILDDIR='.'
 
 
 make USEBUILDDIR=1 AVX=none
-
 make USEBUILDDIR=1 AVX=sse4
+
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
 
@@ -15,13 +15,13 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-03_20:08:49
+DATE: 2023-11-09_18:42:31
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0
  [UNWEIGHT] Wrote 78 events (found 561 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3082s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2374s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0709s for     8192 events => throughput is 1.16E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3036s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2341s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0695s for     8192 events => throughput is 1.18E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3089s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2376s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0713s for     8192 events => throughput is 1.15E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3003s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2303s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0699s for     8192 events => throughput is 1.17E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182648615874] fbridge_mode=0
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    2.2176s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4416s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.7760s for    90112 events => throughput is 1.16E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.2069s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4340s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.7729s for    90112 events => throughput is 1.17E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050316058770007] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3831s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3106s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0725s for     8192 events => throughput is 1.13E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3749s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3033s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0716s for     8192 events => throughput is 1.14E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182797520666] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    2.3282s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5337s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.7945s for    90112 events => throughput is 1.13E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.2764s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4961s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.7803s for    90112 events => throughput is 1.15E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.150985e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.160144e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.152855e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.172915e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050313133963987] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2893s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2630s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0263s for     8192 events => throughput is 3.11E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2845s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2592s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0253s for     8192 events => throughput is 3.23E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801179276862181] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7627s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4776s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2851s for    90112 events => throughput is 3.16E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7355s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4565s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2790s for    90112 events => throughput is 3.23E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.058447e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.194415e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.117460e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.097783e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050313344346482] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2617s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2490s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0127s for     8192 events => throughput is 6.47E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2583s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2455s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0128s for     8192 events => throughput is 6.41E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801179137376883] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6264s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4861s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1403s for    90112 events => throughput is 6.42E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.5855s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4486s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1369s for    90112 events => throughput is 6.58E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.320941e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.397086e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.304004e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.385448e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050313344346482] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2672s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2557s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0115s for     8192 events => throughput is 7.13E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2587s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2472s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0114s for     8192 events => throughput is 7.16E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801179137376883] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6213s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4898s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1315s for    90112 events => throughput is 6.85E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.5778s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4506s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1272s for    90112 events => throughput is 7.09E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.800881e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.864944e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.852775e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.826763e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050317064561834] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2707s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2550s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0157s for     8192 events => throughput is 5.22E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2685s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2527s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0158s for     8192 events => throughput is 5.17E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182143140752] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6597s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4813s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1784s for    90112 events => throughput is 5.05E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6231s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4511s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1720s for    90112 events => throughput is 5.24E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.682841e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.932364e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.814031e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.764394e+05                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050319131407651] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6668s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6663s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.60E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6586s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6581s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.57E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,9 +547,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801186038252196] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.9031s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.8970s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0062s for    90112 events => throughput is 1.46E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.9395s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.9332s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0063s for    90112 events => throughput is 1.43E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.810157e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.830948e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.442986e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.471030e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.776377e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.130497e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.714442e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.724199e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.784654e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.113825e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.791545e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.756435e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.353442e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.594258e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.984091e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.959495e+07                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
index d9f19e3972..3468beddc5 100644
--- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
@@ -3,8 +3,8 @@ CUDACPP_BUILDDIR='.'
 
 
 
-
 make USEBUILDDIR=1 AVX=none
+
 make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
@@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.none_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-03_20:09:17
+DATE: 2023-11-09_18:42:59
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0
  [UNWEIGHT] Wrote 78 events (found 561 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3076s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2365s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0711s for     8192 events => throughput is 1.15E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3047s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2346s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0701s for     8192 events => throughput is 1.17E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3048s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2341s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0707s for     8192 events => throughput is 1.16E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3065s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2359s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0705s for     8192 events => throughput is 1.16E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182648615874] fbridge_mode=0
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    2.2173s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4390s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.7783s for    90112 events => throughput is 1.16E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.2175s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4409s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.7766s for    90112 events => throughput is 1.16E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333282657206] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3915s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3150s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0766s for     8192 events => throughput is 1.07E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3854s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3097s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0757s for     8192 events => throughput is 1.08E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182636608796] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    2.4080s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5555s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8525s for    90112 events => throughput is 1.06E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.3546s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5224s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8322s for    90112 events => throughput is 1.08E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.026153e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.083780e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.029864e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.087409e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,8 +210,8 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333282657201] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3183s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2784s
+ [COUNTERS] PROGRAM TOTAL          :    0.3153s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2754s
  [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0399s for     8192 events => throughput is 2.05E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182636608810] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.9529s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5041s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4488s for    90112 events => throughput is 2.01E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.9817s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5338s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4479s for    90112 events => throughput is 2.01E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.013366e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.021169e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.020889e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.048865e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333291481387] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2872s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2636s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0235s for     8192 events => throughput is 3.48E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2833s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2602s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0231s for     8192 events => throughput is 3.54E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182638680733] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7508s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4909s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2599s for    90112 events => throughput is 3.47E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7176s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4613s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2564s for    90112 events => throughput is 3.52E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.380135e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.495609e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.471740e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.519650e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333291481387] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2817s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2612s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0206s for     8192 events => throughput is 3.98E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2747s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2542s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0205s for     8192 events => throughput is 3.99E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182638680733] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7107s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4801s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2306s for    90112 events => throughput is 3.91E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6910s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4668s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2243s for    90112 events => throughput is 4.02E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.890792e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.857183e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.973788e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.991341e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333291481387] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3050s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2711s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0339s for     8192 events => throughput is 2.42E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3063s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2736s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0327s for     8192 events => throughput is 2.50E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182638680733] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.8573s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4973s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3600s for    90112 events => throughput is 2.50E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8379s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4792s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3586s for    90112 events => throughput is 2.51E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.438047e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.546786e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.395865e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.503592e+05                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333301029693] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6671s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6664s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0007s for     8192 events => throughput is 1.19E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6613s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6607s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0007s for     8192 events => throughput is 1.22E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,9 +547,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182637219935] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.8923s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.8845s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0077s for    90112 events => throughput is 1.16E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8739s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8663s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0076s for    90112 events => throughput is 1.19E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.584492e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.582711e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.972938e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.041620e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.377134e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.534455e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.496287e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.524256e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.388325e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.513154e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.763560e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.797491e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.382255e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.528865e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.773123e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.779970e+07                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
index 4e0cc4f360..96be4f25ce 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-03_19:00:16
+DATE: 2023-11-09_17:36:35
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.995135e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.942022e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.073010e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.632744e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.846433e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.013402e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.649523 sec
-     2,606,897,569      cycles                           #    2.955 GHz                    
-     4,039,165,920      instructions                     #    1.55  insn per cycle         
-       0.938736477 seconds time elapsed
+TOTAL       :     0.666402 sec
+     2,677,197,972      cycles                           #    3.012 GHz                    
+     4,052,373,824      instructions                     #    1.51  insn per cycle         
+       0.957128261 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166
@@ -77,14 +77,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.116390e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.309346e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.309346e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.129159e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.324668e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.324668e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     6.039128 sec
-    18,293,625,810      cycles                           #    3.027 GHz                    
-    44,037,997,118      instructions                     #    2.41  insn per cycle         
-       6.044375342 seconds time elapsed
+TOTAL       :     5.970581 sec
+    18,294,560,469      cycles                           #    3.063 GHz                    
+    44,035,841,714      instructions                     #    2.41  insn per cycle         
+       5.975709847 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  433) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.650519e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.159299e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.159299e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.674808e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.201099e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.201099e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.212186 sec
-    12,761,177,625      cycles                           #    3.027 GHz                    
-    31,004,602,670      instructions                     #    2.43  insn per cycle         
-       4.217391637 seconds time elapsed
+TOTAL       :     4.151985 sec
+    12,801,375,184      cycles                           #    3.080 GHz                    
+    31,001,968,290      instructions                     #    2.42  insn per cycle         
+       4.157180427 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1643) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.065360e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.886676e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.886676e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.097286e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.929276e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.929276e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.440327 sec
-    10,045,086,881      cycles                           #    2.916 GHz                    
-    19,380,193,658      instructions                     #    1.93  insn per cycle         
-       3.445672409 seconds time elapsed
+TOTAL       :     3.388202 sec
+    10,019,877,774      cycles                           #    2.954 GHz                    
+    19,377,611,613      instructions                     #    1.93  insn per cycle         
+       3.393320382 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1965) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.092180e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.955480e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.955480e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.171888e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.054473e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.054473e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.409304 sec
-     9,718,965,428      cycles                           #    2.848 GHz                    
-    18,998,332,681      instructions                     #    1.95  insn per cycle         
-       3.414677998 seconds time elapsed
+TOTAL       :     3.283560 sec
+     9,692,698,438      cycles                           #    2.948 GHz                    
+    19,006,248,514      instructions                     #    1.96  insn per cycle         
+       3.288694745 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1689) (512y:  181) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.821062e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.417007e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.417007e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.836531e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.447502e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.447502e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.852694 sec
-     8,598,148,642      cycles                           #    2.229 GHz                    
-    15,740,848,417      instructions                     #    1.83  insn per cycle         
-       3.858015954 seconds time elapsed
+TOTAL       :     3.828285 sec
+     8,619,412,035      cycles                           #    2.250 GHz                    
+    15,739,302,747      instructions                     #    1.83  insn per cycle         
+       3.833534805 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  900) (512y:  154) (512z: 1258)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt
index a2a2220e0b..46e9abca4a 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-03_19:34:09
+DATE: 2023-11-09_18:08:58
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.616160e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.542311e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.542311e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.786999e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.766835e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.766835e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.257075 sec
-     7,500,299,564      cycles                           #    3.000 GHz                    
-    13,128,281,558      instructions                     #    1.75  insn per cycle         
-       2.557069801 seconds time elapsed
+TOTAL       :     2.197852 sec
+     7,407,513,320      cycles                           #    3.040 GHz                    
+    13,213,549,787      instructions                     #    1.78  insn per cycle         
+       2.495471586 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
@@ -86,14 +86,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.074156e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.251964e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.251964e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.082808e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.262532e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.262532e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     6.457469 sec
-    19,613,725,947      cycles                           #    3.035 GHz                    
-    44,260,538,354      instructions                     #    2.26  insn per cycle         
-       6.464068851 seconds time elapsed
+TOTAL       :     6.417727 sec
+    19,594,664,001      cycles                           #    3.052 GHz                    
+    44,265,878,138      instructions                     #    2.26  insn per cycle         
+       6.424119903 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  433) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe
@@ -114,14 +114,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.537992e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.980628e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.980628e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.589377e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.044221e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.044221e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.703362 sec
-    14,014,545,412      cycles                           #    2.976 GHz                    
-    31,843,317,256      instructions                     #    2.27  insn per cycle         
-       4.710044451 seconds time elapsed
+TOTAL       :     4.559857 sec
+    14,005,526,343      cycles                           #    3.068 GHz                    
+    31,844,006,198      instructions                     #    2.27  insn per cycle         
+       4.566322148 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1643) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe
@@ -142,14 +142,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.930954e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.630364e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.630364e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.929770e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.628189e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.628189e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.870178 sec
-    11,351,058,249      cycles                           #    2.929 GHz                    
-    20,737,271,008      instructions                     #    1.83  insn per cycle         
-       3.876822605 seconds time elapsed
+TOTAL       :     3.878054 sec
+    11,287,723,645      cycles                           #    2.906 GHz                    
+    20,738,072,181      instructions                     #    1.84  insn per cycle         
+       3.884538371 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1965) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe
@@ -170,14 +170,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.936889e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.651989e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.651989e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.014169e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.779352e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.779352e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.871998 sec
-    11,000,759,855      cycles                           #    2.837 GHz                    
-    20,365,657,381      instructions                     #    1.85  insn per cycle         
-       3.879015734 seconds time elapsed
+TOTAL       :     3.727856 sec
+    11,041,223,612      cycles                           #    2.958 GHz                    
+    20,355,670,345      instructions                     #    1.84  insn per cycle         
+       3.734291913 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1689) (512y:  181) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe
@@ -198,14 +198,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.694377e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.207135e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.207135e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.744355e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.276403e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.276403e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.335020 sec
-     9,935,731,633      cycles                           #    2.289 GHz                    
-    16,882,918,411      instructions                     #    1.70  insn per cycle         
-       4.341683669 seconds time elapsed
+TOTAL       :     4.223001 sec
+     9,961,082,180      cycles                           #    2.356 GHz                    
+    16,884,642,255      instructions                     #    1.70  insn per cycle         
+       4.229415228 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  900) (512y:  154) (512z: 1258)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt
index dedce3e2ef..06dd49c8ef 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-03_19:47:12
+DATE: 2023-11-09_18:21:50
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.493472e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.526211e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.980085e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.833760e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.622748e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.982780e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     1.335531 sec
-     4,653,241,552      cycles                           #    2.971 GHz                    
-     7,232,975,239      instructions                     #    1.55  insn per cycle         
-       1.623039981 seconds time elapsed
+TOTAL       :     1.311946 sec
+     4,695,073,853      cycles                           #    3.035 GHz                    
+     7,228,449,301      instructions                     #    1.54  insn per cycle         
+       1.606166442 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166
@@ -77,14 +77,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.100587e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.292616e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.292616e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.133856e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.330921e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.330921e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     6.487751 sec
-    19,390,492,430      cycles                           #    2.987 GHz                    
-    44,137,957,280      instructions                     #    2.28  insn per cycle         
-       6.493082825 seconds time elapsed
+TOTAL       :     6.295519 sec
+    19,403,964,054      cycles                           #    3.081 GHz                    
+    44,141,070,523      instructions                     #    2.27  insn per cycle         
+       6.300790833 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  433) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.649039e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.157189e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.157189e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.674176e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.191162e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.191162e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     4.573606 sec
-    13,864,290,699      cycles                           #    3.029 GHz                    
-    31,004,021,041      instructions                     #    2.24  insn per cycle         
-       4.579072706 seconds time elapsed
+TOTAL       :     4.504649 sec
+    13,863,184,367      cycles                           #    3.075 GHz                    
+    31,003,513,865      instructions                     #    2.24  insn per cycle         
+       4.509943224 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1643) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.050077e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.865714e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.865714e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.015608e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.805515e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.805515e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     3.825144 sec
-    11,151,950,602      cycles                           #    2.912 GHz                    
-    19,279,192,444      instructions                     #    1.73  insn per cycle         
-       3.830421553 seconds time elapsed
+TOTAL       :     3.880062 sec
+    11,162,114,716      cycles                           #    2.882 GHz                    
+    19,285,048,189      instructions                     #    1.73  insn per cycle         
+       3.885435669 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1965) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.125943e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.996151e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.996151e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.146900e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.045970e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.045970e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     3.721741 sec
-    10,820,749,101      cycles                           #    2.904 GHz                    
-    18,706,645,976      instructions                     #    1.73  insn per cycle         
-       3.727088912 seconds time elapsed
+TOTAL       :     3.683003 sec
+    10,893,551,236      cycles                           #    2.955 GHz                    
+    18,696,669,062      instructions                     #    1.72  insn per cycle         
+       3.688290519 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1689) (512y:  181) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.802766e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.399092e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.399092e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.858668e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.475829e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.475829e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     4.260983 sec
-     9,758,383,682      cycles                           #    2.288 GHz                    
-    15,439,422,037      instructions                     #    1.58  insn per cycle         
-       4.266311634 seconds time elapsed
+TOTAL       :     4.138576 sec
+     9,729,969,286      cycles                           #    2.349 GHz                    
+    15,438,316,077      instructions                     #    1.59  insn per cycle         
+       4.143776269 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  900) (512y:  154) (512z: 1258)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt
index 753c8feb62..148fb0d2ee 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-03_19:43:56
+DATE: 2023-11-09_18:18:36
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.492551e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.537742e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.994776e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.853961e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.658990e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.049126e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.978991 sec
-     3,581,699,122      cycles                           #    2.964 GHz                    
-     7,061,755,742      instructions                     #    1.97  insn per cycle         
-       1.265379690 seconds time elapsed
+TOTAL       :     0.956476 sec
+     3,586,792,512      cycles                           #    3.034 GHz                    
+     7,163,432,319      instructions                     #    2.00  insn per cycle         
+       1.241060065 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166
@@ -77,14 +77,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.108457e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.301315e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.301315e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.134189e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.330626e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.330626e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     6.081290 sec
-    18,339,334,415      cycles                           #    3.014 GHz                    
-    44,033,842,254      instructions                     #    2.40  insn per cycle         
-       6.086519540 seconds time elapsed
+TOTAL       :     5.945995 sec
+    18,306,649,766      cycles                           #    3.077 GHz                    
+    44,036,304,039      instructions                     #    2.41  insn per cycle         
+       5.951221281 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  433) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.647910e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.158230e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.158230e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.656363e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.166761e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.166761e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.219825 sec
-    12,790,482,904      cycles                           #    3.028 GHz                    
-    31,000,190,511      instructions                     #    2.42  insn per cycle         
-       4.225042583 seconds time elapsed
+TOTAL       :     4.200416 sec
+    12,751,192,820      cycles                           #    3.033 GHz                    
+    31,001,487,666      instructions                     #    2.43  insn per cycle         
+       4.205764852 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1643) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.046562e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.846964e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.846964e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.102659e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.940126e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.940126e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.470466 sec
-    10,075,062,185      cycles                           #    2.899 GHz                    
-    19,376,808,574      instructions                     #    1.92  insn per cycle         
-       3.475725491 seconds time elapsed
+TOTAL       :     3.381762 sec
+    10,061,410,412      cycles                           #    2.972 GHz                    
+    19,378,394,064      instructions                     #    1.93  insn per cycle         
+       3.387061232 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1965) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.091991e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.948349e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.948349e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.165893e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.060121e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.060121e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.411832 sec
-     9,706,821,336      cycles                           #    2.841 GHz                    
-    18,993,945,887      instructions                     #    1.96  insn per cycle         
-       3.417093831 seconds time elapsed
+TOTAL       :     3.294663 sec
+     9,710,957,285      cycles                           #    2.944 GHz                    
+    18,994,988,980      instructions                     #    1.96  insn per cycle         
+       3.300038627 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1689) (512y:  181) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.817313e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.417390e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.417390e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.865019e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.483923e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.483923e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.864825 sec
-     8,629,354,000      cycles                           #    2.231 GHz                    
-    15,737,585,107      instructions                     #    1.82  insn per cycle         
-       3.870285071 seconds time elapsed
+TOTAL       :     3.767379 sec
+     8,603,525,039      cycles                           #    2.281 GHz                    
+    15,737,455,232      instructions                     #    1.83  insn per cycle         
+       3.772597879 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  900) (512y:  154) (512z: 1258)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt
index 8472c31bea..d2d2949097 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-03_19:40:37
+DATE: 2023-11-09_18:15:21
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -51,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.065913e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.488032e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.905997e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.240881e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.587683e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.915014e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     1.876732 sec
-     6,299,612,348      cycles                           #    2.989 GHz                    
-    11,571,253,190      instructions                     #    1.84  insn per cycle         
-       2.164294467 seconds time elapsed
+TOTAL       :     1.834661 sec
+     6,293,478,609      cycles                           #    3.041 GHz                    
+    11,504,742,224      instructions                     #    1.83  insn per cycle         
+       2.125902004 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -79,14 +79,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.111600e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.304742e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.304742e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.133681e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.328323e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.328323e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     6.064598 sec
-    18,297,128,822      cycles                           #    3.015 GHz                    
-    44,033,779,580      instructions                     #    2.41  insn per cycle         
-       6.069938342 seconds time elapsed
+TOTAL       :     5.944399 sec
+    18,276,841,424      cycles                           #    3.072 GHz                    
+    44,034,753,944      instructions                     #    2.41  insn per cycle         
+       5.949724506 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  433) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe
@@ -106,14 +106,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.622403e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.120612e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.120612e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.688289e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.207763e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.207763e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.285433 sec
-    12,790,120,071      cycles                           #    2.982 GHz                    
-    31,000,688,554      instructions                     #    2.42  insn per cycle         
-       4.290779048 seconds time elapsed
+TOTAL       :     4.121368 sec
+    12,748,827,025      cycles                           #    3.090 GHz                    
+    31,001,833,202      instructions                     #    2.43  insn per cycle         
+       4.126844954 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1643) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe
@@ -133,14 +133,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.044295e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.854365e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.854365e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.079781e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.896967e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.896967e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.476131 sec
-    10,066,944,453      cycles                           #    2.893 GHz                    
-    19,377,002,166      instructions                     #    1.92  insn per cycle         
-       3.481530813 seconds time elapsed
+TOTAL       :     3.417656 sec
+    10,039,679,603      cycles                           #    2.934 GHz                    
+    19,377,458,106      instructions                     #    1.93  insn per cycle         
+       3.423002014 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1965) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe
@@ -160,14 +160,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.095206e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.953285e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.953285e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.191213e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.094392e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.094392e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.401536 sec
-     9,758,102,764      cycles                           #    2.865 GHz                    
-    18,996,151,120      instructions                     #    1.95  insn per cycle         
-       3.406936941 seconds time elapsed
+TOTAL       :     3.256603 sec
+     9,688,244,134      cycles                           #    2.971 GHz                    
+    19,005,599,231      instructions                     #    1.96  insn per cycle         
+       3.261875957 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1689) (512y:  181) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe
@@ -187,14 +187,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.814025e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.410019e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.410019e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.880720e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.508965e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.508965e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.870433 sec
-     8,615,604,376      cycles                           #    2.224 GHz                    
-    15,736,922,136      instructions                     #    1.83  insn per cycle         
-       3.875834680 seconds time elapsed
+TOTAL       :     3.737405 sec
+     8,601,041,918      cycles                           #    2.299 GHz                    
+    15,737,525,138      instructions                     #    1.83  insn per cycle         
+       3.742726567 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  900) (512y:  154) (512z: 1258)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt
index b542059ad1..2943a1e3d5 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-03_19:00:50
+DATE: 2023-11-09_17:37:09
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.000398e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.960570e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.110004e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.636703e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.863019e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.046703e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.647549 sec
-     2,611,748,045      cycles                           #    2.979 GHz                    
-     4,046,502,501      instructions                     #    1.55  insn per cycle         
-       0.933750268 seconds time elapsed
+TOTAL       :     0.654694 sec
+     2,666,558,745      cycles                           #    3.022 GHz                    
+     4,096,338,325      instructions                     #    1.54  insn per cycle         
+       0.944612967 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 154
@@ -77,14 +77,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.159227e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.372064e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.372064e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.202919e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.424199e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.424199e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     5.833108 sec
-    17,445,226,847      cycles                           #    2.989 GHz                    
-    41,885,202,351      instructions                     #    2.40  insn per cycle         
-       5.838346819 seconds time elapsed
+TOTAL       :     5.624164 sec
+    17,409,154,909      cycles                           #    3.093 GHz                    
+    41,881,099,052      instructions                     #    2.41  insn per cycle         
+       5.629252249 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  392) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.682893e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.222491e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.222491e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.734385e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.287483e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.287483e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.142121 sec
-    12,470,632,862      cycles                           #    3.008 GHz                    
-    30,166,171,065      instructions                     #    2.42  insn per cycle         
-       4.147564686 seconds time elapsed
+TOTAL       :     4.020839 sec
+    12,439,753,645      cycles                           #    3.090 GHz                    
+    30,163,334,779      instructions                     #    2.42  insn per cycle         
+       4.026082449 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1611) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.069225e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.895121e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.895121e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.071596e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.904428e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.904428e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.437470 sec
-     9,952,077,094      cycles                           #    2.891 GHz                    
-    19,112,450,451      instructions                     #    1.92  insn per cycle         
-       3.442739539 seconds time elapsed
+TOTAL       :     3.432943 sec
+     9,954,541,311      cycles                           #    2.896 GHz                    
+    19,109,473,980      instructions                     #    1.92  insn per cycle         
+       3.438069931 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1930) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.130212e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.018241e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.018241e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.172502e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.071351e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.071351e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.352335 sec
-     9,644,260,853      cycles                           #    2.874 GHz                    
-    18,779,667,176      instructions                     #    1.95  insn per cycle         
-       3.357742942 seconds time elapsed
+TOTAL       :     3.287111 sec
+     9,635,946,931      cycles                           #    2.927 GHz                    
+    18,764,577,329      instructions                     #    1.95  insn per cycle         
+       3.292294749 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1661) (512y:  178) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.865497e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.495990e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.495990e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.921117e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.582437e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.582437e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.772482 sec
-     8,452,356,069      cycles                           #    2.238 GHz                    
-    15,617,271,494      instructions                     #    1.85  insn per cycle         
-       3.777813091 seconds time elapsed
+TOTAL       :     3.666524 sec
+     8,448,044,488      cycles                           #    2.302 GHz                    
+    15,613,692,408      instructions                     #    1.85  insn per cycle         
+       3.671704856 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  886) (512y:  156) (512z: 1239)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt
index 9fba89aff3..e7918e9c23 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-03_19:23:25
+DATE: 2023-11-09_17:58:09
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.483432e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.567049e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.058193e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.801176e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.647831e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.027831e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.676370 sec
-     2,703,741,341      cycles                           #    2.971 GHz                    
-     4,197,515,180      instructions                     #    1.55  insn per cycle         
-       0.967825669 seconds time elapsed
+TOTAL       :     0.681809 sec
+     2,713,657,783      cycles                           #    2.966 GHz                    
+     4,201,847,315      instructions                     #    1.55  insn per cycle         
+       0.974362645 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166
@@ -77,14 +77,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.672486e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.141310e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.141310e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.699910e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.178375e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.178375e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.163173 sec
-    12,692,329,334      cycles                           #    3.045 GHz                    
-    32,576,040,648      instructions                     #    2.57  insn per cycle         
-       4.168672183 seconds time elapsed
+TOTAL       :     4.094284 sec
+    12,664,884,276      cycles                           #    3.090 GHz                    
+    32,577,115,805      instructions                     #    2.57  insn per cycle         
+       4.099557701 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  296) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.116856e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.025219e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.025219e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.143219e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.065278e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.065278e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.372207 sec
-    10,267,724,267      cycles                           #    3.041 GHz                    
-    24,505,197,015      instructions                     #    2.39  insn per cycle         
-       3.377809241 seconds time elapsed
+TOTAL       :     3.331773 sec
+    10,271,423,521      cycles                           #    3.079 GHz                    
+    24,506,625,447      instructions                     #    2.39  insn per cycle         
+       3.337328311 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1251) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.304978e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.380785e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.380785e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.319805e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.394403e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.394403e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.125688 sec
-     9,128,103,141      cycles                           #    2.916 GHz                    
-    16,940,836,203      instructions                     #    1.86  insn per cycle         
-       3.131242434 seconds time elapsed
+TOTAL       :     3.108988 sec
+     9,122,185,757      cycles                           #    2.931 GHz                    
+    16,942,074,182      instructions                     #    1.86  insn per cycle         
+       3.114300266 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1631) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.298021e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.382509e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.382509e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.263608e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.556489e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.556489e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.144282 sec
-     8,899,696,508      cycles                           #    2.834 GHz                    
-    16,372,313,838      instructions                     #    1.84  insn per cycle         
-       3.149838418 seconds time elapsed
+TOTAL       :     3.169374 sec
+     9,426,858,565      cycles                           #    2.970 GHz                    
+    16,370,203,044      instructions                     #    1.74  insn per cycle         
+       3.174743316 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1370) (512y:  139) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.053092e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.845549e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.845549e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.105750e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.926413e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.926413e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.465226 sec
-     7,910,184,141      cycles                           #    2.280 GHz                    
-    14,591,740,895      instructions                     #    1.84  insn per cycle         
-       3.470686114 seconds time elapsed
+TOTAL       :     3.377253 sec
+     7,897,254,276      cycles                           #    2.335 GHz                    
+    14,592,693,571      instructions                     #    1.85  insn per cycle         
+       3.382567542 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1015) (512y:  158) (512z:  955)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt
index 9b85799057..676eafadb1 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-03_19:23:55
+DATE: 2023-11-09_17:58:39
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.480686e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.569964e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.063993e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.818208e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.668713e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.053456e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.677772 sec
-     2,691,282,086      cycles                           #    2.960 GHz                    
-     4,219,338,579      instructions                     #    1.57  insn per cycle         
-       0.971577356 seconds time elapsed
+TOTAL       :     0.673433 sec
+     2,679,233,058      cycles                           #    2.963 GHz                    
+     4,187,218,910      instructions                     #    1.56  insn per cycle         
+       0.965878825 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 154
@@ -77,14 +77,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.182406e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.087943e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.087943e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.244543e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.167572e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.167572e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.286151 sec
-     9,910,806,255      cycles                           #    3.012 GHz                    
-    25,456,031,111      instructions                     #    2.57  insn per cycle         
-       3.291763573 seconds time elapsed
+TOTAL       :     3.200260 sec
+     9,840,700,159      cycles                           #    3.071 GHz                    
+    25,456,933,061      instructions                     #    2.59  insn per cycle         
+       3.205821754 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  249) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.467752e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.800434e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.800434e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.515705e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.876135e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.876135e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.950518 sec
-     8,946,482,743      cycles                           #    3.027 GHz                    
-    21,514,123,834      instructions                     #    2.40  insn per cycle         
-       2.956056552 seconds time elapsed
+TOTAL       :     2.896836 sec
+     8,925,793,988      cycles                           #    3.076 GHz                    
+    21,514,573,078      instructions                     #    2.41  insn per cycle         
+       2.902177430 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1119) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.464134e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.723435e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.723435e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.506104e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.783990e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.783990e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.952533 sec
-     8,633,003,733      cycles                           #    2.920 GHz                    
-    15,829,431,121      instructions                     #    1.83  insn per cycle         
-       2.958100358 seconds time elapsed
+TOTAL       :     2.900756 sec
+     8,606,887,419      cycles                           #    2.962 GHz                    
+    15,829,788,154      instructions                     #    1.84  insn per cycle         
+       2.906279310 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1494) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.533505e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.859681e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.859681e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.541955e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.879613e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.879613e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.876122 sec
-     8,428,640,196      cycles                           #    2.926 GHz                    
-    15,527,735,744      instructions                     #    1.84  insn per cycle         
-       2.881608685 seconds time elapsed
+TOTAL       :     2.869140 sec
+     8,396,471,591      cycles                           #    2.922 GHz                    
+    15,529,030,850      instructions                     #    1.85  insn per cycle         
+       2.874505432 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1268) (512y:  139) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.128966e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.008830e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.008830e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.119247e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.990719e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.990719e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.361119 sec
-     7,560,312,259      cycles                           #    2.246 GHz                    
-    14,293,668,051      instructions                     #    1.89  insn per cycle         
-       3.366622669 seconds time elapsed
+TOTAL       :     3.376497 sec
+     7,569,554,118      cycles                           #    2.239 GHz                    
+    14,295,014,243      instructions                     #    1.89  insn per cycle         
+       3.381953719 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1041) (512y:  164) (512z:  874)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
index 46e803358f..b0b6c7dbbf 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-03_19:01:23
+DATE: 2023-11-09_17:37:41
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.626199e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.328475e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.281681e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.535063e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.287307e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.259593e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371687e-02 +- 3.270220e-06 )  GeV^0
-TOTAL       :     0.560646 sec
-     2,313,886,918      cycles                           #    2.957 GHz                    
-     3,567,705,327      instructions                     #    1.54  insn per cycle         
-       0.840116151 seconds time elapsed
+TOTAL       :     0.562225 sec
+     2,332,457,444      cycles                           #    2.979 GHz                    
+     3,625,755,159      instructions                     #    1.55  insn per cycle         
+       0.842176648 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117
@@ -77,14 +77,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.146010e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.358105e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.358105e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.164715e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.380430e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.380430e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     5.851033 sec
-    17,813,996,987      cycles                           #    3.043 GHz                    
-    43,616,814,202      instructions                     #    2.45  insn per cycle         
-       5.856069183 seconds time elapsed
+TOTAL       :     5.760009 sec
+    17,802,097,031      cycles                           #    3.089 GHz                    
+    43,613,527,077      instructions                     #    2.45  insn per cycle         
+       5.764750077 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  431) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.343466e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.599751e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.599751e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.392272e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.663586e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.663586e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     3.048613 sec
-     9,276,606,540      cycles                           #    3.040 GHz                    
-    21,930,294,042      instructions                     #    2.36  insn per cycle         
-       3.053688884 seconds time elapsed
+TOTAL       :     2.985891 sec
+     9,233,559,019      cycles                           #    3.088 GHz                    
+    21,925,837,880      instructions                     #    2.37  insn per cycle         
+       2.990875616 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1936) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.523694e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.872956e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.872956e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.561578e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.939602e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.939602e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.845518 sec
-     8,308,772,789      cycles                           #    2.916 GHz                    
-    15,593,301,532      instructions                     #    1.88  insn per cycle         
-       2.850623438 seconds time elapsed
+TOTAL       :     2.807792 sec
+     8,302,482,665      cycles                           #    2.952 GHz                    
+    15,590,734,796      instructions                     #    1.88  insn per cycle         
+       2.812825281 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2595) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.489948e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.840461e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.840461e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.577370e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.998184e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.998184e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.887357 sec
-     8,231,785,355      cycles                           #    2.847 GHz                    
-    15,437,944,905      instructions                     #    1.88  insn per cycle         
-       2.892363682 seconds time elapsed
+TOTAL       :     2.791624 sec
+     8,243,582,435      cycles                           #    2.950 GHz                    
+    15,435,159,534      instructions                     #    1.87  insn per cycle         
+       2.796691298 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2496) (512y:    9) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.580760e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.973673e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.973673e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.534202e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.878199e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.878199e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     2.796324 sec
-     6,629,287,981      cycles                           #    2.367 GHz                    
-    12,873,018,117      instructions                     #    1.94  insn per cycle         
-       2.801456274 seconds time elapsed
+TOTAL       :     2.844579 sec
+     6,638,595,923      cycles                           #    2.339 GHz                    
+    12,873,058,969      instructions                     #    1.94  insn per cycle         
+       2.849721551 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1735) (512y:   17) (512z: 1439)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt
index a12ca3b41d..198199e430 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-03_19:34:47
+DATE: 2023-11-09_18:09:35
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.243102e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.475352e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.475352e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.497702e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.965150e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.965150e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371710e-02 +- 3.270389e-06 )  GeV^0
-TOTAL       :     1.676327 sec
-     5,681,132,328      cycles                           #    2.981 GHz                    
-    10,328,752,116      instructions                     #    1.82  insn per cycle         
-       1.962251346 seconds time elapsed
+TOTAL       :     1.636588 sec
+     5,687,776,927      cycles                           #    3.043 GHz                    
+    10,344,643,155      instructions                     #    1.82  insn per cycle         
+       1.926222709 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
@@ -86,14 +86,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.117341e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.320071e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.320071e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.124698e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.329265e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.329265e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     6.103747 sec
-    18,503,457,384      cycles                           #    3.029 GHz                    
-    43,763,268,873      instructions                     #    2.37  insn per cycle         
-       6.109986471 seconds time elapsed
+TOTAL       :     6.061752 sec
+    18,474,797,660      cycles                           #    3.045 GHz                    
+    43,763,223,756      instructions                     #    2.37  insn per cycle         
+       6.067744277 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  431) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe
@@ -114,14 +114,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.169781e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.246790e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.246790e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.280805e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.418662e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.418662e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     3.406148 sec
-    10,026,239,155      cycles                           #    2.945 GHz                    
-    23,264,915,776      instructions                     #    2.32  insn per cycle         
-       3.412744895 seconds time elapsed
+TOTAL       :     3.239576 sec
+    10,001,339,639      cycles                           #    3.083 GHz                    
+    23,260,791,069      instructions                     #    2.33  insn per cycle         
+       3.245668541 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1936) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe
@@ -142,14 +142,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.376931e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.582524e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.582524e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.455472e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.697664e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.697664e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     3.133404 sec
-     9,115,108,969      cycles                           #    2.904 GHz                    
-    16,712,850,458      instructions                     #    1.83  insn per cycle         
-       3.139765331 seconds time elapsed
+TOTAL       :     3.034907 sec
+     9,092,859,245      cycles                           #    2.991 GHz                    
+    16,710,213,462      instructions                     #    1.84  insn per cycle         
+       3.041109346 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2595) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe
@@ -170,14 +170,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.412136e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.649634e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.649634e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.469626e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.746671e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.746671e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     3.093398 sec
-     9,015,171,302      cycles                           #    2.909 GHz                    
-    16,559,247,945      instructions                     #    1.84  insn per cycle         
-       3.099791137 seconds time elapsed
+TOTAL       :     3.026366 sec
+     9,019,828,246      cycles                           #    2.976 GHz                    
+    16,555,168,621      instructions                     #    1.84  insn per cycle         
+       3.032449491 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2496) (512y:    9) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe
@@ -198,14 +198,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.406219e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.608241e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.608241e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.460766e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.686068e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.686068e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     3.106272 sec
-     7,475,444,541      cycles                           #    2.404 GHz                    
-    14,076,958,110      instructions                     #    1.88  insn per cycle         
-       3.112522018 seconds time elapsed
+TOTAL       :     3.037604 sec
+     7,413,210,247      cycles                           #    2.436 GHz                    
+    14,077,138,025      instructions                     #    1.90  insn per cycle         
+       3.043934055 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1735) (512y:   17) (512z: 1439)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt
index e12a7cff38..38db2540d0 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-03_19:47:49
+DATE: 2023-11-09_18:22:25
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.309547e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.164321e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.211559e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.382431e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.208254e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.230961e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371863e-02 +- 3.269951e-06 )  GeV^0
-TOTAL       :     1.178788 sec
-     4,175,363,575      cycles                           #    2.986 GHz                    
-     6,687,157,832      instructions                     #    1.60  insn per cycle         
-       1.455561692 seconds time elapsed
+TOTAL       :     1.150438 sec
+     4,093,367,606      cycles                           #    2.986 GHz                    
+     6,655,787,532      instructions                     #    1.63  insn per cycle         
+       1.427536965 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117
@@ -77,14 +77,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.139229e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.352216e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.352216e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.163257e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.379748e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.379748e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371887e-02 +- 3.270267e-06 )  GeV^0
-TOTAL       :     6.211284 sec
-    18,855,190,279      cycles                           #    3.034 GHz                    
-    43,795,517,542      instructions                     #    2.32  insn per cycle         
-       6.216374296 seconds time elapsed
+TOTAL       :     6.085036 sec
+    18,810,997,513      cycles                           #    3.089 GHz                    
+    43,795,620,513      instructions                     #    2.33  insn per cycle         
+       6.090075734 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  431) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.318674e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.546898e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.546898e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.379076e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.642823e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.642823e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371887e-02 +- 3.270266e-06 )  GeV^0
-TOTAL       :     3.402195 sec
-    10,237,833,782      cycles                           #    3.006 GHz                    
-    22,007,212,368      instructions                     #    2.15  insn per cycle         
-       3.407333694 seconds time elapsed
+TOTAL       :     3.315467 sec
+    10,223,065,521      cycles                           #    3.080 GHz                    
+    22,006,854,632      instructions                     #    2.15  insn per cycle         
+       3.320462987 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1936) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.476676e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.816143e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.816143e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.487454e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.825644e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.825644e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371885e-02 +- 3.270112e-06 )  GeV^0
-TOTAL       :     3.234448 sec
-     9,334,268,427      cycles                           #    2.883 GHz                    
-    15,503,242,414      instructions                     #    1.66  insn per cycle         
-       3.239539945 seconds time elapsed
+TOTAL       :     3.212098 sec
+     9,324,905,009      cycles                           #    2.900 GHz                    
+    15,502,708,810      instructions                     #    1.66  insn per cycle         
+       3.217273015 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2595) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.532354e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.931778e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.931778e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.573485e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.002018e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.002018e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371885e-02 +- 3.270112e-06 )  GeV^0
-TOTAL       :     3.179353 sec
-     9,298,076,707      cycles                           #    2.921 GHz                    
-    15,144,691,612      instructions                     #    1.63  insn per cycle         
-       3.184641880 seconds time elapsed
+TOTAL       :     3.120613 sec
+     9,288,549,778      cycles                           #    2.973 GHz                    
+    15,149,849,415      instructions                     #    1.63  insn per cycle         
+       3.125542581 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2496) (512y:    9) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.550309e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.928739e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.928739e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.617810e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.038860e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.038860e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371885e-02 +- 3.270112e-06 )  GeV^0
-TOTAL       :     3.163394 sec
-     7,678,426,346      cycles                           #    2.424 GHz                    
-    12,579,409,911      instructions                     #    1.64  insn per cycle         
-       3.168501704 seconds time elapsed
+TOTAL       :     3.081671 sec
+     7,641,480,002      cycles                           #    2.476 GHz                    
+    12,579,693,620      instructions                     #    1.65  insn per cycle         
+       3.086750346 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1735) (512y:   17) (512z: 1439)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt
index ed97b2f8ed..6fcc7aa480 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-03_19:44:30
+DATE: 2023-11-09_18:19:10
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.311918e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.184761e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.263047e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.390821e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.223370e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.268045e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371687e-02 +- 3.270220e-06 )  GeV^0
-TOTAL       :     0.849658 sec
-     3,163,783,620      cycles                           #    2.955 GHz                    
-     6,425,624,965      instructions                     #    2.03  insn per cycle         
-       1.127772989 seconds time elapsed
+TOTAL       :     0.831823 sec
+     3,198,187,473      cycles                           #    3.040 GHz                    
+     6,464,633,768      instructions                     #    2.02  insn per cycle         
+       1.108743988 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117
@@ -77,14 +77,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.132012e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.344208e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.344208e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.166393e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.383502e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.383502e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     5.993383 sec
-    18,094,070,839      cycles                           #    3.017 GHz                    
-    43,613,404,695      instructions                     #    2.41  insn per cycle         
-       5.998406050 seconds time elapsed
+TOTAL       :     5.750668 sec
+    17,811,310,529      cycles                           #    3.095 GHz                    
+    43,613,299,638      instructions                     #    2.45  insn per cycle         
+       5.755604942 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  431) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.281067e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.486158e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.486158e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.317079e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.552668e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.552668e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     3.130477 sec
-     9,257,197,715      cycles                           #    2.953 GHz                    
-    21,925,291,921      instructions                     #    2.37  insn per cycle         
-       3.135663717 seconds time elapsed
+TOTAL       :     3.082399 sec
+     9,236,711,908      cycles                           #    2.992 GHz                    
+    21,926,264,881      instructions                     #    2.37  insn per cycle         
+       3.087937460 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1936) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.526300e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.881905e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.881905e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.562942e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.932578e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.932578e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.846007 sec
-     8,323,404,187      cycles                           #    2.920 GHz                    
-    15,589,367,643      instructions                     #    1.87  insn per cycle         
-       2.851124263 seconds time elapsed
+TOTAL       :     2.803489 sec
+     8,311,895,996      cycles                           #    2.960 GHz                    
+    15,590,591,103      instructions                     #    1.88  insn per cycle         
+       2.808434072 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2595) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.559394e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.951403e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.951403e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.582757e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.993146e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.993146e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.815665 sec
-     8,248,875,592      cycles                           #    2.925 GHz                    
-    15,439,478,624      instructions                     #    1.87  insn per cycle         
-       2.820889860 seconds time elapsed
+TOTAL       :     2.784903 sec
+     8,236,233,463      cycles                           #    2.953 GHz                    
+    15,439,539,485      instructions                     #    1.87  insn per cycle         
+       2.790025696 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2496) (512y:    9) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.553964e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.948928e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.948928e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.640868e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.066609e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.066609e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     2.827281 sec
-     6,687,814,053      cycles                           #    2.363 GHz                    
-    12,869,763,437      instructions                     #    1.92  insn per cycle         
-       2.832592565 seconds time elapsed
+TOTAL       :     2.739279 sec
+     6,618,156,482      cycles                           #    2.412 GHz                    
+    12,869,303,752      instructions                     #    1.94  insn per cycle         
+       2.744541017 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1735) (512y:   17) (512z: 1439)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt
index c7d745ef4d..ef7d7310ec 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-03_19:41:12
+DATE: 2023-11-09_18:15:56
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -51,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.077097e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.138341e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.120075e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.457534e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.184951e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.150897e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371710e-02 +- 3.270389e-06 )  GeV^0
-TOTAL       :     1.480161 sec
-     5,077,584,264      cycles                           #    2.967 GHz                    
-     9,258,149,444      instructions                     #    1.82  insn per cycle         
-       1.768271684 seconds time elapsed
+TOTAL       :     1.431692 sec
+     5,029,016,765      cycles                           #    3.047 GHz                    
+     9,191,843,408      instructions                     #    1.83  insn per cycle         
+       1.708626202 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -79,14 +79,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.142005e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.354012e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.354012e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.163854e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.380282e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.380282e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     5.874856 sec
-    17,835,700,462      cycles                           #    3.034 GHz                    
-    43,613,540,806      instructions                     #    2.45  insn per cycle         
-       5.879931479 seconds time elapsed
+TOTAL       :     5.764377 sec
+    17,805,909,761      cycles                           #    3.087 GHz                    
+    43,613,494,568      instructions                     #    2.45  insn per cycle         
+       5.769597959 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  431) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe
@@ -106,14 +106,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.282759e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.491220e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.491220e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.391849e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.652855e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.652855e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     3.129132 sec
-     9,269,728,355      cycles                           #    2.963 GHz                    
-    21,928,484,188      instructions                     #    2.37  insn per cycle         
-       3.134244707 seconds time elapsed
+TOTAL       :     2.987897 sec
+     9,257,292,453      cycles                           #    3.094 GHz                    
+    21,926,827,781      instructions                     #    2.37  insn per cycle         
+       2.993012479 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1936) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe
@@ -133,14 +133,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.516560e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.868004e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.868004e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.568515e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.950394e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.950394e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.857533 sec
-     8,336,241,805      cycles                           #    2.913 GHz                    
-    15,589,958,795      instructions                     #    1.87  insn per cycle         
-       2.862709487 seconds time elapsed
+TOTAL       :     2.797984 sec
+     8,317,461,722      cycles                           #    2.968 GHz                    
+    15,591,357,650      instructions                     #    1.87  insn per cycle         
+       2.803063629 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2595) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe
@@ -160,14 +160,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.536616e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.924197e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.924197e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.510607e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.874545e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.874545e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.838427 sec
-     8,267,692,084      cycles                           #    2.908 GHz                    
-    15,438,877,256      instructions                     #    1.87  insn per cycle         
-       2.843475918 seconds time elapsed
+TOTAL       :     2.865428 sec
+     8,258,982,824      cycles                           #    2.878 GHz                    
+    15,434,974,292      instructions                     #    1.87  insn per cycle         
+       2.870509731 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2496) (512y:    9) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe
@@ -187,14 +187,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.539393e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.905150e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.905150e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.534626e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.883996e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.883996e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     2.843291 sec
-     6,667,785,493      cycles                           #    2.342 GHz                    
-    12,868,798,226      instructions                     #    1.93  insn per cycle         
-       2.848396098 seconds time elapsed
+TOTAL       :     2.847653 sec
+     6,630,370,490      cycles                           #    2.325 GHz                    
+    12,869,864,045      instructions                     #    1.94  insn per cycle         
+       2.852728913 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1735) (512y:   17) (512z: 1439)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt
index 2a5177092e..acb88982d2 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-03_19:01:53
+DATE: 2023-11-09_17:38:11
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.628396e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.344836e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.322116e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.537187e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.294303e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.293124e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371687e-02 +- 3.270220e-06 )  GeV^0
-TOTAL       :     0.558495 sec
-     2,344,289,295      cycles                           #    2.966 GHz                    
-     3,579,154,611      instructions                     #    1.53  insn per cycle         
-       0.847997464 seconds time elapsed
+TOTAL       :     0.560273 sec
+     2,360,194,998      cycles                           #    3.018 GHz                    
+     3,675,767,532      instructions                     #    1.56  insn per cycle         
+       0.839402775 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 95
@@ -77,14 +77,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.195436e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.435503e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.435503e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.245068e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.494792e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.494792e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     5.634613 sec
-    16,757,667,455      cycles                           #    2.972 GHz                    
-    41,375,848,460      instructions                     #    2.47  insn per cycle         
-       5.639688103 seconds time elapsed
+TOTAL       :     5.409364 sec
+    16,727,058,520      cycles                           #    3.090 GHz                    
+    41,371,618,921      instructions                     #    2.47  insn per cycle         
+       5.414214747 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  375) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.409189e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.740073e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.740073e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.441577e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.817766e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.817766e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     2.974456 sec
-     9,031,167,153      cycles                           #    3.032 GHz                    
-    21,234,204,961      instructions                     #    2.35  insn per cycle         
-       2.979655809 seconds time elapsed
+TOTAL       :     2.932681 sec
+     9,069,604,999      cycles                           #    3.089 GHz                    
+    21,230,786,011      instructions                     #    2.34  insn per cycle         
+       2.937680542 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1841) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.541260e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.926631e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.926631e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.599334e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.008101e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.008101e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.832126 sec
-     8,284,857,543      cycles                           #    2.922 GHz                    
-    15,430,300,133      instructions                     #    1.86  insn per cycle         
-       2.837298063 seconds time elapsed
+TOTAL       :     2.767998 sec
+     8,243,229,329      cycles                           #    2.973 GHz                    
+    15,424,533,858      instructions                     #    1.87  insn per cycle         
+       2.772999466 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2536) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.592912e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.031163e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.031163e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.643252e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.114551e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.114551e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.778473 sec
-     8,124,076,124      cycles                           #    2.921 GHz                    
-    15,242,043,085      instructions                     #    1.88  insn per cycle         
-       2.783650122 seconds time elapsed
+TOTAL       :     2.727085 sec
+     8,130,917,009      cycles                           #    2.977 GHz                    
+    15,244,999,510      instructions                     #    1.87  insn per cycle         
+       2.732127705 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2423) (512y:    8) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.583024e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.982786e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.982786e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.551006e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.930183e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.930183e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     2.793855 sec
-     6,612,725,918      cycles                           #    2.363 GHz                    
-    12,851,623,569      instructions                     #    1.94  insn per cycle         
-       2.799020549 seconds time elapsed
+TOTAL       :     2.826162 sec
+     6,610,785,893      cycles                           #    2.336 GHz                    
+    12,848,595,223      instructions                     #    1.94  insn per cycle         
+       2.831354272 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1705) (512y:   18) (512z: 1427)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt
index b5507320b6..1f616951f6 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-03_19:24:23
+DATE: 2023-11-09_17:59:07
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.295762e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.181123e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.251991e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.379623e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.224230e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.277206e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371687e-02 +- 3.270220e-06 )  GeV^0
-TOTAL       :     0.577748 sec
-     2,371,472,909      cycles                           #    2.938 GHz                    
-     3,662,215,838      instructions                     #    1.54  insn per cycle         
-       0.866645313 seconds time elapsed
+TOTAL       :     0.567867 sec
+     2,380,227,304      cycles                           #    3.011 GHz                    
+     3,716,615,660      instructions                     #    1.56  insn per cycle         
+       0.847985852 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117
@@ -77,14 +77,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.709669e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.230063e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.230063e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.702473e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.225484e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.225484e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     4.043238 sec
-    12,201,253,013      cycles                           #    3.016 GHz                    
-    32,520,928,331      instructions                     #    2.67  insn per cycle         
-       4.048480591 seconds time elapsed
+TOTAL       :     4.053519 sec
+    12,216,293,497      cycles                           #    3.011 GHz                    
+    32,522,254,109      instructions                     #    2.66  insn per cycle         
+       4.058663851 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  312) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.776736e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.688717e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.688717e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.830691e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.806288e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.806288e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     2.629894 sec
-     8,006,523,859      cycles                           #    3.039 GHz                    
-    18,689,561,969      instructions                     #    2.33  insn per cycle         
-       2.635155805 seconds time elapsed
+TOTAL       :     2.580563 sec
+     7,975,462,428      cycles                           #    3.085 GHz                    
+    18,690,132,924      instructions                     #    2.34  insn per cycle         
+       2.585721810 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1554) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.876319e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.776118e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.776118e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.931453e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.867355e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.867355e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.544972 sec
-     7,483,863,921      cycles                           #    2.935 GHz                    
-    14,252,784,118      instructions                     #    1.90  insn per cycle         
-       2.550249205 seconds time elapsed
+TOTAL       :     2.497040 sec
+     7,461,995,802      cycles                           #    2.983 GHz                    
+    14,254,175,720      instructions                     #    1.91  insn per cycle         
+       2.502220546 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2237) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.940665e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.960644e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.960644e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.990445e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.025789e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.025789e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.495422 sec
-     7,326,781,172      cycles                           #    2.931 GHz                    
-    13,945,833,508      instructions                     #    1.90  insn per cycle         
-       2.500698244 seconds time elapsed
+TOTAL       :     2.453022 sec
+     7,312,763,088      cycles                           #    2.976 GHz                    
+    13,952,233,674      instructions                     #    1.91  insn per cycle         
+       2.458314250 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2096) (512y:    3) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.636740e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.108198e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.108198e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.649642e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.141006e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.141006e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     2.746264 sec
-     6,527,138,912      cycles                           #    2.373 GHz                    
-    13,421,028,013      instructions                     #    2.06  insn per cycle         
-       2.751679406 seconds time elapsed
+TOTAL       :     2.733236 sec
+     6,541,090,853      cycles                           #    2.390 GHz                    
+    13,422,969,862      instructions                     #    2.05  insn per cycle         
+       2.738380923 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2071) (512y:    1) (512z: 1198)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt
index b6c42e0895..374f2a331e 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-03_19:24:50
+DATE: 2023-11-09_17:59:34
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.300995e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.194789e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.295764e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.383788e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.237025e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.315197e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371687e-02 +- 3.270220e-06 )  GeV^0
-TOTAL       :     0.573687 sec
-     2,396,122,888      cycles                           #    2.957 GHz                    
-     3,709,386,643      instructions                     #    1.55  insn per cycle         
-       0.867525381 seconds time elapsed
+TOTAL       :     0.566447 sec
+     2,356,919,781      cycles                           #    2.991 GHz                    
+     3,683,739,571      instructions                     #    1.56  insn per cycle         
+       0.846741071 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 95
@@ -77,14 +77,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.274435e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.306451e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.306451e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.320968e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.384461e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.384461e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     3.128769 sec
-     9,423,056,878      cycles                           #    3.008 GHz                    
-    25,306,341,141      instructions                     #    2.69  insn per cycle         
-       3.134038482 seconds time elapsed
+TOTAL       :     3.069124 sec
+     9,404,467,335      cycles                           #    3.060 GHz                    
+    25,307,412,416      instructions                     #    2.69  insn per cycle         
+       3.074433972 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  263) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.099658e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.759584e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.759584e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.164094e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.875777e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.875777e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     2.397339 sec
-     7,201,211,606      cycles                           #    2.998 GHz                    
-    16,901,413,977      instructions                     #    2.35  insn per cycle         
-       2.402789017 seconds time elapsed
+TOTAL       :     2.347070 sec
+     7,183,873,212      cycles                           #    3.055 GHz                    
+    16,901,716,244      instructions                     #    2.35  insn per cycle         
+       2.352401841 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1359) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.019910e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.199492e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.199492e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.103853e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.343298e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.343298e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.443323 sec
-     7,147,435,963      cycles                           #    2.920 GHz                    
-    13,619,110,670      instructions                     #    1.91  insn per cycle         
-       2.448969091 seconds time elapsed
+TOTAL       :     2.377491 sec
+     7,114,519,285      cycles                           #    2.987 GHz                    
+    13,619,081,744      instructions                     #    1.91  insn per cycle         
+       2.382536600 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2060) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.050148e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.307582e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.307582e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.131276e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.434861e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.434861e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.423418 sec
-     7,082,396,314      cycles                           #    2.918 GHz                    
-    13,431,226,521      instructions                     #    1.90  insn per cycle         
-       2.429141482 seconds time elapsed
+TOTAL       :     2.360462 sec
+     7,057,553,337      cycles                           #    2.985 GHz                    
+    13,435,682,624      instructions                     #    1.90  insn per cycle         
+       2.365710938 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1945) (512y:    4) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.725279e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.338904e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.338904e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.814153e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.521058e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.521058e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     2.669392 sec
-     6,366,623,257      cycles                           #    2.381 GHz                    
-    13,153,230,984      instructions                     #    2.07  insn per cycle         
-       2.674848562 seconds time elapsed
+TOTAL       :     2.589000 sec
+     6,345,330,255      cycles                           #    2.447 GHz                    
+    13,153,121,215      instructions                     #    2.07  insn per cycle         
+       2.594408710 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2029) (512y:    1) (512z: 1083)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
index 40be1e0fe4..8dc3126453 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-03_19:02:23
+DATE: 2023-11-09_17:38:40
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.986561e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.920506e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.026737e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.618205e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.831793e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.977288e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.651585 sec
-     2,613,210,290      cycles                           #    2.977 GHz                    
-     4,026,633,947      instructions                     #    1.54  insn per cycle         
-       0.940304085 seconds time elapsed
+TOTAL       :     0.652821 sec
+     2,648,283,165      cycles                           #    3.003 GHz                    
+     4,101,874,172      instructions                     #    1.55  insn per cycle         
+       0.942277172 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166
@@ -77,14 +77,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.098312e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.283308e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.283308e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.110302e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.297006e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.297006e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     6.126587 sec
-    18,732,621,094      cycles                           #    3.056 GHz                    
-    44,288,636,649      instructions                     #    2.36  insn per cycle         
-       6.131702524 seconds time elapsed
+TOTAL       :     6.061696 sec
+    18,702,058,697      cycles                           #    3.083 GHz                    
+    44,286,744,373      instructions                     #    2.37  insn per cycle         
+       6.066885580 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  439) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.724748e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.279623e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.279623e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.748205e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.315149e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.315149e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.052368 sec
-    12,345,078,225      cycles                           #    3.044 GHz                    
-    30,962,385,061      instructions                     #    2.51  insn per cycle         
-       4.057665704 seconds time elapsed
+TOTAL       :     3.994898 sec
+    12,345,141,895      cycles                           #    3.087 GHz                    
+    30,960,600,041      instructions                     #    2.51  insn per cycle         
+       4.000031168 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1685) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.012805e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.801799e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.801799e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.024705e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.805066e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.805066e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.527503 sec
-    10,105,777,222      cycles                           #    2.861 GHz                    
-    19,402,091,411      instructions                     #    1.92  insn per cycle         
-       3.532885933 seconds time elapsed
+TOTAL       :     3.505414 sec
+    10,100,327,501      cycles                           #    2.878 GHz                    
+    19,399,870,617      instructions                     #    1.92  insn per cycle         
+       3.510718654 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2146) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.136223e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.011490e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.011490e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.175175e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.066367e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.066367e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.337554 sec
-     9,780,270,182      cycles                           #    2.927 GHz                    
-    18,984,447,401      instructions                     #    1.94  insn per cycle         
-       3.342834380 seconds time elapsed
+TOTAL       :     3.279954 sec
+     9,681,673,426      cycles                           #    2.948 GHz                    
+    18,969,865,921      instructions                     #    1.96  insn per cycle         
+       3.285422855 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1859) (512y:  188) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.916274e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.582982e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.582982e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.948024e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.629123e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.629123e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.678279 sec
-     8,374,553,290      cycles                           #    2.274 GHz                    
-    15,066,979,076      instructions                     #    1.80  insn per cycle         
-       3.683518796 seconds time elapsed
+TOTAL       :     3.620894 sec
+     8,364,739,572      cycles                           #    2.308 GHz                    
+    15,064,814,645      instructions                     #    1.80  insn per cycle         
+       3.626218437 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1023) (512y:  155) (512z: 1316)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt
index d0448f95d2..a2d87f5da8 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-03_19:02:57
+DATE: 2023-11-09_17:39:14
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.995389e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.942657e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.069355e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.632265e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.861047e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.036900e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.648218 sec
-     2,577,449,374      cycles                           #    2.937 GHz                    
-     3,930,119,139      instructions                     #    1.52  insn per cycle         
-       0.934838617 seconds time elapsed
+TOTAL       :     0.649714 sec
+     2,641,937,888      cycles                           #    3.008 GHz                    
+     4,107,555,428      instructions                     #    1.55  insn per cycle         
+       0.938941535 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 154
@@ -77,14 +77,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.138539e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.340756e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.340756e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.158637e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.370951e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.370951e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     5.930169 sec
-    17,940,598,550      cycles                           #    3.023 GHz                    
-    42,539,439,563      instructions                     #    2.37  insn per cycle         
-       5.935391018 seconds time elapsed
+TOTAL       :     5.823305 sec
+    18,013,373,486      cycles                           #    3.091 GHz                    
+    42,535,982,962      instructions                     #    2.36  insn per cycle         
+       5.828417378 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  421) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.737380e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.320541e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.320541e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.770599e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.353490e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.353490e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.022351 sec
-    12,179,829,023      cycles                           #    3.025 GHz                    
-    30,269,422,152      instructions                     #    2.49  insn per cycle         
-       4.027705928 seconds time elapsed
+TOTAL       :     3.950045 sec
+    12,171,205,402      cycles                           #    3.078 GHz                    
+    30,268,628,414      instructions                     #    2.49  insn per cycle         
+       3.955313835 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1692) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.003006e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.791277e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.791277e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.099406e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.925166e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.925166e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.544763 sec
-    10,086,483,930      cycles                           #    2.843 GHz                    
-    19,285,075,836      instructions                     #    1.91  insn per cycle         
-       3.550049339 seconds time elapsed
+TOTAL       :     3.385614 sec
+    10,033,748,773      cycles                           #    2.960 GHz                    
+    19,281,534,051      instructions                     #    1.92  insn per cycle         
+       3.390768328 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2162) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.153713e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.048947e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.048947e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.135260e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.020042e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.020042e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.313722 sec
-     9,652,564,948      cycles                           #    2.909 GHz                    
-    18,773,850,855      instructions                     #    1.94  insn per cycle         
-       3.319022077 seconds time elapsed
+TOTAL       :     3.343797 sec
+     9,615,342,352      cycles                           #    2.872 GHz                    
+    18,771,093,665      instructions                     #    1.95  insn per cycle         
+       3.349067283 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1833) (512y:  191) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.911178e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.576380e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.576380e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.965653e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.666391e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.666391e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.691490 sec
-     8,274,258,282      cycles                           #    2.239 GHz                    
-    14,991,882,108      instructions                     #    1.81  insn per cycle         
-       3.696773496 seconds time elapsed
+TOTAL       :     3.592114 sec
+     8,278,170,966      cycles                           #    2.302 GHz                    
+    14,988,534,751      instructions                     #    1.81  insn per cycle         
+       3.597402233 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1020) (512y:  156) (512z: 1305)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
index ecfe1f9032..dad81481e1 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-03_19:03:30
+DATE: 2023-11-09_17:39:47
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.269149e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.178306e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.270483e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.113101e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.178068e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.274620e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.515028 sec
-     2,190,362,135      cycles                           #    2.945 GHz                    
-     3,134,430,746      instructions                     #    1.43  insn per cycle         
-       0.801320986 seconds time elapsed
+TOTAL       :     0.513513 sec
+     2,238,779,994      cycles                           #    3.016 GHz                    
+     3,236,054,047      instructions                     #    1.45  insn per cycle         
+       0.800586540 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
@@ -77,14 +77,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.141790e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.204663e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.204663e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.199296e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.263095e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.263095e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.001947 sec
-    15,160,921,453      cycles                           #    3.029 GHz                    
-    38,440,320,018      instructions                     #    2.54  insn per cycle         
-       5.007262329 seconds time elapsed
+TOTAL       :     4.870986 sec
+    15,138,095,755      cycles                           #    3.105 GHz                    
+    38,436,824,615      instructions                     #    2.54  insn per cycle         
+       4.876178872 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  668) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.537912e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.729582e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.729582e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.669942e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.869262e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.869262e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.070180 sec
-     9,135,564,109      cycles                           #    2.971 GHz                    
-    24,595,068,911      instructions                     #    2.69  insn per cycle         
-       3.075510770 seconds time elapsed
+TOTAL       :     2.960626 sec
+     9,095,550,717      cycles                           #    3.068 GHz                    
+    24,591,504,229      instructions                     #    2.70  insn per cycle         
+       2.966139239 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2156) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.794659e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.298456e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.298456e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.803896e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.327557e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.327557e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.915155 sec
-     5,488,800,341      cycles                           #    2.860 GHz                    
-    11,269,289,809      instructions                     #    2.05  insn per cycle         
-       1.920562747 seconds time elapsed
+TOTAL       :     1.909794 sec
+     5,486,817,505      cycles                           #    2.866 GHz                    
+    11,265,648,347      instructions                     #    2.05  insn per cycle         
+       1.915029323 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2376) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.465243e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.099655e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.099655e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.555272e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.195980e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.195980e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.726047 sec
-     4,948,464,581      cycles                           #    2.859 GHz                    
-    10,575,268,094      instructions                     #    2.14  insn per cycle         
-       1.731560491 seconds time elapsed
+TOTAL       :     1.704245 sec
+     4,927,847,485      cycles                           #    2.884 GHz                    
+    10,572,013,859      instructions                     #    2.15  insn per cycle         
+       1.709455619 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2077) (512y:  144) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.977744e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.204839e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.204839e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.103362e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.341522e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.341522e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.740172 sec
-     5,379,659,738      cycles                           #    1.960 GHz                    
-     7,808,789,832      instructions                     #    1.45  insn per cycle         
-       2.745493260 seconds time elapsed
+TOTAL       :     2.658432 sec
+     5,379,828,238      cycles                           #    2.021 GHz                    
+     7,805,118,346      instructions                     #    1.45  insn per cycle         
+       2.663615123 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1446) (512y:  122) (512z: 1542)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt
index dd2f256477..d089f3ea80 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-03_19:35:20
+DATE: 2023-11-09_18:10:07
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.496633e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.880527e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.880527e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.436618e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.989585e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.989585e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.808083 sec
-     3,120,895,454      cycles                           #    2.971 GHz                    
-     4,726,889,577      instructions                     #    1.51  insn per cycle         
-       1.107972527 seconds time elapsed
+TOTAL       :     0.820320 sec
+     3,087,525,024      cycles                           #    2.881 GHz                    
+     4,797,416,225      instructions                     #    1.55  insn per cycle         
+       1.129126082 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
@@ -86,14 +86,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.117962e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.179706e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.179706e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.137936e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.202451e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.202451e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.135527 sec
-    15,504,544,823      cycles                           #    3.016 GHz                    
-    38,497,224,440      instructions                     #    2.48  insn per cycle         
-       5.142229259 seconds time elapsed
+TOTAL       :     5.088780 sec
+    15,506,176,025      cycles                           #    3.045 GHz                    
+    38,500,320,484      instructions                     #    2.48  insn per cycle         
+       5.095207532 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  668) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe
@@ -114,14 +114,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.595756e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.790745e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.790745e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.664205e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.863051e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.863051e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.098715 sec
-     9,432,801,004      cycles                           #    3.038 GHz                    
-    24,773,895,780      instructions                     #    2.63  insn per cycle         
-       3.105439323 seconds time elapsed
+TOTAL       :     3.042056 sec
+     9,436,538,509      cycles                           #    3.096 GHz                    
+    24,774,730,249      instructions                     #    2.63  insn per cycle         
+       3.048601444 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2156) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe
@@ -142,14 +142,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.527781e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.981315e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.981315e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.821161e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.311886e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.311886e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.098555 sec
-     5,826,323,105      cycles                           #    2.789 GHz                    
-    11,554,423,664      instructions                     #    1.98  insn per cycle         
-       2.105206679 seconds time elapsed
+TOTAL       :     1.984151 sec
+     5,841,767,961      cycles                           #    2.936 GHz                    
+    11,552,228,699      instructions                     #    1.98  insn per cycle         
+       1.990639911 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2376) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe
@@ -170,14 +170,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.300396e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.893264e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.893264e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.505257e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.122209e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.122209e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.849117 sec
-     5,294,307,248      cycles                           #    2.854 GHz                    
-    10,856,382,305      instructions                     #    2.05  insn per cycle         
-       1.855861110 seconds time elapsed
+TOTAL       :     1.793114 sec
+     5,293,839,115      cycles                           #    2.943 GHz                    
+    10,856,913,242      instructions                     #    2.05  insn per cycle         
+       1.799607546 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2077) (512y:  144) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe
@@ -198,14 +198,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.891057e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.111611e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.111611e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.021313e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.250852e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.250852e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.882235 sec
-     5,742,873,090      cycles                           #    1.988 GHz                    
-     8,048,787,968      instructions                     #    1.40  insn per cycle         
-       2.889049440 seconds time elapsed
+TOTAL       :     2.791071 sec
+     5,762,529,693      cycles                           #    2.060 GHz                    
+     8,048,857,986      instructions                     #    1.40  insn per cycle         
+       2.797719094 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1446) (512y:  122) (512z: 1542)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt
index 70c42f96ca..d4092f872a 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-03_19:48:21
+DATE: 2023-11-09_18:22:57
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.579966e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.154296e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.270387e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.736311e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.160845e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.271332e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     0.619804 sec
-     2,500,171,473      cycles                           #    2.947 GHz                    
-     3,610,462,854      instructions                     #    1.44  insn per cycle         
-       0.906022247 seconds time elapsed
+TOTAL       :     0.616077 sec
+     2,487,675,163      cycles                           #    2.949 GHz                    
+     3,609,155,412      instructions                     #    1.45  insn per cycle         
+       0.900867999 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
@@ -77,14 +77,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.141469e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.204103e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.204103e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.176864e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.240941e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.240941e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     5.061672 sec
-    15,345,417,554      cycles                           #    3.029 GHz                    
-    38,452,483,858      instructions                     #    2.51  insn per cycle         
-       5.067127392 seconds time elapsed
+TOTAL       :     4.979715 sec
+    15,323,819,271      cycles                           #    3.075 GHz                    
+    38,452,992,607      instructions                     #    2.51  insn per cycle         
+       4.984901972 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  668) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.594441e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.787517e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.787517e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.677729e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.878488e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.878488e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     3.081938 sec
-     9,306,122,505      cycles                           #    3.015 GHz                    
-    24,590,602,612      instructions                     #    2.64  insn per cycle         
-       3.087467598 seconds time elapsed
+TOTAL       :     3.013873 sec
+     9,290,869,776      cycles                           #    3.079 GHz                    
+    24,592,367,735      instructions                     #    2.65  insn per cycle         
+       3.019043179 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2156) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.780444e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.284766e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.284766e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.850559e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.370312e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.370312e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     1.978919 sec
-     5,659,108,727      cycles                           #    2.853 GHz                    
-    11,248,307,846      instructions                     #    1.99  insn per cycle         
-       1.984493875 seconds time elapsed
+TOTAL       :     1.954495 sec
+     5,685,208,050      cycles                           #    2.902 GHz                    
+    11,247,975,749      instructions                     #    1.98  insn per cycle         
+       1.959795584 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2376) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.409554e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.043503e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.043503e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.607127e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.248201e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.248201e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     1.801971 sec
-     5,131,678,035      cycles                           #    2.841 GHz                    
-    10,518,217,961      instructions                     #    2.05  insn per cycle         
-       1.807387516 seconds time elapsed
+TOTAL       :     1.748348 sec
+     5,124,696,849      cycles                           #    2.923 GHz                    
+    10,520,869,381      instructions                     #    2.05  insn per cycle         
+       1.753705732 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2077) (512y:  144) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.952294e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.178919e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.178919e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.874391e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.086224e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.086224e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     2.820832 sec
-     5,565,619,645      cycles                           #    1.970 GHz                    
-     7,754,617,723      instructions                     #    1.39  insn per cycle         
-       2.826352548 seconds time elapsed
+TOTAL       :     2.872375 sec
+     5,588,777,867      cycles                           #    1.950 GHz                    
+     7,758,258,898      instructions                     #    1.39  insn per cycle         
+       2.877703247 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1446) (512y:  122) (512z: 1542)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt
index 4837b41444..b9b046957a 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-03_19:45:01
+DATE: 2023-11-09_18:19:40
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.583777e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.154968e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.271096e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.737213e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.157401e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.270983e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.557101 sec
-     2,322,977,037      cycles                           #    2.953 GHz                    
-     3,599,423,025      instructions                     #    1.55  insn per cycle         
-       0.843882316 seconds time elapsed
+TOTAL       :     0.551870 sec
+     2,343,082,954      cycles                           #    3.005 GHz                    
+     3,662,705,915      instructions                     #    1.56  insn per cycle         
+       0.837059271 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
@@ -77,14 +77,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.134010e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.196717e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.196717e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.189334e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.253549e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.253549e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.022340 sec
-    15,161,844,495      cycles                           #    3.017 GHz                    
-    38,436,020,868      instructions                     #    2.54  insn per cycle         
-       5.028057319 seconds time elapsed
+TOTAL       :     4.893645 sec
+    15,145,823,463      cycles                           #    3.092 GHz                    
+    38,436,891,323      instructions                     #    2.54  insn per cycle         
+       4.899128465 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  668) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.611425e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.807723e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.807723e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.701689e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.903671e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.903671e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.009043 sec
-     9,092,248,013      cycles                           #    3.018 GHz                    
-    24,590,993,356      instructions                     #    2.70  insn per cycle         
-       3.014816078 seconds time elapsed
+TOTAL       :     2.937292 sec
+     9,090,406,845      cycles                           #    3.091 GHz                    
+    24,590,949,325      instructions                     #    2.71  insn per cycle         
+       2.942627315 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2156) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.765157e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.263695e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.263695e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.932093e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.448459e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.448459e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.924911 sec
-     5,492,799,049      cycles                           #    2.847 GHz                    
-    11,264,994,094      instructions                     #    2.05  insn per cycle         
-       1.930399853 seconds time elapsed
+TOTAL       :     1.870782 sec
+     5,477,596,736      cycles                           #    2.921 GHz                    
+    11,265,174,730      instructions                     #    2.06  insn per cycle         
+       1.876089705 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2376) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.461458e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.086226e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.086226e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.470328e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.111006e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.111006e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.728063 sec
-     4,951,669,022      cycles                           #    2.858 GHz                    
-    10,569,075,843      instructions                     #    2.13  insn per cycle         
-       1.733593807 seconds time elapsed
+TOTAL       :     1.725707 sec
+     4,951,306,612      cycles                           #    2.866 GHz                    
+    10,571,555,034      instructions                     #    2.14  insn per cycle         
+       1.731137280 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2077) (512y:  144) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.938989e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.163796e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.163796e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.944088e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.162238e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.162238e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.768049 sec
-     5,404,539,268      cycles                           #    1.950 GHz                    
-     7,804,733,779      instructions                     #    1.44  insn per cycle         
-       2.773480694 seconds time elapsed
+TOTAL       :     2.762921 sec
+     5,392,276,499      cycles                           #    1.949 GHz                    
+     7,806,030,768      instructions                     #    1.45  insn per cycle         
+       2.768347372 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1446) (512y:  122) (512z: 1542)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt
index 04f32ac3bc..655f8b81f2 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-03_19:41:43
+DATE: 2023-11-09_18:16:27
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -51,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.845624e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.154000e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.267501e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.038584e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.158740e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.268341e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.705622 sec
-     2,764,377,825      cycles                           #    2.955 GHz                    
-     4,322,445,800      instructions                     #    1.56  insn per cycle         
-       0.992638570 seconds time elapsed
+TOTAL       :     0.696596 sec
+     2,778,143,738      cycles                           #    3.016 GHz                    
+     4,350,451,856      instructions                     #    1.57  insn per cycle         
+       0.980250782 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -79,14 +79,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.118266e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.179189e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.179189e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.182446e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.245839e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.245839e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.055003 sec
-    15,355,352,228      cycles                           #    3.035 GHz                    
-    38,436,037,499      instructions                     #    2.50  insn per cycle         
-       5.060369145 seconds time elapsed
+TOTAL       :     4.909547 sec
+    15,150,996,904      cycles                           #    3.083 GHz                    
+    38,436,637,567      instructions                     #    2.54  insn per cycle         
+       4.914838193 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  668) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe
@@ -106,14 +106,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.619308e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.814626e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.814626e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.688279e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.888858e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.888858e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.002993 sec
-     9,098,824,080      cycles                           #    3.025 GHz                    
-    24,590,228,698      instructions                     #    2.70  insn per cycle         
-       3.008485414 seconds time elapsed
+TOTAL       :     2.947447 sec
+     9,111,190,675      cycles                           #    3.087 GHz                    
+    24,590,939,294      instructions                     #    2.70  insn per cycle         
+       2.952793630 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2156) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe
@@ -133,14 +133,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.738465e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.252767e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.252767e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.931624e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.461725e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.461725e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.934521 sec
-     5,491,674,204      cycles                           #    2.833 GHz                    
-    11,265,170,941      instructions                     #    2.05  insn per cycle         
-       1.939950087 seconds time elapsed
+TOTAL       :     1.871366 sec
+     5,440,450,573      cycles                           #    2.900 GHz                    
+    11,265,206,629      instructions                     #    2.07  insn per cycle         
+       1.876659163 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2376) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe
@@ -160,14 +160,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.341479e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.957193e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.957193e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.623582e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.268733e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.268733e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.759206 sec
-     4,958,873,003      cycles                           #    2.811 GHz                    
-    10,570,272,367      instructions                     #    2.13  insn per cycle         
-       1.764825335 seconds time elapsed
+TOTAL       :     1.687174 sec
+     4,939,929,910      cycles                           #    2.920 GHz                    
+    10,570,291,125      instructions                     #    2.14  insn per cycle         
+       1.692619999 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2077) (512y:  144) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe
@@ -187,14 +187,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.934828e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.158501e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.158501e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.058474e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.295667e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.295667e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.772888 sec
-     5,409,288,056      cycles                           #    1.948 GHz                    
-     7,806,084,388      instructions                     #    1.44  insn per cycle         
-       2.778257755 seconds time elapsed
+TOTAL       :     2.687752 sec
+     5,409,737,421      cycles                           #    2.010 GHz                    
+     7,805,529,138      instructions                     #    1.44  insn per cycle         
+       2.693129228 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1446) (512y:  122) (512z: 1542)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt
index 4e3b221e19..e703e9e5d5 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-03_19:03:57
+DATE: 2023-11-09_17:40:13
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.258167e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.174363e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.266024e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.110180e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.174406e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.270579e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.515882 sec
-     2,147,525,845      cycles                           #    2.877 GHz                    
-     3,086,933,024      instructions                     #    1.44  insn per cycle         
-       0.803849250 seconds time elapsed
+TOTAL       :     0.513301 sec
+     2,237,705,656      cycles                           #    3.016 GHz                    
+     3,206,861,926      instructions                     #    1.43  insn per cycle         
+       0.799816578 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208
@@ -77,14 +77,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.170531e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.234097e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.234097e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.213184e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.278334e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.278334e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     4.935479 sec
-    15,016,135,362      cycles                           #    3.040 GHz                    
-    40,166,123,209      instructions                     #    2.67  insn per cycle         
-       4.940913654 seconds time elapsed
+TOTAL       :     4.840849 sec
+    15,026,294,462      cycles                           #    3.101 GHz                    
+    40,163,846,165      instructions                     #    2.67  insn per cycle         
+       4.846092672 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  669) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.815308e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.035943e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.035943e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.848578e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.068499e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.068499e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.853658 sec
-     8,679,305,567      cycles                           #    3.037 GHz                    
-    23,688,803,932      instructions                     #    2.73  insn per cycle         
-       2.859362026 seconds time elapsed
+TOTAL       :     2.827567 sec
+     8,771,607,406      cycles                           #    3.097 GHz                    
+    23,683,918,687      instructions                     #    2.70  insn per cycle         
+       2.832818835 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2069) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.201194e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.599502e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.599502e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.290749e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.696907e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.696907e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.119971 sec
-     6,076,924,812      cycles                           #    2.860 GHz                    
-    13,078,281,182      instructions                     #    2.15  insn per cycle         
-       2.125352086 seconds time elapsed
+TOTAL       :     2.084491 sec
+     6,075,216,707      cycles                           #    2.908 GHz                    
+    13,074,699,153      instructions                     #    2.15  insn per cycle         
+       2.089762357 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2546) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.478450e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.920522e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.920522e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.571274e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.025491e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.025491e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.017570 sec
-     5,787,274,892      cycles                           #    2.862 GHz                    
-    12,336,105,279      instructions                     #    2.13  insn per cycle         
-       2.023012261 seconds time elapsed
+TOTAL       :     1.983621 sec
+     5,795,280,725      cycles                           #    2.915 GHz                    
+    12,334,890,295      instructions                     #    2.13  insn per cycle         
+       1.988789955 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2096) (512y:  294) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.519779e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.701184e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.701184e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.706784e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.899846e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.899846e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.086221 sec
-     5,817,765,621      cycles                           #    1.888 GHz                    
-     9,621,068,231      instructions                     #    1.65  insn per cycle         
-       3.091564620 seconds time elapsed
+TOTAL       :     2.932528 sec
+     5,816,798,800      cycles                           #    1.981 GHz                    
+     9,613,398,484      instructions                     #    1.65  insn per cycle         
+       2.938057800 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1510) (512y:  209) (512z: 1971)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt
index 3337c01ad4..a5c5a0c704 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-03_19:25:16
+DATE: 2023-11-09_17:59:59
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.554755e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.155174e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.268743e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.735374e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.165776e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.275136e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.526687 sec
-     2,250,994,801      cycles                           #    2.926 GHz                    
-     3,097,737,524      instructions                     #    1.38  insn per cycle         
-       0.826717654 seconds time elapsed
+TOTAL       :     0.522181 sec
+     2,183,845,501      cycles                           #    2.897 GHz                    
+     3,063,497,760      instructions                     #    1.40  insn per cycle         
+       0.813008083 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
@@ -77,14 +77,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.473532e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.556761e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.556761e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.487456e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.573222e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.573222e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     4.344783 sec
-    13,019,193,404      cycles                           #    2.993 GHz                    
-    34,405,663,599      instructions                     #    2.64  insn per cycle         
-       4.350365607 seconds time elapsed
+TOTAL       :     4.321242 sec
+    13,015,032,492      cycles                           #    3.009 GHz                    
+    34,406,787,342      instructions                     #    2.64  insn per cycle         
+       4.326519493 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  686) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.104680e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.249620e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.249620e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.121956e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.266333e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.266333e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.483866 sec
-    10,607,531,951      cycles                           #    3.041 GHz                    
-    24,022,392,993      instructions                     #    2.26  insn per cycle         
-       3.489298956 seconds time elapsed
+TOTAL       :     3.465165 sec
+    10,606,115,107      cycles                           #    3.057 GHz                    
+    24,023,886,202      instructions                     #    2.27  insn per cycle         
+       3.470527002 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2582) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.787875e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.125865e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.125865e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.813993e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.151107e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.151107e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.295291 sec
-     6,588,895,934      cycles                           #    2.865 GHz                    
-    12,413,954,044      instructions                     #    1.88  insn per cycle         
-       2.300926049 seconds time elapsed
+TOTAL       :     2.282824 sec
+     6,624,207,523      cycles                           #    2.896 GHz                    
+    12,414,593,585      instructions                     #    1.87  insn per cycle         
+       2.288220203 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3156) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.072251e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.445053e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.445053e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.113256e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.489865e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.489865e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.171777 sec
-     6,238,931,665      cycles                           #    2.866 GHz                    
-    11,585,660,605      instructions                     #    1.86  insn per cycle         
-       2.177410338 seconds time elapsed
+TOTAL       :     2.154567 sec
+     6,244,302,737      cycles                           #    2.892 GHz                    
+    11,586,784,905      instructions                     #    1.86  insn per cycle         
+       2.160119888 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2692) (512y:  239) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.998110e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.229600e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.229600e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.080168e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.315597e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.315597e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.727363 sec
-     5,337,713,756      cycles                           #    1.954 GHz                    
-     9,308,309,205      instructions                     #    1.74  insn per cycle         
-       2.732896997 seconds time elapsed
+TOTAL       :     2.674256 sec
+     5,337,021,373      cycles                           #    1.992 GHz                    
+     9,309,292,596      instructions                     #    1.74  insn per cycle         
+       2.679621915 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2116) (512y:  282) (512z: 1958)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt
index 64e33308d5..04c22c3970 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-03_19:25:43
+DATE: 2023-11-09_18:00:26
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.571117e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.157677e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.270835e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.730812e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.162658e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.271522e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.523342 sec
-     2,241,527,426      cycles                           #    2.944 GHz                    
-     3,209,964,665      instructions                     #    1.43  insn per cycle         
-       0.819917937 seconds time elapsed
+TOTAL       :     0.517390 sec
+     2,237,231,843      cycles                           #    2.985 GHz                    
+     3,219,482,821      instructions                     #    1.44  insn per cycle         
+       0.806478536 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208
@@ -77,14 +77,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.658099e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.754988e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.754988e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.686328e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.783817e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.783817e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     4.049112 sec
-    12,374,606,485      cycles                           #    3.053 GHz                    
-    35,058,016,337      instructions                     #    2.83  insn per cycle         
-       4.054549094 seconds time elapsed
+TOTAL       :     4.006446 sec
+    12,372,456,833      cycles                           #    3.085 GHz                    
+    35,059,205,099      instructions                     #    2.83  insn per cycle         
+       4.011874603 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  457) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.088523e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.231607e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.231607e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.113185e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.255090e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.255090e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.500477 sec
-    10,694,410,777      cycles                           #    3.051 GHz                    
-    23,099,336,289      instructions                     #    2.16  insn per cycle         
-       3.506159729 seconds time elapsed
+TOTAL       :     3.471336 sec
+    10,684,507,667      cycles                           #    3.074 GHz                    
+    23,099,965,959      instructions                     #    2.16  insn per cycle         
+       3.476724591 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2363) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.105721e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.492220e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.492220e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.172732e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.564192e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.564192e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.158641 sec
-     6,163,495,994      cycles                           #    2.849 GHz                    
-    11,969,488,967      instructions                     #    1.94  insn per cycle         
-       2.164367762 seconds time elapsed
+TOTAL       :     2.130496 sec
+     6,169,121,187      cycles                           #    2.891 GHz                    
+    11,970,628,399      instructions                     #    1.94  insn per cycle         
+       2.136000238 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2511) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.169198e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.571659e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.571659e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.314737e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.728928e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.728928e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.133549 sec
-     6,039,094,179      cycles                           #    2.824 GHz                    
-    11,144,077,781      instructions                     #    1.85  insn per cycle         
-       2.139096234 seconds time elapsed
+TOTAL       :     2.076859 sec
+     6,006,071,025      cycles                           #    2.885 GHz                    
+    11,143,550,799      instructions                     #    1.86  insn per cycle         
+       2.082481137 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2128) (512y:  174) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.003701e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.233597e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.233597e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.186490e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.434908e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.434908e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.726476 sec
-     5,224,063,612      cycles                           #    1.913 GHz                    
-     9,034,702,359      instructions                     #    1.73  insn per cycle         
-       2.732050023 seconds time elapsed
+TOTAL       :     2.608202 sec
+     5,201,388,823      cycles                           #    1.991 GHz                    
+     9,034,449,537      instructions                     #    1.74  insn per cycle         
+       2.613510222 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1651) (512y:  208) (512z: 1567)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
index 8d92c550fe..b055a915bb 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-03_19:04:25
+DATE: 2023-11-09_17:40:41
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.099342e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.699387e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.953526e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.058988e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.701786e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.976764e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086718e+00 +- 3.413389e-03 )  GeV^0
-TOTAL       :     0.471293 sec
-     2,042,101,644      cycles                           #    2.948 GHz                    
-     2,946,816,826      instructions                     #    1.44  insn per cycle         
-       0.749881107 seconds time elapsed
+TOTAL       :     0.470897 sec
+     2,078,401,117      cycles                           #    3.001 GHz                    
+     2,953,650,991      instructions                     #    1.42  insn per cycle         
+       0.749721776 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128
@@ -77,14 +77,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.296642e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.371475e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.371475e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.334914e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.410542e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.410542e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     4.650028 sec
-    14,160,157,406      cycles                           #    3.043 GHz                    
-    38,398,040,352      instructions                     #    2.71  insn per cycle         
-       4.655270250 seconds time elapsed
+TOTAL       :     4.574332 sec
+    14,151,959,917      cycles                           #    3.091 GHz                    
+    38,392,913,322      instructions                     #    2.71  insn per cycle         
+       4.579307325 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  587) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.139917e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.562152e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.562152e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.213719e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.641599e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.641599e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     2.124632 sec
-     6,476,959,128      cycles                           #    3.042 GHz                    
-    15,834,256,517      instructions                     #    2.44  insn per cycle         
-       2.129768462 seconds time elapsed
+TOTAL       :     2.094684 sec
+     6,471,158,629      cycles                           #    3.083 GHz                    
+    15,829,971,957      instructions                     #    2.45  insn per cycle         
+       2.099849038 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2689) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.088663e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.043198e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.043198e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.559598e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.101002e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.101002e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.237397 sec
-     3,465,504,689      cycles                           #    2.794 GHz                    
-     7,611,207,779      instructions                     #    2.20  insn per cycle         
-       1.242588855 seconds time elapsed
+TOTAL       :     1.179944 sec
+     3,466,899,201      cycles                           #    2.927 GHz                    
+     7,607,183,710      instructions                     #    2.19  insn per cycle         
+       1.185084453 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3051) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.457008e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.096549e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.096549e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.023293e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.190211e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.190211e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.196326 sec
-     3,247,822,045      cycles                           #    2.704 GHz                    
-     7,220,309,293      instructions                     #    2.22  insn per cycle         
-       1.201704693 seconds time elapsed
+TOTAL       :     1.106259 sec
+     3,248,324,558      cycles                           #    2.924 GHz                    
+     7,215,751,749      instructions                     #    2.22  insn per cycle         
+       1.111467205 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2850) (512y:   23) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.679715e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.389169e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.389169e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.338108e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.142577e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.142577e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.658315 sec
-     3,062,288,257      cycles                           #    1.842 GHz                    
-     5,850,668,317      instructions                     #    1.91  insn per cycle         
-       1.663822965 seconds time elapsed
+TOTAL       :     1.512902 sec
+     3,068,145,100      cycles                           #    2.024 GHz                    
+     5,846,808,445      instructions                     #    1.91  insn per cycle         
+       1.518114660 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2364) (512y:   24) (512z: 1889)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt
index a1ebef89d2..b4b4f0117a 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-03_19:35:48
+DATE: 2023-11-09_18:10:35
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.064201e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.498245e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.498245e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.332495e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.768677e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.768677e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086805e+00 +- 3.414078e-03 )  GeV^0
-TOTAL       :     0.670260 sec
-     2,637,877,021      cycles                           #    2.942 GHz                    
-     4,088,256,570      instructions                     #    1.55  insn per cycle         
-       0.955124097 seconds time elapsed
+TOTAL       :     0.657541 sec
+     2,664,976,053      cycles                           #    3.017 GHz                    
+     4,137,029,639      instructions                     #    1.55  insn per cycle         
+       0.940709573 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
@@ -86,14 +86,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.270912e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.344925e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.344925e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.284632e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.359311e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.359311e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     4.744982 sec
-    14,378,860,027      cycles                           #    3.027 GHz                    
-    38,435,472,086      instructions                     #    2.67  insn per cycle         
-       4.751370421 seconds time elapsed
+TOTAL       :     4.717410 sec
+    14,339,509,352      cycles                           #    3.036 GHz                    
+    38,436,261,270      instructions                     #    2.68  insn per cycle         
+       4.723588153 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  587) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe
@@ -114,14 +114,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.017460e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.422989e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.422989e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.161401e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.579571e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.579571e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     2.222354 sec
-     6,685,137,863      cycles                           #    3.001 GHz                    
-    16,109,819,565      instructions                     #    2.41  insn per cycle         
-       2.228696460 seconds time elapsed
+TOTAL       :     2.160392 sec
+     6,674,034,151      cycles                           #    3.082 GHz                    
+    16,110,239,223      instructions                     #    2.41  insn per cycle         
+       2.166483007 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2689) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe
@@ -142,14 +142,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.204872e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.057185e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.057185e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.368587e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.075649e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.075649e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.267912 sec
-     3,665,496,802      cycles                           #    2.878 GHz                    
-     7,843,464,752      instructions                     #    2.14  insn per cycle         
-       1.274414413 seconds time elapsed
+TOTAL       :     1.245937 sec
+     3,665,898,836      cycles                           #    2.929 GHz                    
+     7,844,268,726      instructions                     #    2.14  insn per cycle         
+       1.252070096 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3051) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe
@@ -170,14 +170,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.639653e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.116975e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.116975e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.007320e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.169448e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.169448e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.220373 sec
-     3,444,640,052      cycles                           #    2.810 GHz                    
-     7,451,522,975      instructions                     #    2.16  insn per cycle         
-       1.226715796 seconds time elapsed
+TOTAL       :     1.168797 sec
+     3,453,510,139      cycles                           #    2.941 GHz                    
+     7,453,168,499      instructions                     #    2.16  insn per cycle         
+       1.174935345 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2850) (512y:   23) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe
@@ -198,14 +198,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.178040e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.972638e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.972638e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.465484e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.304262e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.304262e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.593853 sec
-     3,283,201,976      cycles                           #    2.053 GHz                    
-     6,099,788,393      instructions                     #    1.86  insn per cycle         
-       1.600161746 seconds time elapsed
+TOTAL       :     1.534111 sec
+     3,274,248,388      cycles                           #    2.127 GHz                    
+     6,100,577,921      instructions                     #    1.86  insn per cycle         
+       1.540213764 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2364) (512y:   24) (512z: 1889)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt
index b7fb0d6959..375a817a79 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-03_19:48:49
+DATE: 2023-11-09_18:23:25
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.431152e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.624289e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.946132e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.824516e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.637814e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.946525e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079446e+00 +- 3.403306e-03 )  GeV^0
-TOTAL       :     0.564134 sec
-     2,302,613,621      cycles                           #    2.942 GHz                    
-     3,377,451,746      instructions                     #    1.47  insn per cycle         
-       0.841499880 seconds time elapsed
+TOTAL       :     0.570368 sec
+     2,261,358,530      cycles                           #    2.855 GHz                    
+     3,305,358,456      instructions                     #    1.46  insn per cycle         
+       0.849017060 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128
@@ -77,14 +77,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.289992e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.364715e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.364715e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.325738e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.401234e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.401234e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079573e+00 +- 3.404712e-03 )  GeV^0
-TOTAL       :     4.718949 sec
-    14,318,249,819      cycles                           #    3.032 GHz                    
-    38,421,429,911      instructions                     #    2.68  insn per cycle         
-       4.724102129 seconds time elapsed
+TOTAL       :     4.645582 sec
+    14,325,809,375      cycles                           #    3.082 GHz                    
+    38,422,987,894      instructions                     #    2.68  insn per cycle         
+       4.650648560 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  587) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.077786e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.487595e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.487595e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.201320e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.630090e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.630090e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079572e+00 +- 3.404712e-03 )  GeV^0
-TOTAL       :     2.204441 sec
-     6,639,814,735      cycles                           #    3.006 GHz                    
-    15,841,902,427      instructions                     #    2.39  insn per cycle         
-       2.209539727 seconds time elapsed
+TOTAL       :     2.152860 sec
+     6,643,060,083      cycles                           #    3.080 GHz                    
+    15,842,584,477      instructions                     #    2.38  insn per cycle         
+       2.158023571 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2689) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.307822e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.070999e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.070999e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.450401e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.089441e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.089441e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079550e+00 +- 3.404207e-03 )  GeV^0
-TOTAL       :     1.265035 sec
-     3,649,285,785      cycles                           #    2.875 GHz                    
-     7,591,137,573      instructions                     #    2.08  insn per cycle         
-       1.270319196 seconds time elapsed
+TOTAL       :     1.246774 sec
+     3,643,683,352      cycles                           #    2.913 GHz                    
+     7,592,040,005      instructions                     #    2.08  insn per cycle         
+       1.251723719 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3051) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.974832e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.160037e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.160037e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.014057e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.180349e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.180349e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079550e+00 +- 3.404207e-03 )  GeV^0
-TOTAL       :     1.191816 sec
-     3,426,519,284      cycles                           #    2.864 GHz                    
-     7,166,067,248      instructions                     #    2.09  insn per cycle         
-       1.197132868 seconds time elapsed
+TOTAL       :     1.172023 sec
+     3,431,252,645      cycles                           #    2.917 GHz                    
+     7,165,511,136      instructions                     #    2.09  insn per cycle         
+       1.177142051 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2850) (512y:   23) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.265683e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.068951e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.068951e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.431571e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.259454e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.259454e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079550e+00 +- 3.404208e-03 )  GeV^0
-TOTAL       :     1.584018 sec
-     3,241,188,093      cycles                           #    2.041 GHz                    
-     5,795,628,367      instructions                     #    1.79  insn per cycle         
-       1.589192883 seconds time elapsed
+TOTAL       :     1.550630 sec
+     3,238,644,111      cycles                           #    2.083 GHz                    
+     5,796,702,494      instructions                     #    1.79  insn per cycle         
+       1.555869344 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2364) (512y:   24) (512z: 1889)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt
index 30f4fadf92..573aa8a1a6 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-03_19:45:28
+DATE: 2023-11-09_18:20:06
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.447666e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.634082e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.951326e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.875401e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.666103e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.969743e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086718e+00 +- 3.413389e-03 )  GeV^0
-TOTAL       :     0.513708 sec
-     2,149,338,807      cycles                           #    2.936 GHz                    
-     3,363,855,189      instructions                     #    1.57  insn per cycle         
-       0.790810409 seconds time elapsed
+TOTAL       :     0.505509 sec
+     2,155,710,329      cycles                           #    2.977 GHz                    
+     3,399,528,814      instructions                     #    1.58  insn per cycle         
+       0.781726612 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128
@@ -77,14 +77,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.247364e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.319306e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.319306e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.331654e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.407717e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.407717e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     4.751713 sec
-    14,161,394,696      cycles                           #    2.978 GHz                    
-    38,393,782,229      instructions                     #    2.71  insn per cycle         
-       4.756965371 seconds time elapsed
+TOTAL       :     4.581584 sec
+    14,155,354,915      cycles                           #    3.087 GHz                    
+    38,394,211,404      instructions                     #    2.71  insn per cycle         
+       4.586893992 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  587) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.102956e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.519127e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.519127e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.232934e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.666455e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.666455e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     2.140784 sec
-     6,476,072,518      cycles                           #    3.019 GHz                    
-    15,828,662,766      instructions                     #    2.44  insn per cycle         
-       2.146087935 seconds time elapsed
+TOTAL       :     2.087317 sec
+     6,475,857,503      cycles                           #    3.096 GHz                    
+    15,829,568,301      instructions                     #    2.44  insn per cycle         
+       2.092497637 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2689) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.357298e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.077430e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.077430e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.589829e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.103388e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.103388e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.205006 sec
-     3,468,184,099      cycles                           #    2.868 GHz                    
-     7,606,030,531      instructions                     #    2.19  insn per cycle         
-       1.210138102 seconds time elapsed
+TOTAL       :     1.175635 sec
+     3,460,928,709      cycles                           #    2.933 GHz                    
+     7,606,660,397      instructions                     #    2.20  insn per cycle         
+       1.180756657 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3051) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.559739e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.106426e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.106426e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.939659e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.155071e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.155071e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.182909 sec
-     3,252,386,286      cycles                           #    2.739 GHz                    
-     7,215,128,616      instructions                     #    2.22  insn per cycle         
-       1.188234183 seconds time elapsed
+TOTAL       :     1.139254 sec
+     3,252,781,739      cycles                           #    2.845 GHz                    
+     7,214,861,555      instructions                     #    2.22  insn per cycle         
+       1.144377149 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2850) (512y:   23) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.332938e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.163555e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.163555e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.585994e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.448568e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.448568e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.514986 sec
-     3,076,222,583      cycles                           #    2.024 GHz                    
-     5,845,646,643      instructions                     #    1.90  insn per cycle         
-       1.520503790 seconds time elapsed
+TOTAL       :     1.467081 sec
+     3,063,258,508      cycles                           #    2.082 GHz                    
+     5,845,738,451      instructions                     #    1.91  insn per cycle         
+       1.472345808 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2364) (512y:   24) (512z: 1889)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt
index 65eed836f1..415792c712 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-03_19:42:11
+DATE: 2023-11-09_18:16:53
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -51,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.910755e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.623741e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.938668e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.158996e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.650796e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.951969e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086805e+00 +- 3.414078e-03 )  GeV^0
-TOTAL       :     0.613295 sec
-     2,456,965,302      cycles                           #    2.952 GHz                    
-     3,803,211,416      instructions                     #    1.55  insn per cycle         
-       0.890835389 seconds time elapsed
+TOTAL       :     0.604748 sec
+     2,477,150,875      cycles                           #    3.008 GHz                    
+     3,827,452,997      instructions                     #    1.55  insn per cycle         
+       0.882802717 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -79,14 +79,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.291712e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.365790e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.365790e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.278227e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.352095e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.352095e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     4.660750 sec
-    14,151,818,953      cycles                           #    3.034 GHz                    
-    38,392,284,342      instructions                     #    2.71  insn per cycle         
-       4.665929439 seconds time elapsed
+TOTAL       :     4.688006 sec
+    14,149,964,703      cycles                           #    3.016 GHz                    
+    38,393,052,805      instructions                     #    2.71  insn per cycle         
+       4.693060305 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  587) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe
@@ -106,14 +106,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.100691e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.531126e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.531126e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.195001e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.620625e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.620625e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     2.141262 sec
-     6,484,613,456      cycles                           #    3.022 GHz                    
-    15,829,197,800      instructions                     #    2.44  insn per cycle         
-       2.146554392 seconds time elapsed
+TOTAL       :     2.102860 sec
+     6,473,914,859      cycles                           #    3.072 GHz                    
+    15,829,595,595      instructions                     #    2.45  insn per cycle         
+       2.107994821 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2689) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe
@@ -133,14 +133,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.341999e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.073892e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.073892e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.498213e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.092419e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.092419e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.207094 sec
-     3,469,517,910      cycles                           #    2.864 GHz                    
-     7,605,958,162      instructions                     #    2.19  insn per cycle         
-       1.212334488 seconds time elapsed
+TOTAL       :     1.186261 sec
+     3,464,671,010      cycles                           #    2.910 GHz                    
+     7,606,636,115      instructions                     #    2.20  insn per cycle         
+       1.191422669 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3051) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe
@@ -160,14 +160,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.000164e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.163047e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.163047e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.018341e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.184021e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.184021e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.132933 sec
-     3,264,238,503      cycles                           #    2.869 GHz                    
-     7,214,964,009      instructions                     #    2.21  insn per cycle         
-       1.138315941 seconds time elapsed
+TOTAL       :     1.113056 sec
+     3,253,634,801      cycles                           #    2.912 GHz                    
+     7,214,825,947      instructions                     #    2.22  insn per cycle         
+       1.118242022 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2850) (512y:   23) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe
@@ -187,14 +187,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.339791e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.166023e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.166023e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.525206e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.371211e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.371211e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.514355 sec
-     3,071,490,694      cycles                           #    2.022 GHz                    
-     5,845,279,944      instructions                     #    1.90  insn per cycle         
-       1.519539150 seconds time elapsed
+TOTAL       :     1.477022 sec
+     3,066,754,541      cycles                           #    2.070 GHz                    
+     5,845,673,759      instructions                     #    1.91  insn per cycle         
+       1.482222084 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2364) (512y:   24) (512z: 1889)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt
index 06d8f7d09d..dbd0c88759 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-03_19:04:48
+DATE: 2023-11-09_17:41:04
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.108032e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.751852e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.017010e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.062894e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.751636e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.032491e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086718e+00 +- 3.413389e-03 )  GeV^0
-TOTAL       :     0.473084 sec
-     2,025,626,323      cycles                           #    2.920 GHz                    
-     2,923,341,053      instructions                     #    1.44  insn per cycle         
-       0.752440698 seconds time elapsed
+TOTAL       :     0.471370 sec
+     2,069,916,039      cycles                           #    2.986 GHz                    
+     2,893,797,319      instructions                     #    1.40  insn per cycle         
+       0.749930288 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 127
@@ -77,14 +77,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.226197e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.296658e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.296658e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.241816e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.314075e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.314075e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     4.795639 sec
-    14,422,319,778      cycles                           #    3.005 GHz                    
-    39,889,404,210      instructions                     #    2.77  insn per cycle         
-       4.800761254 seconds time elapsed
+TOTAL       :     4.763197 sec
+    14,419,363,408      cycles                           #    3.025 GHz                    
+    39,885,822,805      instructions                     #    2.77  insn per cycle         
+       4.768145939 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  570) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.840353e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.410043e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.410043e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.077159e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.666017e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.666017e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     1.880181 sec
-     5,610,891,745      cycles                           #    2.978 GHz                    
-    15,305,908,167      instructions                     #    2.73  insn per cycle         
-       1.885354787 seconds time elapsed
+TOTAL       :     1.809175 sec
+     5,591,744,554      cycles                           #    3.083 GHz                    
+    15,300,029,522      instructions                     #    2.74  insn per cycle         
+       1.814409785 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2473) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.584020e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.270908e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.270908e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.801496e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.504366e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.504366e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.679496 sec
-     4,739,407,479      cycles                           #    2.814 GHz                    
-     9,752,382,085      instructions                     #    2.06  insn per cycle         
-       1.685063058 seconds time elapsed
+TOTAL       :     1.624464 sec
+     4,741,141,330      cycles                           #    2.911 GHz                    
+     9,747,661,132      instructions                     #    2.06  insn per cycle         
+       1.629561959 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3710) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.785300e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.495008e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.495008e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.005329e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.745480e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.745480e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.630325 sec
-     4,628,420,386      cycles                           #    2.831 GHz                    
-     9,343,264,044      instructions                     #    2.02  insn per cycle         
-       1.635531127 seconds time elapsed
+TOTAL       :     1.578447 sec
+     4,623,271,493      cycles                           #    2.921 GHz                    
+     9,339,033,786      instructions                     #    2.02  insn per cycle         
+       1.583594825 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3497) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.035393e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.577354e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.577354e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.210537e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.774289e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.774289e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.821625 sec
-     3,652,061,133      cycles                           #    2.000 GHz                    
-     7,049,331,376      instructions                     #    1.93  insn per cycle         
-       1.826875192 seconds time elapsed
+TOTAL       :     1.770504 sec
+     3,648,791,259      cycles                           #    2.056 GHz                    
+     7,045,498,641      instructions                     #    1.93  insn per cycle         
+       1.775670307 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2606) (512y:   12) (512z: 2221)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt
index 430bbd2c8e..c0790b6e36 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-03_19:26:11
+DATE: 2023-11-09_18:00:53
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.386931e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.620878e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.939459e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.858794e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.673199e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.981057e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086718e+00 +- 3.413389e-03 )  GeV^0
-TOTAL       :     0.478570 sec
-     2,066,322,031      cycles                           #    2.937 GHz                    
-     2,939,169,205      instructions                     #    1.42  insn per cycle         
-       0.760998289 seconds time elapsed
+TOTAL       :     0.474263 sec
+     2,125,063,536      cycles                           #    3.002 GHz                    
+     3,025,852,918      instructions                     #    1.42  insn per cycle         
+       0.764897313 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128
@@ -77,14 +77,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.585659e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.679951e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.679951e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.589894e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.682971e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.682971e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     4.141107 sec
-    12,606,870,018      cycles                           #    3.041 GHz                    
-    34,392,677,682      instructions                     #    2.73  insn per cycle         
-       4.146310630 seconds time elapsed
+TOTAL       :     4.133418 sec
+    12,609,458,975      cycles                           #    3.048 GHz                    
+    34,395,001,210      instructions                     #    2.73  insn per cycle         
+       4.138439483 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  696) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.476247e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.957210e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.957210e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.435122e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.914251e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.914251e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     2.000613 sec
-     6,098,731,252      cycles                           #    3.041 GHz                    
-    14,873,462,613      instructions                     #    2.44  insn per cycle         
-       2.006051106 seconds time elapsed
+TOTAL       :     2.014773 sec
+     6,085,710,075      cycles                           #    3.014 GHz                    
+    14,874,327,590      instructions                     #    2.44  insn per cycle         
+       2.020198945 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3009) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.182448e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.992665e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.992665e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.550169e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.423492e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.423492e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.544245 sec
-     4,326,302,580      cycles                           #    2.793 GHz                    
-     9,041,454,033      instructions                     #    2.09  insn per cycle         
-       1.549495391 seconds time elapsed
+TOTAL       :     1.471458 sec
+     4,290,277,982      cycles                           #    2.907 GHz                    
+     9,041,954,393      instructions                     #    2.11  insn per cycle         
+       1.476543510 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4445) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.602793e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.504278e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.504278e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.705610e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.621776e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.621776e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.462983 sec
-     4,209,847,303      cycles                           #    2.868 GHz                    
-     8,675,528,842      instructions                     #    2.06  insn per cycle         
-       1.468300337 seconds time elapsed
+TOTAL       :     1.443048 sec
+     4,208,694,980      cycles                           #    2.909 GHz                    
+     8,677,287,895      instructions                     #    2.06  insn per cycle         
+       1.448442097 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4244) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.697162e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.177263e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.177263e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.842247e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.341676e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.341676e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.925379 sec
-     3,842,178,645      cycles                           #    1.991 GHz                    
-     7,819,452,293      instructions                     #    2.04  insn per cycle         
-       1.930845155 seconds time elapsed
+TOTAL       :     1.878702 sec
+     3,847,091,668      cycles                           #    2.044 GHz                    
+     7,820,914,226      instructions                     #    2.03  insn per cycle         
+       1.883936977 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4420) (512y:    0) (512z: 2556)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt
index c32244c33c..a8fdecb532 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-03_19:26:34
+DATE: 2023-11-09_18:01:17
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.460575e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.684792e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.012555e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.862525e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.715295e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.030318e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086718e+00 +- 3.413389e-03 )  GeV^0
-TOTAL       :     0.478960 sec
-     2,073,686,428      cycles                           #    2.952 GHz                    
-     2,982,309,893      instructions                     #    1.44  insn per cycle         
-       0.760465246 seconds time elapsed
+TOTAL       :     0.474547 sec
+     2,129,920,938      cycles                           #    3.015 GHz                    
+     3,022,843,622      instructions                     #    1.42  insn per cycle         
+       0.763729431 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 127
@@ -77,14 +77,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.768420e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.879887e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.879887e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.720126e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.824009e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.824009e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     3.874951 sec
-    11,759,850,982      cycles                           #    3.031 GHz                    
-    35,129,174,459      instructions                     #    2.99  insn per cycle         
-       3.880406297 seconds time elapsed
+TOTAL       :     3.938814 sec
+    11,787,930,920      cycles                           #    2.995 GHz                    
+    35,134,515,128      instructions                     #    2.98  insn per cycle         
+       3.943783291 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  470) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.548911e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.058975e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.058975e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.688740e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.207831e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.207831e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     1.977553 sec
-     5,960,287,184      cycles                           #    3.008 GHz                    
-    14,484,169,544      instructions                     #    2.43  insn per cycle         
-       1.983134337 seconds time elapsed
+TOTAL       :     1.927645 sec
+     5,955,477,747      cycles                           #    3.083 GHz                    
+    14,483,875,890      instructions                     #    2.43  insn per cycle         
+       1.932605425 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2572) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.662372e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.600563e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.600563e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.792092e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.717382e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.717382e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.451994 sec
-     4,186,509,528      cycles                           #    2.874 GHz                    
-     8,887,826,504      instructions                     #    2.12  insn per cycle         
-       1.457581768 seconds time elapsed
+TOTAL       :     1.428222 sec
+     4,172,426,658      cycles                           #    2.912 GHz                    
+     8,888,638,577      instructions                     #    2.13  insn per cycle         
+       1.433579963 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3576) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.782199e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.721549e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.721549e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.830183e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.777369e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.777369e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.432127 sec
-     4,128,776,992      cycles                           #    2.874 GHz                    
-     8,424,271,434      instructions                     #    2.04  insn per cycle         
-       1.437420732 seconds time elapsed
+TOTAL       :     1.421326 sec
+     4,143,555,691      cycles                           #    2.906 GHz                    
+     8,424,122,393      instructions                     #    2.03  insn per cycle         
+       1.426420575 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3320) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.779314e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.273574e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.273574e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.911357e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.422090e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.422090e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.899022 sec
-     3,798,792,191      cycles                           #    1.996 GHz                    
-     7,712,429,012      instructions                     #    2.03  insn per cycle         
-       1.904382082 seconds time elapsed
+TOTAL       :     1.856974 sec
+     3,783,077,119      cycles                           #    2.033 GHz                    
+     7,713,045,733      instructions                     #    2.04  insn per cycle         
+       1.862087187 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3436) (512y:    0) (512z: 2108)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
index 4284e04c80..bc7d9de588 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-03_19:05:13
+DATE: 2023-11-09_17:41:28
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.262595e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.173145e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.266137e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.109904e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.171630e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.269382e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.516288 sec
-     2,170,206,194      cycles                           #    2.914 GHz                    
-     3,121,753,700      instructions                     #    1.44  insn per cycle         
-       0.802206987 seconds time elapsed
+TOTAL       :     0.511961 sec
+     2,222,105,943      cycles                           #    3.001 GHz                    
+     3,180,029,506      instructions                     #    1.43  insn per cycle         
+       0.797868325 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
@@ -77,14 +77,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.129811e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.193121e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.193121e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.142732e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.204413e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.204413e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.027744 sec
-    15,293,663,581      cycles                           #    3.040 GHz                    
-    38,642,438,156      instructions                     #    2.53  insn per cycle         
-       5.032856601 seconds time elapsed
+TOTAL       :     4.999043 sec
+    15,266,738,883      cycles                           #    3.052 GHz                    
+    38,639,692,678      instructions                     #    2.53  insn per cycle         
+       5.004417103 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  672) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.666972e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.869148e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.869148e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.675686e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.874485e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.874485e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.964411 sec
-     8,933,093,188      cycles                           #    3.009 GHz                    
-    24,243,353,502      instructions                     #    2.71  insn per cycle         
-       2.969821465 seconds time elapsed
+TOTAL       :     2.956696 sec
+     8,943,278,567      cycles                           #    3.020 GHz                    
+    24,239,461,473      instructions                     #    2.71  insn per cycle         
+       2.961985342 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2188) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.660709e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.167400e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.167400e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.810568e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.309343e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.309343e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.961588 sec
-     5,410,079,541      cycles                           #    2.752 GHz                    
-    11,291,080,205      instructions                     #    2.09  insn per cycle         
-       1.966921243 seconds time elapsed
+TOTAL       :     1.907942 sec
+     5,390,382,442      cycles                           #    2.818 GHz                    
+    11,287,870,279      instructions                     #    2.09  insn per cycle         
+       1.913175131 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2480) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.588007e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.231756e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.231756e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.736389e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.412733e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.412733e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.695283 sec
-     4,860,759,917      cycles                           #    2.859 GHz                    
-    10,541,284,808      instructions                     #    2.17  insn per cycle         
-       1.700590360 seconds time elapsed
+TOTAL       :     1.660808 sec
+     4,859,407,660      cycles                           #    2.918 GHz                    
+    10,535,709,652      instructions                     #    2.17  insn per cycle         
+       1.666185530 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2167) (512y:  148) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.107588e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.350535e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.350535e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.170238e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.418556e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.418556e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.656629 sec
-     5,204,386,075      cycles                           #    1.956 GHz                    
-     7,617,502,706      instructions                     #    1.46  insn per cycle         
-       2.661905103 seconds time elapsed
+TOTAL       :     2.618902 sec
+     5,253,729,468      cycles                           #    2.003 GHz                    
+     7,613,729,309      instructions                     #    1.45  insn per cycle         
+       2.624316082 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1633) (512y:  126) (512z: 1608)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt
index 58d2d743b0..008a5e172d 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-03_19:05:40
+DATE: 2023-11-09_17:41:55
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.265506e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.176728e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.270375e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.128890e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.181968e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.279178e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.513169 sec
-     2,175,922,923      cycles                           #    2.936 GHz                    
-     3,154,957,492      instructions                     #    1.45  insn per cycle         
-       0.799013980 seconds time elapsed
+TOTAL       :     0.513214 sec
+     2,219,973,022      cycles                           #    2.991 GHz                    
+     3,202,428,118      instructions                     #    1.44  insn per cycle         
+       0.799522630 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208
@@ -77,14 +77,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.110999e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.171227e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.171227e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.124085e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.184530e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.184530e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.072155 sec
-    15,377,556,110      cycles                           #    3.029 GHz                    
-    40,435,905,161      instructions                     #    2.63  insn per cycle         
-       5.077406066 seconds time elapsed
+TOTAL       :     5.039921 sec
+    15,384,037,518      cycles                           #    3.050 GHz                    
+    40,433,132,851      instructions                     #    2.63  insn per cycle         
+       5.045085372 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  669) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.761885e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.974310e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.974310e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.855191e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.079392e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.079392e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.891901 sec
-     8,516,736,770      cycles                           #    2.941 GHz                    
-    23,273,421,536      instructions                     #    2.73  insn per cycle         
-       2.897134410 seconds time elapsed
+TOTAL       :     2.823965 sec
+     8,503,215,845      cycles                           #    3.006 GHz                    
+    23,269,764,862      instructions                     #    2.74  insn per cycle         
+       2.829223148 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2091) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.041812e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.416387e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.416387e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.125017e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.510855e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.510855e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.184891 sec
-     6,239,964,038      cycles                           #    2.850 GHz                    
-    12,976,938,369      instructions                     #    2.08  insn per cycle         
-       2.190210603 seconds time elapsed
+TOTAL       :     2.149257 sec
+     6,265,408,652      cycles                           #    2.910 GHz                    
+    12,973,997,697      instructions                     #    2.07  insn per cycle         
+       2.154583439 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2669) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.262419e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.673980e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.673980e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.427179e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.860121e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.860121e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.097286 sec
-     5,931,604,060      cycles                           #    2.822 GHz                    
-    12,254,844,972      instructions                     #    2.07  insn per cycle         
-       2.102596228 seconds time elapsed
+TOTAL       :     2.035544 sec
+     5,944,578,726      cycles                           #    2.915 GHz                    
+    12,250,352,313      instructions                     #    2.06  insn per cycle         
+       2.040880399 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2209) (512y:  296) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.636806e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.830983e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.830983e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.896609e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.113493e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.113493e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.989274 sec
-     5,599,763,733      cycles                           #    1.871 GHz                    
-     8,758,209,944      instructions                     #    1.56  insn per cycle         
-       2.994808333 seconds time elapsed
+TOTAL       :     2.794321 sec
+     5,604,210,205      cycles                           #    2.003 GHz                    
+     8,753,670,387      instructions                     #    1.56  insn per cycle         
+       2.799501421 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1490) (512y:  183) (512z: 1909)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
index c973ded005..a6a310dca7 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2023-11-03_19:06:08
+DATE: 2023-11-09_17:42:23
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.987778e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.047089e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.059978e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.987135e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.050792e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.063302e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     0.462314 sec
-     1,969,733,176      cycles                           #    2.915 GHz                    
-     2,854,417,454      instructions                     #    1.45  insn per cycle         
-       0.732902295 seconds time elapsed
+TOTAL       :     0.461215 sec
+     2,013,982,440      cycles                           #    2.996 GHz                    
+     2,888,271,641      instructions                     #    1.43  insn per cycle         
+       0.731639311 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.125374e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.318187e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.329149e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.121271e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.323663e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.335167e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.734461e+02 +- 4.775415e+02 )  GeV^-2
-TOTAL       :     0.595579 sec
-     2,446,683,532      cycles                           #    2.952 GHz                    
-     3,726,903,800      instructions                     #    1.52  insn per cycle         
-       0.888429467 seconds time elapsed
+TOTAL       :     0.596567 sec
+     2,489,603,363      cycles                           #    2.997 GHz                    
+     3,769,346,991      instructions                     #    1.51  insn per cycle         
+       0.890614911 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.543975e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.556543e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.556543e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.576698e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.589005e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.589005e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     6.463148 sec
-    19,697,684,289      cycles                           #    3.046 GHz                    
-    59,611,728,869      instructions                     #    3.03  insn per cycle         
-       6.467313414 seconds time elapsed
+TOTAL       :     6.380855 sec
+    19,728,048,826      cycles                           #    3.090 GHz                    
+    59,610,032,345      instructions                     #    3.02  insn per cycle         
+       6.384875624 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1466) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.806236e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.850408e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.850408e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.837473e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.882254e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.882254e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     3.430883 sec
-    10,361,092,942      cycles                           #    3.017 GHz                    
-    30,679,655,225      instructions                     #    2.96  insn per cycle         
-       3.435128458 seconds time elapsed
+TOTAL       :     3.409518 sec
+    10,359,121,121      cycles                           #    3.036 GHz                    
+    30,679,203,213      instructions                     #    2.96  insn per cycle         
+       3.413745701 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 5153) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.723128e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.902993e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.902993e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.786469e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.964416e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.964416e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.707466 sec
-     4,879,146,362      cycles                           #    2.851 GHz                    
-    11,021,709,924      instructions                     #    2.26  insn per cycle         
-       1.711937944 seconds time elapsed
+TOTAL       :     1.696222 sec
+     4,887,496,480      cycles                           #    2.875 GHz                    
+    11,021,602,656      instructions                     #    2.26  insn per cycle         
+       1.700511665 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4467) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.083664e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.105516e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.105516e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.093744e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.115987e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.115987e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.533989 sec
-     4,371,523,225      cycles                           #    2.843 GHz                    
-    10,299,869,041      instructions                     #    2.36  insn per cycle         
-       1.538284203 seconds time elapsed
+TOTAL       :     1.520406 sec
+     4,369,323,760      cycles                           #    2.867 GHz                    
+    10,298,269,078      instructions                     #    2.36  insn per cycle         
+       1.524718704 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4137) (512y:   91) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.583252e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.691167e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.691167e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.753883e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.865687e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.865687e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     2.184881 sec
-     4,101,268,943      cycles                           #    1.874 GHz                    
-     5,846,549,953      instructions                     #    1.43  insn per cycle         
-       2.189162148 seconds time elapsed
+TOTAL       :     2.137350 sec
+     4,099,012,031      cycles                           #    1.915 GHz                    
+     5,845,815,520      instructions                     #    1.43  insn per cycle         
+       2.141590310 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1540) (512y:   95) (512z: 3466)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt
index cc88ce6db1..47e341807c 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2023-11-03_19:36:12
+DATE: 2023-11-09_18:10:58
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.617150e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.773641e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.773641e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.707712e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.862456e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.862456e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     0.490872 sec
-     2,070,161,118      cycles                           #    2.946 GHz                    
-     3,152,579,676      instructions                     #    1.52  insn per cycle         
-       0.759960652 seconds time elapsed
+TOTAL       :     0.491938 sec
+     2,095,418,329      cycles                           #    2.943 GHz                    
+     3,181,792,573      instructions                     #    1.52  insn per cycle         
+       0.771165711 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
@@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.687018e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.487518e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.487518e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.763222e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.617411e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.617411e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 6.734461e+02 +- 4.775415e+02 )  GeV^-2
-TOTAL       :     0.832612 sec
-     3,193,307,533      cycles                           #    2.947 GHz                    
-     4,978,788,975      instructions                     #    1.56  insn per cycle         
-       1.143205796 seconds time elapsed
+TOTAL       :     0.818666 sec
+     3,177,291,822      cycles                           #    2.975 GHz                    
+     5,098,451,441      instructions                     #    1.60  insn per cycle         
+       1.129356217 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.529162e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.541866e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.541866e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.524248e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.536588e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.536588e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     6.507705 sec
-    19,736,202,639      cycles                           #    3.031 GHz                    
-    59,616,040,959      instructions                     #    3.02  insn per cycle         
-       6.512416242 seconds time elapsed
+TOTAL       :     6.519126 sec
+    19,771,628,211      cycles                           #    3.032 GHz                    
+    59,619,366,283      instructions                     #    3.02  insn per cycle         
+       6.523440391 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1466) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe
@@ -132,14 +132,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.815393e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.861165e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.861165e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.881918e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.927973e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.927973e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     3.431600 sec
-    10,398,990,181      cycles                           #    3.027 GHz                    
-    30,726,516,620      instructions                     #    2.95  insn per cycle         
-       3.436080496 seconds time elapsed
+TOTAL       :     3.385706 sec
+    10,402,667,023      cycles                           #    3.069 GHz                    
+    30,728,506,666      instructions                     #    2.95  insn per cycle         
+       3.390173573 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 5153) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -160,14 +160,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.253880e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.426870e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.426870e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.797699e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.978652e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.978652e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.802152 sec
-     4,928,997,803      cycles                           #    2.730 GHz                    
-    11,072,368,065      instructions                     #    2.25  insn per cycle         
-       1.806633331 seconds time elapsed
+TOTAL       :     1.701840 sec
+     4,920,530,137      cycles                           #    2.885 GHz                    
+    11,072,335,054      instructions                     #    2.25  insn per cycle         
+       1.706256708 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4467) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -188,14 +188,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.076136e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.098656e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.098656e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.099458e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.122078e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.122078e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.553423 sec
-     4,411,400,335      cycles                           #    2.833 GHz                    
-    10,349,798,385      instructions                     #    2.35  insn per cycle         
-       1.557941492 seconds time elapsed
+TOTAL       :     1.518361 sec
+     4,398,354,549      cycles                           #    2.890 GHz                    
+    10,347,368,561      instructions                     #    2.35  insn per cycle         
+       1.522642923 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4137) (512y:   91) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe
@@ -216,14 +216,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.266833e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.375233e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.375233e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.773044e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.885749e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.885749e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     2.287929 sec
-     4,148,582,308      cycles                           #    1.811 GHz                    
-     5,885,924,420      instructions                     #    1.42  insn per cycle         
-       2.292472050 seconds time elapsed
+TOTAL       :     2.139176 sec
+     4,134,059,026      cycles                           #    1.929 GHz                    
+     5,885,050,529      instructions                     #    1.42  insn per cycle         
+       2.143583199 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1540) (512y:   95) (512z: 3466)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt
index 890a9e444f..de9a4f17b0 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2023-11-03_19:06:37
+DATE: 2023-11-09_17:42:52
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.934806e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.040123e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.052620e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.944811e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.043287e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.055886e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     0.460430 sec
-     1,973,324,046      cycles                           #    2.928 GHz                    
-     2,840,856,751      instructions                     #    1.44  insn per cycle         
-       0.731489352 seconds time elapsed
+TOTAL       :     0.460458 sec
+     2,026,281,331      cycles                           #    3.005 GHz                    
+     2,900,924,761      instructions                     #    1.43  insn per cycle         
+       0.731473724 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.120884e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.312101e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.322916e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.115492e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.315818e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.327216e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.734461e+02 +- 4.775415e+02 )  GeV^-2
-TOTAL       :     0.593653 sec
-     2,438,307,110      cycles                           #    2.956 GHz                    
-     3,770,815,852      instructions                     #    1.55  insn per cycle         
-       0.884294118 seconds time elapsed
+TOTAL       :     0.589562 sec
+     2,467,189,653      cycles                           #    3.006 GHz                    
+     3,742,728,616      instructions                     #    1.52  insn per cycle         
+       0.882301885 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.568377e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.581048e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.581048e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.562701e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.575539e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.575539e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     6.401933 sec
-    19,482,758,220      cycles                           #    3.042 GHz                    
-    58,802,978,389      instructions                     #    3.02  insn per cycle         
-       6.406140471 seconds time elapsed
+TOTAL       :     6.415806 sec
+    19,556,589,093      cycles                           #    3.047 GHz                    
+    58,802,097,142      instructions                     #    3.01  insn per cycle         
+       6.419943255 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1313) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.917983e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.963815e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.963815e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.964793e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.010479e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.010479e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     3.353380 sec
-    10,239,214,469      cycles                           #    3.050 GHz                    
-    30,351,045,797      instructions                     #    2.96  insn per cycle         
-       3.357673213 seconds time elapsed
+TOTAL       :     3.321576 sec
+    10,234,879,480      cycles                           #    3.078 GHz                    
+    30,349,718,565      instructions                     #    2.97  insn per cycle         
+       3.325925546 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 4970) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.402320e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.570383e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.570383e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.508412e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.675254e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.675254e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.764710 sec
-     5,042,998,580      cycles                           #    2.852 GHz                    
-    11,486,615,235      instructions                     #    2.28  insn per cycle         
-       1.768978894 seconds time elapsed
+TOTAL       :     1.744975 sec
+     5,046,123,954      cycles                           #    2.887 GHz                    
+    11,486,788,981      instructions                     #    2.28  insn per cycle         
+       1.749151834 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4591) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.003860e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.023445e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.023445e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.033659e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.053692e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.053692e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.654433 sec
-     4,647,317,234      cycles                           #    2.803 GHz                    
-    10,844,918,785      instructions                     #    2.33  insn per cycle         
-       1.658681615 seconds time elapsed
+TOTAL       :     1.606653 sec
+     4,645,095,124      cycles                           #    2.885 GHz                    
+    10,843,590,320      instructions                     #    2.33  insn per cycle         
+       1.610949978 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4183) (512y:  244) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.419133e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.526721e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.526721e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.741864e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.853507e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.853507e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     2.233568 sec
-     4,119,227,015      cycles                           #    1.842 GHz                    
-     6,111,995,104      instructions                     #    1.48  insn per cycle         
-       2.238507475 seconds time elapsed
+TOTAL       :     2.140995 sec
+     4,112,867,345      cycles                           #    1.919 GHz                    
+     6,110,383,002      instructions                     #    1.49  insn per cycle         
+       2.145162136 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1457) (512y:  139) (512z: 3568)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
index 906002ccef..f7b3cf47d9 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2023-11-03_19:07:06
+DATE: 2023-11-09_17:43:21
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.570718e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.332431e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.423909e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.559244e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.332615e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.416599e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008472e+02 +- 5.002447e+01 )  GeV^-2
-TOTAL       :     0.445719 sec
-     1,977,839,409      cycles                           #    2.946 GHz                    
-     2,766,831,818      instructions                     #    1.40  insn per cycle         
-       0.728762524 seconds time elapsed
+TOTAL       :     0.442703 sec
+     1,956,548,432      cycles                           #    2.973 GHz                    
+     2,743,818,395      instructions                     #    1.40  insn per cycle         
+       0.717328196 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 254
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.444258e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.461256e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.527187e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.415878e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.488188e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.558288e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.630099e+02 +- 4.770719e+02 )  GeV^-2
-TOTAL       :     0.490311 sec
-     2,098,277,441      cycles                           #    2.940 GHz                    
-     3,050,395,563      instructions                     #    1.45  insn per cycle         
-       0.771282830 seconds time elapsed
+TOTAL       :     0.487204 sec
+     2,131,239,677      cycles                           #    3.000 GHz                    
+     3,082,245,234      instructions                     #    1.45  insn per cycle         
+       0.768130616 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
@@ -86,9 +86,9 @@ OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP=
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
-        32,139,063      cycles                           #    2.763 GHz                    
-        49,369,582      instructions                     #    1.54  insn per cycle         
-       0.012019390 seconds time elapsed
+        31,971,805      cycles                           #    2.811 GHz                    
+        48,583,386      instructions                     #    1.52  insn per cycle         
+       0.011876482 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1034) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt
index afa8c22c25..e1663755b4 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2023-11-03_19:36:42
+DATE: 2023-11-09_18:11:27
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.935100e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.139273e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.139273e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.114759e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.213627e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.213627e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.009071e+02 +- 5.002295e+01 )  GeV^-2
-TOTAL       :     0.458037 sec
-     1,958,659,698      cycles                           #    2.936 GHz                    
-     2,907,533,469      instructions                     #    1.48  insn per cycle         
-       0.726231579 seconds time elapsed
+TOTAL       :     0.453382 sec
+     1,979,110,250      cycles                           #    2.985 GHz                    
+     2,941,718,851      instructions                     #    1.49  insn per cycle         
+       0.719982475 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
@@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.639472e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.576828e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.576828e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.789515e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.657512e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.657512e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.737500e+02 +- 4.776370e+02 )  GeV^-2
-TOTAL       :     0.638235 sec
-     2,567,083,186      cycles                           #    2.951 GHz                    
-     3,965,073,751      instructions                     #    1.54  insn per cycle         
-       0.927254467 seconds time elapsed
+TOTAL       :     0.632326 sec
+     2,585,787,492      cycles                           #    3.000 GHz                    
+     3,972,159,776      instructions                     #    1.54  insn per cycle         
+       0.920056111 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
@@ -99,9 +99,9 @@ OK (relative difference <= 5E-3)
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Instantiate host Bridge (nevt=16384)
-        38,813,158      cycles                           #    2.791 GHz                    
-        52,008,055      instructions                     #    1.34  insn per cycle         
-       0.014463641 seconds time elapsed
+        38,570,643      cycles                           #    2.885 GHz                    
+        52,119,941      instructions                     #    1.35  insn per cycle         
+       0.013856202 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1034) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt
index e0c37ae81b..e8b37410be 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2023-11-03_19:07:15
+DATE: 2023-11-09_17:43:30
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.552711e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.312060e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.409477e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.567326e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.333824e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.424930e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008472e+02 +- 5.002447e+01 )  GeV^-2
-TOTAL       :     0.443645 sec
-     1,939,887,285      cycles                           #    2.958 GHz                    
-     2,753,223,301      instructions                     #    1.42  insn per cycle         
-       0.713433638 seconds time elapsed
+TOTAL       :     0.444883 sec
+     1,998,454,742      cycles                           #    2.980 GHz                    
+     2,813,430,207      instructions                     #    1.41  insn per cycle         
+       0.728667460 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 248
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.420862e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.422248e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.487501e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.379215e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.422915e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.490315e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.630099e+02 +- 4.770719e+02 )  GeV^-2
-TOTAL       :     0.489840 sec
-     2,095,642,051      cycles                           #    2.944 GHz                    
-     3,058,032,700      instructions                     #    1.46  insn per cycle         
-       0.771189239 seconds time elapsed
+TOTAL       :     0.488237 sec
+     2,124,585,750      cycles                           #    2.987 GHz                    
+     3,077,258,575      instructions                     #    1.45  insn per cycle         
+       0.769041859 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2
@@ -86,9 +86,9 @@ OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP=
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
-        31,454,006      cycles                           #    2.782 GHz                    
-        48,514,001      instructions                     #    1.54  insn per cycle         
-       0.011695448 seconds time elapsed
+        31,375,066      cycles                           #    2.814 GHz                    
+        47,697,134      instructions                     #    1.52  insn per cycle         
+       0.011523392 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1029) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
index 9bd85e98d0..aa3d979423 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2023-11-03_19:07:25
+DATE: 2023-11-09_17:43:40
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.981637e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.050998e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.064107e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.974532e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.049892e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.062592e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     0.460239 sec
-     1,991,164,692      cycles                           #    2.956 GHz                    
-     2,861,513,835      instructions                     #    1.44  insn per cycle         
-       0.731121053 seconds time elapsed
+TOTAL       :     0.466235 sec
+     1,982,451,794      cycles                           #    2.881 GHz                    
+     2,904,128,689      instructions                     #    1.46  insn per cycle         
+       0.746029193 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.125939e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.318916e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.329956e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.118841e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.320828e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.332362e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.734461e+02 +- 4.775415e+02 )  GeV^-2
-TOTAL       :     0.595711 sec
-     2,444,157,832      cycles                           #    2.957 GHz                    
-     3,696,457,333      instructions                     #    1.51  insn per cycle         
-       0.888026518 seconds time elapsed
+TOTAL       :     0.602851 sec
+     2,418,002,144      cycles                           #    2.873 GHz                    
+     3,684,858,061      instructions                     #    1.52  insn per cycle         
+       0.899181828 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2
@@ -86,9 +86,9 @@ OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP=
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
-        35,021,922      cycles                           #    2.756 GHz                    
-        50,809,631      instructions                     #    1.45  insn per cycle         
-       0.013111359 seconds time elapsed
+        34,749,440      cycles                           #    2.771 GHz                    
+        50,090,467      instructions                     #    1.44  insn per cycle         
+       0.013126058 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1399) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt
index 659836495f..fa1b7c54dc 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2023-11-03_19:07:34
+DATE: 2023-11-09_17:43:49
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.948465e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.041856e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.054410e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.943854e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.040668e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.053565e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     0.460434 sec
-     1,981,925,545      cycles                           #    2.941 GHz                    
-     2,855,578,890      instructions                     #    1.44  insn per cycle         
-       0.731466835 seconds time elapsed
+TOTAL       :     0.466224 sec
+     1,967,855,817      cycles                           #    2.845 GHz                    
+     2,813,069,845      instructions                     #    1.43  insn per cycle         
+       0.750240924 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.114794e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.303596e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.314294e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.108193e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.305249e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.316509e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.734461e+02 +- 4.775415e+02 )  GeV^-2
-TOTAL       :     0.592739 sec
-     2,423,209,817      cycles                           #    2.940 GHz                    
-     3,698,114,761      instructions                     #    1.53  insn per cycle         
-       0.885260737 seconds time elapsed
+TOTAL       :     0.594738 sec
+     2,473,036,884      cycles                           #    2.994 GHz                    
+     3,768,499,475      instructions                     #    1.52  insn per cycle         
+       0.886746599 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2
@@ -86,9 +86,9 @@ OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP=
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
-        34,542,827      cycles                           #    2.778 GHz                    
-        50,097,141      instructions                     #    1.45  insn per cycle         
-       0.012808089 seconds time elapsed
+        34,257,253      cycles                           #    2.793 GHz                    
+        49,140,913      instructions                     #    1.43  insn per cycle         
+       0.012667194 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1276) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
index a9f9e7f9b0..5de2ca45d8 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-03_19:07:44
+DATE: 2023-11-09_17:43:59
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.471280e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.495513e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.497667e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.498898e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.522792e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.525024e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.521778 sec
-     2,221,753,731      cycles                           #    2.953 GHz                    
-     3,509,979,793      instructions                     #    1.58  insn per cycle         
-       0.811888374 seconds time elapsed
+TOTAL       :     0.521868 sec
+     2,246,075,293      cycles                           #    2.975 GHz                    
+     3,415,991,617      instructions                     #    1.52  insn per cycle         
+       0.815510814 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.130694e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.157314e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.158457e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.122388e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.150135e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.151328e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.024926 sec
-     9,877,023,451      cycles                           #    3.016 GHz                    
-    20,938,621,148      instructions                     #    2.12  insn per cycle         
-       3.332222792 seconds time elapsed
+TOTAL       :     3.026853 sec
+     9,913,864,058      cycles                           #    3.024 GHz                    
+    22,195,735,281      instructions                     #    2.24  insn per cycle         
+       3.335346642 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.942881e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.943811e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.943811e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.927075e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.927983e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.927983e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.450914 sec
-    25,661,004,969      cycles                           #    3.035 GHz                    
-    78,943,064,293      instructions                     #    3.08  insn per cycle         
-       8.455241133 seconds time elapsed
+TOTAL       :     8.520375 sec
+    25,675,362,415      cycles                           #    3.013 GHz                    
+    78,943,710,554      instructions                     #    3.07  insn per cycle         
+       8.524455360 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 4892) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.566286e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.569647e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.569647e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.557363e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.560585e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.560585e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.607952 sec
-    12,925,846,736      cycles                           #    2.803 GHz                    
-    39,287,875,718      instructions                     #    3.04  insn per cycle         
-       4.612260028 seconds time elapsed
+TOTAL       :     4.619361 sec
+    12,935,854,234      cycles                           #    2.798 GHz                    
+    39,286,025,399      instructions                     #    3.04  insn per cycle         
+       4.623706542 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13182) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.376392e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.393376e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.393376e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.091948e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.108522e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.108522e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.967322 sec
-     5,576,808,906      cycles                           #    2.829 GHz                    
-    13,690,679,702      instructions                     #    2.45  insn per cycle         
-       1.971661788 seconds time elapsed
+TOTAL       :     2.036281 sec
+     5,584,766,890      cycles                           #    2.738 GHz                    
+    13,690,141,249      instructions                     #    2.45  insn per cycle         
+       2.040702440 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11357) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.568825e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.591271e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.591271e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.675809e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.698948e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.698948e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.723570 sec
-     4,897,962,779      cycles                           #    2.836 GHz                    
-    12,345,795,320      instructions                     #    2.52  insn per cycle         
-       1.727906957 seconds time elapsed
+TOTAL       :     1.704074 sec
+     4,897,181,740      cycles                           #    2.868 GHz                    
+    12,344,518,245      instructions                     #    2.52  insn per cycle         
+       1.708309061 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10266) (512y:   88) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.463403e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.476893e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.476893e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.632146e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.645889e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.645889e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.207008 sec
-     4,113,706,051      cycles                           #    1.861 GHz                    
-     6,338,446,257      instructions                     #    1.54  insn per cycle         
-       2.211395304 seconds time elapsed
+TOTAL       :     2.158505 sec
+     4,118,735,499      cycles                           #    1.905 GHz                    
+     6,336,932,858      instructions                     #    1.54  insn per cycle         
+       2.162776211 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1821) (512y:  102) (512z: 9375)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt
index 05b9b7b471..322fb0150d 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-03_19:37:26
+DATE: 2023-11-09_18:12:12
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.138586e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.475297e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.475297e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.165662e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.477249e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.477249e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.514369 sec
-     2,174,774,169      cycles                           #    2.935 GHz                    
-     3,408,753,270      instructions                     #    1.57  insn per cycle         
-       0.802511668 seconds time elapsed
+TOTAL       :     0.512387 sec
+     2,201,868,575      cycles                           #    2.980 GHz                    
+     3,430,381,187      instructions                     #    1.56  insn per cycle         
+       0.801238529 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
@@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.635405e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.119639e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.119639e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.642632e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.111769e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.111769e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.311178 sec
-    10,730,531,324      cycles                           #    2.994 GHz                    
-    24,179,707,994      instructions                     #    2.25  insn per cycle         
-       3.640277810 seconds time elapsed
+TOTAL       :     3.299595 sec
+    10,919,109,000      cycles                           #    3.053 GHz                    
+    24,319,272,982      instructions                     #    2.23  insn per cycle         
+       3.633626468 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.906612e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.907549e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.907549e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.957325e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.958258e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.958258e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.615680 sec
-    25,666,310,685      cycles                           #    2.978 GHz                    
-    78,949,148,944      instructions                     #    3.08  insn per cycle         
-       8.620265583 seconds time elapsed
+TOTAL       :     8.392728 sec
+    25,662,881,797      cycles                           #    3.059 GHz                    
+    78,952,840,684      instructions                     #    3.08  insn per cycle         
+       8.396994023 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 4892) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe
@@ -132,14 +132,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.685334e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.688850e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.688850e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.730470e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.733980e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.733980e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.463406 sec
-    12,942,626,026      cycles                           #    2.897 GHz                    
-    39,297,696,719      instructions                     #    3.04  insn per cycle         
-       4.468216686 seconds time elapsed
+TOTAL       :     4.409754 sec
+    12,949,002,647      cycles                           #    2.934 GHz                    
+    39,297,510,156      instructions                     #    3.03  insn per cycle         
+       4.414215325 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13182) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -160,14 +160,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.403877e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.422097e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.422097e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.533999e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.551780e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.551780e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.965161 sec
-     5,597,716,321      cycles                           #    2.843 GHz                    
-    13,700,115,311      instructions                     #    2.45  insn per cycle         
-       1.969720229 seconds time elapsed
+TOTAL       :     1.934795 sec
+     5,595,375,698      cycles                           #    2.886 GHz                    
+    13,699,668,832      instructions                     #    2.45  insn per cycle         
+       1.939106700 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11357) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -188,14 +188,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.573549e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.596918e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.596918e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.706839e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.728905e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.728905e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.726627 sec
-     4,910,197,742      cycles                           #    2.838 GHz                    
-    12,354,930,161      instructions                     #    2.52  insn per cycle         
-       1.731069519 seconds time elapsed
+TOTAL       :     1.702912 sec
+     4,912,481,885      cycles                           #    2.879 GHz                    
+    12,355,076,796      instructions                     #    2.52  insn per cycle         
+       1.707414472 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10266) (512y:   88) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe
@@ -216,14 +216,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.408369e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.421923e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.421923e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.525002e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.540499e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.540499e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.227463 sec
-     4,132,274,023      cycles                           #    1.852 GHz                    
-     6,348,232,709      instructions                     #    1.54  insn per cycle         
-       2.231941444 seconds time elapsed
+TOTAL       :     2.193518 sec
+     4,132,016,890      cycles                           #    1.881 GHz                    
+     6,348,500,069      instructions                     #    1.54  insn per cycle         
+       2.198089448 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1821) (512y:  102) (512z: 9375)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt
index d4a13c45dc..4e138ec032 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-03_19:49:13
+DATE: 2023-11-09_18:23:48
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.490628e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.519771e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.522013e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.485315e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.511617e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.513675e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     0.506263 sec
-     2,193,209,541      cycles                           #    2.934 GHz                    
-     3,448,112,270      instructions                     #    1.57  insn per cycle         
-       0.811794626 seconds time elapsed
+TOTAL       :     0.505341 sec
+     2,219,350,607      cycles                           #    2.986 GHz                    
+     3,460,374,619      instructions                     #    1.56  insn per cycle         
+       0.811034575 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.140777e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.174961e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.176419e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.144642e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.176791e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.178152e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.252232e+02 +- 1.234346e+02 )  GeV^-4
-TOTAL       :     3.133332 sec
-    10,144,803,574      cycles                           #    2.992 GHz                    
-    22,979,164,856      instructions                     #    2.27  insn per cycle         
-       3.446699997 seconds time elapsed
+TOTAL       :     3.133190 sec
+    10,226,911,008      cycles                           #    3.021 GHz                    
+    21,462,701,558      instructions                     #    2.10  insn per cycle         
+       3.444111151 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.934897e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.935823e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.935823e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.962376e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.963339e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.963339e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     8.487397 sec
-    25,642,144,633      cycles                           #    3.020 GHz                    
-    78,942,503,354      instructions                     #    3.08  insn per cycle         
-       8.491509185 seconds time elapsed
+TOTAL       :     8.368138 sec
+    25,660,792,563      cycles                           #    3.066 GHz                    
+    78,945,591,899      instructions                     #    3.08  insn per cycle         
+       8.372166508 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 4892) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.604711e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.608085e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.608085e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.725556e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.729176e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.729176e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     4.560510 sec
-    12,949,935,406      cycles                           #    2.841 GHz                    
-    39,287,959,625      instructions                     #    3.03  insn per cycle         
-       4.564590789 seconds time elapsed
+TOTAL       :     4.413013 sec
+    12,940,530,582      cycles                           #    2.932 GHz                    
+    39,286,713,275      instructions                     #    3.04  insn per cycle         
+       4.417069788 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13182) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.331820e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.349574e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.349574e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.541485e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.558659e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.558659e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     1.979581 sec
-     5,585,242,942      cycles                           #    2.817 GHz                    
-    13,688,645,923      instructions                     #    2.45  insn per cycle         
-       1.983846301 seconds time elapsed
+TOTAL       :     1.930984 sec
+     5,584,027,716      cycles                           #    2.887 GHz                    
+    13,688,917,418      instructions                     #    2.45  insn per cycle         
+       1.935195895 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11357) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.501909e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.523734e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.523734e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.785385e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.808420e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.808420e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     1.737131 sec
-     4,904,473,574      cycles                           #    2.818 GHz                    
-    12,343,066,066      instructions                     #    2.52  insn per cycle         
-       1.741373569 seconds time elapsed
+TOTAL       :     1.686810 sec
+     4,897,782,017      cycles                           #    2.898 GHz                    
+    12,342,341,736      instructions                     #    2.52  insn per cycle         
+       1.690859675 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10266) (512y:   88) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.326865e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.339889e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.339889e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.578298e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.591405e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.591405e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     2.249568 sec
-     4,122,823,033      cycles                           #    1.830 GHz                    
-     6,335,244,526      instructions                     #    1.54  insn per cycle         
-       2.253741280 seconds time elapsed
+TOTAL       :     2.175070 sec
+     4,121,604,366      cycles                           #    1.892 GHz                    
+     6,334,904,963      instructions                     #    1.54  insn per cycle         
+       2.179001381 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1821) (512y:  102) (512z: 9375)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt
index 8a019b9732..a5bd4bb577 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-03_19:45:52
+DATE: 2023-11-09_18:20:29
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.497991e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.525524e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.527678e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.495033e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.521313e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.523414e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.505150 sec
-     2,198,803,568      cycles                           #    2.954 GHz                    
-     3,469,496,289      instructions                     #    1.58  insn per cycle         
-       0.812740673 seconds time elapsed
+TOTAL       :     0.502389 sec
+     2,234,960,295      cycles                           #    3.014 GHz                    
+     3,501,182,478      instructions                     #    1.57  insn per cycle         
+       0.813908762 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.149366e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.183697e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.185208e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.146228e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.178481e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.179832e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.069067 sec
-     9,961,450,693      cycles                           #    3.001 GHz                    
-    22,775,488,914      instructions                     #    2.29  insn per cycle         
-       3.378594275 seconds time elapsed
+TOTAL       :     3.070610 sec
+    10,014,430,488      cycles                           #    3.015 GHz                    
+    23,183,698,994      instructions                     #    2.32  insn per cycle         
+       3.378407946 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.919154e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.920062e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.920062e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.972782e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.973730e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.973730e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.555088 sec
-    25,630,164,257      cycles                           #    2.995 GHz                    
-    78,942,698,347      instructions                     #    3.08  insn per cycle         
-       8.559388166 seconds time elapsed
+TOTAL       :     8.322572 sec
+    25,630,767,892      cycles                           #    3.079 GHz                    
+    78,944,418,555      instructions                     #    3.08  insn per cycle         
+       8.326671797 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 4892) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.673575e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.677034e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.677034e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.718928e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.722195e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.722195e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.474572 sec
-    12,938,774,287      cycles                           #    2.890 GHz                    
-    39,284,863,862      instructions                     #    3.04  insn per cycle         
-       4.478882140 seconds time elapsed
+TOTAL       :     4.419054 sec
+    12,933,087,616      cycles                           #    2.925 GHz                    
+    39,284,437,808      instructions                     #    3.04  insn per cycle         
+       4.423270824 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13182) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.365364e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.382422e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.382422e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.554509e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.572221e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.572221e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.970072 sec
-     5,585,160,191      cycles                           #    2.830 GHz                    
-    13,689,327,859      instructions                     #    2.45  insn per cycle         
-       1.974279626 seconds time elapsed
+TOTAL       :     1.926889 sec
+     5,576,123,810      cycles                           #    2.889 GHz                    
+    13,689,166,422      instructions                     #    2.45  insn per cycle         
+       1.931047296 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11357) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.573694e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.596726e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.596726e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.729620e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.752389e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.752389e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.722482 sec
-     4,895,075,879      cycles                           #    2.836 GHz                    
-    12,344,411,096      instructions                     #    2.52  insn per cycle         
-       1.726704102 seconds time elapsed
+TOTAL       :     1.694985 sec
+     4,901,721,494      cycles                           #    2.886 GHz                    
+    12,344,869,251      instructions                     #    2.52  insn per cycle         
+       1.699075447 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10266) (512y:   88) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.342892e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.356180e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.356180e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.451359e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.465184e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.465184e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.243467 sec
-     4,145,301,834      cycles                           #    1.845 GHz                    
-     6,337,134,423      instructions                     #    1.53  insn per cycle         
-       2.247770943 seconds time elapsed
+TOTAL       :     2.210683 sec
+     4,119,158,466      cycles                           #    1.861 GHz                    
+     6,337,202,754      instructions                     #    1.54  insn per cycle         
+       2.214903970 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1821) (512y:  102) (512z: 9375)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt
index 0761c0d014..e1894928b5 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-03_19:42:34
+DATE: 2023-11-09_18:17:17
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -51,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.224877e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.534029e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.536870e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.185134e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.497070e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.499968e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.509685 sec
-     2,194,677,974      cycles                           #    2.952 GHz                    
-     3,468,699,947      instructions                     #    1.58  insn per cycle         
-       0.805522488 seconds time elapsed
+TOTAL       :     0.512712 sec
+     2,117,085,337      cycles                           #    2.853 GHz                    
+     3,348,083,687      instructions                     #    1.58  insn per cycle         
+       0.802067553 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -71,14 +71,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.741528e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.176834e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.178277e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.746826e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.178822e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.180194e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.196085 sec
-    10,332,938,289      cycles                           #    2.993 GHz                    
-    23,233,171,839      instructions                     #    2.25  insn per cycle         
-       3.511259911 seconds time elapsed
+TOTAL       :     3.195850 sec
+    10,403,522,722      cycles                           #    3.010 GHz                    
+    22,812,003,731      instructions                     #    2.19  insn per cycle         
+       3.513623861 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -94,14 +94,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.927835e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.928807e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.928807e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.978212e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.979161e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.979161e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.516575 sec
-    25,626,746,874      cycles                           #    3.008 GHz                    
-    78,942,783,638      instructions                     #    3.08  insn per cycle         
-       8.520860421 seconds time elapsed
+TOTAL       :     8.300221 sec
+    25,643,059,514      cycles                           #    3.089 GHz                    
+    78,945,101,648      instructions                     #    3.08  insn per cycle         
+       8.304495187 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 4892) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe
@@ -121,14 +121,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.674456e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.677849e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.677849e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.720030e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.723443e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.723443e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.472834 sec
-    12,938,647,402      cycles                           #    2.891 GHz                    
-    39,285,558,550      instructions                     #    3.04  insn per cycle         
-       4.477166946 seconds time elapsed
+TOTAL       :     4.417696 sec
+    12,936,090,694      cycles                           #    2.926 GHz                    
+    39,285,549,332      instructions                     #    3.04  insn per cycle         
+       4.421886330 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13182) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -148,14 +148,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.290335e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.307469e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.307469e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.467679e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.484549e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.484549e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.987857 sec
-     5,582,015,296      cycles                           #    2.804 GHz                    
-    13,690,066,849      instructions                     #    2.45  insn per cycle         
-       1.992149312 seconds time elapsed
+TOTAL       :     1.946291 sec
+     5,575,526,782      cycles                           #    2.860 GHz                    
+    13,689,232,963      instructions                     #    2.46  insn per cycle         
+       1.950526745 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11357) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -175,14 +175,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.537627e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.561759e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.561759e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.714029e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.737204e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.737204e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.729438 sec
-     4,899,116,746      cycles                           #    2.827 GHz                    
-    12,344,356,410      instructions                     #    2.52  insn per cycle         
-       1.733854664 seconds time elapsed
+TOTAL       :     1.697675 sec
+     4,893,869,630      cycles                           #    2.877 GHz                    
+    12,345,121,576      instructions                     #    2.52  insn per cycle         
+       1.701906664 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10266) (512y:   88) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe
@@ -202,14 +202,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.331605e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.345774e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.345774e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.624620e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.638794e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.638794e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.247519 sec
-     4,126,377,191      cycles                           #    1.833 GHz                    
-     6,337,288,668      instructions                     #    1.54  insn per cycle         
-       2.251874954 seconds time elapsed
+TOTAL       :     2.160421 sec
+     4,114,771,943      cycles                           #    1.902 GHz                    
+     6,336,936,596      instructions                     #    1.54  insn per cycle         
+       2.164683207 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1821) (512y:  102) (512z: 9375)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt
index d519ec18af..d9a60f4c2d 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-03_19:08:21
+DATE: 2023-11-09_17:44:36
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.482135e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.509267e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.511176e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.474117e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.499523e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.501625e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.519206 sec
-     2,212,325,201      cycles                           #    2.954 GHz                    
-     3,433,704,735      instructions                     #    1.55  insn per cycle         
-       0.807580904 seconds time elapsed
+TOTAL       :     0.521490 sec
+     2,250,098,032      cycles                           #    2.995 GHz                    
+     3,547,618,625      instructions                     #    1.58  insn per cycle         
+       0.811334512 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.159162e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.186085e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.187240e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.144032e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.172097e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.173315e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.004869 sec
-     9,812,463,662      cycles                           #    3.013 GHz                    
-    21,581,231,713      instructions                     #    2.20  insn per cycle         
-       3.312573877 seconds time elapsed
+TOTAL       :     3.014234 sec
+     9,779,736,194      cycles                           #    2.987 GHz                    
+    19,303,224,180      instructions                     #    1.97  insn per cycle         
+       3.330161859 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.947345e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.948277e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.948277e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.971538e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.972487e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.972487e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.431137 sec
-    25,590,035,480      cycles                           #    3.034 GHz                    
-    78,715,048,416      instructions                     #    3.08  insn per cycle         
-       8.435307792 seconds time elapsed
+TOTAL       :     8.327938 sec
+    25,611,620,219      cycles                           #    3.074 GHz                    
+    78,715,429,796      instructions                     #    3.07  insn per cycle         
+       8.332111280 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 4263) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.620452e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.623805e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.623805e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.709838e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.713193e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.713193e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.539871 sec
-    12,909,848,042      cycles                           #    2.843 GHz                    
-    39,233,023,972      instructions                     #    3.04  insn per cycle         
-       4.544176080 seconds time elapsed
+TOTAL       :     4.429736 sec
+    12,908,947,595      cycles                           #    2.912 GHz                    
+    39,230,824,629      instructions                     #    3.04  insn per cycle         
+       4.433832156 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:12949) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.331174e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.348654e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.348654e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.184366e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.200734e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.200734e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.977747 sec
-     5,618,064,764      cycles                           #    2.836 GHz                    
-    13,804,762,963      instructions                     #    2.46  insn per cycle         
-       1.981982814 seconds time elapsed
+TOTAL       :     2.013363 sec
+     5,615,451,412      cycles                           #    2.785 GHz                    
+    13,804,151,174      instructions                     #    2.46  insn per cycle         
+       2.017493867 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11422) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.463129e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.484771e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.484771e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.496512e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.518383e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.518383e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.742192 sec
-     4,960,747,667      cycles                           #    2.842 GHz                    
-    12,470,817,922      instructions                     #    2.51  insn per cycle         
-       1.746604551 seconds time elapsed
+TOTAL       :     1.736002 sec
+     4,961,501,370      cycles                           #    2.852 GHz                    
+    12,469,539,646      instructions                     #    2.51  insn per cycle         
+       1.740286680 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10258) (512y:  240) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.427183e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.440655e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.440655e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.549305e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.563023e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.563023e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.217977 sec
-     4,119,292,054      cycles                           #    1.855 GHz                    
-     6,462,314,928      instructions                     #    1.57  insn per cycle         
-       2.222289185 seconds time elapsed
+TOTAL       :     2.181875 sec
+     4,116,495,870      cycles                           #    1.884 GHz                    
+     6,461,064,172      instructions                     #    1.57  insn per cycle         
+       2.186117492 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1647) (512y:  192) (512z: 9375)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt
index 0e734b6c9d..909bf4e735 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-03_19:26:58
+DATE: 2023-11-09_18:01:40
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.237666e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.262462e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.264647e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.239370e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.263076e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.265061e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.533653 sec
-     2,219,666,724      cycles                           #    2.910 GHz                    
-     3,445,153,040      instructions                     #    1.55  insn per cycle         
-       0.821091738 seconds time elapsed
+TOTAL       :     0.531588 sec
+     2,281,405,083      cycles                           #    2.976 GHz                    
+     3,558,676,633      instructions                     #    1.56  insn per cycle         
+       0.825879944 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.775197e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.803191e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.804422e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.775154e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.802017e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.803118e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.300230 sec
-    10,634,484,052      cycles                           #    2.991 GHz                    
-    23,844,861,281      instructions                     #    2.24  insn per cycle         
-       3.611693691 seconds time elapsed
+TOTAL       :     3.293832 sec
+    10,794,008,612      cycles                           #    3.043 GHz                    
+    23,569,569,961      instructions                     #    2.18  insn per cycle         
+       3.607202529 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.361422e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.361903e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.361903e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.420862e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.421336e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.421336e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :    37.613350 sec
-   113,653,626,732      cycles                           #    3.022 GHz                    
-   144,966,182,806      instructions                     #    1.28  insn per cycle         
-      37.617592948 seconds time elapsed
+TOTAL       :    37.106861 sec
+   113,630,776,289      cycles                           #    3.063 GHz                    
+   144,980,863,935      instructions                     #    1.28  insn per cycle         
+      37.110990461 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:21605) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.197160e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.199710e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.199710e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.245783e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.248348e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.248348e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     5.138561 sec
-    14,751,525,638      cycles                           #    2.870 GHz                    
-    37,578,516,323      instructions                     #    2.55  insn per cycle         
-       5.143061031 seconds time elapsed
+TOTAL       :     5.061979 sec
+    14,717,920,983      cycles                           #    2.906 GHz                    
+    37,577,837,464      instructions                     #    2.55  insn per cycle         
+       5.066177833 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:68118) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.662015e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.676566e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.676566e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.791579e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.806069e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.806069e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.150367 sec
-     6,125,090,080      cycles                           #    2.844 GHz                    
-    13,063,740,704      instructions                     #    2.13  insn per cycle         
-       2.154679772 seconds time elapsed
+TOTAL       :     2.114146 sec
+     6,120,754,225      cycles                           #    2.890 GHz                    
+    13,063,521,271      instructions                     #    2.13  insn per cycle         
+       2.118343855 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:46960) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.263953e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.285040e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.285040e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.380050e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.401402e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.401402e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.780016 sec
-     5,060,160,878      cycles                           #    2.837 GHz                    
-    11,442,229,361      instructions                     #    2.26  insn per cycle         
-       1.784487029 seconds time elapsed
+TOTAL       :     1.757697 sec
+     5,060,306,566      cycles                           #    2.873 GHz                    
+    11,442,262,844      instructions                     #    2.26  insn per cycle         
+       1.761841609 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:40434) (512y:  285) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.515689e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.530167e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.530167e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.755291e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.769173e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.769173e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.192230 sec
-     3,982,582,654      cycles                           #    1.814 GHz                    
-     5,943,874,364      instructions                     #    1.49  insn per cycle         
-       2.196624515 seconds time elapsed
+TOTAL       :     2.124539 sec
+     3,983,245,523      cycles                           #    1.872 GHz                    
+     5,944,184,553      instructions                     #    1.49  insn per cycle         
+       2.128814459 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2455) (512y:  337) (512z:39411)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt
index a431669edb..8be167a2b3 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-03_19:28:07
+DATE: 2023-11-09_18:02:48
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.227099e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.252215e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.254306e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.258787e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.282651e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.285304e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.530677 sec
-     2,254,800,400      cycles                           #    2.956 GHz                    
-     3,541,881,168      instructions                     #    1.57  insn per cycle         
-       0.819833622 seconds time elapsed
+TOTAL       :     0.525087 sec
+     2,271,033,028      cycles                           #    3.019 GHz                    
+     3,503,626,972      instructions                     #    1.54  insn per cycle         
+       0.810303022 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.792463e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.821318e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.822521e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.795218e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.822430e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.823559e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.276536 sec
-    10,598,798,874      cycles                           #    3.001 GHz                    
-    22,505,546,793      instructions                     #    2.12  insn per cycle         
-       3.590880872 seconds time elapsed
+TOTAL       :     3.267432 sec
+    10,775,752,309      cycles                           #    3.062 GHz                    
+    23,804,895,620      instructions                     #    2.21  insn per cycle         
+       3.575584891 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.316847e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.317310e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.317310e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.382161e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.382658e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.382658e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :    38.002712 sec
-   114,613,209,494      cycles                           #    3.016 GHz                    
-   145,560,103,749      instructions                     #    1.27  insn per cycle         
-      38.007069023 seconds time elapsed
+TOTAL       :    37.434460 sec
+   114,573,902,263      cycles                           #    3.060 GHz                    
+   145,559,795,063      instructions                     #    1.27  insn per cycle         
+      37.438717752 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:22248) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.101440e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.103871e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.103871e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.172461e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.174968e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.174968e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     5.297737 sec
-    15,180,958,119      cycles                           #    2.864 GHz                    
-    37,765,704,407      instructions                     #    2.49  insn per cycle         
-       5.302092232 seconds time elapsed
+TOTAL       :     5.178309 sec
+    15,150,664,399      cycles                           #    2.924 GHz                    
+    37,765,142,558      instructions                     #    2.49  insn per cycle         
+       5.182585019 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:68446) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.750289e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.764988e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.764988e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.899691e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.915108e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.915108e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.125646 sec
-     6,006,519,083      cycles                           #    2.821 GHz                    
-    12,897,926,690      instructions                     #    2.15  insn per cycle         
-       2.130039886 seconds time elapsed
+TOTAL       :     2.085123 sec
+     6,007,372,451      cycles                           #    2.876 GHz                    
+    12,897,891,125      instructions                     #    2.15  insn per cycle         
+       2.089322243 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:45929) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.134516e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.155464e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.155464e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.290925e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.312116e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.312116e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.805195 sec
-     5,111,264,978      cycles                           #    2.826 GHz                    
-    11,448,660,091      instructions                     #    2.24  insn per cycle         
-       1.809562076 seconds time elapsed
+TOTAL       :     1.774574 sec
+     5,109,183,395      cycles                           #    2.874 GHz                    
+    11,448,665,866      instructions                     #    2.24  insn per cycle         
+       1.778819443 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:40123) (512y:  219) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.713307e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.727980e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.727980e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.900466e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.915540e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.915540e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.136153 sec
-     3,956,606,945      cycles                           #    1.850 GHz                    
-     5,898,384,643      instructions                     #    1.49  insn per cycle         
-       2.140545061 seconds time elapsed
+TOTAL       :     2.085227 sec
+     3,957,731,000      cycles                           #    1.895 GHz                    
+     5,897,967,734      instructions                     #    1.49  insn per cycle         
+       2.089481596 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1971) (512y:  259) (512z:38937)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
index 389fe370ef..24e6fadbe8 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-03_19:08:57
+DATE: 2023-11-09_17:45:12
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.330449e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.375316e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.385679e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.337209e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.383457e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.391632e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
-TOTAL       :     0.478801 sec
-     2,034,971,060      cycles                           #    2.940 GHz                    
-     3,054,212,240      instructions                     #    1.50  insn per cycle         
-       0.749375620 seconds time elapsed
+TOTAL       :     0.480161 sec
+     2,056,195,749      cycles                           #    2.969 GHz                    
+     3,041,501,171      instructions                     #    1.48  insn per cycle         
+       0.751973888 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.529589e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.587136e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.589764e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.613057e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.675362e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.678111e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.664703e+00 +- 5.072736e+00 )  GeV^-4
-TOTAL       :     1.723184 sec
-     5,782,983,871      cycles                           #    2.964 GHz                    
-    12,066,403,823      instructions                     #    2.09  insn per cycle         
-       2.008243733 seconds time elapsed
+TOTAL       :     1.713246 sec
+     5,908,983,228      cycles                           #    3.045 GHz                    
+    11,684,311,184      instructions                     #    1.98  insn per cycle         
+       1.997404675 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.003677e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.004662e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.004662e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.054709e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.055772e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.055772e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     8.193664 sec
-    24,655,416,435      cycles                           #    3.008 GHz                    
-    78,134,412,275      instructions                     #    3.17  insn per cycle         
-       8.197717930 seconds time elapsed
+TOTAL       :     7.990756 sec
+    24,645,365,645      cycles                           #    3.083 GHz                    
+    78,136,702,059      instructions                     #    3.17  insn per cycle         
+       7.994878538 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3602) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.270897e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.285143e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.285143e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.432830e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.446994e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.446994e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060119e+00 +- 2.367901e+00 )  GeV^-4
-TOTAL       :     2.263632 sec
-     6,475,526,341      cycles                           #    2.856 GHz                    
-    20,124,982,632      instructions                     #    3.11  insn per cycle         
-       2.267936828 seconds time elapsed
+TOTAL       :     2.213938 sec
+     6,478,911,538      cycles                           #    2.922 GHz                    
+    20,124,199,414      instructions                     #    3.11  insn per cycle         
+       2.218115274 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13763) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.655891e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.662862e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.662862e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.680617e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.687674e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.687674e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.998679 sec
-     2,840,454,971      cycles                           #    2.834 GHz                    
-     6,992,590,525      instructions                     #    2.46  insn per cycle         
-       1.002898964 seconds time elapsed
+TOTAL       :     0.983889 sec
+     2,838,821,051      cycles                           #    2.875 GHz                    
+     6,991,598,423      instructions                     #    2.46  insn per cycle         
+       0.988065526 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11874) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.904708e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.914180e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.914180e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.841366e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.850029e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.850029e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.868982 sec
-     2,491,374,231      cycles                           #    2.855 GHz                    
-     6,299,681,276      instructions                     #    2.53  insn per cycle         
-       0.873227215 seconds time elapsed
+TOTAL       :     0.898688 sec
+     2,488,990,380      cycles                           #    2.759 GHz                    
+     6,298,918,188      instructions                     #    2.53  insn per cycle         
+       0.902843603 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10822) (512y:   43) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.509691e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.515612e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.515612e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.538961e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.547910e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.547910e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
-TOTAL       :     1.094413 sec
-     2,048,957,877      cycles                           #    1.866 GHz                    
-     3,269,073,408      instructions                     #    1.60  insn per cycle         
-       1.098654820 seconds time elapsed
+TOTAL       :     1.073829 sec
+     2,048,858,820      cycles                           #    1.904 GHz                    
+     3,269,526,835      instructions                     #    1.60  insn per cycle         
+       1.078196054 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2415) (512y:   46) (512z: 9571)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt
index 5a5ccf0962..741b2db05e 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-03_19:38:03
+DATE: 2023-11-09_18:12:49
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.621379e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.322960e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.322960e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.661835e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.358766e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.358766e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.048178e+00 +- 2.364571e+00 )  GeV^-4
-TOTAL       :     0.467719 sec
-     2,022,012,389      cycles                           #    2.930 GHz                    
-     3,029,595,627      instructions                     #    1.50  insn per cycle         
-       0.748028952 seconds time elapsed
+TOTAL       :     0.465492 sec
+     2,015,187,483      cycles                           #    2.973 GHz                    
+     3,002,049,942      instructions                     #    1.49  insn per cycle         
+       0.734544576 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
@@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.232227e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.472561e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.472561e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.271779e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.483162e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.483162e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.641710e+00 +- 4.994249e+00 )  GeV^-4
-TOTAL       :     1.900347 sec
-     6,375,786,665      cycles                           #    2.982 GHz                    
-    13,373,135,596      instructions                     #    2.10  insn per cycle         
-       2.195039568 seconds time elapsed
+TOTAL       :     1.878261 sec
+     6,418,416,087      cycles                           #    3.037 GHz                    
+    13,442,701,753      instructions                     #    2.09  insn per cycle         
+       2.169965161 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.008350e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.009347e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.009347e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.022346e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.023320e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.023320e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     8.176665 sec
-    24,649,325,474      cycles                           #    3.013 GHz                    
-    78,138,045,806      instructions                     #    3.17  insn per cycle         
-       8.180908705 seconds time elapsed
+TOTAL       :     8.120106 sec
+    24,656,495,142      cycles                           #    3.035 GHz                    
+    78,138,532,268      instructions                     #    3.17  insn per cycle         
+       8.124268827 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3602) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe
@@ -132,14 +132,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.326247e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.339746e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.339746e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.385899e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.400170e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.400170e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060119e+00 +- 2.367901e+00 )  GeV^-4
-TOTAL       :     2.249404 sec
-     6,483,421,678      cycles                           #    2.878 GHz                    
-    20,133,640,820      instructions                     #    3.11  insn per cycle         
-       2.253658931 seconds time elapsed
+TOTAL       :     2.230708 sec
+     6,485,115,953      cycles                           #    2.903 GHz                    
+    20,133,634,822      instructions                     #    3.10  insn per cycle         
+       2.234788671 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13763) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe
@@ -160,14 +160,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.657895e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.664866e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.664866e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.666755e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.673825e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.673825e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.999874 sec
-     2,846,897,865      cycles                           #    2.837 GHz                    
-     7,001,448,108      instructions                     #    2.46  insn per cycle         
-       1.004235579 seconds time elapsed
+TOTAL       :     0.994493 sec
+     2,844,577,237      cycles                           #    2.850 GHz                    
+     7,001,609,472      instructions                     #    2.46  insn per cycle         
+       0.998731395 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11874) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe
@@ -188,14 +188,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.899947e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.909346e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.909346e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.867923e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.876610e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.876610e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.873710 sec
-     2,498,501,131      cycles                           #    2.848 GHz                    
-     6,308,536,459      instructions                     #    2.52  insn per cycle         
-       0.877964105 seconds time elapsed
+TOTAL       :     0.888528 sec
+     2,499,243,226      cycles                           #    2.802 GHz                    
+     6,308,730,841      instructions                     #    2.52  insn per cycle         
+       0.892798888 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10822) (512y:   43) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe
@@ -216,14 +216,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.494285e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.499863e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.499863e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.495920e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.501735e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.501735e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
-TOTAL       :     1.108704 sec
-     2,059,473,334      cycles                           #    1.852 GHz                    
-     3,279,338,884      instructions                     #    1.59  insn per cycle         
-       1.113120539 seconds time elapsed
+TOTAL       :     1.107724 sec
+     2,056,932,102      cycles                           #    1.850 GHz                    
+     3,279,291,488      instructions                     #    1.59  insn per cycle         
+       1.112281401 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2415) (512y:   46) (512z: 9571)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt
index 12ad22d5a3..341f303aae 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-03_19:49:50
+DATE: 2023-11-09_18:24:24
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.340393e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.392051e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.397944e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.311526e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.361390e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.366448e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.159397e-01 +- 3.238804e-01 )  GeV^-4
-TOTAL       :     0.462195 sec
-     1,986,930,742      cycles                           #    2.947 GHz                    
-     3,005,964,493      instructions                     #    1.51  insn per cycle         
-       0.730831332 seconds time elapsed
+TOTAL       :     0.464707 sec
+     2,008,078,996      cycles                           #    2.985 GHz                    
+     3,036,723,964      instructions                     #    1.51  insn per cycle         
+       0.732085987 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.547500e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.620827e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.624055e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.547836e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.616999e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.620197e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.094367e+02 +- 1.071509e+02 )  GeV^-4
-TOTAL       :     1.798053 sec
-     6,062,916,752      cycles                           #    2.993 GHz                    
-    11,569,516,184      instructions                     #    1.91  insn per cycle         
-       2.082278895 seconds time elapsed
+TOTAL       :     1.809045 sec
+     6,020,726,960      cycles                           #    2.958 GHz                    
+    11,569,273,710      instructions                     #    1.92  insn per cycle         
+       2.092173630 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.005661e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.006690e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.006690e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.048605e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.049604e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.049604e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.208459e-01 +- 3.253446e-01 )  GeV^-4
-TOTAL       :     8.186459 sec
-    24,671,953,454      cycles                           #    3.013 GHz                    
-    78,137,621,710      instructions                     #    3.17  insn per cycle         
-       8.190517160 seconds time elapsed
+TOTAL       :     8.015102 sec
+    24,651,277,493      cycles                           #    3.074 GHz                    
+    78,133,763,667      instructions                     #    3.17  insn per cycle         
+       8.018994302 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3602) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.107458e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.120841e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.120841e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.377691e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.391250e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.391250e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.208457e-01 +- 3.253445e-01 )  GeV^-4
-TOTAL       :     2.317106 sec
-     6,488,771,451      cycles                           #    2.796 GHz                    
-    20,124,539,496      instructions                     #    3.10  insn per cycle         
-       2.321142527 seconds time elapsed
+TOTAL       :     2.232285 sec
+     6,481,088,653      cycles                           #    2.899 GHz                    
+    20,124,382,938      instructions                     #    3.11  insn per cycle         
+       2.236275849 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13763) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.647793e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.654673e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.654673e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.686029e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.693351e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.693351e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.214978e-01 +- 3.255521e-01 )  GeV^-4
-TOTAL       :     1.005506 sec
-     2,843,966,049      cycles                           #    2.818 GHz                    
-     6,991,496,346      instructions                     #    2.46  insn per cycle         
-       1.009548479 seconds time elapsed
+TOTAL       :     0.981416 sec
+     2,838,446,580      cycles                           #    2.882 GHz                    
+     6,989,000,726      instructions                     #    2.46  insn per cycle         
+       0.985356553 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11874) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.895349e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.904605e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.904605e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.921238e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.930307e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.930307e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.214978e-01 +- 3.255521e-01 )  GeV^-4
-TOTAL       :     0.875040 sec
-     2,495,845,822      cycles                           #    2.841 GHz                    
-     6,297,369,404      instructions                     #    2.52  insn per cycle         
-       0.879134455 seconds time elapsed
+TOTAL       :     0.863261 sec
+     2,495,681,706      cycles                           #    2.880 GHz                    
+     6,297,112,783      instructions                     #    2.52  insn per cycle         
+       0.867346097 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10822) (512y:   43) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.504042e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.510113e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.510113e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.544822e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.550907e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.550907e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.214981e-01 +- 3.255523e-01 )  GeV^-4
-TOTAL       :     1.099941 sec
-     2,050,409,457      cycles                           #    1.858 GHz                    
-     3,265,015,309      instructions                     #    1.59  insn per cycle         
-       1.104007255 seconds time elapsed
+TOTAL       :     1.070627 sec
+     2,048,550,465      cycles                           #    1.908 GHz                    
+     3,265,201,106      instructions                     #    1.59  insn per cycle         
+       1.074629445 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2415) (512y:   46) (512z: 9571)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt
index 5b13ff9774..63178ad027 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-03_19:46:29
+DATE: 2023-11-09_18:21:06
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.339869e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.391844e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.397472e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.362546e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.415600e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.420893e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
-TOTAL       :     0.461224 sec
-     1,973,907,563      cycles                           #    2.940 GHz                    
-     2,969,869,707      instructions                     #    1.50  insn per cycle         
-       0.729741448 seconds time elapsed
+TOTAL       :     0.460752 sec
+     2,005,673,830      cycles                           #    2.989 GHz                    
+     2,996,841,960      instructions                     #    1.49  insn per cycle         
+       0.729795900 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.563612e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.637504e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.640751e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.567426e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.636917e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.639909e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.664703e+00 +- 5.072736e+00 )  GeV^-4
-TOTAL       :     1.749654 sec
-     5,928,680,592      cycles                           #    2.999 GHz                    
-    12,893,930,524      instructions                     #    2.17  insn per cycle         
-       2.033490620 seconds time elapsed
+TOTAL       :     1.748848 sec
+     5,960,134,043      cycles                           #    3.018 GHz                    
+    12,821,096,532      instructions                     #    2.15  insn per cycle         
+       2.031326515 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.014770e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.015759e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.015759e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.057810e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.058811e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.058811e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     8.147957 sec
-    24,635,567,678      cycles                           #    3.022 GHz                    
-    78,133,891,626      instructions                     #    3.17  insn per cycle         
-       8.152140443 seconds time elapsed
+TOTAL       :     7.977700 sec
+    24,629,048,089      cycles                           #    3.086 GHz                    
+    78,132,914,520      instructions                     #    3.17  insn per cycle         
+       7.981637101 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3602) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.062428e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.074909e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.074909e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.439696e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.453635e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.453635e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060119e+00 +- 2.367901e+00 )  GeV^-4
-TOTAL       :     2.330015 sec
-     6,475,827,642      cycles                           #    2.775 GHz                    
-    20,124,634,132      instructions                     #    3.11  insn per cycle         
-       2.334037311 seconds time elapsed
+TOTAL       :     2.212183 sec
+     6,477,339,632      cycles                           #    2.924 GHz                    
+    20,124,428,604      instructions                     #    3.11  insn per cycle         
+       2.216339188 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13763) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.595519e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.602006e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.602006e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.594939e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.601395e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.601395e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     1.036160 sec
-     2,838,919,957      cycles                           #    2.730 GHz                    
-     6,991,694,320      instructions                     #    2.46  insn per cycle         
-       1.040335460 seconds time elapsed
+TOTAL       :     1.036317 sec
+     2,842,114,214      cycles                           #    2.733 GHz                    
+     6,991,999,004      instructions                     #    2.46  insn per cycle         
+       1.040742925 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11874) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.893954e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.903085e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.903085e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.922697e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.931896e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.931896e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.873924 sec
-     2,489,283,092      cycles                           #    2.837 GHz                    
-     6,298,948,511      instructions                     #    2.53  insn per cycle         
-       0.878050091 seconds time elapsed
+TOTAL       :     0.860619 sec
+     2,490,053,798      cycles                           #    2.883 GHz                    
+     6,298,956,842      instructions                     #    2.53  insn per cycle         
+       0.864591382 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10822) (512y:   43) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.497242e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.502884e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.502884e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.526848e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.532542e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.532542e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
-TOTAL       :     1.103482 sec
-     2,049,248,209      cycles                           #    1.852 GHz                    
-     3,268,952,113      instructions                     #    1.60  insn per cycle         
-       1.107551558 seconds time elapsed
+TOTAL       :     1.082468 sec
+     2,049,657,294      cycles                           #    1.888 GHz                    
+     3,269,097,732      instructions                     #    1.59  insn per cycle         
+       1.086487061 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2415) (512y:   46) (512z: 9571)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt
index cdb252ac3a..2548057249 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-03_19:43:11
+DATE: 2023-11-09_18:17:53
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -51,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.764175e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.406414e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.411755e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.733376e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.369757e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.375069e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.048178e+00 +- 2.364571e+00 )  GeV^-4
-TOTAL       :     0.466281 sec
-     1,989,064,547      cycles                           #    2.930 GHz                    
-     3,017,212,928      instructions                     #    1.52  insn per cycle         
-       0.737783039 seconds time elapsed
+TOTAL       :     0.461931 sec
+     2,018,026,618      cycles                           #    3.001 GHz                    
+     3,012,517,263      instructions                     #    1.49  insn per cycle         
+       0.729223266 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -71,14 +71,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.472408e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.626435e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.629621e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.494168e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.614081e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.617046e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.641710e+00 +- 4.994249e+00 )  GeV^-4
-TOTAL       :     1.825252 sec
-     6,129,357,136      cycles                           #    2.985 GHz                    
-    13,024,512,874      instructions                     #    2.12  insn per cycle         
-       2.110041533 seconds time elapsed
+TOTAL       :     1.820293 sec
+     6,248,771,087      cycles                           #    3.054 GHz                    
+    13,452,131,003      instructions                     #    2.15  insn per cycle         
+       2.111868581 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
@@ -94,14 +94,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.017146e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.018188e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.018188e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.046417e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.047425e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.047425e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     8.138680 sec
-    24,636,857,889      cycles                           #    3.027 GHz                    
-    78,136,646,989      instructions                     #    3.17  insn per cycle         
-       8.142807331 seconds time elapsed
+TOTAL       :     8.022116 sec
+    24,641,165,344      cycles                           #    3.070 GHz                    
+    78,133,947,271      instructions                     #    3.17  insn per cycle         
+       8.026095295 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3602) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe
@@ -121,14 +121,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.266088e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.280126e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.280126e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.418320e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.431601e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.431601e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060119e+00 +- 2.367901e+00 )  GeV^-4
-TOTAL       :     2.265043 sec
-     6,477,387,096      cycles                           #    2.855 GHz                    
-    20,124,193,083      instructions                     #    3.11  insn per cycle         
-       2.269259910 seconds time elapsed
+TOTAL       :     2.218818 sec
+     6,476,858,939      cycles                           #    2.915 GHz                    
+    20,124,080,031      instructions                     #    3.11  insn per cycle         
+       2.222978465 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13763) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe
@@ -148,14 +148,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.644884e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.651718e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.651718e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.673631e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.680333e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.680333e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     1.005131 sec
-     2,839,448,871      cycles                           #    2.816 GHz                    
-     6,991,884,623      instructions                     #    2.46  insn per cycle         
-       1.009345557 seconds time elapsed
+TOTAL       :     0.987621 sec
+     2,839,487,470      cycles                           #    2.865 GHz                    
+     6,991,564,753      instructions                     #    2.46  insn per cycle         
+       0.991606693 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11874) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe
@@ -175,14 +175,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.866159e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.874920e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.874920e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.883610e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.892494e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.892494e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.887083 sec
-     2,489,977,422      cycles                           #    2.796 GHz                    
-     6,298,695,060      instructions                     #    2.53  insn per cycle         
-       0.891225776 seconds time elapsed
+TOTAL       :     0.878347 sec
+     2,488,399,526      cycles                           #    2.822 GHz                    
+     6,298,882,599      instructions                     #    2.53  insn per cycle         
+       0.882234875 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10822) (512y:   43) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe
@@ -202,14 +202,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.498745e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.504407e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.504407e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.534627e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.540393e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.540393e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
-TOTAL       :     1.102380 sec
-     2,046,697,565      cycles                           #    1.851 GHz                    
-     3,268,682,926      instructions                     #    1.60  insn per cycle         
-       1.106464577 seconds time elapsed
+TOTAL       :     1.076545 sec
+     2,047,724,498      cycles                           #    1.897 GHz                    
+     3,268,770,442      instructions                     #    1.60  insn per cycle         
+       1.080450235 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2415) (512y:   46) (512z: 9571)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt
index 9fe77f3bb4..3e46ada377 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-03_19:09:27
+DATE: 2023-11-09_17:45:41
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.327293e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.373619e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.378917e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.305671e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.350688e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.358120e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
-TOTAL       :     0.480093 sec
-     2,046,992,398      cycles                           #    2.957 GHz                    
-     3,008,261,627      instructions                     #    1.47  insn per cycle         
-       0.750577809 seconds time elapsed
+TOTAL       :     0.483459 sec
+     2,029,909,968      cycles                           #    2.855 GHz                    
+     2,962,980,745      instructions                     #    1.46  insn per cycle         
+       0.768081643 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.515177e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.572348e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.574911e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.574581e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.636147e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.638743e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.664703e+00 +- 5.072736e+00 )  GeV^-4
-TOTAL       :     1.715757 sec
-     5,871,373,370      cycles                           #    3.006 GHz                    
-    12,204,738,560      instructions                     #    2.08  insn per cycle         
-       2.009775672 seconds time elapsed
+TOTAL       :     1.716632 sec
+     5,921,965,881      cycles                           #    3.044 GHz                    
+    11,852,981,757      instructions                     #    2.00  insn per cycle         
+       2.001901523 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.026797e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.027818e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.027818e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.062728e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.063773e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.063773e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     8.100053 sec
-    24,563,227,881      cycles                           #    3.031 GHz                    
-    77,860,200,084      instructions                     #    3.17  insn per cycle         
-       8.104232064 seconds time elapsed
+TOTAL       :     7.958572 sec
+    24,559,190,224      cycles                           #    3.085 GHz                    
+    77,859,989,303      instructions                     #    3.17  insn per cycle         
+       7.962642501 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3113) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.430084e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.444359e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.444359e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.583566e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.598037e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.598037e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060119e+00 +- 2.367901e+00 )  GeV^-4
-TOTAL       :     2.215968 sec
-     6,421,588,621      cycles                           #    2.894 GHz                    
-    20,090,220,099      instructions                     #    3.13  insn per cycle         
-       2.220335001 seconds time elapsed
+TOTAL       :     2.170856 sec
+     6,426,627,449      cycles                           #    2.956 GHz                    
+    20,090,039,565      instructions                     #    3.13  insn per cycle         
+       2.175014616 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13452) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.625861e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.632520e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.632520e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.591188e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.597484e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.597484e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     1.016598 sec
-     2,906,571,537      cycles                           #    2.849 GHz                    
-     7,134,546,428      instructions                     #    2.45  insn per cycle         
-       1.020819368 seconds time elapsed
+TOTAL       :     1.038604 sec
+     2,902,688,212      cycles                           #    2.785 GHz                    
+     7,133,529,057      instructions                     #    2.46  insn per cycle         
+       1.042821386 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:12261) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.810175e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.818358e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.818358e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.840190e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.848739e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.848739e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.914087 sec
-     2,595,791,217      cycles                           #    2.828 GHz                    
-     6,442,852,611      instructions                     #    2.48  insn per cycle         
-       0.918452804 seconds time elapsed
+TOTAL       :     0.898885 sec
+     2,595,883,470      cycles                           #    2.877 GHz                    
+     6,441,979,586      instructions                     #    2.48  insn per cycle         
+       0.902832877 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11276) (512y:   27) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.453251e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.458727e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.458727e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.492137e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.497778e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.497778e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
-TOTAL       :     1.136514 sec
-     2,124,554,510      cycles                           #    1.864 GHz                    
-     3,431,456,558      instructions                     #    1.62  insn per cycle         
-       1.140688320 seconds time elapsed
+TOTAL       :     1.106744 sec
+     2,123,250,955      cycles                           #    1.918 GHz                    
+     3,431,574,417      instructions                     #    1.62  insn per cycle         
+       1.110853762 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2912) (512y:   22) (512z: 9647)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt
index 6d22eac4d2..764181f824 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-03_19:29:17
+DATE: 2023-11-09_18:03:57
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.584275e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.627587e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.631963e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.601175e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.638676e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.643535e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
-TOTAL       :     0.489018 sec
-     2,081,067,674      cycles                           #    2.934 GHz                    
-     3,133,776,802      instructions                     #    1.51  insn per cycle         
-       0.771988427 seconds time elapsed
+TOTAL       :     0.484907 sec
+     2,101,276,759      cycles                           #    2.981 GHz                    
+     3,149,706,582      instructions                     #    1.50  insn per cycle         
+       0.766736785 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.747350e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.808169e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.810857e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.695736e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.752372e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.754868e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.664703e+00 +- 5.072736e+00 )  GeV^-4
-TOTAL       :     1.853996 sec
-     6,275,481,753      cycles                           #    3.001 GHz                    
-    12,514,155,894      instructions                     #    1.99  insn per cycle         
-       2.147936222 seconds time elapsed
+TOTAL       :     1.853224 sec
+     6,303,125,801      cycles                           #    3.016 GHz                    
+    12,982,819,660      instructions                     #    2.06  insn per cycle         
+       2.146815240 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.644036e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.644860e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.644860e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.841033e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.841866e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.841866e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059969e+00 +- 2.367799e+00 )  GeV^-4
-TOTAL       :    29.065327 sec
-    87,424,924,787      cycles                           #    3.008 GHz                    
-   135,567,300,472      instructions                     #    1.55  insn per cycle         
-      29.069446346 seconds time elapsed
+TOTAL       :    28.085663 sec
+    86,167,672,431      cycles                           #    3.068 GHz                    
+   135,565,357,772      instructions                     #    1.57  insn per cycle         
+      28.089696347 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:15486) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.026233e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.038857e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.038857e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.152037e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.164422e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.164422e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059962e+00 +- 2.367792e+00 )  GeV^-4
-TOTAL       :     2.342565 sec
-     6,786,587,363      cycles                           #    2.893 GHz                    
-    19,387,387,931      instructions                     #    2.86  insn per cycle         
-       2.346831164 seconds time elapsed
+TOTAL       :     2.302124 sec
+     6,785,316,910      cycles                           #    2.944 GHz                    
+    19,388,398,647      instructions                     #    2.86  insn per cycle         
+       2.306338036 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:69680) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.459444e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.464900e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.464900e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.500496e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.506041e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.506041e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060903e+00 +- 2.367376e+00 )  GeV^-4
-TOTAL       :     1.132478 sec
-     3,179,013,562      cycles                           #    2.798 GHz                    
-     6,809,043,401      instructions                     #    2.14  insn per cycle         
-       1.136902959 seconds time elapsed
+TOTAL       :     1.100781 sec
+     3,177,227,261      cycles                           #    2.877 GHz                    
+     6,808,813,623      instructions                     #    2.14  insn per cycle         
+       1.104867562 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:49077) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.738168e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.745907e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.745907e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.797362e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.805452e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.805452e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060903e+00 +- 2.367376e+00 )  GeV^-4
-TOTAL       :     0.952016 sec
-     2,651,392,730      cycles                           #    2.774 GHz                    
-     5,987,188,755      instructions                     #    2.26  insn per cycle         
-       0.956397839 seconds time elapsed
+TOTAL       :     0.920545 sec
+     2,652,149,170      cycles                           #    2.870 GHz                    
+     5,986,924,086      instructions                     #    2.26  insn per cycle         
+       0.924698406 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:42677) (512y:   11) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.472802e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.478184e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.478184e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.476030e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.481355e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.481355e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060904e+00 +- 2.367377e+00 )  GeV^-4
-TOTAL       :     1.121995 sec
-     2,073,738,270      cycles                           #    1.843 GHz                    
-     3,501,511,021      instructions                     #    1.69  insn per cycle         
-       1.126283052 seconds time elapsed
+TOTAL       :     1.119541 sec
+     2,077,679,044      cycles                           #    1.851 GHz                    
+     3,501,921,791      instructions                     #    1.69  insn per cycle         
+       1.123804705 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 5198) (512y:    3) (512z:44822)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt
index 5c9ad24a46..7b7c373ccc 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-03_19:30:09
+DATE: 2023-11-09_18:04:48
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.558233e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.598421e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.603327e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.541471e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.579175e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.583358e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
-TOTAL       :     0.487345 sec
-     2,076,063,570      cycles                           #    2.928 GHz                    
-     3,124,474,063      instructions                     #    1.50  insn per cycle         
-       0.769324674 seconds time elapsed
+TOTAL       :     0.484837 sec
+     2,105,287,248      cycles                           #    2.990 GHz                    
+     3,132,361,933      instructions                     #    1.49  insn per cycle         
+       0.765772342 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.647182e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.706650e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.709351e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.694480e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.751016e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.753615e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.664703e+00 +- 5.072736e+00 )  GeV^-4
-TOTAL       :     1.865548 sec
-     6,314,327,402      cycles                           #    2.992 GHz                    
-    13,540,816,282      instructions                     #    2.14  insn per cycle         
-       2.170188129 seconds time elapsed
+TOTAL       :     1.853183 sec
+     6,341,276,975      cycles                           #    3.036 GHz                    
+    13,434,801,047      instructions                     #    2.12  insn per cycle         
+       2.144878674 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.736423e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.737265e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.737265e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.834166e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.834994e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.834994e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059969e+00 +- 2.367799e+00 )  GeV^-4
-TOTAL       :    28.597065 sec
-    86,035,998,776      cycles                           #    3.009 GHz                    
-   135,911,265,736      instructions                     #    1.58  insn per cycle         
-      28.601145029 seconds time elapsed
+TOTAL       :    28.118822 sec
+    86,081,697,198      cycles                           #    3.062 GHz                    
+   135,906,074,576      instructions                     #    1.58  insn per cycle         
+      28.122852922 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:15910) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.976771e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.989628e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.989628e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.132688e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.145964e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.145964e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059962e+00 +- 2.367792e+00 )  GeV^-4
-TOTAL       :     2.358802 sec
-     6,848,676,061      cycles                           #    2.899 GHz                    
-    19,439,456,701      instructions                     #    2.84  insn per cycle         
-       2.362995374 seconds time elapsed
+TOTAL       :     2.306989 sec
+     6,845,463,882      cycles                           #    2.963 GHz                    
+    19,440,308,006      instructions                     #    2.84  insn per cycle         
+       2.311118522 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:69722) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.510619e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.516450e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.516450e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.544215e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.549994e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.549994e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060903e+00 +- 2.367376e+00 )  GeV^-4
-TOTAL       :     1.093889 sec
-     3,110,977,160      cycles                           #    2.835 GHz                    
-     6,719,869,092      instructions                     #    2.16  insn per cycle         
-       1.098127483 seconds time elapsed
+TOTAL       :     1.069611 sec
+     3,120,065,313      cycles                           #    2.908 GHz                    
+     6,719,636,670      instructions                     #    2.15  insn per cycle         
+       1.073683656 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:47667) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.794946e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.802956e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.802956e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.829756e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.837937e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.837937e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060903e+00 +- 2.367376e+00 )  GeV^-4
-TOTAL       :     0.922821 sec
-     2,627,235,427      cycles                           #    2.838 GHz                    
-     5,970,250,488      instructions                     #    2.27  insn per cycle         
-       0.926978795 seconds time elapsed
+TOTAL       :     0.904097 sec
+     2,625,695,846      cycles                           #    2.892 GHz                    
+     5,970,269,399      instructions                     #    2.27  insn per cycle         
+       0.908318447 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:41842) (512y:   13) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.483560e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.489106e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.489106e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.517896e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.523661e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.523661e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060904e+00 +- 2.367377e+00 )  GeV^-4
-TOTAL       :     1.114177 sec
-     2,080,137,201      cycles                           #    1.861 GHz                    
-     3,494,948,543      instructions                     #    1.68  insn per cycle         
-       1.118521627 seconds time elapsed
+TOTAL       :     1.088404 sec
+     2,079,379,564      cycles                           #    1.905 GHz                    
+     3,494,888,851      instructions                     #    1.68  insn per cycle         
+       1.092417864 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4162) (512y:    4) (512z:44465)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
index b38c13fcd9..93a0b75f12 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-03_19:09:56
+DATE: 2023-11-09_17:46:11
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.468828e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.491770e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.493892e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.470867e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.494695e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.496793e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.519876 sec
-     2,215,127,737      cycles                           #    2.957 GHz                    
-     3,487,212,374      instructions                     #    1.57  insn per cycle         
-       0.807913712 seconds time elapsed
+TOTAL       :     0.522613 sec
+     2,231,403,620      cycles                           #    2.972 GHz                    
+     3,427,736,994      instructions                     #    1.54  insn per cycle         
+       0.812895260 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.135164e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.161799e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.162966e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.127962e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.155846e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.156998e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.028303 sec
-     9,769,796,186      cycles                           #    2.979 GHz                    
-    22,335,132,843      instructions                     #    2.29  insn per cycle         
-       3.336784998 seconds time elapsed
+TOTAL       :     3.024603 sec
+    10,040,286,484      cycles                           #    3.065 GHz                    
+    20,701,312,854      instructions                     #    2.06  insn per cycle         
+       3.332453984 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.912244e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.913140e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.913140e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.954833e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.955774e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.955774e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.586099 sec
-    25,914,180,302      cycles                           #    3.017 GHz                    
-    79,445,505,152      instructions                     #    3.07  insn per cycle         
-       8.590406292 seconds time elapsed
+TOTAL       :     8.399305 sec
+    25,922,061,106      cycles                           #    3.085 GHz                    
+    79,443,494,538      instructions                     #    3.06  insn per cycle         
+       8.403427486 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 4857) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.695684e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.699049e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.699049e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.761504e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.765123e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.765123e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.447189 sec
-    12,656,450,439      cycles                           #    2.844 GHz                    
-    38,554,825,829      instructions                     #    3.05  insn per cycle         
-       4.451478069 seconds time elapsed
+TOTAL       :     4.369308 sec
+    12,659,894,478      cycles                           #    2.895 GHz                    
+    38,554,080,405      instructions                     #    3.05  insn per cycle         
+       4.373596593 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13161) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.537952e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.556620e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.556620e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.648175e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.665781e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.665781e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.930375 sec
-     5,512,214,802      cycles                           #    2.850 GHz                    
-    13,486,265,307      instructions                     #    2.45  insn per cycle         
-       1.934770358 seconds time elapsed
+TOTAL       :     1.905268 sec
+     5,516,001,376      cycles                           #    2.890 GHz                    
+    13,483,921,346      instructions                     #    2.44  insn per cycle         
+       1.909531551 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11242) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.638550e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.660856e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.660856e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.803935e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.827738e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.827738e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.711054 sec
-     4,872,445,248      cycles                           #    2.842 GHz                    
-    12,141,983,198      instructions                     #    2.49  insn per cycle         
-       1.715434660 seconds time elapsed
+TOTAL       :     1.682277 sec
+     4,871,353,432      cycles                           #    2.890 GHz                    
+    12,140,803,788      instructions                     #    2.49  insn per cycle         
+       1.686455915 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10154) (512y:   79) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.406789e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.420159e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.420159e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.374652e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.387771e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.387771e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.223975 sec
-     4,144,217,356      cycles                           #    1.862 GHz                    
-     6,340,578,545      instructions                     #    1.53  insn per cycle         
-       2.228285470 seconds time elapsed
+TOTAL       :     2.233464 sec
+     4,145,054,475      cycles                           #    1.853 GHz                    
+     6,339,255,297      instructions                     #    1.53  insn per cycle         
+       2.237809120 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1802) (512y:   93) (512z: 9358)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt
index 46f37c0a90..5c4ca592f3 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-03_19:10:33
+DATE: 2023-11-09_17:46:47
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.484364e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.507714e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.509764e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.487617e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.512149e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.514706e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.519968 sec
-     2,216,873,411      cycles                           #    2.952 GHz                    
-     3,459,675,597      instructions                     #    1.56  insn per cycle         
-       0.809738739 seconds time elapsed
+TOTAL       :     0.518884 sec
+     2,241,934,817      cycles                           #    2.999 GHz                    
+     3,518,298,272      instructions                     #    1.57  insn per cycle         
+       0.808606683 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.134555e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.161246e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.162402e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.131184e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.159088e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.160252e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.016486 sec
-     9,822,814,204      cycles                           #    3.004 GHz                    
-    22,339,986,571      instructions                     #    2.27  insn per cycle         
-       3.325238208 seconds time elapsed
+TOTAL       :     3.016798 sec
+    10,040,228,896      cycles                           #    3.072 GHz                    
+    22,037,859,926      instructions                     #    2.19  insn per cycle         
+       3.324922224 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.909809e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.910727e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.910727e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.950722e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.951656e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.951656e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.597381 sec
-    25,939,435,501      cycles                           #    3.017 GHz                    
-    79,457,351,519      instructions                     #    3.06  insn per cycle         
-       8.601657625 seconds time elapsed
+TOTAL       :     8.416183 sec
+    25,916,224,646      cycles                           #    3.078 GHz                    
+    79,453,865,963      instructions                     #    3.07  insn per cycle         
+       8.420247127 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 4504) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.664829e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.668218e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.668218e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.759672e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.763188e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.763188e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.484461 sec
-    12,651,418,370      cycles                           #    2.819 GHz                    
-    38,525,727,884      instructions                     #    3.05  insn per cycle         
-       4.488762135 seconds time elapsed
+TOTAL       :     4.371398 sec
+    12,639,801,464      cycles                           #    2.889 GHz                    
+    38,524,761,271      instructions                     #    3.05  insn per cycle         
+       4.375560053 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:12928) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.385701e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.404187e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.404187e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.630529e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.648410e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.648410e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.965077 sec
-     5,557,225,506      cycles                           #    2.823 GHz                    
-    13,610,780,927      instructions                     #    2.45  insn per cycle         
-       1.969439061 seconds time elapsed
+TOTAL       :     1.909613 sec
+     5,559,227,570      cycles                           #    2.906 GHz                    
+    13,609,303,550      instructions                     #    2.45  insn per cycle         
+       1.913823155 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11327) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.328216e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.349743e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.349743e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.332740e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.353313e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.353313e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.767465 sec
-     4,920,931,185      cycles                           #    2.779 GHz                    
-    12,278,542,674      instructions                     #    2.50  insn per cycle         
-       1.771926617 seconds time elapsed
+TOTAL       :     1.766447 sec
+     4,917,170,589      cycles                           #    2.778 GHz                    
+    12,276,136,667      instructions                     #    2.50  insn per cycle         
+       1.770689432 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10143) (512y:  239) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.389874e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.403004e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.403004e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.605174e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.618655e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.618655e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.228912 sec
-     4,146,930,402      cycles                           #    1.858 GHz                    
-     6,446,453,346      instructions                     #    1.55  insn per cycle         
-       2.233245374 seconds time elapsed
+TOTAL       :     2.166306 sec
+     4,144,641,386      cycles                           #    1.911 GHz                    
+     6,445,298,096      instructions                     #    1.56  insn per cycle         
+       2.170508580 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1627) (512y:  191) (512z: 9356)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
index 2048a9698e..b73b517066 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2023-11-03_19:12:52
+DATE: 2023-11-09_17:49:06
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.071850e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.072225e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.072335e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.070656e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.071067e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.071174e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     2.421447 sec
-     8,245,731,454      cycles                           #    3.012 GHz                    
-    18,688,279,165      instructions                     #    2.27  insn per cycle         
-       2.797097094 seconds time elapsed
+TOTAL       :     2.421343 sec
+     8,332,807,450      cycles                           #    3.040 GHz                    
+    16,939,230,243      instructions                     #    2.03  insn per cycle         
+       2.799270804 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.261920e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.263777e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.264034e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.271200e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.273122e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.273304e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856249e-04 +- 8.329951e-05 )  GeV^-6
-TOTAL       :     3.993277 sec
-    12,924,149,664      cycles                           #    2.993 GHz                    
-    29,920,520,122      instructions                     #    2.32  insn per cycle         
-       4.373104302 seconds time elapsed
+TOTAL       :     3.985063 sec
+    13,247,174,015      cycles                           #    3.069 GHz                    
+    30,019,215,878      instructions                     #    2.27  insn per cycle         
+       4.374841890 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.414546e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.414780e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.414780e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.228283e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.228511e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.228511e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     6.278557 sec
-    18,784,400,880      cycles                           #    2.990 GHz                    
-    53,915,743,321      instructions                     #    2.87  insn per cycle         
-       6.282578284 seconds time elapsed
+TOTAL       :     6.424546 sec
+    18,798,364,918      cycles                           #    2.925 GHz                    
+    53,916,162,526      instructions                     #    2.87  insn per cycle         
+       6.428517349 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:32447) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.622225e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.622313e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.622313e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.657858e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.657947e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.657947e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     3.260349 sec
-     9,843,353,366      cycles                           #    3.016 GHz                    
-    27,093,120,012      instructions                     #    2.75  insn per cycle         
-       3.264542212 seconds time elapsed
+TOTAL       :     3.191098 sec
+     9,844,225,763      cycles                           #    3.082 GHz                    
+    27,092,778,504      instructions                     #    2.75  insn per cycle         
+       3.195159677 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:96441) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.543297e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.543763e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.543763e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.638511e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.638939e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.638939e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.494911 sec
-     4,247,565,583      cycles                           #    2.835 GHz                    
-     9,561,660,282      instructions                     #    2.25  insn per cycle         
-       1.498994646 seconds time elapsed
+TOTAL       :     1.457101 sec
+     4,229,207,978      cycles                           #    2.896 GHz                    
+     9,561,222,824      instructions                     #    2.26  insn per cycle         
+       1.461220413 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:84390) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.041064e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.041630e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.041630e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.119963e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.120507e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.120507e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.312043 sec
-     3,711,873,932      cycles                           #    2.822 GHz                    
-     8,485,580,977      instructions                     #    2.29  insn per cycle         
-       1.316064551 seconds time elapsed
+TOTAL       :     1.286739 sec
+     3,714,427,423      cycles                           #    2.879 GHz                    
+     8,485,272,385      instructions                     #    2.28  insn per cycle         
+       1.290826596 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:79991) (512y:   91) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.655846e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.656376e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.656376e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.600399e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.600911e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.600911e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.450066 sec
-     2,692,078,825      cycles                           #    1.852 GHz                    
-     4,273,245,565      instructions                     #    1.59  insn per cycle         
-       1.454158841 seconds time elapsed
+TOTAL       :     1.474924 sec
+     2,695,875,361      cycles                           #    1.824 GHz                    
+     4,273,169,567      instructions                     #    1.59  insn per cycle         
+       1.479057981 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2284) (512y:  105) (512z:79105)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt
index fbbae31086..28081b2160 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2023-11-03_19:38:33
+DATE: 2023-11-09_18:13:18
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gp
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.071334e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.072304e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.072304e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.064318e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.065254e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.065254e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     2.376547 sec
-     8,066,576,997      cycles                           #    2.992 GHz                    
-    17,224,378,863      instructions                     #    2.14  insn per cycle         
-       2.753340167 seconds time elapsed
+TOTAL       :     2.361712 sec
+     8,164,385,199      cycles                           #    3.041 GHz                    
+    16,942,565,052      instructions                     #    2.08  insn per cycle         
+       2.743176660 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
@@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.219956e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.252584e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.252584e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.190361e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.223459e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.223459e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856249e-04 +- 8.329951e-05 )  GeV^-6
-TOTAL       :     3.983566 sec
-    12,755,700,095      cycles                           #    2.969 GHz                    
-    26,780,853,821      instructions                     #    2.10  insn per cycle         
-       4.362214203 seconds time elapsed
+TOTAL       :     3.988233 sec
+    13,123,079,634      cycles                           #    3.036 GHz                    
+    28,841,455,416      instructions                     #    2.20  insn per cycle         
+       4.378479494 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.520548e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.520796e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.520796e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.307342e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.307565e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.307565e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     6.203566 sec
-    18,829,459,475      cycles                           #    3.034 GHz                    
-    53,915,868,697      instructions                     #    2.86  insn per cycle         
-       6.207586404 seconds time elapsed
+TOTAL       :     6.364568 sec
+    18,927,213,544      cycles                           #    2.973 GHz                    
+    53,918,164,087      instructions                     #    2.85  insn per cycle         
+       6.368577598 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:32447) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe
@@ -132,14 +132,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.632618e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.632708e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.632708e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.666025e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.666114e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.666114e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     3.239187 sec
-     9,805,468,555      cycles                           #    3.024 GHz                    
-    27,094,086,958      instructions                     #    2.76  insn per cycle         
-       3.243245202 seconds time elapsed
+TOTAL       :     3.173683 sec
+     9,797,609,023      cycles                           #    3.084 GHz                    
+    27,093,782,808      instructions                     #    2.77  insn per cycle         
+       3.177702749 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:96441) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -160,14 +160,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.541893e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.542348e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.542348e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.255906e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.256265e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.256265e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.496042 sec
-     4,247,154,617      cycles                           #    2.833 GHz                    
-     9,562,315,517      instructions                     #    2.25  insn per cycle         
-       1.500165545 seconds time elapsed
+TOTAL       :     1.626420 sec
+     4,592,212,308      cycles                           #    2.818 GHz                    
+     9,562,781,549      instructions                     #    2.08  insn per cycle         
+       1.630448393 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:84390) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -188,14 +188,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.062512e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.063083e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.063083e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.133405e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.134023e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.134023e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.305609 sec
-     3,707,362,205      cycles                           #    2.832 GHz                    
-     8,486,374,508      instructions                     #    2.29  insn per cycle         
-       1.309600698 seconds time elapsed
+TOTAL       :     1.281959 sec
+     3,704,600,058      cycles                           #    2.882 GHz                    
+     8,486,385,133      instructions                     #    2.29  insn per cycle         
+       1.285885098 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:79991) (512y:   91) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe
@@ -216,14 +216,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.623189e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.623772e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.623772e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.663239e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.663889e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.663889e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.463361 sec
-     2,697,367,089      cycles                           #    1.839 GHz                    
-     4,274,143,132      instructions                     #    1.58  insn per cycle         
-       1.467446249 seconds time elapsed
+TOTAL       :     1.446962 sec
+     2,696,700,654      cycles                           #    1.860 GHz                    
+     4,274,559,971      instructions                     #    1.59  insn per cycle         
+       1.451147700 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2284) (512y:  105) (512z:79105)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt
index c51993cada..4570a77a9f 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2023-11-03_19:13:56
+DATE: 2023-11-09_17:50:09
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.063023e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.063394e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.063534e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.067332e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.067722e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.067853e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     2.424921 sec
-     8,232,683,158      cycles                           #    2.990 GHz                    
-    17,655,317,796      instructions                     #    2.14  insn per cycle         
-       2.812107310 seconds time elapsed
+TOTAL       :     2.421722 sec
+     8,395,936,189      cycles                           #    3.053 GHz                    
+    18,623,375,460      instructions                     #    2.22  insn per cycle         
+       2.807666336 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.268141e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.269954e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.270195e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.274592e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.276551e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.276737e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856249e-04 +- 8.329951e-05 )  GeV^-6
-TOTAL       :     3.993040 sec
-    12,961,046,511      cycles                           #    3.002 GHz                    
-    29,041,240,897      instructions                     #    2.24  insn per cycle         
-       4.374135451 seconds time elapsed
+TOTAL       :     3.997048 sec
+    13,290,560,155      cycles                           #    3.077 GHz                    
+    29,230,575,077      instructions                     #    2.20  insn per cycle         
+       4.378333342 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.423791e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.424026e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.424026e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.641666e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.641939e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.641939e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     6.281065 sec
-    18,737,351,960      cycles                           #    2.982 GHz                    
-    53,924,990,961      instructions                     #    2.88  insn per cycle         
-       6.285160496 seconds time elapsed
+TOTAL       :     6.117499 sec
+    18,785,945,280      cycles                           #    3.070 GHz                    
+    53,927,524,861      instructions                     #    2.87  insn per cycle         
+       6.121375903 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:32062) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.617244e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.617330e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.617330e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.649159e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.649256e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.649256e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     3.274940 sec
-     9,810,206,221      cycles                           #    2.993 GHz                    
-    27,090,315,670      instructions                     #    2.76  insn per cycle         
-       3.279033724 seconds time elapsed
+TOTAL       :     3.206425 sec
+     9,787,082,067      cycles                           #    3.050 GHz                    
+    27,089,817,225      instructions                     #    2.77  insn per cycle         
+       3.210577008 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:96284) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.504500e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.504945e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.504945e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.558533e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.558987e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.558987e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.511756 sec
-     4,249,692,377      cycles                           #    2.805 GHz                    
-     9,561,658,782      instructions                     #    2.25  insn per cycle         
-       1.515796071 seconds time elapsed
+TOTAL       :     1.489258 sec
+     4,261,284,391      cycles                           #    2.855 GHz                    
+     9,561,306,757      instructions                     #    2.24  insn per cycle         
+       1.493274617 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:84478) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.067567e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.068141e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.068141e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.116449e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.116994e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.116994e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.304248 sec
-     3,697,935,435      cycles                           #    2.828 GHz                    
-     8,485,512,243      instructions                     #    2.29  insn per cycle         
-       1.308302011 seconds time elapsed
+TOTAL       :     1.287600 sec
+     3,697,517,464      cycles                           #    2.864 GHz                    
+     8,485,532,294      instructions                     #    2.29  insn per cycle         
+       1.291548783 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:80014) (512y:  241) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.626044e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.626572e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.626572e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.666755e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.667279e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.667279e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.462511 sec
-     2,704,261,685      cycles                           #    1.846 GHz                    
-     4,277,565,036      instructions                     #    1.58  insn per cycle         
-       1.466688212 seconds time elapsed
+TOTAL       :     1.444368 sec
+     2,694,896,725      cycles                           #    1.862 GHz                    
+     4,276,159,790      instructions                     #    1.59  insn per cycle         
+       1.448419547 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2169) (512y:  187) (512z:79110)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
index 0a60ba6d62..4a0d02936a 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2023-11-03_19:14:59
+DATE: 2023-11-09_17:51:12
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.757584e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.758488e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.758845e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.745896e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.746749e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.746990e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.186984e-05 +- 9.824899e-06 )  GeV^-6
-TOTAL       :     1.659896 sec
-     5,702,631,198      cycles                           #    2.947 GHz                    
-    11,810,983,379      instructions                     #    2.07  insn per cycle         
-       1.991424837 seconds time elapsed
+TOTAL       :     1.657612 sec
+     5,852,337,885      cycles                           #    3.029 GHz                    
+    12,128,434,322      instructions                     #    2.07  insn per cycle         
+       1.989363075 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.332515e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.333177e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.333265e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.334998e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.335676e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.335767e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856829e-04 +- 8.333435e-05 )  GeV^-6
-TOTAL       :     1.929687 sec
-     6,546,483,377      cycles                           #    2.952 GHz                    
-    14,155,312,120      instructions                     #    2.16  insn per cycle         
-       2.273547514 seconds time elapsed
+TOTAL       :     1.921239 sec
+     6,689,269,410      cycles                           #    3.045 GHz                    
+    13,766,829,986      instructions                     #    2.06  insn per cycle         
+       2.253627777 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.817807e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.818080e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.818080e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.077848e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.078128e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.078128e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825040e-06 )  GeV^-6
-TOTAL       :     5.991502 sec
-    17,897,297,418      cycles                           #    2.986 GHz                    
-    53,590,305,749      instructions                     #    2.99  insn per cycle         
-       5.995609214 seconds time elapsed
+TOTAL       :     5.821316 sec
+    17,888,760,787      cycles                           #    3.072 GHz                    
+    53,591,267,283      instructions                     #    3.00  insn per cycle         
+       5.825272234 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:20207) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.535145e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.535592e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.535592e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.576360e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.576807e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.576807e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825037e-06 )  GeV^-6
-TOTAL       :     1.499015 sec
-     4,559,682,745      cycles                           #    3.035 GHz                    
-    13,762,791,022      instructions                     #    3.02  insn per cycle         
-       1.503172123 seconds time elapsed
+TOTAL       :     1.480982 sec
+     4,560,162,627      cycles                           #    3.072 GHz                    
+    13,762,313,674      instructions                     #    3.02  insn per cycle         
+       1.485020552 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:96986) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.101340e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.103065e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.103065e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.154943e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.156669e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.156669e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826767e-06 )  GeV^-6
-TOTAL       :     0.748885 sec
-     2,136,693,329      cycles                           #    2.841 GHz                    
-     4,817,082,222      instructions                     #    2.25  insn per cycle         
-       0.752876610 seconds time elapsed
+TOTAL       :     0.743454 sec
+     2,138,545,582      cycles                           #    2.865 GHz                    
+     4,816,682,793      instructions                     #    2.25  insn per cycle         
+       0.747370846 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:84904) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.112158e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.114365e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.114365e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.228374e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.230533e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.230533e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826767e-06 )  GeV^-6
-TOTAL       :     0.656308 sec
-     1,869,942,366      cycles                           #    2.835 GHz                    
-     4,274,318,244      instructions                     #    2.29  insn per cycle         
-       0.660301551 seconds time elapsed
+TOTAL       :     0.646748 sec
+     1,869,005,080      cycles                           #    2.875 GHz                    
+     4,273,904,960      instructions                     #    2.29  insn per cycle         
+       0.650625419 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:80610) (512y:   46) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.296564e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.298817e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.298817e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.373581e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.376135e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.376135e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826771e-06 )  GeV^-6
-TOTAL       :     0.728798 sec
-     1,352,736,555      cycles                           #    1.847 GHz                    
-     2,158,877,197      instructions                     #    1.60  insn per cycle         
-       0.732817833 seconds time elapsed
+TOTAL       :     0.721971 sec
+     1,354,973,724      cycles                           #    1.868 GHz                    
+     2,158,504,507      instructions                     #    1.59  insn per cycle         
+       0.726042839 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2878) (512y:   49) (512z:79298)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt
index 17034b30a2..b3edd3819c 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2023-11-03_19:39:36
+DATE: 2023-11-09_18:14:21
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gp
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.806522e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.808414e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.808414e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.797007e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.798750e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.798750e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187094e-05 +- 9.825664e-06 )  GeV^-6
-TOTAL       :     1.595844 sec
-     5,598,060,641      cycles                           #    2.994 GHz                    
-    11,899,085,664      instructions                     #    2.13  insn per cycle         
-       1.927316991 seconds time elapsed
+TOTAL       :     1.595659 sec
+     5,717,240,119      cycles                           #    3.061 GHz                    
+    12,288,497,969      instructions                     #    2.15  insn per cycle         
+       1.924944467 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
@@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.306726e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.320071e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.320071e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.290056e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.302765e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.302765e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856441e-04 +- 8.331096e-05 )  GeV^-6
-TOTAL       :     1.896426 sec
-     6,482,998,335      cycles                           #    2.990 GHz                    
-    13,087,346,923      instructions                     #    2.02  insn per cycle         
-       2.228516012 seconds time elapsed
+TOTAL       :     1.886551 sec
+     6,639,132,324      cycles                           #    3.056 GHz                    
+    14,322,788,387      instructions                     #    2.16  insn per cycle         
+       2.229781396 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.982697e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.982966e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.982966e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.171261e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.171565e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.171565e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825040e-06 )  GeV^-6
-TOTAL       :     5.882902 sec
-    17,886,003,642      cycles                           #    3.039 GHz                    
-    53,589,820,489      instructions                     #    3.00  insn per cycle         
-       5.886864227 seconds time elapsed
+TOTAL       :     5.764943 sec
+    17,824,241,728      cycles                           #    3.090 GHz                    
+    53,589,840,001      instructions                     #    3.01  insn per cycle         
+       5.768783827 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:20207) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe
@@ -132,14 +132,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.517559e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.518006e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.518006e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.577193e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.577612e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.577612e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825037e-06 )  GeV^-6
-TOTAL       :     1.508545 sec
-     4,560,262,414      cycles                           #    3.016 GHz                    
-    13,763,353,615      instructions                     #    3.02  insn per cycle         
-       1.512732617 seconds time elapsed
+TOTAL       :     1.481390 sec
+     4,567,533,848      cycles                           #    3.077 GHz                    
+    13,763,213,169      instructions                     #    3.01  insn per cycle         
+       1.485335177 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:96986) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe
@@ -160,14 +160,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.047943e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.049624e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.049624e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.234763e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.236470e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.236470e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826767e-06 )  GeV^-6
-TOTAL       :     0.755133 sec
-     2,153,006,129      cycles                           #    2.839 GHz                    
-     4,818,213,561      instructions                     #    2.24  insn per cycle         
-       0.759225829 seconds time elapsed
+TOTAL       :     0.735214 sec
+     2,134,795,694      cycles                           #    2.891 GHz                    
+     4,817,744,368      instructions                     #    2.26  insn per cycle         
+       0.739133829 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:84904) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe
@@ -188,14 +188,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.134004e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.136209e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.136209e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.254949e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.257396e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.257396e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826767e-06 )  GeV^-6
-TOTAL       :     0.654556 sec
-     1,870,329,136      cycles                           #    2.842 GHz                    
-     4,274,869,931      instructions                     #    2.29  insn per cycle         
-       0.658687365 seconds time elapsed
+TOTAL       :     0.644560 sec
+     1,871,614,525      cycles                           #    2.889 GHz                    
+     4,274,807,727      instructions                     #    2.28  insn per cycle         
+       0.648424122 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:80610) (512y:   46) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe
@@ -216,14 +216,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.265196e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.267580e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.267580e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.456942e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.459224e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.459224e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826771e-06 )  GeV^-6
-TOTAL       :     0.732395 sec
-     1,354,970,411      cycles                           #    1.842 GHz                    
-     2,159,667,135      instructions                     #    1.59  insn per cycle         
-       0.736399157 seconds time elapsed
+TOTAL       :     0.714093 sec
+     1,353,332,363      cycles                           #    1.886 GHz                    
+     2,159,539,680      instructions                     #    1.60  insn per cycle         
+       0.718064585 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2878) (512y:   49) (512z:79298)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt
index 9247dc6a21..0346c64d8e 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2023-11-03_19:15:46
+DATE: 2023-11-09_17:51:59
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.757824e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.758656e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.758919e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.750539e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.751383e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.751707e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.186984e-05 +- 9.824899e-06 )  GeV^-6
-TOTAL       :     1.658123 sec
-     5,805,953,943      cycles                           #    3.008 GHz                    
-    12,018,291,784      instructions                     #    2.07  insn per cycle         
-       1.988767308 seconds time elapsed
+TOTAL       :     1.659496 sec
+     5,776,495,417      cycles                           #    2.991 GHz                    
+    11,901,437,818      instructions                     #    2.06  insn per cycle         
+       2.001980183 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.327280e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.327957e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.328041e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.353072e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.353765e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.353865e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856829e-04 +- 8.333435e-05 )  GeV^-6
-TOTAL       :     1.929152 sec
-     6,666,976,802      cycles                           #    3.013 GHz                    
-    13,831,721,664      instructions                     #    2.07  insn per cycle         
-       2.269150647 seconds time elapsed
+TOTAL       :     1.912117 sec
+     6,490,117,914      cycles                           #    2.968 GHz                    
+    14,058,143,997      instructions                     #    2.17  insn per cycle         
+       2.245070466 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.798758e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.799028e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.799028e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.137878e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.138152e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.138152e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825040e-06 )  GeV^-6
-TOTAL       :     6.002749 sec
-    17,897,748,334      cycles                           #    2.981 GHz                    
-    53,583,210,251      instructions                     #    2.99  insn per cycle         
-       6.006727820 seconds time elapsed
+TOTAL       :     5.784705 sec
+    17,870,079,189      cycles                           #    3.088 GHz                    
+    53,579,576,519      instructions                     #    3.00  insn per cycle         
+       5.788683996 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:20206) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.533102e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.533527e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.533527e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.609484e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.609917e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.609917e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825037e-06 )  GeV^-6
-TOTAL       :     1.498905 sec
-     4,550,573,846      cycles                           #    3.029 GHz                    
-    13,756,139,320      instructions                     #    3.02  insn per cycle         
-       1.503009468 seconds time elapsed
+TOTAL       :     1.467915 sec
+     4,547,996,475      cycles                           #    3.091 GHz                    
+    13,755,684,665      instructions                     #    3.02  insn per cycle         
+       1.471804589 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:96606) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.049905e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.051589e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.051589e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.135956e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.137601e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.137601e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826767e-06 )  GeV^-6
-TOTAL       :     0.753863 sec
-     2,147,980,052      cycles                           #    2.837 GHz                    
-     4,819,413,658      instructions                     #    2.24  insn per cycle         
-       0.757858909 seconds time elapsed
+TOTAL       :     0.744886 sec
+     2,148,725,562      cycles                           #    2.872 GHz                    
+     4,818,942,438      instructions                     #    2.24  insn per cycle         
+       0.748866334 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:85359) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.121398e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.123528e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.123528e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.165432e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.167702e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.167702e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826767e-06 )  GeV^-6
-TOTAL       :     0.655569 sec
-     1,875,337,702      cycles                           #    2.847 GHz                    
-     4,276,013,202      instructions                     #    2.28  insn per cycle         
-       0.659452126 seconds time elapsed
+TOTAL       :     0.651347 sec
+     1,877,062,772      cycles                           #    2.867 GHz                    
+     4,276,072,949      instructions                     #    2.28  insn per cycle         
+       0.655395180 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:81075) (512y:   26) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.258028e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.260328e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.260328e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.338677e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.341123e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.341123e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826771e-06 )  GeV^-6
-TOTAL       :     0.732438 sec
-     1,358,895,231      cycles                           #    1.851 GHz                    
-     2,165,631,438      instructions                     #    1.59  insn per cycle         
-       0.736476884 seconds time elapsed
+TOTAL       :     0.724588 sec
+     1,360,263,123      cycles                           #    1.868 GHz                    
+     2,164,996,305      instructions                     #    1.59  insn per cycle         
+       0.728742359 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3475) (512y:   34) (512z:79492)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
index 67db6760e6..8c7934b526 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2023-11-03_19:16:34
+DATE: 2023-11-09_17:52:46
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.697393e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.698008e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.698206e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.693982e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.694475e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.694605e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     2.168263 sec
-     7,466,161,453      cycles                           #    3.002 GHz                    
-    16,782,968,221      instructions                     #    2.25  insn per cycle         
-       2.544374597 seconds time elapsed
+TOTAL       :     2.169924 sec
+     7,570,631,130      cycles                           #    3.042 GHz                    
+    15,729,510,401      instructions                     #    2.08  insn per cycle         
+       2.547214982 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.111494e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.111753e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.111788e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.111663e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.111941e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.111967e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856249e-04 +- 8.329951e-05 )  GeV^-6
-TOTAL       :     3.403934 sec
-    11,261,999,951      cycles                           #    3.015 GHz                    
-    23,279,217,600      instructions                     #    2.07  insn per cycle         
-       3.795199307 seconds time elapsed
+TOTAL       :     3.399776 sec
+    11,464,618,476      cycles                           #    3.079 GHz                    
+    23,776,601,911      instructions                     #    2.07  insn per cycle         
+       3.779394913 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.891205e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.891420e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.891420e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.884667e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.884874e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.884874e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     6.695697 sec
-    19,121,802,644      cycles                           #    2.855 GHz                    
-    54,152,938,154      instructions                     #    2.83  insn per cycle         
-       6.699723618 seconds time elapsed
+TOTAL       :     6.698742 sec
+    19,113,024,695      cycles                           #    2.852 GHz                    
+    54,153,033,540      instructions                     #    2.83  insn per cycle         
+       6.702658032 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:32066) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.589938e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.590022e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.590022e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.621402e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.621488e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.621488e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     3.327144 sec
-     9,411,187,085      cycles                           #    2.826 GHz                    
-    26,159,441,613      instructions                     #    2.78  insn per cycle         
-       3.331341639 seconds time elapsed
+TOTAL       :     3.261482 sec
+     9,398,350,643      cycles                           #    2.879 GHz                    
+    26,158,977,284      instructions                     #    2.78  insn per cycle         
+       3.265504352 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:96005) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.556465e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.556911e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.556911e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.791341e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.791883e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.791883e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.488421 sec
-     4,038,495,427      cycles                           #    2.707 GHz                    
-     9,228,280,089      instructions                     #    2.29  insn per cycle         
-       1.492543554 seconds time elapsed
+TOTAL       :     1.398109 sec
+     4,039,627,179      cycles                           #    2.883 GHz                    
+     9,228,162,054      instructions                     #    2.28  insn per cycle         
+       1.402192827 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:84155) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.276116e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.276827e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.276827e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.351031e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.351641e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.351641e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.240270 sec
-     3,525,917,357      cycles                           #    2.835 GHz                    
-     8,175,363,577      instructions                     #    2.32  insn per cycle         
-       1.244573424 seconds time elapsed
+TOTAL       :     1.218443 sec
+     3,518,124,342      cycles                           #    2.879 GHz                    
+     8,175,077,517      instructions                     #    2.32  insn per cycle         
+       1.222409560 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:79844) (512y:   79) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.671636e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.672174e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.672174e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.765628e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.766216e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.766216e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.443903 sec
-     2,654,961,238      cycles                           #    1.834 GHz                    
-     4,155,116,507      instructions                     #    1.57  insn per cycle         
-       1.448186385 seconds time elapsed
+TOTAL       :     1.407689 sec
+     2,655,252,329      cycles                           #    1.882 GHz                    
+     4,154,811,941      instructions                     #    1.56  insn per cycle         
+       1.411617738 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2045) (512y:   93) (512z:78760)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt
index ba876e5994..b26dd71707 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2023-11-03_19:17:35
+DATE: 2023-11-09_17:53:47
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.679011e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.679665e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.679866e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.674330e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.674838e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.674969e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     2.173579 sec
-     7,474,410,637      cycles                           #    3.001 GHz                    
-    15,946,585,145      instructions                     #    2.13  insn per cycle         
-       2.550103231 seconds time elapsed
+TOTAL       :     2.174009 sec
+     7,611,935,314      cycles                           #    3.054 GHz                    
+    16,836,441,609      instructions                     #    2.21  insn per cycle         
+       2.551658489 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.109202e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.109461e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.109492e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.107370e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.107637e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.107663e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856249e-04 +- 8.329951e-05 )  GeV^-6
-TOTAL       :     3.402138 sec
-    11,227,553,919      cycles                           #    3.005 GHz                    
-    23,286,904,291      instructions                     #    2.07  insn per cycle         
-       3.792137186 seconds time elapsed
+TOTAL       :     3.413929 sec
+    11,386,114,072      cycles                           #    3.048 GHz                    
+    23,902,448,329      instructions                     #    2.10  insn per cycle         
+       3.794282526 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.862068e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.862272e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.862272e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.931164e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.931386e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.931386e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     6.723543 sec
-    19,074,467,052      cycles                           #    2.836 GHz                    
-    54,156,087,092      instructions                     #    2.84  insn per cycle         
-       6.727488337 seconds time elapsed
+TOTAL       :     6.662052 sec
+    19,079,234,145      cycles                           #    2.863 GHz                    
+    54,153,851,240      instructions                     #    2.84  insn per cycle         
+       6.666006074 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:32243) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.568667e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.568765e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.568765e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.620269e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.620358e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.620358e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     3.375716 sec
-     9,382,313,393      cycles                           #    2.776 GHz                    
-    26,079,058,590      instructions                     #    2.78  insn per cycle         
-       3.379999018 seconds time elapsed
+TOTAL       :     3.263602 sec
+     9,383,434,712      cycles                           #    2.872 GHz                    
+    26,078,178,648      instructions                     #    2.78  insn per cycle         
+       3.267785109 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:95899) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.662540e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.663002e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.663002e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.732412e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.732940e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.732940e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.448110 sec
-     4,074,555,185      cycles                           #    2.807 GHz                    
-     9,213,769,276      instructions                     #    2.26  insn per cycle         
-       1.452285529 seconds time elapsed
+TOTAL       :     1.420295 sec
+     4,071,120,210      cycles                           #    2.859 GHz                    
+     9,213,520,884      instructions                     #    2.26  insn per cycle         
+       1.424453149 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:83776) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.250454e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.251202e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.251202e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.308670e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.309271e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.309271e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.248074 sec
-     3,536,570,557      cycles                           #    2.826 GHz                    
-     8,168,521,757      instructions                     #    2.31  insn per cycle         
-       1.252256213 seconds time elapsed
+TOTAL       :     1.231097 sec
+     3,538,361,762      cycles                           #    2.867 GHz                    
+     8,168,060,632      instructions                     #    2.31  insn per cycle         
+       1.234995598 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:79373) (512y:  229) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.691090e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.691677e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.691677e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.830037e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.830636e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.830636e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.437256 sec
-     2,622,132,529      cycles                           #    1.820 GHz                    
-     4,153,851,791      instructions                     #    1.58  insn per cycle         
-       1.441375266 seconds time elapsed
+TOTAL       :     1.385915 sec
+     2,618,303,188      cycles                           #    1.885 GHz                    
+     4,153,502,106      instructions                     #    1.59  insn per cycle         
+       1.389952232 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1492) (512y:  175) (512z:78776)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
index 32c5e2345e..6d792821e6 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2023-11-03_19:11:10
+DATE: 2023-11-09_17:47:24
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.931878e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.341004e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.663503e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.838115e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.336717e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.669956e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.446607 sec
-     1,970,164,515      cycles                           #    2.938 GHz                    
-     2,759,248,123      instructions                     #    1.40  insn per cycle         
-       0.729204009 seconds time elapsed
+TOTAL       :     0.441585 sec
+     1,966,591,447      cycles                           #    2.991 GHz                    
+     2,767,621,879      instructions                     #    1.41  insn per cycle         
+       0.715631755 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.710415e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.163714e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.497427e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.614381e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.150528e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.499874e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.602505e+02 +- 2.116328e+02 )  GeV^-2
-TOTAL       :     0.523022 sec
-     2,217,601,456      cycles                           #    2.938 GHz                    
-     3,205,519,009      instructions                     #    1.45  insn per cycle         
-       0.813078242 seconds time elapsed
+TOTAL       :     0.519980 sec
+     2,272,719,542      cycles                           #    3.015 GHz                    
+     3,282,015,462      instructions                     #    1.44  insn per cycle         
+       0.810626224 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.073669e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.096220e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.096220e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.097912e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.120487e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.120487e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     1.548155 sec
-     4,698,700,649      cycles                           #    3.029 GHz                    
-    13,467,797,998      instructions                     #    2.87  insn per cycle         
-       1.552304744 seconds time elapsed
+TOTAL       :     1.514129 sec
+     4,699,091,915      cycles                           #    3.096 GHz                    
+    13,466,947,436      instructions                     #    2.87  insn per cycle         
+       1.518294228 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  860) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.948763e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.021816e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.021816e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.983607e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.058142e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.058142e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.862036 sec
-     2,624,478,574      cycles                           #    3.032 GHz                    
-     7,556,486,050      instructions                     #    2.88  insn per cycle         
-       0.866308924 seconds time elapsed
+TOTAL       :     0.847498 sec
+     2,625,908,011      cycles                           #    3.086 GHz                    
+     7,555,492,469      instructions                     #    2.88  insn per cycle         
+       0.851823974 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3095) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.306326e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.524533e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.524533e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.394636e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.619511e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.619511e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.517734 sec
-     1,480,526,951      cycles                           #    2.839 GHz                    
-     3,123,082,416      instructions                     #    2.11  insn per cycle         
-       0.522085763 seconds time elapsed
+TOTAL       :     0.504120 sec
+     1,476,957,330      cycles                           #    2.909 GHz                    
+     3,122,047,526      instructions                     #    2.11  insn per cycle         
+       0.508259108 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2917) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.669407e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.933881e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.933881e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.754841e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.026481e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.026481e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.468132 sec
-     1,341,729,382      cycles                           #    2.844 GHz                    
-     2,984,537,487      instructions                     #    2.22  insn per cycle         
-       0.472335074 seconds time elapsed
+TOTAL       :     0.457617 sec
+     1,342,416,487      cycles                           #    2.911 GHz                    
+     2,984,161,058      instructions                     #    2.22  insn per cycle         
+       0.461673437 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2694) (512y:  104) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.279474e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.384367e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.384367e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.547509e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.672958e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.672958e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.743758 sec
-     1,327,382,690      cycles                           #    1.776 GHz                    
-     1,956,119,028      instructions                     #    1.47  insn per cycle         
-       0.747985259 seconds time elapsed
+TOTAL       :     0.666989 sec
+     1,325,861,856      cycles                           #    1.977 GHz                    
+     1,955,811,920      instructions                     #    1.48  insn per cycle         
+       0.671229633 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1372) (512y:  106) (512z: 2173)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt
index 83cbc116b3..8337df6649 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2023-11-03_19:36:51
+DATE: 2023-11-09_18:11:37
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.568026e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.132079e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.132079e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.580013e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.253753e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.253753e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.473711 sec
-     2,006,451,769      cycles                           #    2.929 GHz                    
-     2,970,353,925      instructions                     #    1.48  insn per cycle         
-       0.742629859 seconds time elapsed
+TOTAL       :     0.470514 sec
+     2,029,905,705      cycles                           #    2.983 GHz                    
+     3,022,396,069      instructions                     #    1.49  insn per cycle         
+       0.739050820 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
@@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.250433e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.283042e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.283042e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.291351e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.372563e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.372563e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.602505e+02 +- 2.116328e+02 )  GeV^-2
-TOTAL       :     0.748674 sec
-     3,002,657,574      cycles                           #    2.966 GHz                    
-     4,543,695,427      instructions                     #    1.51  insn per cycle         
-       1.069550305 seconds time elapsed
+TOTAL       :     0.742777 sec
+     2,970,951,255      cycles                           #    2.999 GHz                    
+     4,514,637,368      instructions                     #    1.52  insn per cycle         
+       1.047584901 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.069178e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.091931e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.091931e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.084622e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.107537e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.107537e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     1.560726 sec
-     4,731,718,585      cycles                           #    3.025 GHz                    
-    13,472,168,375      instructions                     #    2.85  insn per cycle         
-       1.565141837 seconds time elapsed
+TOTAL       :     1.538614 sec
+     4,724,111,132      cycles                           #    3.063 GHz                    
+    13,474,132,709      instructions                     #    2.85  insn per cycle         
+       1.542829058 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  860) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe
@@ -132,14 +132,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.899999e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.973174e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.973174e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.968452e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.042732e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.042732e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.892296 sec
-     2,670,244,018      cycles                           #    2.980 GHz                    
-     7,605,526,435      instructions                     #    2.85  insn per cycle         
-       0.896907337 seconds time elapsed
+TOTAL       :     0.860148 sec
+     2,657,657,312      cycles                           #    3.076 GHz                    
+     7,605,024,054      instructions                     #    2.86  insn per cycle         
+       0.864557816 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3095) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe
@@ -160,14 +160,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.091835e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.296236e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.296236e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.339093e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.562110e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.562110e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.561077 sec
-     1,524,432,631      cycles                           #    2.698 GHz                    
-     3,172,781,548      instructions                     #    2.08  insn per cycle         
-       0.565642937 seconds time elapsed
+TOTAL       :     0.520524 sec
+     1,514,451,185      cycles                           #    2.892 GHz                    
+     3,172,765,595      instructions                     #    2.09  insn per cycle         
+       0.524939185 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2917) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe
@@ -188,14 +188,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.608228e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.871141e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.871141e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.708754e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.978270e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.978270e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.483758 sec
-     1,382,209,807      cycles                           #    2.835 GHz                    
-     3,035,256,040      instructions                     #    2.20  insn per cycle         
-       0.488244630 seconds time elapsed
+TOTAL       :     0.469539 sec
+     1,371,933,121      cycles                           #    2.899 GHz                    
+     3,033,200,949      instructions                     #    2.21  insn per cycle         
+       0.473789571 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2694) (512y:  104) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe
@@ -216,14 +216,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.425183e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.544675e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.544675e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.533145e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.657118e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.657118e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.707575 sec
-     1,368,070,277      cycles                           #    1.923 GHz                    
-     1,995,483,449      instructions                     #    1.46  insn per cycle         
-       0.712159059 seconds time elapsed
+TOTAL       :     0.676505 sec
+     1,357,238,089      cycles                           #    1.995 GHz                    
+     1,995,412,477      instructions                     #    1.47  insn per cycle         
+       0.680880338 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1372) (512y:  106) (512z: 2173)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt
index 5c16312148..2ec6b9dc47 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2023-11-03_19:11:27
+DATE: 2023-11-09_17:47:41
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.898292e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.236740e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.548470e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.819082e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.206686e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.526015e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.444696 sec
-     1,938,726,844      cycles                           #    2.937 GHz                    
-     2,756,323,630      instructions                     #    1.42  insn per cycle         
-       0.718363875 seconds time elapsed
+TOTAL       :     0.443165 sec
+     1,961,379,003      cycles                           #    2.989 GHz                    
+     2,781,357,072      instructions                     #    1.42  insn per cycle         
+       0.713330689 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.682843e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.082328e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.409380e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.580414e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.034117e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.374431e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.602505e+02 +- 2.116328e+02 )  GeV^-2
-TOTAL       :     0.522202 sec
-     2,220,530,283      cycles                           #    2.941 GHz                    
-     3,184,953,404      instructions                     #    1.43  insn per cycle         
-       0.811776517 seconds time elapsed
+TOTAL       :     0.527713 sec
+     2,200,622,669      cycles                           #    2.860 GHz                    
+     3,134,287,672      instructions                     #    1.42  insn per cycle         
+       0.826392715 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.070337e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.092872e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.092872e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.033805e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.055304e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.055304e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     1.552829 sec
-     4,705,329,544      cycles                           #    3.023 GHz                    
-    13,461,758,666      instructions                     #    2.86  insn per cycle         
-       1.556952692 seconds time elapsed
+TOTAL       :     1.607267 sec
+     4,703,491,098      cycles                           #    2.920 GHz                    
+    13,461,246,606      instructions                     #    2.86  insn per cycle         
+       1.611368977 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  849) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.948045e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.021952e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.021952e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.985735e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.061359e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.061359e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.862207 sec
-     2,624,178,818      cycles                           #    3.031 GHz                    
-     7,555,487,904      instructions                     #    2.88  insn per cycle         
-       0.866510467 seconds time elapsed
+TOTAL       :     0.845910 sec
+     2,624,687,455      cycles                           #    3.090 GHz                    
+     7,554,687,341      instructions                     #    2.88  insn per cycle         
+       0.850163593 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3088) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.292100e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.512278e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.512278e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.383208e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.600735e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.600735e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.519779 sec
-     1,479,324,919      cycles                           #    2.825 GHz                    
-     3,121,432,800      instructions                     #    2.11  insn per cycle         
-       0.524166869 seconds time elapsed
+TOTAL       :     0.505300 sec
+     1,477,429,478      cycles                           #    2.904 GHz                    
+     3,120,730,266      instructions                     #    2.11  insn per cycle         
+       0.509369657 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2900) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.586783e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.851292e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.851292e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.736623e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.003084e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.003084e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.479341 sec
-     1,345,156,968      cycles                           #    2.785 GHz                    
-     2,982,279,143      instructions                     #    2.22  insn per cycle         
-       0.483569808 seconds time elapsed
+TOTAL       :     0.460033 sec
+     1,340,907,328      cycles                           #    2.892 GHz                    
+     2,981,159,149      instructions                     #    2.22  insn per cycle         
+       0.464174349 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2670) (512y:  104) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.481639e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.600263e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.600263e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.537070e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.658764e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.658764e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.683909 sec
-     1,326,826,217      cycles                           #    1.930 GHz                    
-     1,955,120,469      instructions                     #    1.47  insn per cycle         
-       0.688253496 seconds time elapsed
+TOTAL       :     0.669277 sec
+     1,326,031,179      cycles                           #    1.971 GHz                    
+     1,954,098,862      instructions                     #    1.47  insn per cycle         
+       0.673467594 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1348) (512y:  106) (512z: 2173)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
index 59e9dbfb13..25d66c7041 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2023-11-03_19:11:45
+DATE: 2023-11-09_17:47:59
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.904199e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.231536e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.359887e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.746320e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.236957e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.360917e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018174e+01 +- 1.429492e+01 )  GeV^-2
-TOTAL       :     0.438419 sec
-     1,915,720,301      cycles                           #    2.940 GHz                    
-     2,722,845,778      instructions                     #    1.42  insn per cycle         
-       0.708695201 seconds time elapsed
+TOTAL       :     0.440483 sec
+     1,942,018,247      cycles                           #    2.976 GHz                    
+     2,734,614,888      instructions                     #    1.41  insn per cycle         
+       0.710582968 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 167
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.256707e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.834983e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.952518e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.010716e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.836484e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.960610e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.571361e+02 +- 2.114021e+02 )  GeV^-2
-TOTAL       :     0.473385 sec
-     2,068,832,196      cycles                           #    2.955 GHz                    
-     2,965,580,704      instructions                     #    1.43  insn per cycle         
-       0.757067346 seconds time elapsed
+TOTAL       :     0.476376 sec
+     2,093,828,578      cycles                           #    2.973 GHz                    
+     2,983,215,115      instructions                     #    1.42  insn per cycle         
+       0.763925577 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.135878e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.161149e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.161149e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.150183e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.175828e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.175828e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018563e+01 +- 1.429903e+01 )  GeV^-2
-TOTAL       :     1.462461 sec
-     4,454,737,328      cycles                           #    3.039 GHz                    
-    13,053,159,453      instructions                     #    2.93  insn per cycle         
-       1.466494148 seconds time elapsed
+TOTAL       :     1.444504 sec
+     4,454,034,181      cycles                           #    3.077 GHz                    
+    13,052,158,813      instructions                     #    2.93  insn per cycle         
+       1.448436066 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  745) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.046237e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.238088e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.238088e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.075306e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.270472e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.270472e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018563e+01 +- 1.429902e+01 )  GeV^-2
-TOTAL       :     0.557296 sec
-     1,699,998,155      cycles                           #    3.031 GHz                    
-     4,515,681,552      instructions                     #    2.66  insn per cycle         
-       0.561435544 seconds time elapsed
+TOTAL       :     0.552218 sec
+     1,700,873,014      cycles                           #    3.061 GHz                    
+     4,515,081,496      instructions                     #    2.65  insn per cycle         
+       0.556201186 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3601) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.648399e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.355867e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.355867e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.031649e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.790374e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.790374e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018828e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.311054 sec
-       851,131,460      cycles                           #    2.704 GHz                    
-     1,899,263,660      instructions                     #    2.23  insn per cycle         
-       0.315235937 seconds time elapsed
+TOTAL       :     0.291602 sec
+       850,563,357      cycles                           #    2.883 GHz                    
+     1,898,510,633      instructions                     #    2.23  insn per cycle         
+       0.295657443 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3491) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.243995e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.098185e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.098185e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.014318e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.832565e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.832565e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018828e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.282792 sec
-       800,211,416      cycles                           #    2.794 GHz                    
-     1,822,370,089      instructions                     #    2.28  insn per cycle         
-       0.286974618 seconds time elapsed
+TOTAL       :     0.293482 sec
+       802,625,962      cycles                           #    2.700 GHz                    
+     1,821,591,063      instructions                     #    2.27  insn per cycle         
+       0.297764671 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3335) (512y:   22) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe
@@ -194,9 +194,9 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP=
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
-        29,217,754      cycles                           #    2.652 GHz                    
-        42,284,295      instructions                     #    1.45  insn per cycle         
-       0.011406114 seconds time elapsed
+        29,732,895      cycles                           #    2.697 GHz                    
+        41,670,508      instructions                     #    1.40  insn per cycle         
+       0.011409242 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1969) (512y:   32) (512z: 2383)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt
index f15afb12c1..687daa906c 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2023-11-03_19:37:10
+DATE: 2023-11-09_18:11:56
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.572083e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.023629e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.023629e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.747186e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.237510e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.237510e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.017654e+01 +- 1.429184e+01 )  GeV^-2
-TOTAL       :     0.454387 sec
-     1,955,352,187      cycles                           #    2.938 GHz                    
-     2,863,812,902      instructions                     #    1.46  insn per cycle         
-       0.722319097 seconds time elapsed
+TOTAL       :     0.451024 sec
+     1,971,298,564      cycles                           #    2.989 GHz                    
+     2,921,186,990      instructions                     #    1.48  insn per cycle         
+       0.718171404 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
@@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.087118e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.599283e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.599283e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.154719e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.829239e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.829239e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.609942e+02 +- 2.115590e+02 )  GeV^-2
-TOTAL       :     0.623566 sec
-     2,498,674,729      cycles                           #    2.923 GHz                    
-     3,766,117,574      instructions                     #    1.51  insn per cycle         
-       0.913465239 seconds time elapsed
+TOTAL       :     0.620701 sec
+     2,514,914,307      cycles                           #    2.959 GHz                    
+     3,812,117,615      instructions                     #    1.52  insn per cycle         
+       0.908673198 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.124937e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.150391e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.150391e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.130761e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.156131e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.156131e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018563e+01 +- 1.429903e+01 )  GeV^-2
-TOTAL       :     1.480709 sec
-     4,471,348,915      cycles                           #    3.013 GHz                    
-    13,056,806,498      instructions                     #    2.92  insn per cycle         
-       1.485019275 seconds time elapsed
+TOTAL       :     1.472797 sec
+     4,472,979,155      cycles                           #    3.030 GHz                    
+    13,056,761,338      instructions                     #    2.92  insn per cycle         
+       1.477050712 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  745) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe
@@ -132,14 +132,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.015036e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.208624e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.208624e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.077738e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.274919e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.274919e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018563e+01 +- 1.429902e+01 )  GeV^-2
-TOTAL       :     0.567631 sec
-     1,721,622,943      cycles                           #    3.014 GHz                    
-     4,563,283,810      instructions                     #    2.65  insn per cycle         
-       0.571796628 seconds time elapsed
+TOTAL       :     0.555619 sec
+     1,722,866,665      cycles                           #    3.081 GHz                    
+     4,563,322,469      instructions                     #    2.65  insn per cycle         
+       0.559797755 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3601) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe
@@ -160,14 +160,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.904492e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.650265e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.650265e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.956375e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.689121e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.689121e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018828e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.302253 sec
-       872,846,100      cycles                           #    2.852 GHz                    
-     1,935,401,156      instructions                     #    2.22  insn per cycle         
-       0.306655862 seconds time elapsed
+TOTAL       :     0.298686 sec
+       869,037,023      cycles                           #    2.875 GHz                    
+     1,935,544,426      instructions                     #    2.23  insn per cycle         
+       0.302811266 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3491) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe
@@ -188,14 +188,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.271441e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.120717e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.120717e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.465666e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.344453e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.344453e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018828e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.285638 sec
-       819,147,203      cycles                           #    2.831 GHz                    
-     1,858,340,668      instructions                     #    2.27  insn per cycle         
-       0.289825539 seconds time elapsed
+TOTAL       :     0.276910 sec
+       817,448,595      cycles                           #    2.915 GHz                    
+     1,858,610,780      instructions                     #    2.27  insn per cycle         
+       0.280974833 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3335) (512y:   22) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe
@@ -211,9 +211,9 @@ OK (relative difference <= 5E-3)
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Instantiate host Bridge (nevt=16384)
-        37,779,421      cycles                           #    2.664 GHz                    
-        50,267,131      instructions                     #    1.33  insn per cycle         
-       0.014729622 seconds time elapsed
+        37,531,426      cycles                           #    2.805 GHz                    
+        50,366,354      instructions                     #    1.34  insn per cycle         
+       0.013813903 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1969) (512y:   32) (512z: 2383)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt
index c8e32c45f6..8bc404b84b 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2023-11-03_19:12:01
+DATE: 2023-11-09_17:48:15
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.816263e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.233557e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.356584e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.693711e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.215042e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.339602e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018174e+01 +- 1.429492e+01 )  GeV^-2
-TOTAL       :     0.439029 sec
-     1,906,384,387      cycles                           #    2.932 GHz                    
-     2,668,630,925      instructions                     #    1.40  insn per cycle         
-       0.709025104 seconds time elapsed
+TOTAL       :     0.438778 sec
+     1,941,964,603      cycles                           #    2.979 GHz                    
+     2,729,283,404      instructions                     #    1.41  insn per cycle         
+       0.709465120 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 167
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.165457e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.788318e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.899924e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.971564e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.799531e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.917090e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.571361e+02 +- 2.114021e+02 )  GeV^-2
-TOTAL       :     0.475153 sec
-     2,060,825,458      cycles                           #    2.945 GHz                    
-     2,959,751,148      instructions                     #    1.44  insn per cycle         
-       0.758667305 seconds time elapsed
+TOTAL       :     0.470244 sec
+     2,084,172,928      cycles                           #    3.008 GHz                    
+     2,971,888,877      instructions                     #    1.43  insn per cycle         
+       0.750810002 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.129555e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.154905e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.154905e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.156981e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.183336e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.183336e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018563e+01 +- 1.429903e+01 )  GeV^-2
-TOTAL       :     1.470613 sec
-     4,452,780,841      cycles                           #    3.021 GHz                    
-    13,033,295,085      instructions                     #    2.93  insn per cycle         
-       1.474743963 seconds time elapsed
+TOTAL       :     1.435690 sec
+     4,451,626,158      cycles                           #    3.094 GHz                    
+    13,032,987,489      instructions                     #    2.93  insn per cycle         
+       1.439578191 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  727) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.000043e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.190804e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.190804e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.129722e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.328624e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.328624e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018563e+01 +- 1.429902e+01 )  GeV^-2
-TOTAL       :     0.566289 sec
-     1,691,331,084      cycles                           #    2.968 GHz                    
-     4,511,809,710      instructions                     #    2.67  insn per cycle         
-       0.570477990 seconds time elapsed
+TOTAL       :     0.542754 sec
+     1,689,058,698      cycles                           #    3.092 GHz                    
+     4,510,968,389      instructions                     #    2.67  insn per cycle         
+       0.546880720 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3589) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.392978e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.034440e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.034440e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.059640e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.837369e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.837369e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018828e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.325119 sec
-       853,124,200      cycles                           #    2.596 GHz                    
-     1,896,337,755      instructions                     #    2.22  insn per cycle         
-       0.329328797 seconds time elapsed
+TOTAL       :     0.290425 sec
+       852,449,044      cycles                           #    2.901 GHz                    
+     1,895,470,717      instructions                     #    2.22  insn per cycle         
+       0.294595816 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3461) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.399192e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.280649e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.280649e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.503379e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.376998e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.376998e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018828e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.275864 sec
-       799,266,525      cycles                           #    2.860 GHz                    
-     1,818,357,527      instructions                     #    2.28  insn per cycle         
-       0.279975539 seconds time elapsed
+TOTAL       :     0.271227 sec
+       799,263,402      cycles                           #    2.909 GHz                    
+     1,817,410,136      instructions                     #    2.27  insn per cycle         
+       0.275264605 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3298) (512y:   22) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest.exe
@@ -194,9 +194,9 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP=
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
-        28,695,242      cycles                           #    2.686 GHz                    
-        41,682,313      instructions                     #    1.45  insn per cycle         
-       0.011083970 seconds time elapsed
+        28,811,890      cycles                           #    2.702 GHz                    
+        40,903,960      instructions                     #    1.42  insn per cycle         
+       0.011044926 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1932) (512y:   32) (512z: 2383)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
index 2f090614c3..eab7ec279c 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2023-11-03_19:12:17
+DATE: 2023-11-09_17:48:31
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.924011e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.312316e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.652376e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.897435e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.394394e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.726655e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.444193 sec
-     1,982,708,723      cycles                           #    2.960 GHz                    
-     2,773,326,834      instructions                     #    1.40  insn per cycle         
-       0.727594315 seconds time elapsed
+TOTAL       :     0.442647 sec
+     2,004,900,013      cycles                           #    3.007 GHz                    
+     2,826,895,466      instructions                     #    1.41  insn per cycle         
+       0.724412660 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.716781e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.189044e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.525460e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.620708e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.161875e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.511766e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.602505e+02 +- 2.116328e+02 )  GeV^-2
-TOTAL       :     0.521362 sec
-     2,209,841,644      cycles                           #    2.939 GHz                    
-     3,173,284,555      instructions                     #    1.44  insn per cycle         
-       0.811280771 seconds time elapsed
+TOTAL       :     0.520790 sec
+     2,257,400,704      cycles                           #    2.997 GHz                    
+     3,259,917,218      instructions                     #    1.44  insn per cycle         
+       0.810908697 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.069544e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.093797e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.093797e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.088540e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.110933e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.110933e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     1.554251 sec
-     4,735,824,731      cycles                           #    3.041 GHz                    
-    13,470,683,397      instructions                     #    2.84  insn per cycle         
-       1.558385201 seconds time elapsed
+TOTAL       :     1.526966 sec
+     4,723,154,452      cycles                           #    3.087 GHz                    
+    13,469,602,667      instructions                     #    2.85  insn per cycle         
+       1.531097432 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  840) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.965218e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.040121e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.040121e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.988494e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.063440e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.063440e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.855114 sec
-     2,601,303,673      cycles                           #    3.029 GHz                    
-     7,389,579,625      instructions                     #    2.84  insn per cycle         
-       0.859411839 seconds time elapsed
+TOTAL       :     0.845345 sec
+     2,599,329,855      cycles                           #    3.062 GHz                    
+     7,388,612,618      instructions                     #    2.84  insn per cycle         
+       0.849529924 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3073) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.103178e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.304731e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.304731e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.404332e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.629825e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.629825e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.550950 sec
-     1,470,989,933      cycles                           #    2.653 GHz                    
-     3,058,765,662      instructions                     #    2.08  insn per cycle         
-       0.555184249 seconds time elapsed
+TOTAL       :     0.502979 sec
+     1,466,711,057      cycles                           #    2.896 GHz                    
+     3,057,623,965      instructions                     #    2.08  insn per cycle         
+       0.507143043 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3013) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.774277e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.060098e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.060098e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.803609e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.085245e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.085245e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.455765 sec
-     1,309,522,407      cycles                           #    2.852 GHz                    
-     2,933,428,757      instructions                     #    2.24  insn per cycle         
-       0.459981977 seconds time elapsed
+TOTAL       :     0.452713 sec
+     1,309,685,857      cycles                           #    2.871 GHz                    
+     2,932,566,248      instructions                     #    2.24  insn per cycle         
+       0.456835979 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2799) (512y:  110) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.411920e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.526016e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.526016e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.397391e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.510097e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.510097e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.703353 sec
-     1,366,582,014      cycles                           #    1.933 GHz                    
-     1,972,774,215      instructions                     #    1.44  insn per cycle         
-       0.707707323 seconds time elapsed
+TOTAL       :     0.707515 sec
+     1,366,670,273      cycles                           #    1.922 GHz                    
+     1,971,774,412      instructions                     #    1.44  insn per cycle         
+       0.711692701 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1700) (512y:  114) (512z: 2171)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt
index f9fb6155f7..804124a528 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2023-11-03_19:12:35
+DATE: 2023-11-09_17:48:49
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.886874e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.228157e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.568514e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.811798e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.176696e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.495530e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.444068 sec
-     1,946,521,853      cycles                           #    2.951 GHz                    
-     2,755,422,178      instructions                     #    1.42  insn per cycle         
-       0.717280231 seconds time elapsed
+TOTAL       :     0.443833 sec
+     2,007,951,396      cycles                           #    2.999 GHz                    
+     2,822,905,809      instructions                     #    1.41  insn per cycle         
+       0.728453943 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.675020e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.027076e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.349457e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.587196e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.041060e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.377539e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.602505e+02 +- 2.116328e+02 )  GeV^-2
-TOTAL       :     0.523198 sec
-     2,222,274,900      cycles                           #    2.946 GHz                    
-     3,198,191,753      instructions                     #    1.44  insn per cycle         
-       0.813003520 seconds time elapsed
+TOTAL       :     0.523091 sec
+     2,298,379,472      cycles                           #    2.986 GHz                    
+     3,299,691,245      instructions                     #    1.44  insn per cycle         
+       0.827230276 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.069395e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.091866e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.091866e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.081127e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.103599e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.103599e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     1.554132 sec
-     4,733,166,680      cycles                           #    3.039 GHz                    
-    13,456,716,984      instructions                     #    2.84  insn per cycle         
-       1.558278315 seconds time elapsed
+TOTAL       :     1.537190 sec
+     4,726,723,623      cycles                           #    3.068 GHz                    
+    13,455,766,194      instructions                     #    2.85  insn per cycle         
+       1.541247326 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  827) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.963106e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.038064e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.038064e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.984806e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.061569e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.061569e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.856001 sec
-     2,603,447,344      cycles                           #    3.028 GHz                    
-     7,393,362,148      instructions                     #    2.84  insn per cycle         
-       0.860294166 seconds time elapsed
+TOTAL       :     0.846326 sec
+     2,602,293,302      cycles                           #    3.065 GHz                    
+     7,392,635,608      instructions                     #    2.84  insn per cycle         
+       0.850454133 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3062) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.354162e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.573385e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.573385e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.380134e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.599128e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.599128e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.509733 sec
-     1,467,381,346      cycles                           #    2.859 GHz                    
-     3,058,521,485      instructions                     #    2.08  insn per cycle         
-       0.513844239 seconds time elapsed
+TOTAL       :     0.506085 sec
+     1,466,467,612      cycles                           #    2.876 GHz                    
+     3,058,106,145      instructions                     #    2.09  insn per cycle         
+       0.510457197 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2990) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.783084e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.065773e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.065773e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.778195e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.059768e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.059768e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.454796 sec
-     1,307,019,802      cycles                           #    2.851 GHz                    
-     2,934,565,738      instructions                     #    2.25  insn per cycle         
-       0.459066978 seconds time elapsed
+TOTAL       :     0.455384 sec
+     1,311,774,111      cycles                           #    2.858 GHz                    
+     2,933,399,487      instructions                     #    2.24  insn per cycle         
+       0.459674797 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2775) (512y:  110) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.408065e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.519741e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.519741e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.385780e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.497799e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.497799e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.704354 sec
-     1,368,218,437      cycles                           #    1.933 GHz                    
-     1,972,609,636      instructions                     #    1.44  insn per cycle         
-       0.708886358 seconds time elapsed
+TOTAL       :     0.711136 sec
+     1,370,131,308      cycles                           #    1.917 GHz                    
+     1,971,581,787      instructions                     #    1.44  insn per cycle         
+       0.715633425 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1676) (512y:  114) (512z: 2171)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest.exe