From a1de7d6021b9d92d3456cbd8ea1d55ecc4f43d57 Mon Sep 17 00:00:00 2001
From: cguzman <christian.guzman@bsc.es>
Date: Tue, 22 Oct 2019 13:23:20 +0200
Subject: [PATCH 1/4] Added compilation of the tests on GPU: For the moment,
 only unit_tests_arrhenius added. The implementation it's a bit ugly (can't be
 executed in parallel, duplicate .sh test files with _gpu.sh) , but it works
 and only do a minor changes on CmakeLists.txt

How it works: basically it creates a partmclib_gpu (apart from partmclib) with all files compiled with ENABLE_GPU=ON, and if cuda_found, a new test _gpu test is generated
---
 CMakeLists.txt                                | 135 +++++++++++++-----
 src/camp_solver.c                             |  12 +-
 src/cuda/camp_gpu_solver.cu                   |  30 ++--
 src/cuda/camp_gpu_solver.h                    |   1 -
 src/cuda/rxns_gpu/rxn_aqueous_equilibrium.cu  |  10 +-
 src/cuda/rxns_gpu/rxn_arrhenius.cu            |  34 +++--
 src/rxn_solver.c                              |   4 +
 .../input_files/run_rxn_arrhenius_gpu.sh      |  35 +++++
 8 files changed, 183 insertions(+), 78 deletions(-)
 create mode 100755 test/unit_tests/input_files/run_rxn_arrhenius_gpu.sh

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 97551690b..3ba098a90 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -170,11 +170,21 @@ endif()
 ######################################################################
 # GPU support
 
-if (ENABLE_GPU)
-  add_definitions(-DPMC_USE_GPU)
-  enable_language(CUDA)
+
+#Compile GPU tests if CUDA detected
+find_package(CUDA)
+if (CUDA_FOUND)
+    enable_language(CUDA)
+    #If enable_gpu, all will be executed on GPU
+    if (ENABLE_GPU)
+        #Define global variable of use gpu no matter what test to include user main test
+        add_definitions(-DPMC_USE_GPU)
+    endif()
 endif()
 
+if(NOT CUDA_FOUND AND ENABLE_GPU)
+    message( WARNING "Not CUDA module found, can't support GPU computation")
+endif()
 
 ######################################################################
 # debugging options
@@ -236,6 +246,13 @@ else()
   set(MPI_TEST_FLAG serial)
 endif()
 
+if(CUDA_FOUND)
+    add_test(test_rxn_arrhenius_mech_gpu ${CMAKE_BINARY_DIR}/test_run/unit_tests/input_files/run_rxn_arrhenius_gpu.sh ${MPI_TEST_FLAG})
+endif()
+
+
+add_test(test_rxn_arrhenius_mech ${CMAKE_BINARY_DIR}/test_run/unit_tests/input_files/run_rxn_arrhenius.sh ${MPI_TEST_FLAG})
+
 if (ENABLE_TESTS)
     if (ENABLE_JSON AND ENABLE_SUNDIALS)
       # New unit tests (UNDER DEVELOPMENT)
@@ -411,8 +428,12 @@ endif()
 # partmc library
 
 set(STD_C_FLAGS "-std=c99")
+
 set(STD_CUDA_FLAGS "-dc -arch=compute_70 -code=sm_70")
 
+
+
+
 if(${CMAKE_Fortran_COMPILER_ID} MATCHES "Intel")
   set(STD_F_FLAGS "-extend_source -warn truncated_source")
 elseif(${CMAKE_Fortran_COMPILER_ID} MATCHES "GNU")
@@ -498,7 +519,8 @@ set_source_files_properties(${CAMP_C_SRC} PROPERTIES COMPILE_FLAGS
 
 set(CAMP_CXX_SRC "")
 
-if(ENABLE_GPU)
+if (CUDA_FOUND)
+#if(ENABLE_GPU)
   set(CAMP_CUDA_SRC
           src/cuda/camp_gpu_solver.cu
           src/cuda/rxns_gpu/rxn_aqueous_equilibrium.cu
@@ -521,33 +543,38 @@ if(ENABLE_GPU)
 
   set_source_files_properties(${CAMP_CUDA_SRC} PROPERTIES COMPILE_FLAGS
         ${STD_CUDA_FLAGS})
+
   set_source_files_properties(${CAMP_CUDA_SRC} PROPERTIES LANGUAGE CUDA)
 else()
-  set(CAMP_CUDA_SRC "")
+#  set(CAMP_CUDA_SRC "")
 endif()
 
-add_library(partmclib src/aero_state.F90 src/integer_varray.F90
-  src/integer_rmap.F90 src/integer_rmap2.F90 src/aero_sorted.F90
-  src/aero_binned.F90 src/bin_grid.F90 src/constants.F90
-  src/scenario.F90 src/env_state.F90 src/aero_mode.F90
-  src/aero_dist.F90 src/aero_weight.F90 src/aero_weight_array.F90
-  src/coag_kernel_additive.F90 src/coag_kernel_sedi.F90
-  src/coag_kernel_constant.F90 src/coag_kernel_brown.F90
-  src/coag_kernel_zero.F90 src/coag_kernel_brown_free.F90
-  src/coag_kernel_brown_cont.F90 src/aero_data.F90 src/run_exact.F90
-  src/run_part.F90 src/util.F90 src/stats.F90 src/run_sect.F90 src/output.F90
-  src/mosaic.F90 src/gas_data.F90 src/gas_state.F90
-  src/coagulation.F90 src/exact_soln.F90 src/coagulation_dist.F90
-  src/coag_kernel.F90 src/spec_line.F90 src/spec_file.F90 src/rand.F90
-  src/aero_particle.F90 src/aero_particle_array.F90 src/mpi.F90
-  src/netcdf.F90 src/aero_info.F90 src/aero_info_array.F90
-  src/nucleate.F90 src/condense.F90 src/fractal.F90 src/chamber.F90
-  src/property.F90 src/chem_spec_data.F90
-  src/rxn_data.F90 src/camp_state.F90 src/mechanism_data.F90
-  src/camp_core.F90 src/camp_solver_data.F90 src/aero_rep_data.F90
-  src/aero_phase_data.F90 src/aero_rep_factory.F90 src/camp_interface.F90
-  src/rxn_factory.F90 src/sub_model_data.F90 src/sub_model_factory.F90
-  src/solver_stats.F90 src/camp_box_model_data.F90
+set(CAMP_F_SRC
+        src/aero_state.F90 src/integer_varray.F90
+        src/integer_rmap.F90 src/integer_rmap2.F90 src/aero_sorted.F90
+        src/aero_binned.F90 src/bin_grid.F90 src/constants.F90
+        src/scenario.F90 src/env_state.F90 src/aero_mode.F90
+        src/aero_dist.F90 src/aero_weight.F90 src/aero_weight_array.F90
+        src/coag_kernel_additive.F90 src/coag_kernel_sedi.F90
+        src/coag_kernel_constant.F90 src/coag_kernel_brown.F90
+        src/coag_kernel_zero.F90 src/coag_kernel_brown_free.F90
+        src/coag_kernel_brown_cont.F90 src/aero_data.F90 src/run_exact.F90
+        src/run_part.F90 src/util.F90 src/stats.F90 src/run_sect.F90 src/output.F90
+        src/mosaic.F90 src/gas_data.F90 src/gas_state.F90
+        src/coagulation.F90 src/exact_soln.F90 src/coagulation_dist.F90
+        src/coag_kernel.F90 src/spec_line.F90 src/spec_file.F90 src/rand.F90
+        src/aero_particle.F90 src/aero_particle_array.F90 src/mpi.F90
+        src/netcdf.F90 src/aero_info.F90 src/aero_info_array.F90
+        src/nucleate.F90 src/condense.F90 src/fractal.F90 src/chamber.F90
+        src/property.F90 src/chem_spec_data.F90
+        src/rxn_data.F90 src/camp_state.F90 src/mechanism_data.F90
+        src/camp_core.F90 src/camp_solver_data.F90 src/aero_rep_data.F90
+        src/aero_phase_data.F90 src/aero_rep_factory.F90 src/camp_interface.F90
+        src/rxn_factory.F90 src/sub_model_data.F90 src/sub_model_factory.F90
+        src/solver_stats.F90 src/camp_box_model_data.F90
+        )
+
+add_library(partmclib ${CAMP_F_SRC}
   ${CAMP_C_SRC} ${AEROSOL_REPS_SRC} ${SUB_MODELS_SRC} ${REACTIONS_SRC}
   ${SUNDIALS_SRC} ${GSL_SRC} ${C_SORT_SRC} ${CAMP_CUDA_SRC} ${CAMP_CXX_SRC} )
 
@@ -556,6 +583,24 @@ target_link_libraries(partmclib ${NETCDF_LIBS} ${SUNDIALS_LIBS}
 
 set_target_properties(partmclib PROPERTIES OUTPUT_NAME partmc)
 
+if (CUDA_FOUND)
+    add_library(partmclib_gpu ${CAMP_F_SRC}
+            ${CAMP_C_SRC} ${AEROSOL_REPS_SRC} ${SUB_MODELS_SRC} ${REACTIONS_SRC}
+            ${SUNDIALS_SRC} ${GSL_SRC} ${C_SORT_SRC} ${CAMP_CUDA_SRC} ${CAMP_CXX_SRC} )
+
+    target_compile_definitions(partmclib_gpu PRIVATE
+            "PMC_USE_GPU=ON")
+
+    target_link_libraries(partmclib_gpu ${NETCDF_LIBS} ${SUNDIALS_LIBS}
+            ${MOSAIC_LIB} ${GSL_LIBS} ${JSON_LIB})
+
+    set_target_properties(partmclib_gpu PROPERTIES OUTPUT_NAME partmc_gpu)
+
+    add_executable(partmc_gpu src/partmc.F90)
+
+    target_link_libraries(partmc_gpu partmclib_gpu)
+endif()
+
 ######################################################################
 # partmc executable
 
@@ -630,6 +675,28 @@ add_executable(mock_monarch test/monarch/mock_monarch.F90
 
 target_link_libraries(mock_monarch partmclib)
 
+
+set(UNIT_TEST_SRC
+        test/unit_tests/unit_test_data.F90
+        test/unit_tests/unit_test_driver.F90)
+
+set(UNIT_TEST_RXN_ARRHENIUS_SRC
+        test/unit_tests/rxns/unit_test_rxn_arrhenius.F90 ${UNIT_TEST_SRC})
+set_source_files_properties(${UNIT_TEST_RXN_ARRHENIUS_SRC} PROPERTIES COMPILE_FLAGS ${STD_F_FLAGS})
+add_executable(unit_test_rxn_arrhenius ${UNIT_TEST_RXN_ARRHENIUS_SRC})
+target_compile_definitions(unit_test_rxn_arrhenius PRIVATE
+        "UNIT_TEST_MODULE_=pmc_unit_test_rxn_arrhenius \
+  ;UNIT_TEST_TYPE_=unit_test_rxn_arrhenius_t()")
+target_link_libraries(unit_test_rxn_arrhenius partmclib)
+
+if (CUDA_FOUND)
+    add_executable(unit_test_rxn_arrhenius_gpu ${UNIT_TEST_RXN_ARRHENIUS_SRC})
+    target_compile_definitions(unit_test_rxn_arrhenius_gpu PRIVATE
+            "UNIT_TEST_MODULE_=pmc_unit_test_rxn_arrhenius \
+    ;UNIT_TEST_TYPE_=unit_test_rxn_arrhenius_t()")
+    target_link_libraries(unit_test_rxn_arrhenius_gpu partmclib_gpu)
+endif()
+
 if (ENABLE_TESTS)
 
 ######################################################################
@@ -708,19 +775,9 @@ target_link_libraries(test_sub_model_ZSR_aerosol_water partmclib)
 ######################################################################
 # New unit tests (UNDER DEVELOPMENT)
 
-set(UNIT_TEST_SRC
-    test/unit_tests/unit_test_data.F90
-    test/unit_tests/unit_test_driver.F90)
 
-set(UNIT_TEST_RXN_ARRHENIUS_SRC
-    test/unit_tests/rxns/unit_test_rxn_arrhenius.F90 ${UNIT_TEST_SRC})
-set_source_files_properties(${UNIT_TEST_RXN_ARRHENIUS_SRC} PROPERTIES COMPILE_DEFINITIONS
-  "UNIT_TEST_MODULE_=pmc_unit_test_rxn_arrhenius \
-  ;UNIT_TEST_TYPE_=unit_test_rxn_arrhenius_t()")
-set_source_files_properties(${UNIT_TEST_RXN_ARRHENIUS_SRC} PROPERTIES COMPILE_FLAGS
-                            ${STD_F_FLAGS})
-add_executable(unit_test_rxn_arrhenius ${UNIT_TEST_RXN_ARRHENIUS_SRC})
-target_link_libraries(unit_test_rxn_arrhenius partmclib)
+
+
 
 ######################################################################
 # test_camp_core
diff --git a/src/camp_solver.c b/src/camp_solver.c
index 97bb01fa7..342205475 100644
--- a/src/camp_solver.c
+++ b/src/camp_solver.c
@@ -339,6 +339,7 @@ void *solver_new(int n_state_var, int n_cells, int *var_type, int n_rxn,
 #ifdef PMC_USE_GPU
   solver_new_gpu_cu(n_dep_var, n_state_var, n_rxn, n_rxn_int_param,
                     n_rxn_float_param, n_rxn_env_param, n_cells);
+  printf("gpu acting \n");
 #endif
 
 #ifdef PMC_DEBUG
@@ -881,8 +882,8 @@ int f(realtype t, N_Vector y, N_Vector deriv, void *solver_data) {
     // Calculate the time derivative f(t,y)
     rxn_calc_deriv(md, deriv_data, (double)time_step);
 #else
-      // Add contributions from reactions not implemented on GPU
-      rxn_calc_deriv_specific_types(md, deriv_data, (double)time_step);
+    // Add contributions from reactions not implemented on GPU
+    rxn_calc_deriv_specific_types(md, deriv_data, (double)time_step);
 #endif
 
 #ifdef PMC_DEBUG
@@ -1016,8 +1017,9 @@ int Jac(realtype t, N_Vector y, N_Vector deriv, SUNMatrix J, void *solver_data,
     PMC_DEBUG_JAC(md->J_rxn, "reaction Jacobian");
 
 #else
-      // Add contributions from reactions not implemented on GPU
-      rxn_calc_jac_specific_types(md, J_rxn_data, time_step);
+    // Add contributions from reactions not implemented on GPU
+    rxn_calc_jac_specific_types(md, J_rxn_data, time_step);
+    PMC_DEBUG_JAC(md->J_rxn, "reaction Jacobian");
 #endif
 
 // rxn_calc_jac_specific_types(md, J_rxn_data, time_step);
@@ -1868,7 +1870,7 @@ void error_handler(int error_code, const char *module, const char *function,
  */
 void model_free(ModelData model_data) {
 #ifdef PMC_USE_GPU
-  // free_gpu_cu();
+   free_gpu_cu();
 #endif
 
 #ifdef PMC_USE_SUNDIALS
diff --git a/src/cuda/camp_gpu_solver.cu b/src/cuda/camp_gpu_solver.cu
index d4d021709..a0a0350a0 100644
--- a/src/cuda/camp_gpu_solver.cu
+++ b/src/cuda/camp_gpu_solver.cu
@@ -97,6 +97,7 @@ void solver_new_gpu_cu(int n_dep_var,
   if (n_dep_var*n_cells < DATA_SIZE_LIMIT_OPT){
     few_data = 1;
   }
+ //few_data = 1;
 
   //Set working GPU: we have 4 gpu available on power9. as default, it should be assign to gpu 0
   int device=0;
@@ -425,6 +426,12 @@ void rxn_calc_deriv_gpu(ModelData *model_data, N_Vector deriv, realtype time_ste
   double *state = model_data->total_state;
   double *rate_constants = model_data->rxn_env_data;
 
+//TODO: not working with new arrhenius test FOR SOME REASON SEG FAULT
+//Pa ARREGLARLO poner deriv size y todo eso en un struct gpu y asi el multi cell y one cell core tienen su struct y sus cosas
+//(parece que al ser global variable pues no lo pilla bien y no sobreescribe x algun motivo)
+
+/*
+
   //Faster, use for few values
   if (few_data){
     //This method of passing them as a function parameter has a theoric maximum of 4kb of data
@@ -438,18 +445,13 @@ void rxn_calc_deriv_gpu(ModelData *model_data, N_Vector deriv, realtype time_ste
     HANDLE_ERROR(cudaMemcpy(rate_constants_gpu, rate_constants, rate_constants_size, cudaMemcpyHostToDevice));
   }
 
-  HANDLE_ERROR(cudaMemset(deriv_gpu_data, 0, deriv_size));
+  HANDLE_ERROR(cudaMemset(deriv_gpu_data, 0.0, deriv_size));
 
-  solveDerivative << < (n_blocks), max_n_gpu_thread >> >
-         (state_gpu, deriv_gpu_data, time_step, model_data->n_per_cell_dep_var,
-         model_data->n_per_cell_state_var, model_data->n_rxn_env_data,
-         n_rxn, n_cells,
-         int_pointer_gpu, double_pointer_gpu, rate_constants_gpu, rate_constants_idx_gpu);
 
   cudaDeviceSynchronize();// Secure cuda synchronization
 
   //Use pinned memory for few values
-  if (few_data){
+  if (!few_data){
     HANDLE_ERROR(cudaMemcpy(deriv_cpu, deriv_gpu_data, deriv_size, cudaMemcpyDeviceToHost));
     memcpy(deriv_data, deriv_cpu, deriv_size);
   }
@@ -457,9 +459,11 @@ void rxn_calc_deriv_gpu(ModelData *model_data, N_Vector deriv, realtype time_ste
     HANDLE_ERROR(cudaMemcpy(deriv_data, deriv_gpu_data, deriv_size, cudaMemcpyDeviceToHost));
   }
 
+  printf("holaa %d, %d\n", deriv_size/sizeof(double), n_cells);
 
   //TODO Calculate on CPU non gpu-translated type reactions (HL & SIMPOL phase transfer) (or wait till v2.0 with C++)
 
+ */
 
 }
 
@@ -514,8 +518,8 @@ __global__ void solveJacobian(double *state_init, double *jac_init,
         //        state, jac_data, (void *) rxn_data, float_data, time_step, n_rxn);
         break;
       case RXN_ARRHENIUS :
-        rxn_gpu_arrhenius_calc_jac_contrib(rate_constants,
-                                           state, jac_data, (void *) rxn_data, float_data, time_step,n_rxn);
+        //rxn_gpu_arrhenius_calc_jac_contrib(rate_constants,
+        //                                   state, jac_data, (void *) rxn_data, float_data, time_step,n_rxn);
         break;
       case RXN_CMAQ_H2O2 :
         rxn_gpu_CMAQ_H2O2_calc_jac_contrib(rate_constants,
@@ -580,6 +584,8 @@ void rxn_calc_jac_gpu(ModelData *model_data, SUNMatrix jac, realtype time_step)
   double *state = model_data->total_state;
   double *rate_constants = model_data->rxn_env_data;
 
+  /*
+
   //Faster, use for few values
   if (few_data){
     //This method of passing them as a function parameter has a theoric maximum of 4kb of data
@@ -610,6 +616,8 @@ void rxn_calc_jac_gpu(ModelData *model_data, SUNMatrix jac, realtype time_step)
     HANDLE_ERROR(cudaMemcpy(jac_data, jac_gpu_data, jac_size, cudaMemcpyDeviceToHost));
   }
 
+   */
+
 }
 
 
@@ -617,11 +625,12 @@ void rxn_calc_jac_gpu(ModelData *model_data, SUNMatrix jac, realtype time_step)
  */
 void free_gpu_cu() {
 
+printf("free");
 
   HANDLE_ERROR(cudaFree(int_pointer_gpu));
   HANDLE_ERROR(cudaFree(double_pointer_gpu));
   HANDLE_ERROR(cudaFree(deriv_gpu_data));
-  HANDLE_ERROR(cudaFree(jac_gpu_data));
+  //HANDLE_ERROR(cudaFree(jac_gpu_data));
 
   if(few_data){
   }
@@ -630,7 +639,6 @@ void free_gpu_cu() {
     HANDLE_ERROR(cudaFree(rate_constants_gpu));
     HANDLE_ERROR(cudaFree(rate_constants_idx_gpu));
   }
-
 }
 
 /* Auxiliar functions */
diff --git a/src/cuda/camp_gpu_solver.h b/src/cuda/camp_gpu_solver.h
index d6be403d7..08a84d2fe 100644
--- a/src/cuda/camp_gpu_solver.h
+++ b/src/cuda/camp_gpu_solver.h
@@ -13,7 +13,6 @@
 #include <stdlib.h>
 #include "../camp_common.h"
 
-
 //Value to consider data size too big -> Memory optimization will change below and under the limit
 #define DATA_SIZE_LIMIT_OPT 2000
 
diff --git a/src/cuda/rxns_gpu/rxn_aqueous_equilibrium.cu b/src/cuda/rxns_gpu/rxn_aqueous_equilibrium.cu
index 98fd0da15..e6dd78f45 100644
--- a/src/cuda/rxns_gpu/rxn_aqueous_equilibrium.cu
+++ b/src/cuda/rxns_gpu/rxn_aqueous_equilibrium.cu
@@ -143,10 +143,12 @@ __device__ double rxn_gpu_aqueous_equilibrium_calc_overall_rate(int *rxn_data,
  */
 #ifdef PMC_USE_SUNDIALS
 
-#ifdef PMC_USE_GPU
-__device__
-#endif
-void rxn_gpu_aqueous_equilibrium_calc_deriv_contrib(double *rate_constants, double *state,
+//TODO: seems this implementation ifdef raise a warning of host__ device__ function with new tests gpu
+//TODO: O a lo mejor es xk en el .h no esta asi (diria que si)
+//#ifdef PMC_USE_GPU
+//__device__
+//#endif
+__device__ void rxn_gpu_aqueous_equilibrium_calc_deriv_contrib(double *rate_constants, double *state,
           double *deriv, void *rxn_data, double * double_pointer_gpu, double time_step, int n_rxn2)
 {
   int n_rxn=n_rxn2;
diff --git a/src/cuda/rxns_gpu/rxn_arrhenius.cu b/src/cuda/rxns_gpu/rxn_arrhenius.cu
index 73b177825..fe478c2c4 100644
--- a/src/cuda/rxns_gpu/rxn_arrhenius.cu
+++ b/src/cuda/rxns_gpu/rxn_arrhenius.cu
@@ -244,31 +244,29 @@ __device__ void rxn_gpu_arrhenius_calc_jac_contrib(double *rate_constants, doubl
   int n_rxn=n_rxn2;
   int *int_data = (int*) rxn_data;
   double *float_data = double_pointer_gpu;
+  double rate;
 
   // Calculate the reaction rate
   //double rate = RATE_CONSTANT_;
-  double rate = rate_constants[0];
-  for (int i_spec=0; i_spec<NUM_REACT_; i_spec++) rate *= state[REACT_(i_spec)];
-
+//TODO FIX THIS, ITS NOT WORKING WITH NEW UNIT TEST IDK WHYYYYYYY
   // Add contributions to the Jacobian
   int i_elem = 0;
   for (int i_ind=0; i_ind<NUM_REACT_; i_ind++) {
+    rate = rate_constants[0];
+    for (int i_spec = 0; i_spec < NUM_REACT_; i_spec++)
+      if (i_spec != i_ind) rate *= state[REACT_(i_spec)];
+
     for (int i_dep=0; i_dep<NUM_REACT_; i_dep++, i_elem++) {
-  if (JAC_ID_(i_elem) < 0) continue;
-  //J[JAC_ID_(i_elem)] -= rate / state[REACT_(i_ind)];
-  //double rate2 = rate /  state[REACT_(i_ind)];
-  //atomicAdd(&(J[JAC_ID_(i_elem)]),-rate2);
-  atomicAdd(&(J[JAC_ID_(i_elem)]),-rate / state[REACT_(i_ind)]);
-    }
-    for (int i_dep=0; i_dep<NUM_PROD_; i_dep++, i_elem++) {
-  if (JAC_ID_(i_elem) < 0) continue;
-      // Negative yields are allowed, but prevented from causing negative
-      // concentrations that lead to solver failures
-      if (-rate*YIELD_(i_dep)*time_step <= state[PROD_(i_dep)]) {
-    //J[JAC_ID_(i_elem)] += YIELD_(i_dep) * rate / state[REACT_(i_ind)];
-    //double rate2 = YIELD_(i_dep) * rate / state[REACT_(i_ind)];
-    //atomicAdd(&(J[JAC_ID_(i_elem)]), rate2);
-    atomicAdd(&(J[JAC_ID_(i_elem)]),YIELD_(i_dep) * rate / state[REACT_(i_ind)]);
+      if (JAC_ID_(i_elem) < 0) continue;
+        atomicAdd(&(J[JAC_ID_(i_elem)]),-rate);
+      }
+      for (int i_dep=0; i_dep<NUM_PROD_; i_dep++, i_elem++) {
+        if (JAC_ID_(i_elem) < 0) continue;
+        // Negative yields are allowed, but prevented from causing negative
+        // concentrations that lead to solver failures
+        if (-rate * state[REACT_(i_ind)] * YIELD_(i_dep) * time_step <=
+            state[PROD_(i_dep)]) {
+          atomicAdd(&(J[JAC_ID_(i_elem)]),YIELD_(i_dep) * rate);
       }
     }
   }
diff --git a/src/rxn_solver.c b/src/rxn_solver.c
index 9984e85b8..6809e970a 100644
--- a/src/rxn_solver.c
+++ b/src/rxn_solver.c
@@ -510,6 +510,10 @@ void rxn_calc_jac_specific_types(ModelData *model_data, double *J_data,
                                                  rxn_int_data, rxn_float_data,
                                                  rxn_env_data, time_step);
         break;
+      case RXN_ARRHENIUS: //TODO delete arrhenius when it works
+        rxn_arrhenius_calc_jac_contrib(model_data, J_data, rxn_int_data,
+                                       rxn_float_data, rxn_env_data, time_step);
+        break;
       case RXN_CONDENSED_PHASE_ARRHENIUS:
         rxn_condensed_phase_arrhenius_calc_jac_contrib(
             model_data, J_data, rxn_int_data, rxn_float_data, rxn_env_data,
diff --git a/test/unit_tests/input_files/run_rxn_arrhenius_gpu.sh b/test/unit_tests/input_files/run_rxn_arrhenius_gpu.sh
new file mode 100755
index 000000000..43e472eab
--- /dev/null
+++ b/test/unit_tests/input_files/run_rxn_arrhenius_gpu.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+# exit on error
+set -e
+# turn on command echoing
+set -v
+# make sure that the current directory is the one where this script is
+cd ${0%/*}
+# make the output directory if it doesn't exist
+mkdir -p out
+
+((counter = 1))
+while [ true ]
+do
+  echo Attempt $counter
+
+if [[ $1 == "MPI" ]]; then
+  exec_str="mpirun -v -np 2 ../../../unit_test_rxn_arrhenius_gpu"
+else
+  exec_str="../../../unit_test_rxn_arrhenius_gpu"
+fi
+if ! $exec_str; then
+	  echo Failure "$counter"
+	  if [ "$counter" -gt 5 ]
+	  then
+		  echo FAIL
+		  exit 1
+	  fi
+	  echo retrying...
+  else
+	  echo PASS
+	  exit 0
+  fi
+  ((counter++))
+done

From eb2d00745da148f92afbd2331f7081ff963071fb Mon Sep 17 00:00:00 2001
From: cguzman <christian.guzman@bsc.es>
Date: Tue, 22 Oct 2019 15:42:01 +0200
Subject: [PATCH 2/4] Fixed new unit_test_arrhenius on gpu by moving the global
 gpu variables to camp_common.h

---
 CMakeLists.txt                                |  43 ++--
 src/camp_common.h                             |  30 +++
 src/camp_solver.c                             |   8 +-
 src/cuda/camp_gpu_solver.cu                   | 224 +++++++++---------
 src/cuda/camp_gpu_solver.h                    |   3 +-
 test/chemistry/cb05cl_ae5/test_cb05cl_ae5.F90 |   2 +-
 .../test_chemistry_cb05cl_ae5_gpu.sh          |  30 +++
 test/monarch/pmc_monarch_interface.F90        |   4 -
 8 files changed, 200 insertions(+), 144 deletions(-)
 create mode 100755 test/chemistry/cb05cl_ae5/test_chemistry_cb05cl_ae5_gpu.sh

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3ba098a90..0bfacc425 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -248,11 +248,15 @@ endif()
 
 if(CUDA_FOUND)
     add_test(test_rxn_arrhenius_mech_gpu ${CMAKE_BINARY_DIR}/test_run/unit_tests/input_files/run_rxn_arrhenius_gpu.sh ${MPI_TEST_FLAG})
-endif()
 
+    add_test(test_chemistry_cb05cl_ae5_gpu ${CMAKE_BINARY_DIR}/test_run/chemistry/cb05cl_ae5/test_chemistry_cb05cl_ae5_gpu.sh ${MPI_TEST_FLAG})
+
+endif()
 
 add_test(test_rxn_arrhenius_mech ${CMAKE_BINARY_DIR}/test_run/unit_tests/input_files/run_rxn_arrhenius.sh ${MPI_TEST_FLAG})
 
+add_test(test_chemistry_cb05cl_ae5 ${CMAKE_BINARY_DIR}/test_run/chemistry/cb05cl_ae5/test_chemistry_cb05cl_ae5.sh ${MPI_TEST_FLAG})
+
 if (ENABLE_TESTS)
     if (ENABLE_JSON AND ENABLE_SUNDIALS)
       # New unit tests (UNDER DEVELOPMENT)
@@ -659,13 +663,21 @@ set_source_files_properties(${CB5_EBI_SOLVER} PROPERTIES COMPILE_FLAGS
 	${CB5_EBI_FLAGS})
 
 ######################################################################
-# test_chemistry_cb05cl_ae5_big
+# test_chemistry_cb05cl_ae5
 
-add_executable(test_chemistry_cb05cl_ae5_big
-        test/chemistry/cb05cl_ae5/test_cb05cl_ae5_big.F90
+add_executable(test_chemistry_cb05cl_ae5
+        test/chemistry/cb05cl_ae5/test_cb05cl_ae5.F90
         ${CB5_EBI_SOLVER} ${CB5_KPP_SOLVER}
         test/chemistry/cb05cl_ae5/module_BSC_CHEM_DATA.F90)
-target_link_libraries(test_chemistry_cb05cl_ae5_big partmclib)
+target_link_libraries(test_chemistry_cb05cl_ae5 partmclib)
+
+if (CUDA_FOUND)
+    add_executable(test_chemistry_cb05cl_ae5_gpu
+            test/chemistry/cb05cl_ae5/test_cb05cl_ae5.F90
+            ${CB5_EBI_SOLVER} ${CB5_KPP_SOLVER}
+            test/chemistry/cb05cl_ae5/module_BSC_CHEM_DATA.F90)
+    target_link_libraries(test_chemistry_cb05cl_ae5_gpu partmclib_gpu)
+endif()
 
 ######################################################################
 # MONARCH interface
@@ -685,14 +697,14 @@ set(UNIT_TEST_RXN_ARRHENIUS_SRC
 set_source_files_properties(${UNIT_TEST_RXN_ARRHENIUS_SRC} PROPERTIES COMPILE_FLAGS ${STD_F_FLAGS})
 add_executable(unit_test_rxn_arrhenius ${UNIT_TEST_RXN_ARRHENIUS_SRC})
 target_compile_definitions(unit_test_rxn_arrhenius PRIVATE
-        "UNIT_TEST_MODULE_=pmc_unit_test_rxn_arrhenius \
-  ;UNIT_TEST_TYPE_=unit_test_rxn_arrhenius_t()")
+    "UNIT_TEST_MODULE_=pmc_unit_test_rxn_arrhenius \
+    ;UNIT_TEST_TYPE_=unit_test_rxn_arrhenius_t()")
 target_link_libraries(unit_test_rxn_arrhenius partmclib)
 
 if (CUDA_FOUND)
     add_executable(unit_test_rxn_arrhenius_gpu ${UNIT_TEST_RXN_ARRHENIUS_SRC})
     target_compile_definitions(unit_test_rxn_arrhenius_gpu PRIVATE
-            "UNIT_TEST_MODULE_=pmc_unit_test_rxn_arrhenius \
+    "UNIT_TEST_MODULE_=pmc_unit_test_rxn_arrhenius \
     ;UNIT_TEST_TYPE_=unit_test_rxn_arrhenius_t()")
     target_link_libraries(unit_test_rxn_arrhenius_gpu partmclib_gpu)
 endif()
@@ -700,20 +712,19 @@ endif()
 if (ENABLE_TESTS)
 
 ######################################################################
-# test_chemistry_cb05cl_ae5
+# test_chemistry_cb05cl_ae5_big
 
-add_executable(test_chemistry_cb05cl_ae5
-    test/chemistry/cb05cl_ae5/test_cb05cl_ae5.F90
-    ${CB5_EBI_SOLVER} ${CB5_KPP_SOLVER}
-    test/chemistry/cb05cl_ae5/module_BSC_CHEM_DATA.F90)
-target_link_libraries(test_chemistry_cb05cl_ae5 partmclib)
+add_executable(test_chemistry_cb05cl_ae5_big
+        test/chemistry/cb05cl_ae5/test_cb05cl_ae5_big.F90
+        ${CB5_EBI_SOLVER} ${CB5_KPP_SOLVER}
+        test/chemistry/cb05cl_ae5/module_BSC_CHEM_DATA.F90)
+target_link_libraries(test_chemistry_cb05cl_ae5_big partmclib)
 
 ######################################################################
 # test_chemistry_eqsam_v03d
 
 set(EQSAM_V03D
-	test/chemistry/eqsam_v03d/eqsam_v03d.F90
-	)
+	test/chemistry/eqsam_v03d/eqsam_v03d.F90)
 
 add_executable(test_chemistry_eqsam_v03d test/chemistry/eqsam_v03d/test_eqsam_v03d.F90
 	${EQSAM_V03D})
diff --git a/src/camp_common.h b/src/camp_common.h
index 8f6fd5049..d8e63724d 100644
--- a/src/camp_common.h
+++ b/src/camp_common.h
@@ -59,6 +59,12 @@ typedef struct {
   int param_id;   // sub model Jacobian id
 } JacMap;
 
+/* GPU data structure */
+typedef struct {
+    size_t deriv_size;
+
+} DataGPU;
+
 /* Model data structure */
 typedef struct {
   int n_per_cell_state_var;        // number of state variables per grid cell
@@ -167,6 +173,30 @@ typedef struct {
                                  // for the current grid cell
   int n_sub_model_env_data;      // Number of sub model environmental parameters
                                  // from all sub models
+//#ifdef CUDA_FOUND
+// GPU data
+  double *deriv_gpu_data;
+  double *deriv_cpu;
+  double *jac_gpu_data;
+  double *jac_cpu;
+  size_t deriv_size;
+  size_t jac_size;
+  size_t state_size;
+  size_t env_size;
+  size_t rate_constants_size;
+  size_t rate_constants_idx_size;
+  bool few_data;
+  bool implemented_all;
+  int *int_pointer;
+  int *int_pointer_gpu;
+  double *double_pointer;
+  double *double_pointer_gpu;
+  double *state_gpu;
+  double *env_gpu;
+  double *rate_constants_gpu;
+  int *rate_constants_idx_gpu;
+
+//#endif
 } ModelData;
 
 /* Solver data structure */
diff --git a/src/camp_solver.c b/src/camp_solver.c
index 342205475..a90fe29a1 100644
--- a/src/camp_solver.c
+++ b/src/camp_solver.c
@@ -337,9 +337,8 @@ void *solver_new(int n_state_var, int n_cells, int *var_type, int n_rxn,
   sd->model_data.sub_model_env_idx[0] = 0;
 
 #ifdef PMC_USE_GPU
-  solver_new_gpu_cu(n_dep_var, n_state_var, n_rxn, n_rxn_int_param,
+  solver_new_gpu_cu(&(sd->model_data), n_dep_var, n_state_var, n_rxn, n_rxn_int_param,
                     n_rxn_float_param, n_rxn_env_param, n_cells);
-  printf("gpu acting \n");
 #endif
 
 #ifdef PMC_DEBUG
@@ -453,11 +452,6 @@ void solver_initialize(void *solver_data, double *abs_tol, double rel_tol,
   flag = CVodeSetDlsGuessHelper(sd->cvode_mem, guess_helper);
   check_flag_fail(&flag, "CVodeSetDlsGuessHelper", 1);
 
-// Allocate Jacobian on GPU
-#ifdef PMC_USE_GPU
-  allocate_jac_gpu(sd->model_data.n_per_cell_solver_jac_elem, n_cells);
-#endif
-
 // Set gpu rxn values
 #ifdef PMC_USE_GPU
   solver_set_rxn_data_gpu(&(sd->model_data));
diff --git a/src/cuda/camp_gpu_solver.cu b/src/cuda/camp_gpu_solver.cu
index a0a0350a0..a05cbe017 100644
--- a/src/cuda/camp_gpu_solver.cu
+++ b/src/cuda/camp_gpu_solver.cu
@@ -28,29 +28,29 @@ int max_n_gpu_thread;
 int max_n_gpu_blocks;
 
 //General data
-double *deriv_gpu_data;
-double *deriv_cpu;
-double *jac_gpu_data;
-double *jac_cpu;
-size_t deriv_size;
-size_t jac_size;
-size_t state_size;
-size_t env_size;
-size_t rate_constants_size;
-size_t rate_constants_idx_size;
-
-bool few_data = 0;
-bool implemented_all = true;
-int *int_pointer;
-int *int_pointer_gpu;
-double *double_pointer;
-double *double_pointer_gpu;
-double *state_gpu;
-double *state_cpu;
-double *env_gpu;
-double *rate_constants_gpu;
-double *rate_constants_cpu;
-int *rate_constants_idx_gpu;
+/*double *model_data->deriv_gpu_data;
+double *model_data->deriv_cpu;
+double *model_data->jac_gpu_data;
+double *model_data->jac_cpu;
+//size_t model_data->deriv_size;
+size_t model_data->jac_size;
+size_t model_data->state_size;
+size_t model_data->env_size;
+size_t model_data->rate_constants_size;
+size_t model_data->rate_constants_idx_size;
+
+
+bool model_data->few_data;
+bool model_data->implemented_all;
+int *model_data->int_pointer;
+int *model_data->int_pointer_gpu;
+double *model_data->double_pointer;
+double *model_data->double_pointer_gpu;
+double *model_data->state_gpu;
+double *model_data->env_gpu;
+double *model_data->rate_constants_gpu;
+int *model_data->rate_constants_idx_gpu;
+*/
 
 static void HandleError(cudaError_t err,
                         const char *file,
@@ -81,23 +81,29 @@ static void HandleError2(const char *file,
  * \param n_rxn_float_param Total number of floating-point reaction parameters
  * \param n_cells Number of grid cells to solve simultaneously
  */
-void solver_new_gpu_cu(int n_dep_var,
+void solver_new_gpu_cu(ModelData *model_data, int n_dep_var,
                        int n_state_var, int n_rxn,
                        int n_rxn_int_param, int n_rxn_float_param, int n_rxn_env_param,
                        int n_cells) {
+//Get Global gpu data
+  //DataGPU *dgpu = (DataGPU *)malloc(sizeof(DataGPU));
 
   //Lengths
-  state_size = n_state_var*n_cells * sizeof(double);
-  deriv_size = n_dep_var*n_cells * sizeof(double);
-  env_size = PMC_NUM_ENV_PARAM_*n_cells * sizeof(double); //Temp and pressure
-  rate_constants_size = n_rxn_env_param * n_cells * sizeof(double);
-  rate_constants_idx_size = (n_rxn+1) * sizeof(int);
+  model_data->state_size = n_state_var * n_cells * sizeof(double);
+  model_data->deriv_size = n_dep_var * n_cells * sizeof(double);
+  model_data->env_size = PMC_NUM_ENV_PARAM_ * n_cells * sizeof(double); //Temp and pressure
+  model_data->rate_constants_size = n_rxn_env_param * n_cells * sizeof(double);
+  model_data->rate_constants_idx_size = (n_rxn+1) * sizeof(int);
+  model_data->few_data = 0;
+  model_data->implemented_all = true;
+
+  printf("new deriv_lenght %d, cells %d\n", model_data->deriv_size/sizeof(double), n_cells);
 
   //Detect if we are working with few data values
   if (n_dep_var*n_cells < DATA_SIZE_LIMIT_OPT){
-    few_data = 1;
+    model_data->few_data = 1;
   }
- //few_data = 1;
+ //model_data->few_data = 1;
 
   //Set working GPU: we have 4 gpu available on power9. as default, it should be assign to gpu 0
   int device=0;
@@ -113,18 +119,17 @@ void solver_new_gpu_cu(int n_dep_var,
   int n_blocks = (n_rxn + max_n_gpu_thread - 1) / max_n_gpu_thread;
 
   //GPU allocation
-  cudaMalloc((void **) &deriv_gpu_data, deriv_size);
-  cudaMalloc((void **) &state_gpu, state_size);
-  cudaMalloc((void **) &env_gpu, env_size);
-  cudaMalloc((void **) &rate_constants_gpu, rate_constants_size);
-  cudaMalloc((void **) &rate_constants_idx_gpu, rate_constants_idx_size);
+  cudaMalloc((void **) &model_data->deriv_gpu_data, model_data->deriv_size);
+  cudaMalloc((void **) &model_data->state_gpu, model_data->state_size);
+  cudaMalloc((void **) &model_data->env_gpu, model_data->env_size);
+  cudaMalloc((void **) &model_data->rate_constants_gpu, model_data->rate_constants_size);
+  cudaMalloc((void **) &model_data->rate_constants_idx_gpu, model_data->rate_constants_idx_size);
 
   //GPU allocation few data on pinned memory
-  if(few_data){
+  if(model_data->few_data){
     //Notice auxiliar variables are created because we
     // can't pin directly variables initialized before
-    cudaMallocHost((void**)&rate_constants_cpu, rate_constants_size);
-    cudaMallocHost((void**)&deriv_cpu, deriv_size);
+    cudaMallocHost((void**)&model_data->deriv_cpu, model_data->deriv_size);
   }
 
   // Warning if exceeding GPU limits
@@ -139,22 +144,6 @@ void solver_new_gpu_cu(int n_dep_var,
 
 }
 
-/** \brief Allocate Jacobian on GPU
-*
-* \param n_jac_elem Number of data elements on the jacobian
-* \param n_cells Number of cells to compute
-*/
-void allocate_jac_gpu(int n_jac_elem, int n_cells){
-
-  jac_size = n_jac_elem * n_cells * sizeof(double);
-  cudaMalloc((void **) &jac_gpu_data, jac_size);
-
-  if(few_data){
-    cudaMallocHost((void**)&jac_cpu, jac_size);
-  }
-
-}
-
 /** \brief Set reaction data on GPU prepared structure. RXN data is divided
  * into two different matrix, per double and int data respectively. Matrix are
  * reversed to improve memory access on GPU.
@@ -165,6 +154,7 @@ void allocate_jac_gpu(int n_jac_elem, int n_cells){
 void solver_set_rxn_data_gpu(ModelData *model_data) {
 
   int n_rxn = model_data->n_rxn;
+  int n_cells = model_data->n_cells;
   unsigned int int_max_length = 0;
   unsigned int double_max_length = 0;
 
@@ -227,7 +217,7 @@ void solver_set_rxn_data_gpu(ModelData *model_data) {
 #ifdef FAILURE_DETAIL
       printf("WARNING: Reaction type %d is not fully implemented on GPU. Computing on CPU...\n", rxn_type);
 #endif
-      implemented_all=false;
+      model_data->implemented_all=false;
     }
 
     //Get RXN lengths
@@ -253,15 +243,15 @@ void solver_set_rxn_data_gpu(ModelData *model_data) {
 
   //Allocate int and double rxn data separately
   //Add -1 to avoid access and have a square matrix
-  int_pointer = (int *) malloc(rxn_int_length * sizeof(int));
-  memset(int_pointer, -1, rxn_int_length * sizeof(int));
+  model_data->int_pointer = (int *) malloc(rxn_int_length * sizeof(int));
+  memset(model_data->int_pointer, -1, rxn_int_length * sizeof(int));
 
   //Add 0 to avoid access and have a square matrix
-  double_pointer = (double*)calloc(rxn_double_length, sizeof(double));
+  model_data->double_pointer = (double*)calloc(rxn_double_length, sizeof(double));
 
   //GPU allocation
-  cudaMalloc((void **) &int_pointer_gpu, rxn_int_length * sizeof(int));
-  cudaMalloc((void **) &double_pointer_gpu, rxn_double_length * sizeof(double));
+  cudaMalloc((void **) &model_data->int_pointer_gpu, rxn_int_length * sizeof(int));
+  cudaMalloc((void **) &model_data->double_pointer_gpu, rxn_double_length * sizeof(double));
 
   //Update number of zeros added on each reaction
   for (int i_rxn = 0; i_rxn < n_rxn; i_rxn++)
@@ -285,11 +275,11 @@ void solver_set_rxn_data_gpu(ModelData *model_data) {
     int i_pos=rxn_position[i_rxn];//i_rxn;//rxn_position[i_rxn];//for bubblesort
     for (int j = 0; j < int_lengths[i_pos]; j++){
       int *rxn_int_data = &(model_data->rxn_int_data[model_data->rxn_int_indices[i_pos]]);
-      int_pointer[n_rxn*j + i_rxn] = rxn_int_data[j];
+      model_data->int_pointer[n_rxn*j + i_rxn] = rxn_int_data[j];
     }
     for (int j = 0; j < double_lengths[i_pos]; j++) {
       double *rxn_float_data = &(model_data->rxn_float_data[model_data->rxn_float_indices[i_pos]]);
-      double_pointer[n_rxn*j + i_rxn] = rxn_float_data[j];
+      model_data->double_pointer[n_rxn*j + i_rxn] = rxn_float_data[j];
     }
     //Reorder the rate indices
     //Todo update on main code the rate_constants to read consecutively in cpu
@@ -297,11 +287,19 @@ void solver_set_rxn_data_gpu(ModelData *model_data) {
   }
 
   //Save data to GPU
-  HANDLE_ERROR(cudaMemcpy(int_pointer_gpu, int_pointer, rxn_int_length*sizeof(int), cudaMemcpyHostToDevice));
-  HANDLE_ERROR(cudaMemcpy(double_pointer_gpu, double_pointer, rxn_double_length*sizeof(double), cudaMemcpyHostToDevice));
+  HANDLE_ERROR(cudaMemcpy(model_data->int_pointer_gpu, model_data->int_pointer, rxn_int_length*sizeof(int), cudaMemcpyHostToDevice));
+  HANDLE_ERROR(cudaMemcpy(model_data->double_pointer_gpu, model_data->double_pointer, rxn_double_length*sizeof(double), cudaMemcpyHostToDevice));
 
   //Set rate_constants-idx
-  HANDLE_ERROR(cudaMemcpy(rate_constants_idx_gpu, rate_constants_idx_aux, rate_constants_idx_size, cudaMemcpyHostToDevice));
+  HANDLE_ERROR(cudaMemcpy(model_data->rate_constants_idx_gpu, rate_constants_idx_aux, model_data->rate_constants_idx_size, cudaMemcpyHostToDevice));
+
+  //Allocate jacobian
+  model_data->jac_size = model_data->n_per_cell_solver_jac_elem * n_cells * sizeof(double);
+  cudaMalloc((void **) &model_data->jac_gpu_data, model_data->jac_size);
+
+  if(model_data->few_data){
+    cudaMallocHost((void**)&model_data->jac_cpu, model_data->jac_size);
+  }
 
 }
 
@@ -311,11 +309,11 @@ void solver_set_rxn_data_gpu(ModelData *model_data) {
  * \param deriv_init Pointer to first value of derivative array
  * \param time_step Current time step being computed (s)
  * \param deriv_length_cell Derivative length for one cell
- * \param state_size_cell Derivative length for one cell
+ * \param model_data->state_size_cell Derivative length for one cell
  * \param n_rxn Number of reactions to include
  * \param n_cells_gpu Number of cells to compute
- * \param int_pointer Pointer to integer reaction data
- * \param double_pointer Pointer to double reaction data
+ * \param model_data->int_pointer Pointer to integer reaction data
+ * \param model_data->double_pointer Pointer to double reaction data
  * \param rate_constants_init Pointer to first value of reaction rates
  */
 __global__ void solveDerivative(double *state_init, double *deriv_init,
@@ -430,40 +428,39 @@ void rxn_calc_deriv_gpu(ModelData *model_data, N_Vector deriv, realtype time_ste
 //Pa ARREGLARLO poner deriv size y todo eso en un struct gpu y asi el multi cell y one cell core tienen su struct y sus cosas
 //(parece que al ser global variable pues no lo pilla bien y no sobreescribe x algun motivo)
 
-/*
-
   //Faster, use for few values
-  if (few_data){
+  if (model_data->few_data){
     //This method of passing them as a function parameter has a theoric maximum of 4kb of data
-    state_gpu= state;
-    rate_constants_gpu= rate_constants;
+    model_data->state_gpu= state;
+    model_data->rate_constants_gpu= rate_constants;
   }
     //Slower, use for large values
   else{
-    HANDLE_ERROR(cudaMemcpy(state_gpu, state, state_size, cudaMemcpyHostToDevice));
+    HANDLE_ERROR(cudaMemcpy(model_data->state_gpu, state, model_data->state_size, cudaMemcpyHostToDevice));
     //todo rate_constants only change each time_step, not each deriv iteration
-    HANDLE_ERROR(cudaMemcpy(rate_constants_gpu, rate_constants, rate_constants_size, cudaMemcpyHostToDevice));
+    HANDLE_ERROR(cudaMemcpy(model_data->rate_constants_gpu, rate_constants, model_data->rate_constants_size, cudaMemcpyHostToDevice));
   }
 
-  HANDLE_ERROR(cudaMemset(deriv_gpu_data, 0.0, deriv_size));
+  HANDLE_ERROR(cudaMemset(model_data->deriv_gpu_data, 0.0, model_data->deriv_size));
 
+  solveDerivative << < (n_blocks), max_n_gpu_thread >> >
+     (model_data->state_gpu, model_data->deriv_gpu_data, time_step, model_data->n_per_cell_dep_var,
+     model_data->n_per_cell_state_var, model_data->n_rxn_env_data,
+     n_rxn, n_cells,
+     model_data->int_pointer_gpu, model_data->double_pointer_gpu, model_data->rate_constants_gpu, model_data->rate_constants_idx_gpu);
 
   cudaDeviceSynchronize();// Secure cuda synchronization
 
   //Use pinned memory for few values
-  if (!few_data){
-    HANDLE_ERROR(cudaMemcpy(deriv_cpu, deriv_gpu_data, deriv_size, cudaMemcpyDeviceToHost));
-    memcpy(deriv_data, deriv_cpu, deriv_size);
+  if (model_data->few_data){
+    HANDLE_ERROR(cudaMemcpy(model_data->deriv_cpu, model_data->deriv_gpu_data, model_data->deriv_size, cudaMemcpyDeviceToHost));
+    memcpy(deriv_data, model_data->deriv_cpu, model_data->deriv_size);
   }
   else {
-    HANDLE_ERROR(cudaMemcpy(deriv_data, deriv_gpu_data, deriv_size, cudaMemcpyDeviceToHost));
+    HANDLE_ERROR(cudaMemcpy(deriv_data, model_data->deriv_gpu_data, model_data->deriv_size, cudaMemcpyDeviceToHost));
   }
 
-  printf("holaa %d, %d\n", deriv_size/sizeof(double), n_cells);
-
-  //TODO Calculate on CPU non gpu-translated type reactions (HL & SIMPOL phase transfer) (or wait till v2.0 with C++)
-
- */
+  //printf("deriv deriv_lenght %d, cells %d\n", model_data->deriv_size/sizeof(double), n_cells);
 
 }
 
@@ -473,11 +470,11 @@ void rxn_calc_deriv_gpu(ModelData *model_data, N_Vector deriv, realtype time_ste
  * \param jac_init Pointer to first value of jacobian array
  * \param time_step Current time step being computed (s)
  * \param jac_length_cell jacobian length for one cell
- * \param state_size_cell jacobian length for one cell
+ * \param model_data->state_size_cell jacobian length for one cell
  * \param n_rxn Number of reactions to include
  * \param n_cells_gpu Number of cells to compute
- * \param int_pointer Pointer to integer reaction data
- * \param double_pointer Pointer to double reaction data
+ * \param model_data->int_pointer Pointer to integer reaction data
+ * \param model_data->double_pointer Pointer to double reaction data
  * \param rate_constants_init Pointer to first value of reaction rates
  */
 //TODO: fix jacGPU once matt let me modify rxn_solver and reduce jacobians (in v2.0 or before if needed)
@@ -584,39 +581,37 @@ void rxn_calc_jac_gpu(ModelData *model_data, SUNMatrix jac, realtype time_step)
   double *state = model_data->total_state;
   double *rate_constants = model_data->rxn_env_data;
 
-  /*
-
   //Faster, use for few values
-  if (few_data){
+  if (model_data->few_data){
     //This method of passing them as a function parameter has a theoric maximum of 4kb of data
-    state_gpu= state;
-    rate_constants_gpu= rate_constants;
+    model_data->state_gpu= state;
+    model_data->rate_constants_gpu= rate_constants;
   }
     //Slower, use for large values
   else{
-    HANDLE_ERROR(cudaMemcpy(state_gpu, state, state_size, cudaMemcpyHostToDevice));
-    HANDLE_ERROR(cudaMemcpy(rate_constants_gpu, rate_constants, rate_constants_size, cudaMemcpyHostToDevice));
+    HANDLE_ERROR(cudaMemcpy(model_data->state_gpu, state, model_data->state_size, cudaMemcpyHostToDevice));
+    HANDLE_ERROR(cudaMemcpy(model_data->rate_constants_gpu, rate_constants, model_data->rate_constants_size, cudaMemcpyHostToDevice));
   }
 
-  HANDLE_ERROR(cudaMemset(jac_gpu_data, 0, jac_size));
+  HANDLE_ERROR(cudaMemset(model_data->jac_gpu_data, 0, model_data->jac_size));
 
   solveJacobian << < (n_blocks), max_n_gpu_thread >> >
-    (state_gpu, jac_gpu_data, time_step, model_data->n_per_cell_rxn_jac_elem,
+    (model_data->state_gpu, model_data->jac_gpu_data, time_step, model_data->n_per_cell_rxn_jac_elem,
     model_data->n_per_cell_state_var, model_data->n_rxn_env_data,
-    n_rxn, n_cells, int_pointer_gpu, double_pointer_gpu, rate_constants_gpu, rate_constants_idx_gpu);
+    n_rxn, n_cells, model_data->int_pointer_gpu, model_data->double_pointer_gpu, model_data->rate_constants_gpu, model_data->rate_constants_idx_gpu);
 
   cudaDeviceSynchronize();// Secure cuda synchronization
 
   //Use pinned memory for few values
-  if (few_data){
-    HANDLE_ERROR(cudaMemcpy(jac_cpu, jac_gpu_data, jac_size, cudaMemcpyDeviceToHost));
-    memcpy(jac_data, jac_cpu, jac_size);
+  if (model_data->few_data){
+    HANDLE_ERROR(cudaMemcpy(model_data->jac_cpu, model_data->jac_gpu_data, model_data->jac_size, cudaMemcpyDeviceToHost));
+    memcpy(jac_data, model_data->jac_cpu, model_data->jac_size);
   }
   else {
-    HANDLE_ERROR(cudaMemcpy(jac_data, jac_gpu_data, jac_size, cudaMemcpyDeviceToHost));
+    HANDLE_ERROR(cudaMemcpy(jac_data, model_data->jac_gpu_data, model_data->jac_size, cudaMemcpyDeviceToHost));
   }
 
-   */
+
 
 }
 
@@ -624,21 +619,22 @@ void rxn_calc_jac_gpu(ModelData *model_data, SUNMatrix jac, realtype time_step)
 /** \brief Free GPU data structures
  */
 void free_gpu_cu() {
+/*
+printf("free gpu\n");
 
-printf("free");
-
-  HANDLE_ERROR(cudaFree(int_pointer_gpu));
-  HANDLE_ERROR(cudaFree(double_pointer_gpu));
-  HANDLE_ERROR(cudaFree(deriv_gpu_data));
-  //HANDLE_ERROR(cudaFree(jac_gpu_data));
+  HANDLE_ERROR(cudaFree(model_data->int_pointer_gpu));
+  HANDLE_ERROR(cudaFree(model_data->double_pointer_gpu));
+  HANDLE_ERROR(cudaFree(model_data->deriv_gpu_data));
+  //HANDLE_ERROR(cudaFree(model_data->jac_gpu_data));
 
-  if(few_data){
+  if(model_data->few_data){
   }
   else{
-    HANDLE_ERROR(cudaFree(state_gpu));
-    HANDLE_ERROR(cudaFree(rate_constants_gpu));
-    HANDLE_ERROR(cudaFree(rate_constants_idx_gpu));
+    HANDLE_ERROR(cudaFree(model_data->state_gpu));
+    HANDLE_ERROR(cudaFree(model_data->rate_constants_gpu));
+    HANDLE_ERROR(cudaFree(model_data->rate_constants_idx_gpu));
   }
+  */
 }
 
 /* Auxiliar functions */
diff --git a/src/cuda/camp_gpu_solver.h b/src/cuda/camp_gpu_solver.h
index 08a84d2fe..bf51da0cc 100644
--- a/src/cuda/camp_gpu_solver.h
+++ b/src/cuda/camp_gpu_solver.h
@@ -23,9 +23,8 @@
 //Force solving on CPU: Test option
 #define FORCE_CPU 0
 
-void solver_new_gpu_cu(int n_dep_var, int n_state_var, int n_rxn,
+void solver_new_gpu_cu(ModelData *model_data, int n_dep_var, int n_state_var, int n_rxn,
      int n_rxn_int_param, int n_rxn_float_param, int n_rxn_env_param, int n_cells);
-void allocate_jac_gpu(int n_jac_elem, int n_cells);
 void rxn_update_env_state_gpu(ModelData *model_data, double *env);
 void rxn_calc_deriv_gpu(ModelData *model_data, N_Vector deriv, realtype time_step);
 void rxn_calc_jac_gpu(ModelData *model_data, SUNMatrix jac, realtype time_step);
diff --git a/test/chemistry/cb05cl_ae5/test_cb05cl_ae5.F90 b/test/chemistry/cb05cl_ae5/test_cb05cl_ae5.F90
index c46101cc3..a80c9d9b4 100644
--- a/test/chemistry/cb05cl_ae5/test_cb05cl_ae5.F90
+++ b/test/chemistry/cb05cl_ae5/test_cb05cl_ae5.F90
@@ -42,7 +42,7 @@ program pmc_test_cb05cl_ae5
   ! CAMP-chem output file unit
   integer(kind=i_kind), parameter :: CAMP_FILE_UNIT = 12
   ! Number of timesteps to integrate over
-  integer(kind=i_kind), parameter :: NUM_TIME_STEPS = 100
+  integer(kind=i_kind), parameter :: NUM_TIME_STEPS = 2!100
   ! Number of EBI-solver species
   integer(kind=i_kind), parameter :: NUM_EBI_SPEC = 72
   ! Number of EBI-solever photolysis reactions
diff --git a/test/chemistry/cb05cl_ae5/test_chemistry_cb05cl_ae5_gpu.sh b/test/chemistry/cb05cl_ae5/test_chemistry_cb05cl_ae5_gpu.sh
new file mode 100755
index 000000000..b6e591b83
--- /dev/null
+++ b/test/chemistry/cb05cl_ae5/test_chemistry_cb05cl_ae5_gpu.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+# exit on error
+set -e
+# turn on command echoing
+set -v
+# make sure that the current directory is the one where this script is
+cd ${0%/*}
+# make the output directory if it doesn't exist
+mkdir -p out
+
+((counter = 1))
+while [ true ]
+do
+  echo Attempt $counter
+
+if ! ../../../test_chemistry_cb05cl_ae5_gpu; then
+	  echo Failure "$counter"
+	  if [ "$counter" -gt 1 ]
+	  then
+		  echo FAIL
+		  exit 1
+	  fi
+	  echo retrying...
+  else
+	  echo PASS
+	  exit 0
+  fi
+  ((counter++))
+done
diff --git a/test/monarch/pmc_monarch_interface.F90 b/test/monarch/pmc_monarch_interface.F90
index a61559d39..211258da8 100644
--- a/test/monarch/pmc_monarch_interface.F90
+++ b/test/monarch/pmc_monarch_interface.F90
@@ -390,7 +390,6 @@ subroutine integrate(this, start_time, time_step, i_start, i_end, j_start, &
       state_size_per_cell = this%camp_core%state_size_per_cell()
     end if
 
-
 #ifdef PMC_DEBUG
     ! Evaluate the Jacobian during solving
     solver_stats%eval_Jac = .true.
@@ -435,7 +434,6 @@ subroutine integrate(this, start_time, time_step, i_start, i_end, j_start, &
             call this%camp_core%solve(this%camp_state, &
                     real(time_step, kind=dp), solver_stats = solver_stats)
 
-
 #ifdef PMC_DEBUG
               ! Check the Jacobian evaluations
               call warn_assert_msg(611569150, solver_stats%Jac_eval_fails.eq.0,&
@@ -444,8 +442,6 @@ subroutine integrate(this, start_time, time_step, i_start, i_end, j_start, &
                           trim( to_string( start_time ) ) )
 #endif
 
-
-
             ! Update the MONARCH tracer array with new species concentrations
             MONARCH_conc(i,j,k_flip,this%map_monarch_id(:)) = &
                     this%camp_state%state_var(this%map_camp_id(:))

From c0b2f63ab5eec44c40aa90ec7b25c066fb49568b Mon Sep 17 00:00:00 2001
From: cguzman <christian.guzman@bsc.es>
Date: Sat, 26 Oct 2019 21:17:09 +0200
Subject: [PATCH 3/4] Cleaning code. Prepared a folder debug for other issue

---
 CMakeLists.txt                                | 71 ++++++++---------
 src/camp_debug.h                              |  4 +
 src/camp_solver.c                             | 11 +--
 src/camp_solver_data.F90                      |  3 +-
 src/cuda/camp_gpu_solver.cu                   | 79 ++++++++++---------
 src/cuda/camp_gpu_solver.h                    |  1 +
 .../rxns_gpu/rxn_SIMPOL_phase_transfer.cu     |  1 -
 src/cuda/rxns_gpu/rxn_aqueous_equilibrium.cu  |  3 +-
 src/cuda/rxns_gpu/rxn_arrhenius.cu            |  5 +-
 src/debug_and_stats/camp_debug_2.c            | 24 ++++++
 src/debug_and_stats/camp_debug_2.h            | 12 +++
 test/chemistry/cb05cl_ae5/test_cb05cl_ae5.F90 |  2 +-
 12 files changed, 126 insertions(+), 90 deletions(-)
 create mode 100644 src/debug_and_stats/camp_debug_2.c
 create mode 100644 src/debug_and_stats/camp_debug_2.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0bfacc425..ae7942af0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -246,17 +246,6 @@ else()
   set(MPI_TEST_FLAG serial)
 endif()
 
-if(CUDA_FOUND)
-    add_test(test_rxn_arrhenius_mech_gpu ${CMAKE_BINARY_DIR}/test_run/unit_tests/input_files/run_rxn_arrhenius_gpu.sh ${MPI_TEST_FLAG})
-
-    add_test(test_chemistry_cb05cl_ae5_gpu ${CMAKE_BINARY_DIR}/test_run/chemistry/cb05cl_ae5/test_chemistry_cb05cl_ae5_gpu.sh ${MPI_TEST_FLAG})
-
-endif()
-
-add_test(test_rxn_arrhenius_mech ${CMAKE_BINARY_DIR}/test_run/unit_tests/input_files/run_rxn_arrhenius.sh ${MPI_TEST_FLAG})
-
-add_test(test_chemistry_cb05cl_ae5 ${CMAKE_BINARY_DIR}/test_run/chemistry/cb05cl_ae5/test_chemistry_cb05cl_ae5.sh ${MPI_TEST_FLAG})
-
 if (ENABLE_TESTS)
     if (ENABLE_JSON AND ENABLE_SUNDIALS)
       # New unit tests (UNDER DEVELOPMENT)
@@ -283,6 +272,15 @@ if (ENABLE_TESTS)
       add_test(test_MONARCH_1 ${CMAKE_BINARY_DIR}/test_run/monarch/test_monarch_1.sh ${MPI_TEST_FLAG})
       add_test(test_MONARCH_2 ${CMAKE_BINARY_DIR}/test_run/monarch/test_monarch_2.sh ${MPI_TEST_FLAG})
 
+      if (CUDA_FOUND AND NOT ENABLE_GPU)
+        #todo: new unit_tests with gpu
+          add_test(test_rxn_arrhenius_mech_gpu ${CMAKE_BINARY_DIR}/test_run/unit_tests/input_files/run_rxn_arrhenius_gpu.sh ${MPI_TEST_FLAG})
+
+          #Relevant mechanisms tests
+          add_test(test_chemistry_cb05cl_ae5_gpu ${CMAKE_BINARY_DIR}/test_run/chemistry/cb05cl_ae5/test_chemistry_cb05cl_ae5_gpu.sh ${MPI_TEST_FLAG})
+
+      endif()
+
     endif()
 
     add_test(test_additive_1 ${CMAKE_BINARY_DIR}/test_run/additive/test_additive_1.sh)
@@ -435,9 +433,6 @@ set(STD_C_FLAGS "-std=c99")
 
 set(STD_CUDA_FLAGS "-dc -arch=compute_70 -code=sm_70")
 
-
-
-
 if(${CMAKE_Fortran_COMPILER_ID} MATCHES "Intel")
   set(STD_F_FLAGS "-extend_source -warn truncated_source")
 elseif(${CMAKE_Fortran_COMPILER_ID} MATCHES "GNU")
@@ -516,7 +511,8 @@ set(SUB_MODELS_SRC ${SUB_MODELS_F_SRC} ${SUB_MODELS_C_SRC})
 
 set(CAMP_C_SRC
         src/camp_solver.c src/rxn_solver.c src/aero_phase_solver.c
-        src/aero_rep_solver.c src/sub_model_solver.c)
+        src/aero_rep_solver.c src/sub_model_solver.c
+        src/debug_and_stats/camp_debug_2.c)
 
 set_source_files_properties(${CAMP_C_SRC} PROPERTIES COMPILE_FLAGS
         ${STD_C_FLAGS})
@@ -587,7 +583,7 @@ target_link_libraries(partmclib ${NETCDF_LIBS} ${SUNDIALS_LIBS}
 
 set_target_properties(partmclib PROPERTIES OUTPUT_NAME partmc)
 
-if (CUDA_FOUND)
+if (CUDA_FOUND AND NOT ENABLE_GPU)
     add_library(partmclib_gpu ${CAMP_F_SRC}
             ${CAMP_C_SRC} ${AEROSOL_REPS_SRC} ${SUB_MODELS_SRC} ${REACTIONS_SRC}
             ${SUNDIALS_SRC} ${GSL_SRC} ${C_SORT_SRC} ${CAMP_CUDA_SRC} ${CAMP_CXX_SRC} )
@@ -671,7 +667,7 @@ add_executable(test_chemistry_cb05cl_ae5
         test/chemistry/cb05cl_ae5/module_BSC_CHEM_DATA.F90)
 target_link_libraries(test_chemistry_cb05cl_ae5 partmclib)
 
-if (CUDA_FOUND)
+if (CUDA_FOUND AND NOT ENABLE_GPU)
     add_executable(test_chemistry_cb05cl_ae5_gpu
             test/chemistry/cb05cl_ae5/test_cb05cl_ae5.F90
             ${CB5_EBI_SOLVER} ${CB5_KPP_SOLVER}
@@ -687,28 +683,6 @@ add_executable(mock_monarch test/monarch/mock_monarch.F90
 
 target_link_libraries(mock_monarch partmclib)
 
-
-set(UNIT_TEST_SRC
-        test/unit_tests/unit_test_data.F90
-        test/unit_tests/unit_test_driver.F90)
-
-set(UNIT_TEST_RXN_ARRHENIUS_SRC
-        test/unit_tests/rxns/unit_test_rxn_arrhenius.F90 ${UNIT_TEST_SRC})
-set_source_files_properties(${UNIT_TEST_RXN_ARRHENIUS_SRC} PROPERTIES COMPILE_FLAGS ${STD_F_FLAGS})
-add_executable(unit_test_rxn_arrhenius ${UNIT_TEST_RXN_ARRHENIUS_SRC})
-target_compile_definitions(unit_test_rxn_arrhenius PRIVATE
-    "UNIT_TEST_MODULE_=pmc_unit_test_rxn_arrhenius \
-    ;UNIT_TEST_TYPE_=unit_test_rxn_arrhenius_t()")
-target_link_libraries(unit_test_rxn_arrhenius partmclib)
-
-if (CUDA_FOUND)
-    add_executable(unit_test_rxn_arrhenius_gpu ${UNIT_TEST_RXN_ARRHENIUS_SRC})
-    target_compile_definitions(unit_test_rxn_arrhenius_gpu PRIVATE
-    "UNIT_TEST_MODULE_=pmc_unit_test_rxn_arrhenius \
-    ;UNIT_TEST_TYPE_=unit_test_rxn_arrhenius_t()")
-    target_link_libraries(unit_test_rxn_arrhenius_gpu partmclib_gpu)
-endif()
-
 if (ENABLE_TESTS)
 
 ######################################################################
@@ -786,9 +760,26 @@ target_link_libraries(test_sub_model_ZSR_aerosol_water partmclib)
 ######################################################################
 # New unit tests (UNDER DEVELOPMENT)
 
+set(UNIT_TEST_SRC
+        test/unit_tests/unit_test_data.F90
+        test/unit_tests/unit_test_driver.F90)
 
+set(UNIT_TEST_RXN_ARRHENIUS_SRC
+        test/unit_tests/rxns/unit_test_rxn_arrhenius.F90 ${UNIT_TEST_SRC})
+set_source_files_properties(${UNIT_TEST_RXN_ARRHENIUS_SRC} PROPERTIES COMPILE_FLAGS ${STD_F_FLAGS})
+add_executable(unit_test_rxn_arrhenius ${UNIT_TEST_RXN_ARRHENIUS_SRC})
+target_compile_definitions(unit_test_rxn_arrhenius PRIVATE
+        "UNIT_TEST_MODULE_=pmc_unit_test_rxn_arrhenius \
+    ;UNIT_TEST_TYPE_=unit_test_rxn_arrhenius_t()")
+target_link_libraries(unit_test_rxn_arrhenius partmclib)
 
-
+if (CUDA_FOUND AND NOT ENABLE_GPU)
+    add_executable(unit_test_rxn_arrhenius_gpu ${UNIT_TEST_RXN_ARRHENIUS_SRC})
+    target_compile_definitions(unit_test_rxn_arrhenius_gpu PRIVATE
+            "UNIT_TEST_MODULE_=pmc_unit_test_rxn_arrhenius \
+    ;UNIT_TEST_TYPE_=unit_test_rxn_arrhenius_t()")
+    target_link_libraries(unit_test_rxn_arrhenius_gpu partmclib_gpu)
+endif()
 
 ######################################################################
 # test_camp_core
diff --git a/src/camp_debug.h b/src/camp_debug.h
index 58bac669d..5f86fe1e6 100644
--- a/src/camp_debug.h
+++ b/src/camp_debug.h
@@ -5,6 +5,8 @@
  * Header file with some debugging functions for use with camp_solver.c
  *
  */
+#ifndef CAMP_DEBUG_H
+#define CAMP_DEBUG_H
 #ifdef PMC_DEBUG
 #define PMC_DEBUG_SPEC_ 0
 #define PMC_DEBUG_PRINT(x) \
@@ -146,3 +148,5 @@ static void print_derivative(N_Vector deriv) {
     printf(" index: %d \n", i);
   }
 }
+
+#endif
\ No newline at end of file
diff --git a/src/camp_solver.c b/src/camp_solver.c
index a90fe29a1..08e11f641 100644
--- a/src/camp_solver.c
+++ b/src/camp_solver.c
@@ -964,7 +964,8 @@ int Jac(realtype t, N_Vector y, N_Vector deriv, SUNMatrix J, void *solver_data,
 
 #ifdef PMC_USE_GPU
   // Calculate the Jacobian
-  rxn_calc_jac_gpu(md, J, time_step);
+  //TODO: Fix jacobian mapping with jac_map[i_map].rxn_id
+  //rxn_calc_jac_gpu(md, J, time_step);
 #endif
 
 #ifdef PMC_DEBUG
@@ -973,7 +974,6 @@ int Jac(realtype t, N_Vector y, N_Vector deriv, SUNMatrix J, void *solver_data,
 #endif
 
   // Solving on CPU only
-
   // Loop over the grid cells to calculate sub-model and rxn Jacobians
   for (int i_cell = 0; i_cell < n_cells; ++i_cell) {
     // Set the grid cell state pointers
@@ -1004,19 +1004,20 @@ int Jac(realtype t, N_Vector y, N_Vector deriv, SUNMatrix J, void *solver_data,
     clock_t start = clock();
 #endif
 
-#ifndef PMC_USE_GPU
+//#ifdef PMC_USE_GPU
 
     // Calculate the reaction Jacobian
     rxn_calc_jac(md, J_rxn_data, time_step);
     PMC_DEBUG_JAC(md->J_rxn, "reaction Jacobian");
 
-#else
+/*#else
     // Add contributions from reactions not implemented on GPU
     rxn_calc_jac_specific_types(md, J_rxn_data, time_step);
     PMC_DEBUG_JAC(md->J_rxn, "reaction Jacobian");
 #endif
 
-// rxn_calc_jac_specific_types(md, J_rxn_data, time_step);
+rxn_calc_jac_specific_types(md, J_rxn_data, time_step);
+*/
 #ifdef PMC_DEBUG
     clock_t end = clock();
     sd->timeJac += (end - start);
diff --git a/src/camp_solver_data.F90 b/src/camp_solver_data.F90
index cb694c908..5bfb4c50a 100644
--- a/src/camp_solver_data.F90
+++ b/src/camp_solver_data.F90
@@ -888,7 +888,8 @@ subroutine solve(this, camp_state, t_initial, t_final, solver_stats)
     end if
 
     ! Reset the solver function timers
-    call this%reset_timers( )
+    !Move this to intialization: I want to print only 1 time the deriv, not all times we call solve...
+    !call this%reset_timers( )
 #endif
 
     ! Run the solver
diff --git a/src/cuda/camp_gpu_solver.cu b/src/cuda/camp_gpu_solver.cu
index a05cbe017..990bbe7c7 100644
--- a/src/cuda/camp_gpu_solver.cu
+++ b/src/cuda/camp_gpu_solver.cu
@@ -27,30 +27,12 @@ extern "C" {
 int max_n_gpu_thread;
 int max_n_gpu_blocks;
 
-//General data
-/*double *model_data->deriv_gpu_data;
-double *model_data->deriv_cpu;
-double *model_data->jac_gpu_data;
-double *model_data->jac_cpu;
-//size_t model_data->deriv_size;
-size_t model_data->jac_size;
-size_t model_data->state_size;
-size_t model_data->env_size;
-size_t model_data->rate_constants_size;
-size_t model_data->rate_constants_idx_size;
-
-
-bool model_data->few_data;
-bool model_data->implemented_all;
-int *model_data->int_pointer;
-int *model_data->int_pointer_gpu;
-double *model_data->double_pointer;
-double *model_data->double_pointer_gpu;
-double *model_data->state_gpu;
-double *model_data->env_gpu;
-double *model_data->rate_constants_gpu;
-int *model_data->rate_constants_idx_gpu;
-*/
+//Debug info
+int counterDeriv;       // Total calls to f()
+int counterJac;         // Total calls to Jac()
+clock_t timeDeriv;      // Compute time for calls to f()
+clock_t timeJac;        // Compute time for calls to Jac()
+
 
 static void HandleError(cudaError_t err,
                         const char *file,
@@ -85,9 +67,6 @@ void solver_new_gpu_cu(ModelData *model_data, int n_dep_var,
                        int n_state_var, int n_rxn,
                        int n_rxn_int_param, int n_rxn_float_param, int n_rxn_env_param,
                        int n_cells) {
-//Get Global gpu data
-  //DataGPU *dgpu = (DataGPU *)malloc(sizeof(DataGPU));
-
   //Lengths
   model_data->state_size = n_state_var * n_cells * sizeof(double);
   model_data->deriv_size = n_dep_var * n_cells * sizeof(double);
@@ -96,14 +75,13 @@ void solver_new_gpu_cu(ModelData *model_data, int n_dep_var,
   model_data->rate_constants_idx_size = (n_rxn+1) * sizeof(int);
   model_data->few_data = 0;
   model_data->implemented_all = true;
-
-  printf("new deriv_lenght %d, cells %d\n", model_data->deriv_size/sizeof(double), n_cells);
+  counterDeriv=0;
+  counterJac=0;
 
   //Detect if we are working with few data values
   if (n_dep_var*n_cells < DATA_SIZE_LIMIT_OPT){
     model_data->few_data = 1;
   }
- //model_data->few_data = 1;
 
   //Set working GPU: we have 4 gpu available on power9. as default, it should be assign to gpu 0
   int device=0;
@@ -233,8 +211,6 @@ void solver_set_rxn_data_gpu(ModelData *model_data) {
 
   }
 
-  //todo checkout chem_mod after 99 merge, save the changed files of this branch (like gpu) copy into checkout chem_mod
-
   //Add a for to search the biggest distance int_max_length (ptrs[i] - ptrs[i-1]
 
   //Total lengths of rxn structure
@@ -407,6 +383,14 @@ __global__ void solveDerivative(double *state_init, double *deriv_init,
 
 }
 
+static void print_derivative_3(N_Vector deriv) {
+  // printf(" deriv length: %d\n", NV_LENGTH_S(deriv));
+  for (int i = 0; i < NV_LENGTH_S(deriv); i++) {  // NV_LENGTH_S(deriv)
+    printf(" deriv: % -le", NV_DATA_S(deriv)[i]);
+    printf(" index: %d \n", i);
+  }
+}
+
 /** \brief Calculate the time derivative \f$f(t,y)\f$ on GPU
  *
  * \param model_data Pointer to the model data
@@ -443,6 +427,7 @@ void rxn_calc_deriv_gpu(ModelData *model_data, N_Vector deriv, realtype time_ste
 
   HANDLE_ERROR(cudaMemset(model_data->deriv_gpu_data, 0.0, model_data->deriv_size));
 
+  //TODO: execute this asyncrhonous and let CPU thinks be computed in the meanwhile
   solveDerivative << < (n_blocks), max_n_gpu_thread >> >
      (model_data->state_gpu, model_data->deriv_gpu_data, time_step, model_data->n_per_cell_dep_var,
      model_data->n_per_cell_state_var, model_data->n_rxn_env_data,
@@ -460,7 +445,12 @@ void rxn_calc_deriv_gpu(ModelData *model_data, N_Vector deriv, realtype time_ste
     HANDLE_ERROR(cudaMemcpy(deriv_data, model_data->deriv_gpu_data, model_data->deriv_size, cudaMemcpyDeviceToHost));
   }
 
-  //printf("deriv deriv_lenght %d, cells %d\n", model_data->deriv_size/sizeof(double), n_cells);
+  //if(counterDeriv==0) print_derivative_3(deriv);
+
+  //counterDeriv++;
+
+//todo: add mock_monarch as test gpu?
+//todo: fix debug things... this camp_debug.h is horrible, and probably it will be a better option to print timederiv or counter...
 
 }
 
@@ -477,7 +467,6 @@ void rxn_calc_deriv_gpu(ModelData *model_data, N_Vector deriv, realtype time_ste
  * \param model_data->double_pointer Pointer to double reaction data
  * \param rate_constants_init Pointer to first value of reaction rates
  */
-//TODO: fix jacGPU once matt let me modify rxn_solver and reduce jacobians (in v2.0 or before if needed)
 __global__ void solveJacobian(double *state_init, double *jac_init,
                               double time_step, int jac_length_cell, int state_size_cell,
                               int rate_constants_size_cell, int n_rxn,
@@ -515,8 +504,8 @@ __global__ void solveJacobian(double *state_init, double *jac_init,
         //        state, jac_data, (void *) rxn_data, float_data, time_step, n_rxn);
         break;
       case RXN_ARRHENIUS :
-        //rxn_gpu_arrhenius_calc_jac_contrib(rate_constants,
-        //                                   state, jac_data, (void *) rxn_data, float_data, time_step,n_rxn);
+        rxn_gpu_arrhenius_calc_jac_contrib(rate_constants,
+                                           state, jac_data, (void *) rxn_data, float_data, time_step,n_rxn);
         break;
       case RXN_CMAQ_H2O2 :
         rxn_gpu_CMAQ_H2O2_calc_jac_contrib(rate_constants,
@@ -563,15 +552,29 @@ __global__ void solveJacobian(double *state_init, double *jac_init,
   }
 }
 
+static void print_jacobian_3(SUNMatrix M) {
+  printf("\n NNZ JAC: %lld \n", SM_NNZ_S(M));
+  printf("DATA | INDEXVALS:\n");
+  for (int i = 0; i < SM_NNZ_S(M); i++) {
+    printf("% -le \n", (SM_DATA_S(M))[i]);
+  //  printf("%lld \n", (SM_INDEXVALS_S(M))[i]);
+  }
+  printf("PTRS:\n");
+  for (int i = 0; i <= SM_NP_S(M); i++) {
+    //printf("%lld \n", (SM_INDEXPTRS_S(M))[i]);
+  }
+}
+
 /** \brief Calculate the Jacobian on GPU
  *
  * \param model_data Pointer to the model data
  * \param J Jacobian to be calculated
  * \param time_step Current model time step (s)
  */
-
 void rxn_calc_jac_gpu(ModelData *model_data, SUNMatrix jac, realtype time_step) {
 
+  //TODO: Fix jacobian with jac_ids...
+
   // Get a pointer to the jacobian data
   double *jac_data = SM_DATA_S(jac);
   int n_cells = model_data->n_cells;
@@ -611,7 +614,9 @@ void rxn_calc_jac_gpu(ModelData *model_data, SUNMatrix jac, realtype time_step)
     HANDLE_ERROR(cudaMemcpy(jac_data, model_data->jac_gpu_data, model_data->jac_size, cudaMemcpyDeviceToHost));
   }
 
+  if(counterJac==0)print_jacobian_3(jac);
 
+  counterJac++;
 
 }
 
diff --git a/src/cuda/camp_gpu_solver.h b/src/cuda/camp_gpu_solver.h
index bf51da0cc..e531fab94 100644
--- a/src/cuda/camp_gpu_solver.h
+++ b/src/cuda/camp_gpu_solver.h
@@ -12,6 +12,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include "../camp_common.h"
+#include "../debug_and_stats/camp_debug_2.h"
 
 //Value to consider data size too big -> Memory optimization will change below and under the limit
 #define DATA_SIZE_LIMIT_OPT 2000
diff --git a/src/cuda/rxns_gpu/rxn_SIMPOL_phase_transfer.cu b/src/cuda/rxns_gpu/rxn_SIMPOL_phase_transfer.cu
index a944e0f6d..85220dcec 100644
--- a/src/cuda/rxns_gpu/rxn_SIMPOL_phase_transfer.cu
+++ b/src/cuda/rxns_gpu/rxn_SIMPOL_phase_transfer.cu
@@ -17,7 +17,6 @@ extern "C"{
 
 #define TEMPERATURE_K_ env_data[0]
 #define PRESSURE_PA_ env_data[1]
- //todo fix
 // Universal gas constant (J/mol/K)
 #define UNIV_GAS_CONST_ 8.314472
 // Small number for ignoring low concentrations
diff --git a/src/cuda/rxns_gpu/rxn_aqueous_equilibrium.cu b/src/cuda/rxns_gpu/rxn_aqueous_equilibrium.cu
index e6dd78f45..4746ba708 100644
--- a/src/cuda/rxns_gpu/rxn_aqueous_equilibrium.cu
+++ b/src/cuda/rxns_gpu/rxn_aqueous_equilibrium.cu
@@ -143,8 +143,7 @@ __device__ double rxn_gpu_aqueous_equilibrium_calc_overall_rate(int *rxn_data,
  */
 #ifdef PMC_USE_SUNDIALS
 
-//TODO: seems this implementation ifdef raise a warning of host__ device__ function with new tests gpu
-//TODO: O a lo mejor es xk en el .h no esta asi (diria que si)
+//TODO: Apply this to cpu reactions, merging cpu and gpu code
 //#ifdef PMC_USE_GPU
 //__device__
 //#endif
diff --git a/src/cuda/rxns_gpu/rxn_arrhenius.cu b/src/cuda/rxns_gpu/rxn_arrhenius.cu
index fe478c2c4..22504e4b5 100644
--- a/src/cuda/rxns_gpu/rxn_arrhenius.cu
+++ b/src/cuda/rxns_gpu/rxn_arrhenius.cu
@@ -244,15 +244,14 @@ __device__ void rxn_gpu_arrhenius_calc_jac_contrib(double *rate_constants, doubl
   int n_rxn=n_rxn2;
   int *int_data = (int*) rxn_data;
   double *float_data = double_pointer_gpu;
-  double rate;
 
   // Calculate the reaction rate
   //double rate = RATE_CONSTANT_;
-//TODO FIX THIS, ITS NOT WORKING WITH NEW UNIT TEST IDK WHYYYYYYY
+//TODO FIX THIS, ITS NOT WORKING NOW WHYYYY (i guess in branch 114 it was working )
   // Add contributions to the Jacobian
   int i_elem = 0;
   for (int i_ind=0; i_ind<NUM_REACT_; i_ind++) {
-    rate = rate_constants[0];
+    double rate = rate_constants[0];
     for (int i_spec = 0; i_spec < NUM_REACT_; i_spec++)
       if (i_spec != i_ind) rate *= state[REACT_(i_spec)];
 
diff --git a/src/debug_and_stats/camp_debug_2.c b/src/debug_and_stats/camp_debug_2.c
new file mode 100644
index 000000000..0fd548d1e
--- /dev/null
+++ b/src/debug_and_stats/camp_debug_2.c
@@ -0,0 +1,24 @@
+/* Copyright (C) 2019 Christian Guzman
+ * Licensed under the GNU General Public License version 1 or (at your
+ * option) any later version. See the file COPYING for details.
+ *
+ * Debug and stats functions
+ *
+ */
+//todo fix this...move all to this folder and compile correctly...
+//todo: i think is not necessary include all time stdio.h
+#include <stdio.h>
+#include <stdlib.h>
+#include "camp_debug_2.h"
+
+/** \brief Print derivative array
+ *
+ * \param deriv Derivative array
+ */
+static void print_derivative_2(N_Vector deriv) {
+  // printf(" deriv length: %d\n", NV_LENGTH_S(deriv));
+  for (int i = 0; i < NV_LENGTH_S(deriv); i++) {  // NV_LENGTH_S(deriv)
+    printf(" deriv: % -le", NV_DATA_S(deriv)[i]);
+    printf(" index: %d \n", i);
+  }
+}
\ No newline at end of file
diff --git a/src/debug_and_stats/camp_debug_2.h b/src/debug_and_stats/camp_debug_2.h
new file mode 100644
index 000000000..2744e341c
--- /dev/null
+++ b/src/debug_and_stats/camp_debug_2.h
@@ -0,0 +1,12 @@
+//
+// Created by cguzman on 10/24/19.
+//
+
+#ifndef CAMP_DEBUG_2_H
+#define CAMP_DEBUG_2_H
+
+#include "../camp_common.h"
+
+static void print_derivative_2(N_Vector deriv);
+
+#endif //CAMP_DEBUG_2_H
diff --git a/test/chemistry/cb05cl_ae5/test_cb05cl_ae5.F90 b/test/chemistry/cb05cl_ae5/test_cb05cl_ae5.F90
index a80c9d9b4..c46101cc3 100644
--- a/test/chemistry/cb05cl_ae5/test_cb05cl_ae5.F90
+++ b/test/chemistry/cb05cl_ae5/test_cb05cl_ae5.F90
@@ -42,7 +42,7 @@ program pmc_test_cb05cl_ae5
   ! CAMP-chem output file unit
   integer(kind=i_kind), parameter :: CAMP_FILE_UNIT = 12
   ! Number of timesteps to integrate over
-  integer(kind=i_kind), parameter :: NUM_TIME_STEPS = 2!100
+  integer(kind=i_kind), parameter :: NUM_TIME_STEPS = 100
   ! Number of EBI-solver species
   integer(kind=i_kind), parameter :: NUM_EBI_SPEC = 72
   ! Number of EBI-solever photolysis reactions

From ea78eecd9e1c8bed5c1aa3755d2e738e77de534e Mon Sep 17 00:00:00 2001
From: cguzman <christian.guzman@bsc.es>
Date: Sat, 26 Oct 2019 21:38:28 +0200
Subject: [PATCH 4/4] Fixing format

---
 src/camp_common.h                  |  8 ++++----
 src/camp_solver.c                  | 17 +++++++++--------
 src/debug_and_stats/camp_debug_2.c |  6 +++---
 src/debug_and_stats/camp_debug_2.h |  2 +-
 src/rxn_solver.c                   |  2 +-
 5 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/src/camp_common.h b/src/camp_common.h
index d8e63724d..68ecee258 100644
--- a/src/camp_common.h
+++ b/src/camp_common.h
@@ -61,7 +61,7 @@ typedef struct {
 
 /* GPU data structure */
 typedef struct {
-    size_t deriv_size;
+  size_t deriv_size;
 
 } DataGPU;
 
@@ -173,8 +173,8 @@ typedef struct {
                                  // for the current grid cell
   int n_sub_model_env_data;      // Number of sub model environmental parameters
                                  // from all sub models
-//#ifdef CUDA_FOUND
-// GPU data
+                                 //#ifdef CUDA_FOUND
+                                 // GPU data
   double *deriv_gpu_data;
   double *deriv_cpu;
   double *jac_gpu_data;
@@ -196,7 +196,7 @@ typedef struct {
   double *rate_constants_gpu;
   int *rate_constants_idx_gpu;
 
-//#endif
+  //#endif
 } ModelData;
 
 /* Solver data structure */
diff --git a/src/camp_solver.c b/src/camp_solver.c
index 08e11f641..bdae8594b 100644
--- a/src/camp_solver.c
+++ b/src/camp_solver.c
@@ -337,8 +337,9 @@ void *solver_new(int n_state_var, int n_cells, int *var_type, int n_rxn,
   sd->model_data.sub_model_env_idx[0] = 0;
 
 #ifdef PMC_USE_GPU
-  solver_new_gpu_cu(&(sd->model_data), n_dep_var, n_state_var, n_rxn, n_rxn_int_param,
-                    n_rxn_float_param, n_rxn_env_param, n_cells);
+  solver_new_gpu_cu(&(sd->model_data), n_dep_var, n_state_var, n_rxn,
+                    n_rxn_int_param, n_rxn_float_param, n_rxn_env_param,
+                    n_cells);
 #endif
 
 #ifdef PMC_DEBUG
@@ -876,8 +877,8 @@ int f(realtype t, N_Vector y, N_Vector deriv, void *solver_data) {
     // Calculate the time derivative f(t,y)
     rxn_calc_deriv(md, deriv_data, (double)time_step);
 #else
-    // Add contributions from reactions not implemented on GPU
-    rxn_calc_deriv_specific_types(md, deriv_data, (double)time_step);
+      // Add contributions from reactions not implemented on GPU
+      rxn_calc_deriv_specific_types(md, deriv_data, (double)time_step);
 #endif
 
 #ifdef PMC_DEBUG
@@ -964,8 +965,8 @@ int Jac(realtype t, N_Vector y, N_Vector deriv, SUNMatrix J, void *solver_data,
 
 #ifdef PMC_USE_GPU
   // Calculate the Jacobian
-  //TODO: Fix jacobian mapping with jac_map[i_map].rxn_id
-  //rxn_calc_jac_gpu(md, J, time_step);
+  // TODO: Fix jacobian mapping with jac_map[i_map].rxn_id
+  // rxn_calc_jac_gpu(md, J, time_step);
 #endif
 
 #ifdef PMC_DEBUG
@@ -1004,7 +1005,7 @@ int Jac(realtype t, N_Vector y, N_Vector deriv, SUNMatrix J, void *solver_data,
     clock_t start = clock();
 #endif
 
-//#ifdef PMC_USE_GPU
+    //#ifdef PMC_USE_GPU
 
     // Calculate the reaction Jacobian
     rxn_calc_jac(md, J_rxn_data, time_step);
@@ -1865,7 +1866,7 @@ void error_handler(int error_code, const char *module, const char *function,
  */
 void model_free(ModelData model_data) {
 #ifdef PMC_USE_GPU
-   free_gpu_cu();
+  free_gpu_cu();
 #endif
 
 #ifdef PMC_USE_SUNDIALS
diff --git a/src/debug_and_stats/camp_debug_2.c b/src/debug_and_stats/camp_debug_2.c
index 0fd548d1e..6fdcabc0a 100644
--- a/src/debug_and_stats/camp_debug_2.c
+++ b/src/debug_and_stats/camp_debug_2.c
@@ -5,11 +5,11 @@
  * Debug and stats functions
  *
  */
-//todo fix this...move all to this folder and compile correctly...
-//todo: i think is not necessary include all time stdio.h
+// todo fix this...move all to this folder and compile correctly...
+// todo: i think is not necessary include all time stdio.h
+#include "camp_debug_2.h"
 #include <stdio.h>
 #include <stdlib.h>
-#include "camp_debug_2.h"
 
 /** \brief Print derivative array
  *
diff --git a/src/debug_and_stats/camp_debug_2.h b/src/debug_and_stats/camp_debug_2.h
index 2744e341c..71bf7a09e 100644
--- a/src/debug_and_stats/camp_debug_2.h
+++ b/src/debug_and_stats/camp_debug_2.h
@@ -9,4 +9,4 @@
 
 static void print_derivative_2(N_Vector deriv);
 
-#endif //CAMP_DEBUG_2_H
+#endif  // CAMP_DEBUG_2_H
diff --git a/src/rxn_solver.c b/src/rxn_solver.c
index 6809e970a..6889ea301 100644
--- a/src/rxn_solver.c
+++ b/src/rxn_solver.c
@@ -510,7 +510,7 @@ void rxn_calc_jac_specific_types(ModelData *model_data, double *J_data,
                                                  rxn_int_data, rxn_float_data,
                                                  rxn_env_data, time_step);
         break;
-      case RXN_ARRHENIUS: //TODO delete arrhenius when it works
+      case RXN_ARRHENIUS:  // TODO delete arrhenius when it works
         rxn_arrhenius_calc_jac_contrib(model_data, J_data, rxn_int_data,
                                        rxn_float_data, rxn_env_data, time_step);
         break;