plumed · Iximiel · Nov 24, 2025 · Dec 11, 2025 · Dec 11, 2025 · Dec 9, 2025
diff --git a/plugins/cudaCoord/Coordination.cu b/plugins/cudaCoord/Coordination.cu
diff --git a/plugins/cudaCoord/Coordination.cuh b/plugins/cudaCoord/Coordination.cuh
@@ -34,15 +34,18 @@ using std::cerr;
 
 
 #define hdbg(...) __LINE__ << ":" #__VA_ARGS__ " = " << (__VA_ARGS__) << '\n'
-// #define vdbg(...) std::cerr << __LINE__ << ":" << #__VA_ARGS__ << " " << (__VA_ARGS__) << '\n'
 //#define vdbg(...) std::cerr << hdbg(__VA_ARGS__)
 #define vdbg(...)
 
+//#define dprintf(...) printf(__VA_ARGS__)
+#define dprintf(...)
+
 namespace PLMD {
 namespace GPU {
 
 // these constant will be used within the kernels
-template <typename calculateFloat> struct rationalSwitchParameters {
+template <typename calculateFloat>
+struct rationalSwitchParameters {
   calculateFloat dmaxSQ = std::numeric_limits<calculateFloat>::max();
   calculateFloat invr0_2 = 1.0;
   calculateFloat d0 = 0.0;
@@ -53,7 +56,20 @@ template <typename calculateFloat> struct rationalSwitchParameters {
   bool calcSquared=false;
 };
 
-template <typename T> struct invData {
+template <typename T>
+std::ostream& operator<<(std::ostream& os, const rationalSwitchParameters<T>& rsp) {
+  os << "dmaxSQ: "      << rsp.dmaxSQ << "\n";
+  os << "invr0_2: "     << rsp.invr0_2 << "\n";
+  os << "d0: "          << rsp.d0 << "\n";
+  os << "stretch: "     << rsp.stretch << "\n";
+  os << "nn: "          << rsp.nn << "\n";
+  os << "mm: "          << rsp.mm << "\n";
+  os << "calcSquared: " << ((rsp.calcSquared)?"true":"false") << "\n";
+  return os;
+}
+
+template <typename T>
+struct invData {
   T val = 1.0;
   T inv = 1.0;
   // this makes the `X = x;` work like "X.val=x;X.inv=1/x;"
@@ -65,18 +81,29 @@ template <typename T> struct invData {
     return *this;
   }
 };
-template <typename calculateFloat> struct ortoPBCs {
+
+template <typename calculateFloat>
+struct ortoPBCs {
   invData<calculateFloat> X{1.0};
   invData<calculateFloat> Y{1.0};
   invData<calculateFloat> Z{1.0};
 };
 
+template <typename T>
+std::ostream& operator<<(std::ostream& os, const ortoPBCs<T>& opc) {
+  os << "X: " << opc.X.val << " " << opc.X.inv << "\n";
+  os << "Y: " << opc.Y.val << " " << opc.Y.inv << "\n";
+  os << "Z: " << opc.Z.val << " " << opc.Z.inv << "\n";
+  return os;
+}
+
 template <typename calculateFloat>
 __device__ calculateFloat pbcClamp (calculateFloat x) {
   return 0.0;
 }
 
-template <> __device__ __forceinline__ double pbcClamp<double> (double x) {
+template <>
+__device__ __forceinline__ double pbcClamp<double> (double x) {
   // convert a double to a signed int in round-to-nearest-even mode.
   // return __double2int_rn (x) - x;
   // return x - floor (x + 0.5);
@@ -86,7 +113,8 @@ template <> __device__ __forceinline__ double pbcClamp<double> (double x) {
   return x - nearbyint (x);
 }
 
-template <> __device__ __forceinline__ float pbcClamp<float> (float x) {
+template <>
+__device__ __forceinline__ float pbcClamp<float> (float x) {
   // convert a double to a signed int in round-to-nearest-even mode.
   // return __float2int_rn (x) - x;
   // return x - floorf (x + 0.5f);
@@ -111,17 +139,22 @@ __device__ __forceinline__ calculateFloat pcuda_fastpow (calculateFloat base,
   return result;
 }
 
-template <typename calculateFloat> __device__ calculateFloat pcuda_eps() {
+template <typename calculateFloat>
+__device__ calculateFloat pcuda_eps() {
   return 0;
 }
 
-template <> constexpr __device__ float pcuda_eps<float>() {
+template <>
+constexpr __device__ float pcuda_eps<float>() {
   return FLT_EPSILON * 10.0f;
 }
-template <> constexpr __device__ double pcuda_eps<double>() {
+
+template <>
+constexpr __device__ double pcuda_eps<double>() {
   return DBL_EPSILON * 10.0;
 }
 
+//Use this as a base to implement different new switching functions, inclued this in the make_worker function
 struct Rational {
   template <typename calculateFloat>
   static __device__ __forceinline__ calculateFloat
@@ -168,7 +201,7 @@ __global__ void getpcuda_func (const calculateFloat *rdists,
   // printf("stretch: %i: %f -> %f\n",i,rdists[i],res[i]);
 }
 
-template <typename calculateFloat>
+template <typename mySwitch, typename calculateFloat>
 __device__ __forceinline__ calculateFloat calculate (
   calculateFloat distance,
   const rationalSwitchParameters<calculateFloat> switchingParameters,
@@ -177,7 +210,7 @@ __device__ __forceinline__ calculateFloat calculate (
   dfunc = 0.0;
   //if (distance < switchingParameters.dmaxSQ) { already tested in caclulateSqr
   const calculateFloat rdist_2 = (distance-switchingParameters.d0) * switchingParameters.invr0_2;
-  result = Rational::pcuda_func (
+  result = mySwitch::pcuda_func (
              rdist_2, switchingParameters, dfunc);
   // chain rule:
   dfunc *=  switchingParameters.invr0_2;
@@ -187,25 +220,26 @@ __device__ __forceinline__ calculateFloat calculate (
   return result;
 }
 
-template <typename calculateFloat>
+template <typename mySwitch, typename calculateFloat>
 __device__ __forceinline__ calculateFloat calculateSqr (
   const calculateFloat distancesq,
   const rationalSwitchParameters<calculateFloat> switchingParameters,
   calculateFloat &dfunc) {
   calculateFloat result = 0.0;
   dfunc = 0.0;
+  dprintf("distancesqr %f\n",distancesq);
   if (distancesq < switchingParameters.dmaxSQ) {
     if(switchingParameters.calcSquared)  {
       const calculateFloat rdist_2 = distancesq * switchingParameters.invr0_2;
-      result = Rational::pcuda_func (
+      result = mySwitch::pcuda_func (
                  rdist_2, switchingParameters, dfunc);
       // chain rule:
       dfunc *= 2 * switchingParameters.invr0_2;
       // cu_stretch:
       result = result * switchingParameters.stretch + switchingParameters.shift;
       dfunc *= switchingParameters.stretch;
     } else {
-      result = calculate(std::sqrt(distancesq),switchingParameters,dfunc);
+      result = calculate<mySwitch>(std::sqrt(distancesq),switchingParameters,dfunc);
     }
   }
   return result;

diff --git a/plugins/cudaCoord/Readme.md b/plugins/cudaCoord/Readme.md
@@ -7,6 +7,13 @@ This is the optimized version of the lesson that I presented in the [plumed-scho
 The files `cudaHelpers.cuh` and `cudaHelpers.cu` contains a few support functions for helping in interfacing `PLMD::Vector` and `PLMD::Tensor` with Cuda's thrust,
 along with the reduction functions baked with Cuda's cub building blocks and their drivers.
 
+>[!WARNING]
+>Plumed may refuse to to the calculations if the GPU does not allow for the calculations to be done.
+>We are currently working on a solution for this
+>
+>A workaround is reducing the size of the cutoff of the neigbor list or to increase the number of threads with THREADS
+>Plumed will present the user wiht an error message with the suggestion.
+
 ### Compile
 
 

diff --git a/plugins/cudaCoord/cudaHelpers.cuh b/plugins/cudaCoord/cudaHelpers.cuh
@@ -23,6 +23,7 @@
 
 #include "plumed/tools/Tensor.h"
 #include "plumed/tools/Vector.h"
+#include "plumed/tools/View.h"
 #include <thrust/device_vector.h>
 #include <vector>
 namespace CUDAHELPERS {
@@ -107,6 +108,30 @@ inline void plmdDataToGPU (thrust::device_vector<double> &dvmem,
                    stream);
 }
 
+/// @brief the specialized asyncronous function for tranferring the data in a view to the GPUtemplate <typename T>
+template <typename T>
+inline void plmdDataToGPU (thrust::device_vector<std::remove_const_t<T>> &dvmem,
+                           PLMD::View<T> data,
+                           cudaStream_t stream) {
+  dvmem.resize (data.size());
+  cudaMemcpyAsync (thrust::raw_pointer_cast (dvmem.data()),
+                   data.data(),
+                   data.size() * sizeof (T),
+                   cudaMemcpyHostToDevice,
+                   stream);
+}
+
+/// @brief the specialized function for tranferring the data in a view to the GPUtemplate <typename T>
+template <typename T>
+inline void plmdDataToGPU (thrust::device_vector<std::remove_const_t<T>> &dvmem,
+                           PLMD::View<T> data) {
+  dvmem.resize (data.size());
+  cudaMemcpyAsync (thrust::raw_pointer_cast (dvmem.data()),
+                   data.data(),
+                   data.size() * sizeof (T),
+                   cudaMemcpyHostToDevice);
+}
+
 /// @brief the specialized function for getting single precision data from the
 /// gpu to a PLMD container
 /// @param dvmem the cuda interface to the data on the device
@@ -405,5 +430,25 @@ void doReductionND (T *inputArray,
   doReductionND_t<DATAPERTHREAD> (
     inputArray, outputArray, len, blocks, nthreads, stream);
 }
+
+///A simple struct to use the allocated shared memory in multiple ways
+///
+///Usage: declare this at the begininnig of your kernel (a single instance per kernel)
+///and then "allocate" memory by calling the `get_shared_memory method`
+class sharedArena {
+  //TODO: this costs a register, it may be a better idea to use an external register and then override it for another usage
+  unsigned allocated=0;
+public:
+  template <typename T>
+  __device__ T *get_shared_memory(unsigned const size) {
+    // do we need an __align__() here?
+    extern __shared__ unsigned char memory[];
+    auto ptr = reinterpret_cast<T *> (&memory[allocated]);
+    allocated+=size * sizeof(T);
+    return ptr;
+
+  }
+};
+
 } // namespace CUDAHELPERS
 #endif //__PLUMED_cuda_helpers_cuh
diff --git a/plugins/cudaCoord/nvcc-MakeFile.sh b/plugins/cudaCoord/nvcc-MakeFile.sh
@@ -20,12 +20,13 @@ fi
 #pendantic adds a unuseful FOR EACH line with
 #"" warning: style of line directive is a GCC extension"
 {
-  grep CXXFLAGS Make.tmp |
-    sed -e 's/-f/-Xcompiler -f/g' \
+  grep CXXFLAGS Make.tmp \
+    | sed -E -e 's/-f([a-zA-Z-]*)\>/-Xcompiler -f\1/g' \
+      -e 's/-gdwarf/-Xcompiler -gdwarf/g' \
       -e 's/-pedantic//g' \
       -e 's/-W/-Xcompiler -W/g'
-  grep -eDYNAMIC_LIBS -eLDFLAGS Make.tmp |
-    sed -e 's/-rdynamic/-Xcompiler -rdynamic/g' \
+  grep -eDYNAMIC_LIBS -eLDFLAGS Make.tmp \
+    | sed -e 's/-rdynamic/-Xcompiler -rdynamic/g' \
       -e 's/-Wl,/-Xlinker /g' \
       -e 's/-f/-Xcompiler -f/g'
   #prints the rest of the file

diff --git a/plugins/cudaCoord/regtest/cudatest/rt-double-bigbox/Makefile b/plugins/cudaCoord/regtest/cudatest/rt-double-bigbox/Makefile
@@ -0,0 +1 @@
+include ../../scripts/test.make
diff --git a/plugins/cudaCoord/regtest/cudatest/rt-double-bigbox/colvar.reference b/plugins/cudaCoord/regtest/cudatest/rt-double-bigbox/colvar.reference
@@ -0,0 +1,3 @@
+#! FIELDS time diff
+ 0.000000  -0.0000
+ 1.000000  -0.0000
diff --git a/plugins/cudaCoord/regtest/cudatest/rt-double-bigbox/config b/plugins/cudaCoord/regtest/cudatest/rt-double-bigbox/config
@@ -0,0 +1,11 @@
+type=driver
+# this is to test a different name
+arg="--plumed plumed.dat --ixyz trajbig.xyz"
+
+plumed_regtest_after() {
+  {
+    echo '#! FIELDS time parameter cpu-gpu'
+    awk 'NR>1{print $1, $2, $4-$3} ' <deriv
+  } >deriv_delta
+}
+extra_files="../../trajectories/trajbig.xyz"