Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,886 changes: 1,204 additions & 682 deletions plugins/cudaCoord/Coordination.cu

Large diffs are not rendered by default.

62 changes: 48 additions & 14 deletions plugins/cudaCoord/Coordination.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,18 @@ using std::cerr;


#define hdbg(...) __LINE__ << ":" #__VA_ARGS__ " = " << (__VA_ARGS__) << '\n'
// #define vdbg(...) std::cerr << __LINE__ << ":" << #__VA_ARGS__ << " " << (__VA_ARGS__) << '\n'
//#define vdbg(...) std::cerr << hdbg(__VA_ARGS__)
#define vdbg(...)

//#define dprintf(...) printf(__VA_ARGS__)
#define dprintf(...)

namespace PLMD {
namespace GPU {

// these constant will be used within the kernels
template <typename calculateFloat> struct rationalSwitchParameters {
template <typename calculateFloat>
struct rationalSwitchParameters {
calculateFloat dmaxSQ = std::numeric_limits<calculateFloat>::max();
calculateFloat invr0_2 = 1.0;
calculateFloat d0 = 0.0;
Expand All @@ -53,7 +56,20 @@ template <typename calculateFloat> struct rationalSwitchParameters {
bool calcSquared=false;
};

template <typename T> struct invData {
template <typename T>
std::ostream& operator<<(std::ostream& os, const rationalSwitchParameters<T>& rsp) {
os << "dmaxSQ: " << rsp.dmaxSQ << "\n";
os << "invr0_2: " << rsp.invr0_2 << "\n";
os << "d0: " << rsp.d0 << "\n";
os << "stretch: " << rsp.stretch << "\n";
os << "nn: " << rsp.nn << "\n";
os << "mm: " << rsp.mm << "\n";
os << "calcSquared: " << ((rsp.calcSquared)?"true":"false") << "\n";
return os;
}

template <typename T>
struct invData {
T val = 1.0;
T inv = 1.0;
// this makes the `X = x;` work like "X.val=x;X.inv=1/x;"
Expand All @@ -65,18 +81,29 @@ template <typename T> struct invData {
return *this;
}
};
template <typename calculateFloat> struct ortoPBCs {

template <typename calculateFloat>
struct ortoPBCs {
invData<calculateFloat> X{1.0};
invData<calculateFloat> Y{1.0};
invData<calculateFloat> Z{1.0};
};

template <typename T>
std::ostream& operator<<(std::ostream& os, const ortoPBCs<T>& opc) {
os << "X: " << opc.X.val << " " << opc.X.inv << "\n";
os << "Y: " << opc.Y.val << " " << opc.Y.inv << "\n";
os << "Z: " << opc.Z.val << " " << opc.Z.inv << "\n";
return os;
}

template <typename calculateFloat>
__device__ calculateFloat pbcClamp (calculateFloat x) {
return 0.0;
}

template <> __device__ __forceinline__ double pbcClamp<double> (double x) {
template <>
__device__ __forceinline__ double pbcClamp<double> (double x) {
// convert a double to a signed int in round-to-nearest-even mode.
// return __double2int_rn (x) - x;
// return x - floor (x + 0.5);
Expand All @@ -86,7 +113,8 @@ template <> __device__ __forceinline__ double pbcClamp<double> (double x) {
return x - nearbyint (x);
}

template <> __device__ __forceinline__ float pbcClamp<float> (float x) {
template <>
__device__ __forceinline__ float pbcClamp<float> (float x) {
// convert a double to a signed int in round-to-nearest-even mode.
// return __float2int_rn (x) - x;
// return x - floorf (x + 0.5f);
Expand All @@ -111,17 +139,22 @@ __device__ __forceinline__ calculateFloat pcuda_fastpow (calculateFloat base,
return result;
}

template <typename calculateFloat> __device__ calculateFloat pcuda_eps() {
template <typename calculateFloat>
__device__ calculateFloat pcuda_eps() {
return 0;
}

template <> constexpr __device__ float pcuda_eps<float>() {
template <>
constexpr __device__ float pcuda_eps<float>() {
return FLT_EPSILON * 10.0f;
}
template <> constexpr __device__ double pcuda_eps<double>() {

template <>
constexpr __device__ double pcuda_eps<double>() {
return DBL_EPSILON * 10.0;
}

//Use this as a base to implement different new switching functions, inclued this in the make_worker function
struct Rational {
template <typename calculateFloat>
static __device__ __forceinline__ calculateFloat
Expand Down Expand Up @@ -168,7 +201,7 @@ __global__ void getpcuda_func (const calculateFloat *rdists,
// printf("stretch: %i: %f -> %f\n",i,rdists[i],res[i]);
}

template <typename calculateFloat>
template <typename mySwitch, typename calculateFloat>
__device__ __forceinline__ calculateFloat calculate (
calculateFloat distance,
const rationalSwitchParameters<calculateFloat> switchingParameters,
Expand All @@ -177,7 +210,7 @@ __device__ __forceinline__ calculateFloat calculate (
dfunc = 0.0;
//if (distance < switchingParameters.dmaxSQ) { already tested in caclulateSqr
const calculateFloat rdist_2 = (distance-switchingParameters.d0) * switchingParameters.invr0_2;
result = Rational::pcuda_func (
result = mySwitch::pcuda_func (
rdist_2, switchingParameters, dfunc);
// chain rule:
dfunc *= switchingParameters.invr0_2;
Expand All @@ -187,25 +220,26 @@ __device__ __forceinline__ calculateFloat calculate (
return result;
}

template <typename calculateFloat>
template <typename mySwitch, typename calculateFloat>
__device__ __forceinline__ calculateFloat calculateSqr (
const calculateFloat distancesq,
const rationalSwitchParameters<calculateFloat> switchingParameters,
calculateFloat &dfunc) {
calculateFloat result = 0.0;
dfunc = 0.0;
dprintf("distancesqr %f\n",distancesq);
if (distancesq < switchingParameters.dmaxSQ) {
if(switchingParameters.calcSquared) {
const calculateFloat rdist_2 = distancesq * switchingParameters.invr0_2;
result = Rational::pcuda_func (
result = mySwitch::pcuda_func (
rdist_2, switchingParameters, dfunc);
// chain rule:
dfunc *= 2 * switchingParameters.invr0_2;
// cu_stretch:
result = result * switchingParameters.stretch + switchingParameters.shift;
dfunc *= switchingParameters.stretch;
} else {
result = calculate(std::sqrt(distancesq),switchingParameters,dfunc);
result = calculate<mySwitch>(std::sqrt(distancesq),switchingParameters,dfunc);
}
}
return result;
Expand Down
7 changes: 7 additions & 0 deletions plugins/cudaCoord/Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,13 @@ This is the optimized version of the lesson that I presented in the [plumed-scho
The files `cudaHelpers.cuh` and `cudaHelpers.cu` contains a few support functions for helping in interfacing `PLMD::Vector` and `PLMD::Tensor` with Cuda's thrust,
along with the reduction functions baked with Cuda's cub building blocks and their drivers.

>[!WARNING]
>Plumed may refuse to to the calculations if the GPU does not allow for the calculations to be done.
>We are currently working on a solution for this
>
>A workaround is reducing the size of the cutoff of the neigbor list or to increase the number of threads with THREADS
>Plumed will present the user wiht an error message with the suggestion.

### Compile


Expand Down
45 changes: 45 additions & 0 deletions plugins/cudaCoord/cudaHelpers.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

#include "plumed/tools/Tensor.h"
#include "plumed/tools/Vector.h"
#include "plumed/tools/View.h"
#include <thrust/device_vector.h>
#include <vector>
namespace CUDAHELPERS {
Expand Down Expand Up @@ -107,6 +108,30 @@ inline void plmdDataToGPU (thrust::device_vector<double> &dvmem,
stream);
}

/// @brief the specialized asyncronous function for tranferring the data in a view to the GPUtemplate <typename T>
template <typename T>
inline void plmdDataToGPU (thrust::device_vector<std::remove_const_t<T>> &dvmem,
PLMD::View<T> data,
cudaStream_t stream) {
dvmem.resize (data.size());
cudaMemcpyAsync (thrust::raw_pointer_cast (dvmem.data()),
data.data(),
data.size() * sizeof (T),
cudaMemcpyHostToDevice,
stream);
}

/// @brief the specialized function for tranferring the data in a view to the GPUtemplate <typename T>
template <typename T>
inline void plmdDataToGPU (thrust::device_vector<std::remove_const_t<T>> &dvmem,
PLMD::View<T> data) {
dvmem.resize (data.size());
cudaMemcpyAsync (thrust::raw_pointer_cast (dvmem.data()),
data.data(),
data.size() * sizeof (T),
cudaMemcpyHostToDevice);
}

/// @brief the specialized function for getting single precision data from the
/// gpu to a PLMD container
/// @param dvmem the cuda interface to the data on the device
Expand Down Expand Up @@ -405,5 +430,25 @@ void doReductionND (T *inputArray,
doReductionND_t<DATAPERTHREAD> (
inputArray, outputArray, len, blocks, nthreads, stream);
}

///A simple struct to use the allocated shared memory in multiple ways
///
///Usage: declare this at the begininnig of your kernel (a single instance per kernel)
///and then "allocate" memory by calling the `get_shared_memory method`
class sharedArena {
//TODO: this costs a register, it may be a better idea to use an external register and then override it for another usage
unsigned allocated=0;
public:
template <typename T>
__device__ T *get_shared_memory(unsigned const size) {
// do we need an __align__() here?
extern __shared__ unsigned char memory[];
auto ptr = reinterpret_cast<T *> (&memory[allocated]);
allocated+=size * sizeof(T);
return ptr;

}
};

} // namespace CUDAHELPERS
#endif //__PLUMED_cuda_helpers_cuh
9 changes: 5 additions & 4 deletions plugins/cudaCoord/nvcc-MakeFile.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,13 @@ fi
#pendantic adds a unuseful FOR EACH line with
#"" warning: style of line directive is a GCC extension"
{
grep CXXFLAGS Make.tmp |
sed -e 's/-f/-Xcompiler -f/g' \
grep CXXFLAGS Make.tmp \
| sed -E -e 's/-f([a-zA-Z-]*)\>/-Xcompiler -f\1/g' \
-e 's/-gdwarf/-Xcompiler -gdwarf/g' \
-e 's/-pedantic//g' \
-e 's/-W/-Xcompiler -W/g'
grep -eDYNAMIC_LIBS -eLDFLAGS Make.tmp |
sed -e 's/-rdynamic/-Xcompiler -rdynamic/g' \
grep -eDYNAMIC_LIBS -eLDFLAGS Make.tmp \
| sed -e 's/-rdynamic/-Xcompiler -rdynamic/g' \
-e 's/-Wl,/-Xlinker /g' \
-e 's/-f/-Xcompiler -f/g'
#prints the rest of the file
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
include ../../scripts/test.make
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#! FIELDS time diff
0.000000 -0.0000
1.000000 -0.0000
11 changes: 11 additions & 0 deletions plugins/cudaCoord/regtest/cudatest/rt-double-bigbox/config
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
type=driver
# this is to test a different name
arg="--plumed plumed.dat --ixyz trajbig.xyz"

plumed_regtest_after() {
{
echo '#! FIELDS time parameter cpu-gpu'
awk 'NR>1{print $1, $2, $4-$3} ' <deriv
} >deriv_delta
}
extra_files="../../trajectories/trajbig.xyz"
Loading
Loading