From ce2dda0407443acfc6a92a374ba7591737d56b77 Mon Sep 17 00:00:00 2001 From: "shen.guo" Date: Tue, 6 Jan 2026 16:33:20 +0100 Subject: [PATCH 1/5] optimize nonbonded pp force using openmm method --- src/core/cuda/src/CudaNonbondedPPForce.cu | 290 ++++++++++++++++++---- src/core/parse.cu | 4 + 2 files changed, 247 insertions(+), 47 deletions(-) diff --git a/src/core/cuda/src/CudaNonbondedPPForce.cu b/src/core/cuda/src/CudaNonbondedPPForce.cu index d32db32b..e5a78ee8 100644 --- a/src/core/cuda/src/CudaNonbondedPPForce.cu +++ b/src/core/cuda/src/CudaNonbondedPPForce.cu @@ -1,11 +1,11 @@ +#include + #include "cuda/include/CudaContext.cuh" #include "cuda/include/CudaNonbondedPPForce.cuh" #include "utils.h" namespace CudaNonbondedPPForce { bool is_initialized = false; -dvel_t* PP_MAT; -double *D_PP_Evdw, *D_PP_Ecoul; double *D_PP_evdw_TOT, *D_PP_ecoul_TOT; __device__ void calc_pp_dvel_matrix_incr(int row, int pi, int column, int pj, @@ -65,6 +65,231 @@ __device__ void calc_pp_dvel_matrix_incr(int row, int pi, int column, int pj, *Evdw += (V_a - V_b); } +__device__ __forceinline__ void idx2xy(int n, int t, int& x, int& y) { + x = (int)floorf((2 * n + 1 - sqrtf((2 * n + 1) * (2 * n + 1) - 8 * t)) * 0.5f); + y = t - (x * n - (x * (x - 1) >> 1)); + if (y < 0) { + x--; + y += (n - x); + } + y += x; +} + +__device__ __forceinline__ double shfl(double v, int srcLane, unsigned mask = 0xffffffffu) { + int2 a = *reinterpret_cast(&v); + a.x = __shfl_sync(mask, a.x, srcLane); + a.y = __shfl_sync(mask, a.y, srcLane); + return *reinterpret_cast(&a); +} + +__device__ __forceinline__ coord_t shfl_coord(coord_t v, int srcLane, unsigned mask = 0xffffffffu) { + v.x = shfl(v.x, srcLane, mask); + v.y = shfl(v.y, srcLane, mask); + v.z = shfl(v.z, srcLane, mask); + return v; +} + +__device__ void calculate_unforce_bound( + // const int x_idx, + // const int y_idx, + + const coord_t& x, + const coord_t& y, + + const double x_charge, + const double y_charge, + + const double x_aii, + const double y_aii, + + const double x_bii, + const double y_bii, + + const double coulomb_constant, + + const double scaling, + + double& evdw, + double& ecoul, + double& dv) { + double3 d = {y.x - x.x, y.y - x.y, y.z - x.z}; + + double r2 = 1.0 / (d.x * d.x + d.y * d.y + d.z * d.z); + double r = sqrt(r2); + double r6 = r2 * r2 * r2; + + ecoul = scaling * coulomb_constant * x_charge * y_charge * r; + + double v_a = r6 * r6 * x_aii * y_aii; + double v_b = r6 * x_bii * y_bii; + evdw = v_a - v_b; + dv = r2 * (-ecoul - 12.0 * v_a + 6.0 * v_b); +} + +__global__ void calc_pp( + const int N, + const int n_atoms_solute, + + const charge_t* D_charges, // charge type of each atom + const ccharge_t* D_ccharges, // charge value of each charge type + + const atype_t* D_atypes, + const catype_t* D_catypes, + + const topo_t D_topo, + + const bool* D_excluded, + + const int* D_LJ_matrix, + + const p_atom_t* D_patoms, + + coord_t* X, + dvel_t* DV_X, + + double* Evdw_TOT, + double* Ecoul_TOT) { + const int nBlocks = (N + 31) >> 5; // ceil(N/32) + const int totalTiles = nBlocks * (nBlocks + 1) / 2; + + const int warpsPerBlock = blockDim.x >> 5; // THREAD_BLOCK_SIZE / 32 + const int tid = threadIdx.x; + const int lane = tid & 31; + const int warpInBlock = tid >> 5; + + const int tile = blockIdx.x * warpsPerBlock + warpInBlock; + if (tile >= totalTiles) { + return; // extra warps in last block + } + + // Map tile -> (x,y) with y>=x (upper triangle) + int tile_x, tile_y; + idx2xy(nBlocks, tile, tile_x, tile_y); + + const int baseX = tile_x << 5; // x*32 + const int baseY = tile_y << 5; // y*32 + + int x_idx = baseX + lane; + int y_idx = baseY + lane; + + int x_atom = (x_idx < N) ? (D_patoms[x_idx].a - 1) : -1; + int y_atom = (y_idx < N) ? (D_patoms[y_idx].a - 1) : -1; + + coord_t invalid = {-1e9, -1e9, -1e9}; + coord_t x_coord = (x_atom >= 0) ? X[x_atom] : invalid; + coord_t y_coord = (y_atom >= 0) ? X[y_atom] : invalid; + + bool x_excluded = (x_atom >= 0) ? D_excluded[x_atom] : true; + bool y_excluded = (y_atom >= 0) ? D_excluded[y_atom] : true; + + double x_charge = (x_atom >= 0) ? D_ccharges[D_charges[x_atom].code - 1].charge : 0.0; + double y_charge = (y_atom >= 0) ? D_ccharges[D_charges[y_atom].code - 1].charge : 0.0; + + catype_t x_type = (x_atom >= 0) ? D_catypes[D_atypes[x_atom].code - 1] : catype_t{}; + catype_t y_type = (y_atom >= 0) ? D_catypes[D_atypes[y_atom].code - 1] : catype_t{}; + double x_aii_normal = x_type.aii_normal; + double y_aii_normal = y_type.aii_normal; + double x_aii_14 = x_type.aii_1_4; + double y_aii_14 = y_type.aii_1_4; + double x_bii_normal = x_type.bii_normal; + double y_bii_normal = y_type.bii_normal; + double x_bii_14 = x_type.bii_1_4; + double y_bii_14 = y_type.bii_1_4; + + double3 x_force = {0.0, 0.0, 0.0}; + double3 y_force = {0.0, 0.0, 0.0}; + + double evdw_sum = 0; + double ecoul_sum = 0; + + const unsigned mask = 0xffffffffu; + + auto is_valid = [&]() -> bool { + if (x_idx >= N || y_idx >= N) { + return false; + } + if (tile_x == tile_y && x_idx <= y_idx) { + return false; + } + if (x_excluded || y_excluded) { + return false; + } + + + bool bond23 = (D_LJ_matrix[y_atom * n_atoms_solute + x_atom] == 3); + if (bond23) { + return false; + } + + return true; + }; + + // shuffle to get next y + auto do_shuffle = [&]() { + const int src = (lane + 1) & 31; + y_idx = __shfl_sync(mask, y_idx, src); + y_atom = __shfl_sync(mask, y_atom, src); + y_coord = shfl_coord(y_coord, src, mask); + y_excluded = __shfl_sync(mask, y_excluded, src); + y_charge = __shfl_sync(mask, y_charge, src); + y_aii_normal = __shfl_sync(mask, y_aii_normal, src); + y_aii_14 = __shfl_sync(mask, y_aii_14, src); + y_bii_normal = __shfl_sync(mask, y_bii_normal, src); + y_bii_14 = __shfl_sync(mask, y_bii_14, src); + + // shuffle y_force + y_force.x = shfl(y_force.x, src, mask); + y_force.y = shfl(y_force.y, src, mask); + y_force.z = shfl(y_force.z, src, mask); + }; + + for (int i = 0; i < 32; i++) { + if (is_valid()) { + bool bond14 = (D_LJ_matrix[y_atom * n_atoms_solute + x_atom] == 1); + double scaling = bond14 ? D_topo.el14_scale : 1.0; + double ai_aii = bond14 ? x_aii_14 : x_aii_normal; + double aj_aii = bond14 ? y_aii_14 : y_aii_normal; + double ai_bii = bond14 ? x_bii_14 : x_bii_normal; + double aj_bii = bond14 ? y_bii_14 : y_bii_normal; + double evdw = 0.0, ecoul = 0.0, dv = 0.0; + calculate_unforce_bound( + x_coord, y_coord, x_charge, y_charge, ai_aii, aj_aii, ai_bii, aj_bii, + D_topo.coulomb_constant, scaling, evdw, ecoul, dv); + evdw_sum += evdw; + ecoul_sum += ecoul; + double3 d = {y_coord.x - x_coord.x, y_coord.y - x_coord.y, y_coord.z - x_coord.z}; + x_force.x += dv * d.x; + x_force.y += dv * d.y; + x_force.z += dv * d.z; + + y_force.x -= dv * d.x; + y_force.y -= dv * d.y; + y_force.z -= dv * d.z; + } + do_shuffle(); + } + + if (x_atom >= 0) { + atomicAdd(&DV_X[x_atom].x, x_force.x); + atomicAdd(&DV_X[x_atom].y, x_force.y); + atomicAdd(&DV_X[x_atom].z, x_force.z); + } + if (y_atom >= 0) { + atomicAdd(&DV_X[y_atom].x, y_force.x); + atomicAdd(&DV_X[y_atom].y, y_force.y); + atomicAdd(&DV_X[y_atom].z, y_force.z); + } + + for (int offset = 16; offset > 0; offset >>= 1) { + evdw_sum += __shfl_down_sync(0xffffffffu, evdw_sum, offset); + ecoul_sum += __shfl_down_sync(0xffffffffu, ecoul_sum, offset); + } + if (lane == 0) { + atomicAdd(Evdw_TOT, evdw_sum); + atomicAdd(Ecoul_TOT, ecoul_sum); + } +} + __global__ void calc_pp_dvel_matrix(int n_patoms, int n_atoms_solute, coord_t* X, double* Evdw, double* Ecoul, dvel_t* PP_MAT, ccharge_t* D_ccharges, charge_t* D_charges, catype_t* D_catypes, atype_t* D_atypes, p_atom_t* D_patoms, int* D_LJ_matrix, bool* D_excluded, topo_t D_topo) { @@ -221,8 +446,6 @@ void calc_nonbonded_pp_forces_host_v2() { int mem_size_DV_X = n_atoms_solute * sizeof(dvel_t); - int n_blocks = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE; - CudaContext& ctx = CudaContext::instance(); auto X = ctx.d_coords; auto DV_X = ctx.d_dvelocities; @@ -234,32 +457,27 @@ void calc_nonbonded_pp_forces_host_v2() { auto D_LJ_matrix = ctx.d_LJ_matrix; auto D_excluded = ctx.d_excluded; - dim3 threads, grid; + cudaMemset(D_PP_evdw_TOT, 0, sizeof(double)); + cudaMemset(D_PP_ecoul_TOT, 0, sizeof(double)); - threads = dim3(BLOCK_SIZE, BLOCK_SIZE); - grid = dim3((n_patoms + BLOCK_SIZE - 1) / threads.x, (n_patoms + BLOCK_SIZE - 1) / threads.y); + const int thread_num = 128; + dim3 block_sz = dim3(thread_num); - calc_pp_dvel_matrix<<>>(n_patoms, n_atoms_solute, X, D_PP_Evdw, D_PP_Ecoul, PP_MAT, D_ccharges, D_charges, D_catypes, D_atypes, D_patoms, D_LJ_matrix, D_excluded, topo); - calc_pp_dvel_vector<<<((n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_patoms, DV_X, PP_MAT, D_patoms); + int tile_num_per_block = thread_num / 32; + int nBlocks = (n_patoms + 31) / 32; + int total_tiles = nBlocks * (nBlocks + 1) / 2; - cudaMemcpy(dvelocities, DV_X, mem_size_DV_X, cudaMemcpyDeviceToHost); + int grid_sz = (total_tiles + tile_num_per_block - 1) / tile_num_per_block; + dim3 grid = dim3(grid_sz); -#ifdef DEBUG - cudaMemcpy(h_PP_MAT, PP_MAT, mem_size_PP_MAT, cudaMemcpyDeviceToHost); -#endif + calc_pp<<>>(n_patoms, n_atoms_solute, D_charges, D_ccharges, + D_atypes, D_catypes, topo, D_excluded, + D_LJ_matrix, D_patoms, X, DV_X, D_PP_evdw_TOT, + D_PP_ecoul_TOT); -#ifdef DEBUG - for (int i = 0; i < n_patoms; i++) { - for (int j = 0; j < n_patoms; j++) { - if (h_PP_MAT[i * n_patoms + j].x > 100) - printf("PP_MAT[%d][%d] = %f %f %f\n", i, j, h_PP_MAT[i * n_patoms + j].x, h_PP_MAT[i * n_patoms + j].y, h_PP_MAT[i * n_patoms + j].z); - } - } -#endif - cudaMemset(D_PP_evdw_TOT, 0, sizeof(double)); - cudaMemset(D_PP_ecoul_TOT, 0, sizeof(double)); + cudaDeviceSynchronize(); + cudaMemcpy(dvelocities, DV_X, mem_size_DV_X, cudaMemcpyDeviceToHost); - calc_energy_sum<<<1, threads>>>(n_blocks, n_blocks, D_PP_evdw_TOT, D_PP_ecoul_TOT, D_PP_Evdw, D_PP_Ecoul, true); double PP_evdw_TOT, PP_ecoul_TOT; cudaMemcpy(&PP_evdw_TOT, D_PP_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost); cudaMemcpy(&PP_ecoul_TOT, D_PP_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost); @@ -272,25 +490,6 @@ void init_nonbonded_pp_force_kernel_data() { using namespace CudaNonbondedPPForce; if (!is_initialized) { - int n_blocks = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE; - - int mem_size_PP_MAT = n_patoms * n_patoms * sizeof(dvel_t); - - int mem_size_PP_Evdw = n_blocks * n_blocks * sizeof(double); - int mem_size_PP_Ecoul = n_blocks * n_blocks * sizeof(double); -#ifdef DEBUG - printf("Allocating PP_MAT\n"); -#endif - check_cudaMalloc((void**)&PP_MAT, mem_size_PP_MAT); -#ifdef DEBUG - printf("Allocating D_PP_Evdw\n"); -#endif - check_cudaMalloc((void**)&D_PP_Evdw, mem_size_PP_Evdw); -#ifdef DEBUG - printf("Allocating D_PP_Ecoul\n"); -#endif - check_cudaMalloc((void**)&D_PP_Ecoul, mem_size_PP_Ecoul); - check_cudaMalloc((void**)&D_PP_evdw_TOT, sizeof(double)); check_cudaMalloc((void**)&D_PP_ecoul_TOT, sizeof(double)); @@ -301,11 +500,8 @@ void init_nonbonded_pp_force_kernel_data() { void cleanup_nonbonded_pp_force() { using namespace CudaNonbondedPPForce; if (is_initialized) { - cudaFree(PP_MAT); - cudaFree(D_PP_Evdw); - cudaFree(D_PP_Ecoul); cudaFree(D_PP_evdw_TOT); cudaFree(D_PP_ecoul_TOT); is_initialized = false; } -} \ No newline at end of file +} diff --git a/src/core/parse.cu b/src/core/parse.cu index 1b9e0f8d..c5518223 100644 --- a/src/core/parse.cu +++ b/src/core/parse.cu @@ -753,6 +753,7 @@ void init_ngbrs14(const char* filename) { int jx = (lineI + i + 1) % lines; // if (ix < 100 && jx < 100) printf("i = %d j = %d\n", ix+1, jx+1); LJ_matrix[ix * n_atoms_solute + jx] = 1; + LJ_matrix[jx * n_atoms_solute + ix] = 1; } } lineI++; @@ -797,6 +798,7 @@ void init_ngbrs23(const char* filename) { int jx = (lineI + i + 1) % lines; // if (ix < 100 && jx < 100) printf("i = %d j = %d\n", ix+1, jx+1); LJ_matrix[ix * n_atoms_solute + jx] = 3; + LJ_matrix[jx * n_atoms_solute + ix] = 3; } } lineI++; @@ -819,6 +821,7 @@ void init_ngbrs14_long(const char* filename) { int ix = atoi(file.buffer[i+1][0])-1; int jx = atoi(file.buffer[i+1][1])-1; LJ_matrix[ix * n_atoms_solute + jx] = 1; + LJ_matrix[jx * n_atoms_solute + ix] = 1; } clean_csv(file); @@ -838,6 +841,7 @@ void init_ngbrs23_long(const char* filename) { int ix = atoi(file.buffer[i+1][0])-1; int jx = atoi(file.buffer[i+1][1])-1; LJ_matrix[ix * n_atoms_solute + jx] = 3; + LJ_matrix[jx * n_atoms_solute + ix] = 3; } clean_csv(file); From 7143986823074c194944a033abe7c0ae1acfd259 Mon Sep 17 00:00:00 2001 From: "shen.guo" Date: Tue, 6 Jan 2026 16:55:35 +0100 Subject: [PATCH 2/5] optimize nonbonded pp force using openmm method --- src/core/cuda/src/CudaNonbondedPPForce.cu | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/src/core/cuda/src/CudaNonbondedPPForce.cu b/src/core/cuda/src/CudaNonbondedPPForce.cu index e5a78ee8..4b2b67b2 100644 --- a/src/core/cuda/src/CudaNonbondedPPForce.cu +++ b/src/core/cuda/src/CudaNonbondedPPForce.cu @@ -214,9 +214,7 @@ __global__ void calc_pp( if (x_excluded || y_excluded) { return false; } - - - bool bond23 = (D_LJ_matrix[y_atom * n_atoms_solute + x_atom] == 3); + bool bond23 = (D_LJ_matrix[x_atom * n_atoms_solute + y_atom] == 3); if (bond23) { return false; } @@ -258,13 +256,13 @@ __global__ void calc_pp( evdw_sum += evdw; ecoul_sum += ecoul; double3 d = {y_coord.x - x_coord.x, y_coord.y - x_coord.y, y_coord.z - x_coord.z}; - x_force.x += dv * d.x; - x_force.y += dv * d.y; - x_force.z += dv * d.z; + x_force.x -= dv * d.x; + x_force.y -= dv * d.y; + x_force.z -= dv * d.z; - y_force.x -= dv * d.x; - y_force.y -= dv * d.y; - y_force.z -= dv * d.z; + y_force.x += dv * d.x; + y_force.y += dv * d.y; + y_force.z += dv * d.z; } do_shuffle(); } From da70854110299bb3d181c788f1da809547f1f967 Mon Sep 17 00:00:00 2001 From: shen Date: Mon, 12 Jan 2026 14:57:48 +0100 Subject: [PATCH 3/5] optimize q lambda --- src/core/cuda/include/CudaContext.cuh | 5 + src/core/cuda/src/CudaContext.cu | 41 +++++++- src/core/cuda/src/CudaNonbondedQPForce.cu | 121 ++++++++++------------ src/core/cuda/src/CudaNonbondedQQForce.cu | 4 + src/core/cuda/src/CudaNonbondedQWForce.cu | 98 +++++++----------- src/core/system.cu | 2 +- 6 files changed, 145 insertions(+), 126 deletions(-) diff --git a/src/core/cuda/include/CudaContext.cuh b/src/core/cuda/include/CudaContext.cuh index 3c2883fa..faa3cf90 100644 --- a/src/core/cuda/include/CudaContext.cuh +++ b/src/core/cuda/include/CudaContext.cuh @@ -59,6 +59,9 @@ class CudaContext { E_nonbonded_t* d_EQ_nonbond_qq = nullptr; double* d_lambdas = nullptr; + q_charge_t* d_q_aggregate_charges = nullptr; + q_catype_t* d_q_aggregate_catypes = nullptr; + /* Used in CudaPolxWaterForce.cu */ @@ -124,6 +127,8 @@ class CudaContext { private: CudaContext() = default; + void init_data_for_q(); + void free(); ~CudaContext() { free(); } diff --git a/src/core/cuda/src/CudaContext.cu b/src/core/cuda/src/CudaContext.cu index 19f22165..0ac5f0b5 100644 --- a/src/core/cuda/src/CudaContext.cu +++ b/src/core/cuda/src/CudaContext.cu @@ -1,6 +1,7 @@ #include #include "cuda/include/CudaContext.cuh" +#include void CudaContext::init() { check_cudaMalloc((void**)&d_coords, sizeof(coord_t) * n_atoms); @@ -54,6 +55,8 @@ void CudaContext::init() { check_cudaMalloc((void**)&d_charges, sizeof(charge_t) * n_charges); check_cudaMalloc((void**)&d_p_atoms, sizeof(p_atom_t) * n_patoms); + init_data_for_q(); + sync_all_to_device(); } @@ -183,7 +186,6 @@ void CudaContext::free() { cudaFree(d_excluded); cudaFree(d_q_elscales); cudaFree(d_q_catypes); - cudaFree(d_q_atypes); cudaFree(d_EQ_nonbond_qq); cudaFree(d_lambdas); @@ -208,6 +210,9 @@ void CudaContext::free() { cudaFree(d_ccharges); cudaFree(d_charges); cudaFree(d_p_atoms); + + cudaFree(d_q_aggregate_charges); + cudaFree(d_q_aggregate_catypes); } void CudaContext::reset_energies() { @@ -215,3 +220,37 @@ void CudaContext::reset_energies() { cudaMemset(d_EQ_nonbond_qq, 0, sizeof(E_nonbonded_t) * n_lambdas); cudaMemset(d_EQ_restraint, 0, sizeof(E_restraint_t) * n_lambdas); } + +void CudaContext::init_data_for_q() { + + // Additional initialization for q-related data can be added here if needed + + // q charges + std::vector charges(n_qatoms, q_charge_t{0.0}); + for (int i = 0; i < n_qatoms; i++) { + double q = 0; + for (int state = 0; state < n_lambdas; state++) { + q += q_charges[state * n_qatoms + i].q * lambdas[state]; + } + charges[i].q = q; + } + check_cudaMalloc((void**)&d_q_aggregate_charges, sizeof(q_charge_t) * n_qatoms); + check_cuda(cudaMemcpy(d_q_aggregate_charges, charges.data(), sizeof(q_charge_t) * n_qatoms, cudaMemcpyHostToDevice)); + + + // q atypes + std::vector atypes(n_qatoms, q_catype_t{"", 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}); + for (int i = 0; i < n_qatoms; i++) { + for (int state = 0; state < n_lambdas; state++) { + atypes[i].Ai += q_catypes[q_atypes[state * n_qatoms + i].code - 1].Ai * lambdas[state]; + atypes[i].Bi += q_catypes[q_atypes[state * n_qatoms + i].code - 1].Bi * lambdas[state]; + atypes[i].Ci += q_catypes[q_atypes[state * n_qatoms + i].code - 1].Ci * lambdas[state]; + atypes[i].ai += q_catypes[q_atypes[state * n_qatoms + i].code - 1].ai * lambdas[state]; + atypes[i].Ai_14 += q_catypes[q_atypes[state * n_qatoms + i].code - 1].Ai_14 * lambdas[state]; + atypes[i].Bi_14 += q_catypes[q_atypes[state * n_qatoms + i].code - 1].Bi_14 * lambdas[state]; + atypes[i].m += q_catypes[q_atypes[state * n_qatoms + i].code - 1].m * lambdas[state]; + } + } + check_cudaMalloc((void**)&d_q_aggregate_catypes, sizeof(q_catype_t) * n_qatoms); + check_cuda(cudaMemcpy(d_q_aggregate_catypes, atypes.data(), sizeof(q_catype_t) * n_qatoms, cudaMemcpyHostToDevice)); +} \ No newline at end of file diff --git a/src/core/cuda/src/CudaNonbondedQPForce.cu b/src/core/cuda/src/CudaNonbondedQPForce.cu index 4eb53908..47f26451 100644 --- a/src/core/cuda/src/CudaNonbondedQPForce.cu +++ b/src/core/cuda/src/CudaNonbondedQPForce.cu @@ -42,6 +42,7 @@ __global__ void calc_energy_sum(int rows, int columns, double* Evdw_TOT, double* } Ecoul_S[ty][tx] = coul_TOT; Evdw_S[ty][tx] = vdw_TOT; + // printf("Thread (%d,%d) coul_TOT = %f vdw_TOT = %f\n", tx, ty, coul_TOT, vdw_TOT); __syncthreads(); @@ -111,8 +112,8 @@ __global__ void calc_qp_dvel_vector_column(int n_qatoms, int n_patoms, dvel_t* D } __device__ void calc_qp_dvel_matrix_incr(int row, int qi, int column, int pj, int n_lambdas, int n_qatoms, - coord_t* Qs, coord_t* Ps, int* LJs, bool* excluded_s, double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE], double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE], calc_qp_t* qp, - q_catype_t* D_qcatypes, q_atype_t* D_qatypes, q_charge_t* D_qcharges, p_atom_t* D_patoms, q_atom_t* D_qatoms, double* D_lambdas, + coord_t* Qs, coord_t* Ps, int* LJs, bool* excluded_s, double Evdw_S[BLOCK_SIZE][BLOCK_SIZE], double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE], calc_qp_t* qp, + q_catype_t* D_qcatypes, q_charge_t* D_qcharges, p_atom_t* D_patoms, q_atom_t* D_qatoms, catype_t* D_catypes, atype_t* D_atypes, ccharge_t* D_ccharges, charge_t* D_charges, topo_t D_topo) { coord_t da; double r2, r6, r; @@ -141,43 +142,40 @@ __device__ void calc_qp_dvel_matrix_incr(int row, int qi, int column, int pj, in r6 = r2 * r2 * r2; r2 = 1 / r2; r = sqrt(r2); - - for (int state = 0; state < n_lambdas; state++) { - qi_type = D_qcatypes[D_qatypes[qi + n_qatoms * state].code - 1]; - aj_type = D_catypes[D_atypes[j].code - 1]; - - ai_aii = bond14 ? qi_type.Ai_14 : qi_type.Ai; - aj_aii = bond14 ? aj_type.aii_1_4 : aj_type.aii_normal; - ai_bii = bond14 ? qi_type.Bi_14 : qi_type.Bi; - aj_bii = bond14 ? aj_type.bii_1_4 : aj_type.bii_normal; - - Vel = D_topo.coulomb_constant * scaling * D_qcharges[qi + n_qatoms * state].q * D_ccharges[D_charges[j].code - 1].charge * r; - V_a = ai_aii * aj_aii / (r6 * r6); - V_b = ai_bii * aj_bii / r6; - dv = r2 * (-Vel - (12 * V_a - 6 * V_b)) * D_lambdas[state]; - - // if (state == 0 && qi == 0 && pj == 1) { - // printf("crg_q = %f crg_j = %f r = %f\n", D_qcharges[qi + n_qatoms * state].q, D_ccharges[D_charges[pj].code - 1].charge, r); - // printf("ai_aii = %f aj_aii = %f ai_bii = %f aj_bii = %f\n", ai_aii, aj_aii, ai_bii, aj_bii); - // } - - // Update forces - qp->Q.x += dv * da.x; - qp->Q.y += dv * da.y; - qp->Q.z += dv * da.z; - qp->P.x -= dv * da.x; - qp->P.y -= dv * da.y; - qp->P.z -= dv * da.z; - - // Update Q totals - Ecoul_S[row][state * BLOCK_SIZE + column] += Vel; - Evdw_S[row][state * BLOCK_SIZE + column] += (V_a - V_b); - } + qi_type = D_qcatypes[qi]; + aj_type = D_catypes[D_atypes[j].code - 1]; + + ai_aii = bond14 ? qi_type.Ai_14 : qi_type.Ai; + aj_aii = bond14 ? aj_type.aii_1_4 : aj_type.aii_normal; + ai_bii = bond14 ? qi_type.Bi_14 : qi_type.Bi; + aj_bii = bond14 ? aj_type.bii_1_4 : aj_type.bii_normal; + + Vel = D_topo.coulomb_constant * scaling * D_qcharges[qi].q * D_ccharges[D_charges[j].code - 1].charge * r; + V_a = ai_aii * aj_aii / (r6 * r6); + V_b = ai_bii * aj_bii / r6; + dv = r2 * (-Vel - (12 * V_a - 6 * V_b)); + + // if (state == 0 && qi == 0 && pj == 1) { + // printf("crg_q = %f crg_j = %f r = %f\n", D_qcharges[qi + n_qatoms * state].q, D_ccharges[D_charges[pj].code - 1].charge, r); + // printf("ai_aii = %f aj_aii = %f ai_bii = %f aj_bii = %f\n", ai_aii, aj_aii, ai_bii, aj_bii); + // } + + // Update forces + qp->Q.x += dv * da.x; + qp->Q.y += dv * da.y; + qp->Q.z += dv * da.z; + qp->P.x -= dv * da.x; + qp->P.y -= dv * da.y; + qp->P.z -= dv * da.z; + + // Update Q totals + Ecoul_S[row][column] += Vel; + Evdw_S[row][column] += (V_a - V_b); } __global__ void calc_qp_dvel_matrix(int n_qatoms, int n_patoms, int n_lambdas, int n_atoms_solute, coord_t* X, double* Evdw, double* Ecoul, calc_qp_t* QP_MAT, - q_catype_t* D_qcatypes, q_atype_t* D_qatypes, q_charge_t* D_qcharges, p_atom_t* D_patoms, q_atom_t* D_qatoms, double* D_lambdas, + q_catype_t* D_qcatypes, q_charge_t* D_qcharges, p_atom_t* D_patoms, q_atom_t* D_qatoms, int* D_LJ_matrix, bool* D_excluded, catype_t* D_catypes, atype_t* D_atypes, ccharge_t* D_ccharges, charge_t* D_charges, topo_t D_topo) { // Block index int bx = blockIdx.x; @@ -187,9 +185,8 @@ __global__ void calc_qp_dvel_matrix(int n_qatoms, int n_patoms, int n_lambdas, i int tx = threadIdx.x; int ty = threadIdx.y; - // TODO implement >2 states on GPU - __shared__ double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE]; - __shared__ double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE]; + __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE]; + __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE]; Ecoul_S[ty][tx] = 0; Evdw_S[ty][tx] = 0; @@ -230,29 +227,25 @@ __global__ void calc_qp_dvel_matrix(int n_qatoms, int n_patoms, int n_lambdas, i int column = bx * BLOCK_SIZE + tx; calc_qp_dvel_matrix_incr(ty, row, tx, column, n_lambdas, n_qatoms, Qs, Ps, LJs, excluded_s, Evdw_S, Ecoul_S, - &qp, D_qcatypes, D_qatypes, D_qcharges, D_patoms, D_qatoms, D_lambdas, D_catypes, D_atypes, D_ccharges, D_charges, D_topo); + &qp, D_qcatypes, D_qcharges, D_patoms, D_qatoms, D_catypes, D_atypes, D_ccharges, D_charges, D_topo); QP_MAT[n_patoms * row + column] = qp; __syncthreads(); int rowlen = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE; - int collen = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE; if (tx == 0 && ty == 0) { - // TODO implement >2 states on GPU - for (int state = 0; state < min(2, n_lambdas); state++) { - double tot_Evdw = 0; - double tot_Ecoul = 0; - for (int i = 0; i < BLOCK_SIZE; i++) { - for (int j = 0; j < BLOCK_SIZE; j++) { - tot_Evdw += Evdw_S[i][j + state * BLOCK_SIZE]; - tot_Ecoul += Ecoul_S[i][j + state * BLOCK_SIZE]; - } + double tot_Evdw = 0; + double tot_Ecoul = 0; + for (int i = 0; i < BLOCK_SIZE; i++) { + for (int j = 0; j < BLOCK_SIZE; j++) { + tot_Evdw += Evdw_S[i][j]; + tot_Ecoul += Ecoul_S[i][j]; } - Evdw[rowlen * collen * state + rowlen * by + bx] = tot_Evdw; - Ecoul[rowlen * collen * state + rowlen * by + bx] = tot_Ecoul; } + Evdw[rowlen * by + bx] = tot_Evdw; + Ecoul[rowlen * by + bx] = tot_Ecoul; } __syncthreads(); @@ -269,12 +262,10 @@ void calc_nonbonded_qp_forces_host_v2() { CudaContext& ctx = CudaContext::instance(); auto X = ctx.d_coords; auto DV_X = ctx.d_dvelocities; - auto D_qcatypes = ctx.d_q_catypes; - auto D_qatypes = ctx.d_q_atypes; - auto D_qcharges = ctx.d_q_charges; + auto D_qcatypes = ctx.d_q_aggregate_catypes; + auto D_qcharges = ctx.d_q_aggregate_charges; auto D_patoms = ctx.d_p_atoms; auto D_qatoms = ctx.d_q_atoms; - auto D_lambdas = ctx.d_lambdas; auto D_LJ_matrix = ctx.d_LJ_matrix; auto D_excluded = ctx.d_excluded; auto D_catypes = ctx.d_catypes; @@ -288,7 +279,7 @@ void calc_nonbonded_qp_forces_host_v2() { grid = dim3((n_patoms + BLOCK_SIZE - 1) / threads.x, (n_qatoms + BLOCK_SIZE - 1) / threads.y); calc_qp_dvel_matrix<<>>(n_qatoms, n_patoms, n_lambdas, n_atoms_solute, X, D_QP_Evdw, D_QP_Ecoul, QP_MAT, - D_qcatypes, D_qatypes, D_qcharges, D_patoms, D_qatoms, D_lambdas, D_LJ_matrix, D_excluded, + D_qcatypes, D_qcharges, D_patoms, D_qatoms, D_LJ_matrix, D_excluded, D_catypes, D_atypes, D_ccharges, D_charges, topo); calc_qp_dvel_vector_column<<<((n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_qatoms, n_patoms, DV_X, QP_MAT, D_patoms); calc_qp_dvel_vector_row<<<((n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_qatoms, n_patoms, DV_X, QP_MAT, D_qatoms); @@ -309,16 +300,14 @@ void calc_nonbonded_qp_forces_host_v2() { int mem_size_DV_X = n_atoms * sizeof(dvel_t); cudaMemcpy(dvelocities, DV_X, mem_size_DV_X, cudaMemcpyDeviceToHost); - // TODO: make Evdw & Ecoul work for # of states > 2 - for (int state = 0; state < min(2, n_lambdas); state++) { - calc_energy_sum<<<1, threads>>>(n_blocks_q, n_blocks_p, D_QP_evdw_TOT, D_QP_ecoul_TOT, &D_QP_Evdw[state * n_blocks_p * n_blocks_q], &D_QP_Ecoul[state * n_blocks_p * n_blocks_q], false); + calc_energy_sum<<<1, threads>>>(n_blocks_q, n_blocks_p, D_QP_evdw_TOT, D_QP_ecoul_TOT, D_QP_Evdw, D_QP_Ecoul, false); - cudaMemcpy(&QP_evdw_TOT, D_QP_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost); - cudaMemcpy(&QP_ecoul_TOT, D_QP_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(&QP_evdw_TOT, D_QP_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(&QP_ecoul_TOT, D_QP_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost); + printf("QP E_vdw: %f E_coul: %f\n", QP_evdw_TOT, QP_ecoul_TOT); - EQ_nonbond_qp[state].Uvdw += QP_evdw_TOT; - EQ_nonbond_qp[state].Ucoul += QP_ecoul_TOT; - } + E_nonbond_qx.Uvdw += QP_evdw_TOT; + E_nonbond_qx.Ucoul += QP_ecoul_TOT; } void init_nonbonded_qp_force_kernel_data() { @@ -328,8 +317,8 @@ void init_nonbonded_qp_force_kernel_data() { int n_blocks_q = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE; int n_blocks_p = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE; // TODO make Evdw & Ecoul work for # of states > 2 - int mem_size_QP_Evdw = min(n_lambdas, 2) * n_blocks_q * n_blocks_p * sizeof(double); - int mem_size_QP_Ecoul = min(n_lambdas, 2) * n_blocks_q * n_blocks_p * sizeof(double); + int mem_size_QP_Evdw = n_blocks_q * n_blocks_p * sizeof(double); + int mem_size_QP_Ecoul = n_blocks_q * n_blocks_p * sizeof(double); int mem_size_QP_MAT = n_qatoms * n_patoms * sizeof(calc_qp_t); check_cudaMalloc((void**)&D_QP_Evdw, mem_size_QP_Evdw); diff --git a/src/core/cuda/src/CudaNonbondedQQForce.cu b/src/core/cuda/src/CudaNonbondedQQForce.cu index e2138ed6..a10c315c 100644 --- a/src/core/cuda/src/CudaNonbondedQQForce.cu +++ b/src/core/cuda/src/CudaNonbondedQQForce.cu @@ -1,6 +1,7 @@ #include "cuda/include/CudaContext.cuh" #include "cuda/include/CudaNonbondedQQForce.cuh" #include "utils.h" +#include __global__ void calc_nonbonded_qq_forces_kernel( q_atom_t* q_atoms, @@ -129,6 +130,9 @@ void calc_nonbonded_qq_forces_host() { cudaDeviceSynchronize(); cudaMemcpy(EQ_nonbond_qq, d_EQ_nonbond_qq, sizeof(E_nonbonded_t) * n_lambdas, cudaMemcpyDeviceToHost); cudaMemcpy(dvelocities, d_dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyDeviceToHost); + for (int i = 0; i < n_lambdas; i++) { + printf("QQ Energy State %d: Evdw = %f Ecoul = %f\n", i, EQ_nonbond_qq[i].Uvdw, EQ_nonbond_qq[i].Ucoul); + } } void init_nonbonded_qq_force_kernel_data() {} diff --git a/src/core/cuda/src/CudaNonbondedQWForce.cu b/src/core/cuda/src/CudaNonbondedQWForce.cu index 974c380c..68ac0f21 100644 --- a/src/core/cuda/src/CudaNonbondedQWForce.cu +++ b/src/core/cuda/src/CudaNonbondedQWForce.cu @@ -124,14 +124,13 @@ __global__ void calc_qw_dvel_vector_column(int n_qatoms, int n_waters, dvel_t* D } __device__ void calc_qw_dvel_matrix_incr(int row, int qi, int column, int n_lambdas, int n_qatoms, double crg_ow, double crg_hw, double A_O, double B_O, - coord_t* Qs, coord_t* Ws, double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE], double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE], calc_qw_t* qw, - q_catype_t* D_qcatypes, q_atype_t* D_qatypes, q_charge_t* D_qcharges, q_atom_t* D_qatoms, double* D_lambdas, topo_t D_topo) { + coord_t* Qs, coord_t* Ws, double Evdw_S[BLOCK_SIZE][BLOCK_SIZE], double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE], calc_qw_t* qw, + q_catype_t* D_qcatypes, q_charge_t* D_qcharges, q_atom_t* D_qatoms, topo_t D_topo) { int j; coord_t dO, dH1, dH2; double r2O, rH1, rH2, r6O, rO, r2H1, r2H2; double dvO, dvH1, dvH2; double V_a, V_b, VelO, VelH1, VelH2; - q_atype_t qa_type; q_catype_t qi_type; double ai_aii, ai_bii; @@ -160,28 +159,25 @@ __device__ void calc_qw_dvel_matrix_incr(int row, int qi, int column, int n_lamb dvH1 = 0; dvH2 = 0; - for (int state = 0; state < n_lambdas; state++) { - qa_type = D_qatypes[qi + n_qatoms * state]; - qi_type = D_qcatypes[qa_type.code - 1]; + qi_type = D_qcatypes[qi]; - ai_aii = qi_type.Ai; - ai_bii = qi_type.Bi; + ai_aii = qi_type.Ai; + ai_bii = qi_type.Bi; - V_a = ai_aii * A_O / (r6O * r6O); - V_b = ai_bii * B_O / (r6O); + V_a = ai_aii * A_O / (r6O * r6O); + V_b = ai_bii * B_O / (r6O); - VelO = D_topo.coulomb_constant * crg_ow * D_qcharges[qi + n_qatoms * state].q * rO; - VelH1 = D_topo.coulomb_constant * crg_hw * D_qcharges[qi + n_qatoms * state].q * rH1; - VelH2 = D_topo.coulomb_constant * crg_hw * D_qcharges[qi + n_qatoms * state].q * rH2; + VelO = D_topo.coulomb_constant * crg_ow * D_qcharges[qi].q * rO; + VelH1 = D_topo.coulomb_constant * crg_hw * D_qcharges[qi].q * rH1; + VelH2 = D_topo.coulomb_constant * crg_hw * D_qcharges[qi].q * rH2; - dvO += r2O * (-VelO - (12 * V_a - 6 * V_b)) * D_lambdas[state]; - dvH1 -= r2H1 * VelH1 * D_lambdas[state]; - dvH2 -= r2H2 * VelH2 * D_lambdas[state]; + dvO += r2O * (-VelO - (12 * V_a - 6 * V_b)); + dvH1 -= r2H1 * VelH1; + dvH2 -= r2H2 * VelH2; - // Update Q totals - Ecoul_S[row][state * BLOCK_SIZE + column] += (VelO + VelH1 + VelH2); - Evdw_S[row][state * BLOCK_SIZE + column] += (V_a - V_b); - } + // Update Q totals + Ecoul_S[row][column] += (VelO + VelH1 + VelH2); + Evdw_S[row][column] += (V_a - V_b); // Note r6O is not the usual 1/rO^6, but rather rO^6. be careful!!! @@ -204,7 +200,7 @@ __device__ void calc_qw_dvel_matrix_incr(int row, int qi, int column, int n_lamb __global__ void calc_qw_dvel_matrix(int n_qatoms, int n_waters, int n_lambdas, double crg_ow, double crg_hw, double A_O, double B_O, coord_t* X, coord_t* W, double* Evdw, double* Ecoul, calc_qw_t* MAT, - q_catype_t* D_qcatypes, q_atype_t* D_qatypes, q_charge_t* D_qcharges, q_atom_t* D_qatoms, double* D_lambdas, topo_t D_topo) { + q_catype_t* D_qcatypes, q_charge_t* D_qcharges, q_atom_t* D_qatoms, topo_t D_topo) { // Block index int bx = blockIdx.x; int by = blockIdx.y; @@ -214,8 +210,8 @@ __global__ void calc_qw_dvel_matrix(int n_qatoms, int n_waters, int n_lambdas, d int ty = threadIdx.y; // TODO implement >2 states on GPU - __shared__ double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE]; - __shared__ double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE]; + __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE]; + __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE]; Ecoul_S[ty][tx] = 0; Evdw_S[ty][tx] = 0; @@ -250,29 +246,26 @@ __global__ void calc_qw_dvel_matrix(int n_qatoms, int n_waters, int n_lambdas, d int column = bx * BLOCK_SIZE + tx; calc_qw_dvel_matrix_incr(ty, aStart + ty, tx, n_lambdas, n_qatoms, crg_ow, crg_hw, A_O, B_O, Qs, Ws, Evdw_S, Ecoul_S, - &qw, D_qcatypes, D_qatypes, D_qcharges, D_qatoms, D_lambdas, D_topo); + &qw, D_qcatypes, D_qcharges, D_qatoms, D_topo); MAT[column + n_waters * row] = qw; __syncthreads(); int rowlen = (n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE; - int collen = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE; if (tx == 0 && ty == 0) { // TODO implement >2 states on GPU - for (int state = 0; state < min(2, n_lambdas); state++) { - double tot_Evdw = 0; - double tot_Ecoul = 0; - for (int i = 0; i < BLOCK_SIZE; i++) { - for (int j = 0; j < BLOCK_SIZE; j++) { - tot_Evdw += Evdw_S[i][j + state * BLOCK_SIZE]; - tot_Ecoul += Ecoul_S[i][j + state * BLOCK_SIZE]; - } + double tot_Evdw = 0; + double tot_Ecoul = 0; + for (int i = 0; i < BLOCK_SIZE; i++) { + for (int j = 0; j < BLOCK_SIZE; j++) { + tot_Evdw += Evdw_S[i][j + BLOCK_SIZE]; + tot_Ecoul += Ecoul_S[i][j + BLOCK_SIZE]; } - Evdw[rowlen * collen * state + rowlen * by + bx] = tot_Evdw; - Ecoul[rowlen * collen * state + rowlen * by + bx] = tot_Ecoul; } + Evdw[rowlen * by + bx] = tot_Evdw; + Ecoul[rowlen * by + bx] = tot_Ecoul; } __syncthreads(); @@ -282,36 +275,27 @@ __global__ void calc_qw_dvel_matrix(int n_qatoms, int n_waters, int n_lambdas, d void calc_nonbonded_qw_forces_host_v2() { using namespace CudaNonbondedQWForce; - int mem_size_X = n_atoms_solute * sizeof(coord_t); - int mem_size_W = 3 * n_waters * sizeof(coord_t); int mem_size_DV_X = n_atoms_solute * sizeof(dvel_t); int mem_size_DV_W = 3 * n_waters * sizeof(dvel_t); - int mem_size_MAT = 3 * n_waters * n_qatoms * sizeof(calc_qw_t); int n_blocks_q = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE; int n_blocks_w = (n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE; - // TODO make Evdw & Ecoul work for # of states > 2 - int mem_size_QW_Evdw = min(n_lambdas, 2) * n_blocks_q * n_blocks_w * sizeof(double); - int mem_size_QW_Ecoul = min(n_lambdas, 2) * n_blocks_q * n_blocks_w * sizeof(double); CudaContext& ctx = CudaContext::instance(); auto X = ctx.d_coords; auto DV_X = ctx.d_dvelocities; - auto D_qcatypes = ctx.d_q_catypes; - auto D_qatypes = ctx.d_q_atypes; - auto D_qcharges = ctx.d_q_charges; + auto D_qcatypes = ctx.d_q_aggregate_catypes; + auto D_qcharges = ctx.d_q_aggregate_charges; auto D_qatoms = ctx.d_q_atoms; - auto D_lambdas = ctx.d_lambdas; dim3 threads, grid; threads = dim3(BLOCK_SIZE, BLOCK_SIZE); grid = dim3((n_waters + BLOCK_SIZE - 1) / threads.x, (n_qatoms + BLOCK_SIZE - 1) / threads.y); - double evdw, ecoul; calc_qw_dvel_matrix<<>>(n_qatoms, n_waters, n_lambdas, crg_ow, crg_hw, A_O, B_O, X, X + n_atoms_solute, D_QW_Evdw, D_QW_Ecoul, - QW_MAT, D_qcatypes, D_qatypes, D_qcharges, D_qatoms, D_lambdas, topo); + QW_MAT, D_qcatypes, D_qcharges, D_qatoms, topo); calc_qw_dvel_vector_column<<<((n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_qatoms, n_waters, DV_X, DV_X + n_atoms_solute, QW_MAT); calc_qw_dvel_vector_row<<<((n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_qatoms, n_waters, DV_X, DV_X + n_atoms_solute, QW_MAT, D_qatoms); @@ -331,17 +315,15 @@ void calc_nonbonded_qw_forces_host_v2() { cudaMemcpy(dvelocities, DV_X, mem_size_DV_X, cudaMemcpyDeviceToHost); cudaMemcpy(&dvelocities[n_atoms_solute], DV_X + n_atoms_solute, mem_size_DV_W, cudaMemcpyDeviceToHost); - // TODO make Evdw & Ecoul work for # of states > 2 - for (int state = 0; state < min(2, n_lambdas); state++) { - calc_energy_sum<<<1, threads>>>(n_blocks_q, n_blocks_w, D_QW_evdw_TOT, D_QW_ecoul_TOT, &D_QW_Evdw[state * n_blocks_w * n_blocks_q], - &D_QW_Ecoul[state * n_blocks_w * n_blocks_q], false); + calc_energy_sum<<<1, threads>>>(n_blocks_q, n_blocks_w, D_QW_evdw_TOT, D_QW_ecoul_TOT, D_QW_Evdw, + D_QW_Ecoul, false); - cudaMemcpy(&QW_evdw_TOT, D_QW_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost); - cudaMemcpy(&QW_ecoul_TOT, D_QW_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(&QW_evdw_TOT, D_QW_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(&QW_ecoul_TOT, D_QW_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost); - EQ_nonbond_qw[state].Uvdw += QW_evdw_TOT; - EQ_nonbond_qw[state].Ucoul += QW_ecoul_TOT; - } + E_nonbond_qx.Uvdw += QW_evdw_TOT; + E_nonbond_qx.Ucoul += QW_ecoul_TOT; + printf("QW Energy: Evdw = %f Ecoul = %f\n", QW_evdw_TOT, QW_ecoul_TOT); } void init_nonbonded_qw_force_kernel_data() { @@ -358,8 +340,8 @@ void init_nonbonded_qw_force_kernel_data() { int n_blocks_w = (n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE; int mem_size_MAT = 3 * n_waters * n_qatoms * sizeof(calc_qw_t); - int mem_size_QW_Evdw = min(n_lambdas, 2) * n_blocks_q * n_blocks_w * sizeof(double); - int mem_size_QW_Ecoul = min(n_lambdas, 2) * n_blocks_q * n_blocks_w * sizeof(double); + int mem_size_QW_Evdw = n_blocks_q * n_blocks_w * sizeof(double); + int mem_size_QW_Ecoul = n_blocks_q * n_blocks_w * sizeof(double); #ifdef DEBUG printf("Allocating QW_MAT\n"); #endif diff --git a/src/core/system.cu b/src/core/system.cu index afb999cf..ee50ee61 100644 --- a/src/core/system.cu +++ b/src/core/system.cu @@ -1,3 +1,4 @@ +#include #include "system.h" #include "utils.h" #include "parse.h" @@ -1094,7 +1095,6 @@ void calc_integration_step(int iteration) { calc_bonded_forces(); clock_t end_bonded = clock(); - clock_t start_pp, end_pp, start_qp, end_qp; if (run_gpu) { start_qp = clock(); From f6c57c9b2bafba18068aebc6934c90f1315235ac Mon Sep 17 00:00:00 2001 From: "shen.guo" Date: Tue, 13 Jan 2026 09:33:36 +0100 Subject: [PATCH 4/5] fix array idx --- src/core/cuda/src/CudaNonbondedQWForce.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core/cuda/src/CudaNonbondedQWForce.cu b/src/core/cuda/src/CudaNonbondedQWForce.cu index 68ac0f21..049a38aa 100644 --- a/src/core/cuda/src/CudaNonbondedQWForce.cu +++ b/src/core/cuda/src/CudaNonbondedQWForce.cu @@ -260,8 +260,8 @@ __global__ void calc_qw_dvel_matrix(int n_qatoms, int n_waters, int n_lambdas, d double tot_Ecoul = 0; for (int i = 0; i < BLOCK_SIZE; i++) { for (int j = 0; j < BLOCK_SIZE; j++) { - tot_Evdw += Evdw_S[i][j + BLOCK_SIZE]; - tot_Ecoul += Ecoul_S[i][j + BLOCK_SIZE]; + tot_Evdw += Evdw_S[i][j]; + tot_Ecoul += Ecoul_S[i][j]; } } Evdw[rowlen * by + bx] = tot_Evdw; From c585cac11bd928965ece60fd6de748a0ad087db3 Mon Sep 17 00:00:00 2001 From: "shen.guo" Date: Tue, 13 Jan 2026 09:55:28 +0100 Subject: [PATCH 5/5] revert no need commit --- src/core/cuda/src/CudaNonbondedPPForce.cu | 288 ++++------------------ src/core/parse.cu | 4 - 2 files changed, 47 insertions(+), 245 deletions(-) diff --git a/src/core/cuda/src/CudaNonbondedPPForce.cu b/src/core/cuda/src/CudaNonbondedPPForce.cu index 4b2b67b2..d32db32b 100644 --- a/src/core/cuda/src/CudaNonbondedPPForce.cu +++ b/src/core/cuda/src/CudaNonbondedPPForce.cu @@ -1,11 +1,11 @@ -#include - #include "cuda/include/CudaContext.cuh" #include "cuda/include/CudaNonbondedPPForce.cuh" #include "utils.h" namespace CudaNonbondedPPForce { bool is_initialized = false; +dvel_t* PP_MAT; +double *D_PP_Evdw, *D_PP_Ecoul; double *D_PP_evdw_TOT, *D_PP_ecoul_TOT; __device__ void calc_pp_dvel_matrix_incr(int row, int pi, int column, int pj, @@ -65,229 +65,6 @@ __device__ void calc_pp_dvel_matrix_incr(int row, int pi, int column, int pj, *Evdw += (V_a - V_b); } -__device__ __forceinline__ void idx2xy(int n, int t, int& x, int& y) { - x = (int)floorf((2 * n + 1 - sqrtf((2 * n + 1) * (2 * n + 1) - 8 * t)) * 0.5f); - y = t - (x * n - (x * (x - 1) >> 1)); - if (y < 0) { - x--; - y += (n - x); - } - y += x; -} - -__device__ __forceinline__ double shfl(double v, int srcLane, unsigned mask = 0xffffffffu) { - int2 a = *reinterpret_cast(&v); - a.x = __shfl_sync(mask, a.x, srcLane); - a.y = __shfl_sync(mask, a.y, srcLane); - return *reinterpret_cast(&a); -} - -__device__ __forceinline__ coord_t shfl_coord(coord_t v, int srcLane, unsigned mask = 0xffffffffu) { - v.x = shfl(v.x, srcLane, mask); - v.y = shfl(v.y, srcLane, mask); - v.z = shfl(v.z, srcLane, mask); - return v; -} - -__device__ void calculate_unforce_bound( - // const int x_idx, - // const int y_idx, - - const coord_t& x, - const coord_t& y, - - const double x_charge, - const double y_charge, - - const double x_aii, - const double y_aii, - - const double x_bii, - const double y_bii, - - const double coulomb_constant, - - const double scaling, - - double& evdw, - double& ecoul, - double& dv) { - double3 d = {y.x - x.x, y.y - x.y, y.z - x.z}; - - double r2 = 1.0 / (d.x * d.x + d.y * d.y + d.z * d.z); - double r = sqrt(r2); - double r6 = r2 * r2 * r2; - - ecoul = scaling * coulomb_constant * x_charge * y_charge * r; - - double v_a = r6 * r6 * x_aii * y_aii; - double v_b = r6 * x_bii * y_bii; - evdw = v_a - v_b; - dv = r2 * (-ecoul - 12.0 * v_a + 6.0 * v_b); -} - -__global__ void calc_pp( - const int N, - const int n_atoms_solute, - - const charge_t* D_charges, // charge type of each atom - const ccharge_t* D_ccharges, // charge value of each charge type - - const atype_t* D_atypes, - const catype_t* D_catypes, - - const topo_t D_topo, - - const bool* D_excluded, - - const int* D_LJ_matrix, - - const p_atom_t* D_patoms, - - coord_t* X, - dvel_t* DV_X, - - double* Evdw_TOT, - double* Ecoul_TOT) { - const int nBlocks = (N + 31) >> 5; // ceil(N/32) - const int totalTiles = nBlocks * (nBlocks + 1) / 2; - - const int warpsPerBlock = blockDim.x >> 5; // THREAD_BLOCK_SIZE / 32 - const int tid = threadIdx.x; - const int lane = tid & 31; - const int warpInBlock = tid >> 5; - - const int tile = blockIdx.x * warpsPerBlock + warpInBlock; - if (tile >= totalTiles) { - return; // extra warps in last block - } - - // Map tile -> (x,y) with y>=x (upper triangle) - int tile_x, tile_y; - idx2xy(nBlocks, tile, tile_x, tile_y); - - const int baseX = tile_x << 5; // x*32 - const int baseY = tile_y << 5; // y*32 - - int x_idx = baseX + lane; - int y_idx = baseY + lane; - - int x_atom = (x_idx < N) ? (D_patoms[x_idx].a - 1) : -1; - int y_atom = (y_idx < N) ? (D_patoms[y_idx].a - 1) : -1; - - coord_t invalid = {-1e9, -1e9, -1e9}; - coord_t x_coord = (x_atom >= 0) ? X[x_atom] : invalid; - coord_t y_coord = (y_atom >= 0) ? X[y_atom] : invalid; - - bool x_excluded = (x_atom >= 0) ? D_excluded[x_atom] : true; - bool y_excluded = (y_atom >= 0) ? D_excluded[y_atom] : true; - - double x_charge = (x_atom >= 0) ? D_ccharges[D_charges[x_atom].code - 1].charge : 0.0; - double y_charge = (y_atom >= 0) ? D_ccharges[D_charges[y_atom].code - 1].charge : 0.0; - - catype_t x_type = (x_atom >= 0) ? D_catypes[D_atypes[x_atom].code - 1] : catype_t{}; - catype_t y_type = (y_atom >= 0) ? D_catypes[D_atypes[y_atom].code - 1] : catype_t{}; - double x_aii_normal = x_type.aii_normal; - double y_aii_normal = y_type.aii_normal; - double x_aii_14 = x_type.aii_1_4; - double y_aii_14 = y_type.aii_1_4; - double x_bii_normal = x_type.bii_normal; - double y_bii_normal = y_type.bii_normal; - double x_bii_14 = x_type.bii_1_4; - double y_bii_14 = y_type.bii_1_4; - - double3 x_force = {0.0, 0.0, 0.0}; - double3 y_force = {0.0, 0.0, 0.0}; - - double evdw_sum = 0; - double ecoul_sum = 0; - - const unsigned mask = 0xffffffffu; - - auto is_valid = [&]() -> bool { - if (x_idx >= N || y_idx >= N) { - return false; - } - if (tile_x == tile_y && x_idx <= y_idx) { - return false; - } - if (x_excluded || y_excluded) { - return false; - } - bool bond23 = (D_LJ_matrix[x_atom * n_atoms_solute + y_atom] == 3); - if (bond23) { - return false; - } - - return true; - }; - - // shuffle to get next y - auto do_shuffle = [&]() { - const int src = (lane + 1) & 31; - y_idx = __shfl_sync(mask, y_idx, src); - y_atom = __shfl_sync(mask, y_atom, src); - y_coord = shfl_coord(y_coord, src, mask); - y_excluded = __shfl_sync(mask, y_excluded, src); - y_charge = __shfl_sync(mask, y_charge, src); - y_aii_normal = __shfl_sync(mask, y_aii_normal, src); - y_aii_14 = __shfl_sync(mask, y_aii_14, src); - y_bii_normal = __shfl_sync(mask, y_bii_normal, src); - y_bii_14 = __shfl_sync(mask, y_bii_14, src); - - // shuffle y_force - y_force.x = shfl(y_force.x, src, mask); - y_force.y = shfl(y_force.y, src, mask); - y_force.z = shfl(y_force.z, src, mask); - }; - - for (int i = 0; i < 32; i++) { - if (is_valid()) { - bool bond14 = (D_LJ_matrix[y_atom * n_atoms_solute + x_atom] == 1); - double scaling = bond14 ? D_topo.el14_scale : 1.0; - double ai_aii = bond14 ? x_aii_14 : x_aii_normal; - double aj_aii = bond14 ? y_aii_14 : y_aii_normal; - double ai_bii = bond14 ? x_bii_14 : x_bii_normal; - double aj_bii = bond14 ? y_bii_14 : y_bii_normal; - double evdw = 0.0, ecoul = 0.0, dv = 0.0; - calculate_unforce_bound( - x_coord, y_coord, x_charge, y_charge, ai_aii, aj_aii, ai_bii, aj_bii, - D_topo.coulomb_constant, scaling, evdw, ecoul, dv); - evdw_sum += evdw; - ecoul_sum += ecoul; - double3 d = {y_coord.x - x_coord.x, y_coord.y - x_coord.y, y_coord.z - x_coord.z}; - x_force.x -= dv * d.x; - x_force.y -= dv * d.y; - x_force.z -= dv * d.z; - - y_force.x += dv * d.x; - y_force.y += dv * d.y; - y_force.z += dv * d.z; - } - do_shuffle(); - } - - if (x_atom >= 0) { - atomicAdd(&DV_X[x_atom].x, x_force.x); - atomicAdd(&DV_X[x_atom].y, x_force.y); - atomicAdd(&DV_X[x_atom].z, x_force.z); - } - if (y_atom >= 0) { - atomicAdd(&DV_X[y_atom].x, y_force.x); - atomicAdd(&DV_X[y_atom].y, y_force.y); - atomicAdd(&DV_X[y_atom].z, y_force.z); - } - - for (int offset = 16; offset > 0; offset >>= 1) { - evdw_sum += __shfl_down_sync(0xffffffffu, evdw_sum, offset); - ecoul_sum += __shfl_down_sync(0xffffffffu, ecoul_sum, offset); - } - if (lane == 0) { - atomicAdd(Evdw_TOT, evdw_sum); - atomicAdd(Ecoul_TOT, ecoul_sum); - } -} - __global__ void calc_pp_dvel_matrix(int n_patoms, int n_atoms_solute, coord_t* X, double* Evdw, double* Ecoul, dvel_t* PP_MAT, ccharge_t* D_ccharges, charge_t* D_charges, catype_t* D_catypes, atype_t* D_atypes, p_atom_t* D_patoms, int* D_LJ_matrix, bool* D_excluded, topo_t D_topo) { @@ -444,6 +221,8 @@ void calc_nonbonded_pp_forces_host_v2() { int mem_size_DV_X = n_atoms_solute * sizeof(dvel_t); + int n_blocks = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE; + CudaContext& ctx = CudaContext::instance(); auto X = ctx.d_coords; auto DV_X = ctx.d_dvelocities; @@ -455,27 +234,32 @@ void calc_nonbonded_pp_forces_host_v2() { auto D_LJ_matrix = ctx.d_LJ_matrix; auto D_excluded = ctx.d_excluded; - cudaMemset(D_PP_evdw_TOT, 0, sizeof(double)); - cudaMemset(D_PP_ecoul_TOT, 0, sizeof(double)); + dim3 threads, grid; - const int thread_num = 128; - dim3 block_sz = dim3(thread_num); + threads = dim3(BLOCK_SIZE, BLOCK_SIZE); + grid = dim3((n_patoms + BLOCK_SIZE - 1) / threads.x, (n_patoms + BLOCK_SIZE - 1) / threads.y); - int tile_num_per_block = thread_num / 32; - int nBlocks = (n_patoms + 31) / 32; - int total_tiles = nBlocks * (nBlocks + 1) / 2; + calc_pp_dvel_matrix<<>>(n_patoms, n_atoms_solute, X, D_PP_Evdw, D_PP_Ecoul, PP_MAT, D_ccharges, D_charges, D_catypes, D_atypes, D_patoms, D_LJ_matrix, D_excluded, topo); + calc_pp_dvel_vector<<<((n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_patoms, DV_X, PP_MAT, D_patoms); - int grid_sz = (total_tiles + tile_num_per_block - 1) / tile_num_per_block; - dim3 grid = dim3(grid_sz); + cudaMemcpy(dvelocities, DV_X, mem_size_DV_X, cudaMemcpyDeviceToHost); - calc_pp<<>>(n_patoms, n_atoms_solute, D_charges, D_ccharges, - D_atypes, D_catypes, topo, D_excluded, - D_LJ_matrix, D_patoms, X, DV_X, D_PP_evdw_TOT, - D_PP_ecoul_TOT); +#ifdef DEBUG + cudaMemcpy(h_PP_MAT, PP_MAT, mem_size_PP_MAT, cudaMemcpyDeviceToHost); +#endif - cudaDeviceSynchronize(); - cudaMemcpy(dvelocities, DV_X, mem_size_DV_X, cudaMemcpyDeviceToHost); +#ifdef DEBUG + for (int i = 0; i < n_patoms; i++) { + for (int j = 0; j < n_patoms; j++) { + if (h_PP_MAT[i * n_patoms + j].x > 100) + printf("PP_MAT[%d][%d] = %f %f %f\n", i, j, h_PP_MAT[i * n_patoms + j].x, h_PP_MAT[i * n_patoms + j].y, h_PP_MAT[i * n_patoms + j].z); + } + } +#endif + cudaMemset(D_PP_evdw_TOT, 0, sizeof(double)); + cudaMemset(D_PP_ecoul_TOT, 0, sizeof(double)); + calc_energy_sum<<<1, threads>>>(n_blocks, n_blocks, D_PP_evdw_TOT, D_PP_ecoul_TOT, D_PP_Evdw, D_PP_Ecoul, true); double PP_evdw_TOT, PP_ecoul_TOT; cudaMemcpy(&PP_evdw_TOT, D_PP_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost); cudaMemcpy(&PP_ecoul_TOT, D_PP_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost); @@ -488,6 +272,25 @@ void init_nonbonded_pp_force_kernel_data() { using namespace CudaNonbondedPPForce; if (!is_initialized) { + int n_blocks = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE; + + int mem_size_PP_MAT = n_patoms * n_patoms * sizeof(dvel_t); + + int mem_size_PP_Evdw = n_blocks * n_blocks * sizeof(double); + int mem_size_PP_Ecoul = n_blocks * n_blocks * sizeof(double); +#ifdef DEBUG + printf("Allocating PP_MAT\n"); +#endif + check_cudaMalloc((void**)&PP_MAT, mem_size_PP_MAT); +#ifdef DEBUG + printf("Allocating D_PP_Evdw\n"); +#endif + check_cudaMalloc((void**)&D_PP_Evdw, mem_size_PP_Evdw); +#ifdef DEBUG + printf("Allocating D_PP_Ecoul\n"); +#endif + check_cudaMalloc((void**)&D_PP_Ecoul, mem_size_PP_Ecoul); + check_cudaMalloc((void**)&D_PP_evdw_TOT, sizeof(double)); check_cudaMalloc((void**)&D_PP_ecoul_TOT, sizeof(double)); @@ -498,8 +301,11 @@ void init_nonbonded_pp_force_kernel_data() { void cleanup_nonbonded_pp_force() { using namespace CudaNonbondedPPForce; if (is_initialized) { + cudaFree(PP_MAT); + cudaFree(D_PP_Evdw); + cudaFree(D_PP_Ecoul); cudaFree(D_PP_evdw_TOT); cudaFree(D_PP_ecoul_TOT); is_initialized = false; } -} +} \ No newline at end of file diff --git a/src/core/parse.cu b/src/core/parse.cu index c5518223..1b9e0f8d 100644 --- a/src/core/parse.cu +++ b/src/core/parse.cu @@ -753,7 +753,6 @@ void init_ngbrs14(const char* filename) { int jx = (lineI + i + 1) % lines; // if (ix < 100 && jx < 100) printf("i = %d j = %d\n", ix+1, jx+1); LJ_matrix[ix * n_atoms_solute + jx] = 1; - LJ_matrix[jx * n_atoms_solute + ix] = 1; } } lineI++; @@ -798,7 +797,6 @@ void init_ngbrs23(const char* filename) { int jx = (lineI + i + 1) % lines; // if (ix < 100 && jx < 100) printf("i = %d j = %d\n", ix+1, jx+1); LJ_matrix[ix * n_atoms_solute + jx] = 3; - LJ_matrix[jx * n_atoms_solute + ix] = 3; } } lineI++; @@ -821,7 +819,6 @@ void init_ngbrs14_long(const char* filename) { int ix = atoi(file.buffer[i+1][0])-1; int jx = atoi(file.buffer[i+1][1])-1; LJ_matrix[ix * n_atoms_solute + jx] = 1; - LJ_matrix[jx * n_atoms_solute + ix] = 1; } clean_csv(file); @@ -841,7 +838,6 @@ void init_ngbrs23_long(const char* filename) { int ix = atoi(file.buffer[i+1][0])-1; int jx = atoi(file.buffer[i+1][1])-1; LJ_matrix[ix * n_atoms_solute + jx] = 3; - LJ_matrix[jx * n_atoms_solute + ix] = 3; } clean_csv(file);