diff --git a/src/core/cuda/include/CudaContext.cuh b/src/core/cuda/include/CudaContext.cuh index 3c2883fa..8ed0369d 100644 --- a/src/core/cuda/include/CudaContext.cuh +++ b/src/core/cuda/include/CudaContext.cuh @@ -2,6 +2,10 @@ #include +#include +#include +#include + #include "system.h" #include "utils.h" @@ -105,6 +109,27 @@ class CudaContext { charge_t* d_charges; p_atom_t* d_p_atoms; + /* + Other helper arrays + */ + ccharge_t* d_charge_table_all; // Device copy of h_charge_table_all + catype_t* d_catype_table_all; // Device copy of h_catype_table_all + std::map, int> catype_to_type_host; + int* d_p_charge_types; + int* d_w_charge_types; + int* d_q_charge_types; // [0, lambdas * n_qatoms) is the normal q_charge_type, [lambdas * n_qatoms, ... ) is the lambda-scaled q_charge_type] + + int* d_p_catype_types; + int* d_w_catype_types; + int* d_q_catype_types; // [0, lambdas * n_qatoms) is the normal q_catype_type, [lambdas * n_qatoms, ... ) is the lambda-scaled q_catype_type] + + int *d_p_atoms_list; + int *d_w_atoms_list; + int *d_q_atoms_list; + std::vector h_p_atoms_list; + std::vector h_w_atoms_list; + std::vector h_q_atoms_list; + static CudaContext& instance() { static CudaContext ctx; return ctx; @@ -121,6 +146,16 @@ class CudaContext { void sync_all_to_host(); void reset_energies(); + std::array get_catype_key(const catype_t& catype) { + return { + catype.aii_normal, + catype.bii_normal, + catype.aii_polar, + catype.bii_polar, + catype.aii_1_4, + catype.bii_1_4}; + } + private: CudaContext() = default; @@ -129,6 +164,10 @@ class CudaContext { ~CudaContext() { free(); } CudaContext(const CudaContext&) = delete; CudaContext& operator=(const CudaContext&) = delete; + + void initialize_charge_tables_host(); + void initialize_catype_tables_host(); + void initialize_atom_lists_host(); }; template void CudaContext::sync_array_to_device(T* dst, const T* src, int count) { diff --git a/src/core/cuda/include/CudaNonbondedForce.cuh b/src/core/cuda/include/CudaNonbondedForce.cuh new file mode 100644 index 00000000..891820fa --- /dev/null +++ b/src/core/cuda/include/CudaNonbondedForce.cuh @@ -0,0 +1,22 @@ +#pragma once +#include "system.h" + +void init_nonbonded_force_kernel_data(); + +std::pair calc_nonbonded_force_host( + int nx, + int ny, + int* x_idx_list, + int* y_idx_list, + bool symmetric, + + const int* x_charges_types, + const int* y_charges_types, + const ccharge_t* ccharges_table, + + const int* x_atypes_types, + const int* y_atypes_types, + const catype_t* catypes_table +); + +void cleanup_nonbonded_force(); \ No newline at end of file diff --git a/src/core/cuda/src/CudaContext.cu b/src/core/cuda/src/CudaContext.cu index 19f22165..183172bc 100644 --- a/src/core/cuda/src/CudaContext.cu +++ b/src/core/cuda/src/CudaContext.cu @@ -21,7 +21,6 @@ void CudaContext::init() { check_cudaMalloc((void**)&d_atypes, sizeof(atype_t) * n_atypes); check_cudaMalloc((void**)&d_catypes, sizeof(catype_t) * n_catypes); - check_cudaMalloc((void**)&d_q_atoms, sizeof(q_atom_t) * n_qatoms); check_cudaMalloc((void**)&d_q_charges, sizeof(q_charge_t) * n_qatoms * n_lambdas); check_cudaMalloc((void**)&d_LJ_matrix, sizeof(int) * n_atoms_solute * n_atoms_solute); @@ -54,6 +53,10 @@ void CudaContext::init() { check_cudaMalloc((void**)&d_charges, sizeof(charge_t) * n_charges); check_cudaMalloc((void**)&d_p_atoms, sizeof(p_atom_t) * n_patoms); + initialize_atom_lists_host(); + initialize_charge_tables_host(); + initialize_catype_tables_host(); + sync_all_to_device(); } @@ -208,6 +211,23 @@ void CudaContext::free() { cudaFree(d_ccharges); cudaFree(d_charges); cudaFree(d_p_atoms); + cudaFree(d_charge_table_all); + cudaFree(d_catype_table_all); + cudaFree(d_p_charge_types); + cudaFree(d_w_charge_types); + cudaFree(d_q_charge_types); + cudaFree(d_p_catype_types); + cudaFree(d_w_catype_types); + cudaFree(d_q_catype_types); + + d_charge_table_all = nullptr; + d_catype_table_all = nullptr; + d_p_charge_types = nullptr; + d_w_charge_types = nullptr; + d_q_charge_types = nullptr; + d_p_catype_types = nullptr; + d_w_catype_types = nullptr; + d_q_catype_types = nullptr; } void CudaContext::reset_energies() { @@ -215,3 +235,227 @@ void CudaContext::reset_energies() { cudaMemset(d_EQ_nonbond_qq, 0, sizeof(E_nonbonded_t) * n_lambdas); cudaMemset(d_EQ_restraint, 0, sizeof(E_restraint_t) * n_lambdas); } + +void CudaContext::initialize_atom_lists_host() { + h_p_atoms_list.clear(); + for (int i = 0; i < n_patoms; i++) { + int id = p_atoms[i].a - 1; + if (!excluded[id]) { + h_p_atoms_list.push_back(id); + } + } + + h_w_atoms_list.clear(); + for (int i = n_atoms_solute; i < n_atoms; i++) { + int id = i; + if (!excluded[id]) { + h_w_atoms_list.push_back(id); + } + } + printf("Number of water atoms: %d, number of all water atoms: %d\n", (int)h_w_atoms_list.size(), n_atoms - n_atoms_solute); + + h_q_atoms_list.clear(); + for (int i = 0; i < n_qatoms; i++) { + int id = q_atoms[i].a - 1; + if (!excluded[id]) { + h_q_atoms_list.push_back(id); + } + } + + check_cudaMalloc((void**)&d_p_atoms_list, sizeof(int) * h_p_atoms_list.size()); + cudaMemcpy(d_p_atoms_list, h_p_atoms_list.data(), sizeof(int) * h_p_atoms_list.size(), cudaMemcpyHostToDevice); + check_cudaMalloc((void**)&d_w_atoms_list, sizeof(int) * h_w_atoms_list.size()); + cudaMemcpy(d_w_atoms_list, h_w_atoms_list.data(), sizeof(int) * h_w_atoms_list.size(), cudaMemcpyHostToDevice); + check_cudaMalloc((void**)&d_q_atoms_list, sizeof(int) * h_q_atoms_list.size()); + cudaMemcpy(d_q_atoms_list, h_q_atoms_list.data(), sizeof(int) * h_q_atoms_list.size(), cudaMemcpyHostToDevice); +} + +void CudaContext::initialize_charge_tables_host() { + std::map charge_to_type_host; + std::vector h_charge_table_all; // h_charge_table_all[charge type] = (code, charge) + + auto add_charge = [&](double charge) { + if (charge_to_type_host.count(charge) == 0) { + int sz = h_charge_table_all.size(); + ccharge_t new_ccharge; + new_ccharge.code = sz; + new_ccharge.charge = charge; + h_charge_table_all.push_back(new_ccharge); + charge_to_type_host[charge] = sz; + } + }; + + for (int i = 0; i < n_ccharges; i++) { + double charge = ccharges[i].charge; + add_charge(charge); + } + for (int state = 0; state < n_lambdas; state++) { + for (int i = 0; i < n_qatoms; i++) { + double charge = q_charges[i + n_qatoms * state].q; + add_charge(charge); + charge *= lambdas[state]; + add_charge(charge); + } + } + + std::vector p_charge_types(h_p_atoms_list.size()); + for (int i = 0; i < h_p_atoms_list.size(); i++) { + int id = h_p_atoms_list[i]; + double charge = ccharges[charges[id].code - 1].charge; + p_charge_types[i] = charge_to_type_host[charge]; + } + + int h_q_atoms_size = h_q_atoms_list.size(); + std::vector q_charge_types(h_q_atoms_size * n_lambdas * 2); + std::map q_idx; + for (int i = 0; i < n_qatoms; i++) { + int id = q_atoms[i].a - 1; + if (!excluded[id]) { + q_idx[id] = i; + } + } + + for (int state = 0; state < n_lambdas; state++) { + for (int i = 0; i < h_q_atoms_size; i++) { + int id = h_q_atoms_list[i]; + double charge = q_charges[q_idx[id] + n_qatoms * state].q; + q_charge_types[state * h_q_atoms_size + i] = charge_to_type_host[charge]; + charge *= lambdas[state]; + q_charge_types[state * h_q_atoms_size + i + h_q_atoms_size * n_lambdas] = charge_to_type_host[charge]; + } + } + + int h_w_atoms_size = h_w_atoms_list.size(); + std::vector w_charge_types(h_w_atoms_size); + for (int i = 0; i < h_w_atoms_size; i++) { + int id = h_w_atoms_list[i]; + double charge = ccharges[charges[id].code - 1].charge; + w_charge_types[i] = charge_to_type_host[charge]; + } + + check_cudaMalloc((void**)&d_charge_table_all, sizeof(ccharge_t) * h_charge_table_all.size()); + cudaMemcpy(d_charge_table_all, h_charge_table_all.data(), sizeof(ccharge_t) * h_charge_table_all.size(), cudaMemcpyHostToDevice); + check_cudaMalloc((void**)&d_p_charge_types, sizeof(int) * p_charge_types.size()); + cudaMemcpy(d_p_charge_types, p_charge_types.data(), sizeof(int) * p_charge_types.size(), cudaMemcpyHostToDevice); + check_cudaMalloc((void**)&d_q_charge_types, sizeof(int) * q_charge_types.size()); + cudaMemcpy(d_q_charge_types, q_charge_types.data(), sizeof(int) * q_charge_types.size(), cudaMemcpyHostToDevice); + check_cudaMalloc((void**)&d_w_charge_types, sizeof(int) * w_charge_types.size()); + cudaMemcpy(d_w_charge_types, w_charge_types.data(), sizeof(int) * w_charge_types.size(), cudaMemcpyHostToDevice); +} + +void CudaContext::initialize_catype_tables_host() { + std::vector h_catype_table_all; // h_catype_table_all[catype code] = catype_t + catype_to_type_host.clear(); + + auto add_catype = [&](catype_t catype) { + auto key = get_catype_key(catype); + if (catype_to_type_host.count(key) == 0) { + int sz = h_catype_table_all.size(); + catype.code = sz; + h_catype_table_all.push_back(catype); + catype_to_type_host[key] = sz; + } + }; + + for (int i = 0; i < n_catypes; i++) { + add_catype(catypes[i]); + } + + for (int state = 0; state < n_lambdas; state++) { + for (int i = 0; i < n_qatoms; i++) { + const q_atype_t& qat = q_atypes[i + n_qatoms * state]; + const q_catype_t& catype = q_catypes[qat.code - 1]; + + catype_t new_catype; + new_catype.m = catype.m; + new_catype.aii_normal = catype.Ai; + new_catype.bii_normal = catype.Bi; + new_catype.aii_polar = catype.Ci; + new_catype.bii_polar = catype.ai; + new_catype.aii_1_4 = catype.Ai_14; + new_catype.bii_1_4 = catype.Bi_14; + + add_catype(new_catype); + + new_catype.m *= lambdas[state]; + new_catype.aii_normal *= lambdas[state]; + new_catype.bii_normal *= lambdas[state]; + new_catype.aii_polar *= lambdas[state]; + new_catype.bii_polar *= lambdas[state]; + new_catype.aii_1_4 *= lambdas[state]; + new_catype.bii_1_4 *= lambdas[state]; + add_catype(new_catype); + } + } + + std::vector p_catype_types(h_p_atoms_list.size()); + for (int i = 0; i < h_p_atoms_list.size(); i++) { + int id = h_p_atoms_list[i]; + auto catype = catypes[atypes[id].code - 1]; + auto key = get_catype_key(catype); + p_catype_types[i] = catype_to_type_host[key]; + } + + int h_q_atoms_size = h_q_atoms_list.size(); + std::vector q_catype_types(h_q_atoms_size * n_lambdas * 2); + std::map q_idx; + for (int i = 0; i < n_qatoms; i++) { + int id = q_atoms[i].a - 1; + if (!excluded[id]) { + q_idx[id] = i; + } + } + + + for (int state = 0; state < n_lambdas; state++) { + for (int i = 0; i < h_q_atoms_size; i++) { + int id = h_q_atoms_list[i]; + const q_atype_t& qat = q_atypes[q_idx[id] + n_qatoms * state]; + const q_catype_t& catype = q_catypes[qat.code - 1]; + + catype_t normal_catype; + normal_catype.m = catype.m; + normal_catype.aii_normal = catype.Ai; + normal_catype.bii_normal = catype.Bi; + normal_catype.aii_polar = catype.Ci; + normal_catype.bii_polar = catype.ai; + normal_catype.aii_1_4 = catype.Ai_14; + normal_catype.bii_1_4 = catype.Bi_14; + + auto key_normal = get_catype_key(normal_catype); + q_catype_types[state * h_q_atoms_size + i] = catype_to_type_host[key_normal]; + + catype_t scaled_catype; + scaled_catype.m = catype.m * lambdas[state]; + scaled_catype.aii_normal = catype.Ai * lambdas[state]; + scaled_catype.bii_normal = catype.Bi * lambdas[state]; + scaled_catype.aii_polar = catype.Ci * lambdas[state]; + scaled_catype.bii_polar = catype.ai * lambdas[state]; + scaled_catype.aii_1_4 = catype.Ai_14 * lambdas[state]; + scaled_catype.bii_1_4 = catype.Bi_14 * lambdas[state]; + + auto key_scaled = get_catype_key(scaled_catype); + q_catype_types[state * h_q_atoms_size + i + h_q_atoms_size * n_lambdas] = catype_to_type_host[key_scaled]; + } + } + + int h_w_atoms_size = h_w_atoms_list.size(); + std::vector w_catype_types(h_w_atoms_size); + for (int i = 0; i < h_w_atoms_size; i++) { + int id = h_w_atoms_list[i]; + auto catype = catypes[atypes[id].code - 1]; + auto key = get_catype_key(catype); + w_catype_types[i] = catype_to_type_host[key]; + // printf("Water atom %d: catype code %d mapped to %d, data: m=%f, aii_normal=%f, bii_normal=%f, aii_polar=%f, bii_polar=%f, aii_1_4=%f, bii_1_4=%f\n", id, atypes[id].code, w_catype_types[i], catype.m, catype.aii_normal, catype.bii_normal, catype.aii_polar, catype.bii_polar, catype.aii_1_4, catype.bii_1_4); + } + printf("Total water atom number: %d, w_catype_types size: %lu\n", h_w_atoms_size, w_catype_types.size()); + + check_cudaMalloc((void**)&d_catype_table_all, sizeof(catype_t) * h_catype_table_all.size()); + cudaMemcpy(d_catype_table_all, h_catype_table_all.data(), sizeof(catype_t) * h_catype_table_all.size(), cudaMemcpyHostToDevice); + check_cudaMalloc((void**)&d_p_catype_types, sizeof(int) * p_catype_types.size()); + cudaMemcpy(d_p_catype_types, p_catype_types.data(), sizeof(int) * p_catype_types.size(), cudaMemcpyHostToDevice); + check_cudaMalloc((void**)&d_q_catype_types, sizeof(int) * q_catype_types.size()); + cudaMemcpy(d_q_catype_types, q_catype_types.data(), sizeof(int) * q_catype_types.size(), cudaMemcpyHostToDevice); + check_cudaMalloc((void**)&d_w_catype_types, sizeof(int) * w_catype_types.size()); + cudaMemcpy(d_w_catype_types, w_catype_types.data(), sizeof(int) * w_catype_types.size(), cudaMemcpyHostToDevice); +} diff --git a/src/core/cuda/src/CudaNonbondedForce.cu b/src/core/cuda/src/CudaNonbondedForce.cu new file mode 100644 index 00000000..fa03ad9d --- /dev/null +++ b/src/core/cuda/src/CudaNonbondedForce.cu @@ -0,0 +1,371 @@ +#include + +#include "cuda/include/CudaContext.cuh" +#include "cuda/include/CudaNonbondedForce.cuh" +#include "system.h" + +namespace CudaNonbondedForce { +bool is_initialized = false; +double *d_evdw_total, *d_ecoul_total; + +__device__ __forceinline__ void idx2xy(int n, int t, int& x, int& y) { + x = (int)floorf((2 * n + 1 - sqrtf((2 * n + 1) * (2 * n + 1) - 8 * t)) * 0.5f); + y = t - (x * n - (x * (x - 1) >> 1)); + if (y < 0) { + x--; + y += (n - x); + } + y += x; +} + +__device__ __forceinline__ double shfl(double v, int srcLane, unsigned mask = 0xffffffffu) { + int2 a = *reinterpret_cast(&v); + a.x = __shfl_sync(mask, a.x, srcLane); + a.y = __shfl_sync(mask, a.y, srcLane); + return *reinterpret_cast(&a); +} + +__device__ __forceinline__ coord_t shfl_coord(coord_t v, int srcLane, unsigned mask = 0xffffffffu) { + v.x = shfl(v.x, srcLane, mask); + v.y = shfl(v.y, srcLane, mask); + v.z = shfl(v.z, srcLane, mask); + return v; +} + +__device__ __forceinline__ catype_t shfl_catype(catype_t v, int srcLane, unsigned mask = 0xffffffffu) { + v.code = __shfl_sync(mask, v.code, srcLane); + v.m = shfl(v.m, srcLane, mask); + v.aii_normal = shfl(v.aii_normal, srcLane, mask); + v.bii_normal = shfl(v.bii_normal, srcLane, mask); + v.aii_polar = shfl(v.aii_polar, srcLane, mask); + v.bii_polar = shfl(v.bii_polar, srcLane, mask); + v.aii_1_4 = shfl(v.aii_1_4, srcLane, mask); + v.bii_1_4 = shfl(v.bii_1_4, srcLane, mask); + return v; +} + +__device__ void calculate_unforce_bound( + // const int x_idx, + // const int y_idx, + + const coord_t& x, + const coord_t& y, + + const double x_charge, + const double y_charge, + + const double x_aii, + const double y_aii, + + const double x_bii, + const double y_bii, + + const double coulomb_constant, + + const double scaling, + + double& evdw, + double& ecoul, + double& dv) { + double3 d = {x.x - y.x, x.y - y.y, x.z - y.z}; + double r = rsqrt(d.x * d.x + d.y * d.y + d.z * d.z); + double r2 = r * r; + double r6 = r2 * r2 * r2; + // double v_a = r6 * r6; + // double v_b = r6; + // ecoul = r; + // evdw = v_a - v_b; + // dv = r2 * (-ecoul - v_a + v_b); + + + ecoul = scaling * coulomb_constant * x_charge * y_charge * r; + + double v_a = r6 * r6 * x_aii * y_aii; + double v_b = r6 * x_bii * y_bii; + evdw = v_a - v_b; + dv = r2 * (-ecoul - 12.0 * v_a + 6.0 * v_b); +} + +__global__ void calc_nonbonded_force_kernel( + const int nx, + const int ny, + + const int* x_charges_types, + const int* y_charges_types, + const ccharge_t* ccharges_table, + + const int* x_atypes_types, + const int* y_atypes_types, + const catype_t* catypes_table, + + const topo_t d_topo, + + const bool* d_excluded, + + const int* d_LJ_matrix, + + const int* x_idx_list, + const int* y_idx_list, + + const coord_t* d_coords, + + dvel_t* d_dvelocities, + + double* evdw_tot, + double* ecoul_tot, + + bool symmetric, + + // helper variables + const int n_atoms_solute + +) { + const int x_block_num = (nx + 31) >> 5; + const int y_block_num = (ny + 31) >> 5; + + const int total_tiles = symmetric ? (x_block_num * (x_block_num + 1)) >> 1 : x_block_num * y_block_num; + + const int warps_per_block = blockDim.x >> 5; + const int tid = threadIdx.x; + const int lane = tid & 31; + const int warp_in_block = tid >> 5; + + const int tile = blockIdx.x * warps_per_block + warp_in_block; + if (tile >= total_tiles) return; + + int tile_x, tile_y; + if (symmetric) { + idx2xy(x_block_num, tile, tile_x, tile_y); + } else { + tile_x = tile % x_block_num; + tile_y = tile / x_block_num; + } + + const int base_x = tile_x << 5; + const int base_y = tile_y << 5; + + int x_idx = base_x + lane; + int y_idx = base_y + lane; + + int x_atom_idx = (x_idx < nx) ? x_idx_list[x_idx] : -1; + int y_atom_idx = (y_idx < ny) ? y_idx_list[y_idx] : -1; + + coord_t invalid = {-1e9, -1e9, -1e9}; + coord_t x_coord = (x_atom_idx >= 0) ? d_coords[x_atom_idx] : invalid; + coord_t y_coord = (y_atom_idx >= 0) ? d_coords[y_atom_idx] : invalid; + + bool x_excluded = (x_atom_idx >= 0) ? d_excluded[x_atom_idx] : true; + bool y_excluded = (y_atom_idx >= 0) ? d_excluded[y_atom_idx] : true; + + int x_charge_type_idx = (x_idx < nx) ? x_charges_types[x_idx] : -1; + int y_charge_type_idx = (y_idx < ny) ? y_charges_types[y_idx] : -1; + double x_charge = (x_atom_idx >= 0 && x_charge_type_idx >= 0) ? ccharges_table[x_charge_type_idx].charge : 0.0; + double y_charge = (y_atom_idx >= 0 && y_charge_type_idx >= 0) ? ccharges_table[y_charge_type_idx].charge : 0.0; + + int x_catype_type_idx = (x_idx < nx) ? x_atypes_types[x_idx] : -1; + int y_catype_type_idx = (y_idx < ny) ? y_atypes_types[y_idx] : -1; + catype_t x_type = (x_atom_idx >= 0 && x_catype_type_idx >= 0) ? catypes_table[x_catype_type_idx] : catype_t{}; + catype_t y_type = (y_atom_idx >= 0 && y_catype_type_idx >= 0) ? catypes_table[y_catype_type_idx] : catype_t{}; + + + double3 x_force = {0.0, 0.0, 0.0}; + double3 y_force = {0.0, 0.0, 0.0}; + + double evdw_sum = 0.0; + double ecoul_sum = 0.0; + + const unsigned mask = 0xffffffffu; + + auto lj_code = [&](int ix, int iy) -> int { + if (ix < 0 || iy < 0 || ix >= n_atoms_solute || iy >= n_atoms_solute) return 0; + return d_LJ_matrix[ix * n_atoms_solute + iy]; + }; + + auto is_valid = [&]() -> bool { + if (x_idx >= nx || y_idx >= ny) return false; + + if (symmetric && tile_x == tile_y && x_idx <= y_idx) return false; + + if (x_excluded || y_excluded) return false; + + // Skip interactions within the same rigid water molecule + if (x_atom_idx >= n_atoms_solute && y_atom_idx >= n_atoms_solute) { + int wx = (x_atom_idx - n_atoms_solute) / 3; + int wy = (y_atom_idx - n_atoms_solute) / 3; + if (wx == wy) return false; + } + + bool bond23 = lj_code(x_atom_idx, y_atom_idx) == 3; + if (bond23) return false; + + return true; + }; + + auto do_shuffle = [&]() { + const int src = (lane + 1) & 31; + y_idx = __shfl_sync(mask, y_idx, src); + y_atom_idx = __shfl_sync(mask, y_atom_idx, src); + y_coord = shfl_coord(y_coord, src, mask); + y_excluded = __shfl_sync(mask, y_excluded, src); + y_charge = __shfl_sync(mask, y_charge, src); + y_type = shfl_catype(y_type, src, mask); + + y_force.x = shfl(y_force.x, src, mask); + y_force.y = shfl(y_force.y, src, mask); + y_force.z = shfl(y_force.z, src, mask); + }; + + for (int i = 0; i < 32; i++) { + if (is_valid()) { + bool bond14 = lj_code(x_atom_idx, y_atom_idx) == 1; + double scaling = bond14 ? d_topo.el14_scale : 1.0; + double ai_aii = bond14 ? x_type.aii_1_4 : x_type.aii_normal; + double bi_bii = bond14 ? x_type.bii_1_4 : x_type.bii_normal; + double aj_aii = bond14 ? y_type.aii_1_4 : y_type.aii_normal; + double bj_bii = bond14 ? y_type.bii_1_4 : y_type.bii_normal; + + // Match legacy water-water behavior: LJ only between oxygens + if (x_atom_idx >= n_atoms_solute && y_atom_idx >= n_atoms_solute) { + bool x_is_O = ((x_atom_idx - n_atoms_solute) % 3) == 0; + bool y_is_O = ((y_atom_idx - n_atoms_solute) % 3) == 0; + if (!(x_is_O && y_is_O)) { + ai_aii = bi_bii = aj_aii = bj_bii = 0.0; + } + } + + double evdw = 0, ecoul = 0, dv = 0; + + calculate_unforce_bound( + x_coord, + y_coord, + x_charge, + y_charge, + ai_aii, + aj_aii, + bi_bii, + bj_bii, + d_topo.coulomb_constant, + scaling, + evdw, + ecoul, + dv); + + evdw_sum += evdw; + ecoul_sum += ecoul; + + double3 d = {x_coord.x - y_coord.x, x_coord.y - y_coord.y, x_coord.z - y_coord.z}; + y_force.x -= dv * d.x; + y_force.y -= dv * d.y; + y_force.z -= dv * d.z; + + x_force.x += dv * d.x; + x_force.y += dv * d.y; + x_force.z += dv * d.z; + } + do_shuffle(); + } + + if (x_atom_idx >= 0) { + atomicAdd(&d_dvelocities[x_atom_idx].x, x_force.x); + atomicAdd(&d_dvelocities[x_atom_idx].y, x_force.y); + atomicAdd(&d_dvelocities[x_atom_idx].z, x_force.z); + } + + if (y_atom_idx >= 0) { + atomicAdd(&d_dvelocities[y_atom_idx].x, y_force.x); + atomicAdd(&d_dvelocities[y_atom_idx].y, y_force.y); + atomicAdd(&d_dvelocities[y_atom_idx].z, y_force.z); + } + + for (int offset = 16; offset > 0; offset >>= 1) { + evdw_sum += __shfl_down_sync(mask, evdw_sum, offset); + ecoul_sum += __shfl_down_sync(mask, ecoul_sum, offset); + } + if (lane == 0) { + atomicAdd(evdw_tot, evdw_sum); + atomicAdd(ecoul_tot, ecoul_sum); + } +} + +} // namespace CudaNonbondedForce + +std::pair calc_nonbonded_force_host( + int nx, + int ny, + int* x_idx_list, + int* y_idx_list, + bool symmetric, + + const int* x_charges_types, + const int* y_charges_types, + const ccharge_t* ccharges_table, + + const int* x_atypes_types, + const int* y_atypes_types, + const catype_t* catypes_table +) { + using namespace CudaNonbondedForce; + CudaContext& context = CudaContext::instance(); + const int thread_num = 256; + dim3 block_sz = dim3(thread_num); + + int tile_num_per_block = thread_num >> 5; + + int x_block_num = (nx + 31) >> 5; + int y_block_num = (ny + 31) >> 5; + + int total_tiles = symmetric ? (x_block_num * (x_block_num + 1)) >> 1 : x_block_num * y_block_num; + int grid_sz = (total_tiles + tile_num_per_block - 1) / tile_num_per_block; + + dim3 grid = dim3(grid_sz); + + cudaMemset(d_ecoul_total, 0, sizeof(double)); + cudaMemset(d_evdw_total, 0, sizeof(double)); + + calc_nonbonded_force_kernel<<>>( + nx, + ny, + x_charges_types, + y_charges_types, + ccharges_table, + x_atypes_types, + y_atypes_types, + catypes_table, + topo, + context.d_excluded, + context.d_LJ_matrix, + x_idx_list, + y_idx_list, + context.d_coords, + context.d_dvelocities, + d_evdw_total, + d_ecoul_total, + symmetric, + n_atoms_solute); + + cudaDeviceSynchronize(); + + double evdw_tot = 0, ecoul_tot = 0; + cudaMemcpy(&evdw_tot, d_evdw_total, sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(&ecoul_tot, d_ecoul_total, sizeof(double), cudaMemcpyDeviceToHost); + + return {evdw_tot, ecoul_tot}; +} + +void init_nonbonded_force_kernel_data() { + using namespace CudaNonbondedForce; + if (!is_initialized) { + check_cudaMalloc((void**)&d_evdw_total, sizeof(double)); + check_cudaMalloc((void**)&d_ecoul_total, sizeof(double)); + is_initialized = true; + } +} + +void cleanup_nonbonded_force() { + using namespace CudaNonbondedForce; + if (is_initialized) { + cudaFree(d_evdw_total); + cudaFree(d_ecoul_total); + is_initialized = false; + } +} diff --git a/src/core/cuda/src/CudaNonbondedPPForce.cu b/src/core/cuda/src/CudaNonbondedPPForce.cu index d32db32b..792abba2 100644 --- a/src/core/cuda/src/CudaNonbondedPPForce.cu +++ b/src/core/cuda/src/CudaNonbondedPPForce.cu @@ -1,311 +1,48 @@ +#include +#include + #include "cuda/include/CudaContext.cuh" +#include "cuda/include/CudaNonbondedForce.cuh" #include "cuda/include/CudaNonbondedPPForce.cuh" #include "utils.h" namespace CudaNonbondedPPForce { bool is_initialized = false; -dvel_t* PP_MAT; -double *D_PP_Evdw, *D_PP_Ecoul; -double *D_PP_evdw_TOT, *D_PP_ecoul_TOT; - -__device__ void calc_pp_dvel_matrix_incr(int row, int pi, int column, int pj, - coord_t* Xs, coord_t* Ys, int* LJs, bool* excluded_s, double* Evdw, double* Ecoul, dvel_t* patom_a, dvel_t* patom_b, - ccharge_t* D_ccharges, charge_t* D_charges, catype_t* D_catypes, atype_t* D_atypes, p_atom_t* D_patoms, topo_t D_topo) { - bool bond14, bond23; - double scaling; - coord_t da; - double r2a, ra, r6a; - double Vela, V_a, V_b; - double dva; - double crg_i, crg_j; - double ai_aii, aj_aii, ai_bii, aj_bii; - catype_t ai_type, aj_type; - - bond23 = LJs[row * BLOCK_SIZE + column] == 3; - bond14 = LJs[row * BLOCK_SIZE + column] == 1; - - if (bond23) return; - if (excluded_s[row] || excluded_s[BLOCK_SIZE + column]) return; - - scaling = bond14 ? D_topo.el14_scale : 1; - - crg_i = D_ccharges[D_charges[pi].code - 1].charge; - crg_j = D_ccharges[D_charges[pj].code - 1].charge; - - ai_type = D_catypes[D_atypes[pi].code - 1]; - aj_type = D_catypes[D_atypes[pj].code - 1]; - - da.x = Ys[column].x - Xs[row].x; - da.y = Ys[column].y - Xs[row].y; - da.z = Ys[column].z - Xs[row].z; - r2a = 1 / (pow(da.x, 2) + pow(da.y, 2) + pow(da.z, 2)); - ra = sqrt(r2a); - r6a = r2a * r2a * r2a; - - Vela = scaling * D_topo.coulomb_constant * crg_i * crg_j * ra; - - ai_aii = bond14 ? ai_type.aii_1_4 : ai_type.aii_normal; - aj_aii = bond14 ? aj_type.aii_1_4 : aj_type.aii_normal; - ai_bii = bond14 ? ai_type.bii_1_4 : ai_type.bii_normal; - aj_bii = bond14 ? aj_type.bii_1_4 : aj_type.bii_normal; - - V_a = r6a * r6a * ai_aii * aj_aii; - V_b = r6a * ai_bii * aj_bii; - dva = r2a * (-Vela - 12 * V_a + 6 * V_b); - - patom_a->x = -dva * da.x; - patom_a->y = -dva * da.y; - patom_a->z = -dva * da.z; - - patom_b->x = dva * da.x; - patom_b->y = dva * da.y; - patom_b->z = dva * da.z; - - *Ecoul += Vela; - *Evdw += (V_a - V_b); -} - -__global__ void calc_pp_dvel_matrix(int n_patoms, int n_atoms_solute, - coord_t* X, double* Evdw, double* Ecoul, dvel_t* PP_MAT, - ccharge_t* D_ccharges, charge_t* D_charges, catype_t* D_catypes, atype_t* D_atypes, p_atom_t* D_patoms, int* D_LJ_matrix, bool* D_excluded, topo_t D_topo) { - // Block index - int bx = blockIdx.x; - int by = blockIdx.y; - - // Thread index - int tx = threadIdx.x; - int ty = threadIdx.y; - - int aStart = BLOCK_SIZE * by; - int bStart = BLOCK_SIZE * bx; - - __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE]; - __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE]; - - Ecoul_S[ty][tx] = 0; - Evdw_S[ty][tx] = 0; - - if (aStart + ty >= n_patoms) return; - if (bStart + tx >= n_patoms) return; - - __shared__ coord_t Xs[BLOCK_SIZE]; - __shared__ coord_t Ys[BLOCK_SIZE]; - __shared__ int LJs[BLOCK_SIZE * BLOCK_SIZE]; - __shared__ bool excluded_s[2 * BLOCK_SIZE]; - - int pi = D_patoms[aStart + ty].a - 1; - int pj = D_patoms[bStart + tx].a - 1; - - if (tx == 0) { - Xs[ty] = X[pi]; - excluded_s[ty] = D_excluded[pi]; - } - - if (ty == 0) { - Ys[tx] = X[pj]; - excluded_s[BLOCK_SIZE + tx] = D_excluded[pj]; - } - LJs[ty * BLOCK_SIZE + tx] = D_LJ_matrix[pi * n_atoms_solute + pj]; - - __syncthreads(); - - if (bx < by || (bx == by && tx < ty)) return; - - dvel_t patom_a, patom_b; - memset(&patom_a, 0, sizeof(dvel_t)); - memset(&patom_b, 0, sizeof(dvel_t)); - - int row = by * BLOCK_SIZE + ty; - int column = bx * BLOCK_SIZE + tx; - - if (bx != by || tx != ty) { - double evdw = 0, ecoul = 0; - calc_pp_dvel_matrix_incr(ty, pi, tx, pj, Xs, Ys, LJs, excluded_s, &evdw, &ecoul, &patom_a, - &patom_b, D_ccharges, D_charges, D_catypes, D_atypes, D_patoms, D_topo); - Evdw_S[ty][tx] = evdw; - Ecoul_S[ty][tx] = ecoul; - } - - PP_MAT[row * n_patoms + column] = patom_a; - PP_MAT[column * n_patoms + row] = patom_b; - - __syncthreads(); - - int rowlen = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE; - - if (tx == 0 && ty == 0) { - double tot_Evdw = 0; - double tot_Ecoul = 0; - for (int i = 0; i < BLOCK_SIZE; i++) { - for (int j = 0; j < BLOCK_SIZE; j++) { - tot_Evdw += Evdw_S[i][j]; - tot_Ecoul += Ecoul_S[i][j]; - } - } - Evdw[rowlen * by + bx] = tot_Evdw; - Ecoul[rowlen * by + bx] = tot_Ecoul; - - // printf("bx = %d by = %d tot_Evdw = %f tot_Ecoul = %f\n", bx, by, Evdw[rowlen * by + bx], Ecoul[rowlen * by + bx]); - } - - __syncthreads(); -} - -__global__ void calc_pp_dvel_vector(int n_patoms, dvel_t* DV_X, dvel_t* PP_MAT, p_atom_t* D_patoms) { - int row = blockIdx.x * blockDim.x + threadIdx.x; - if (row >= n_patoms) return; - - dvel_t dP; - dP.x = 0; - dP.y = 0; - dP.z = 0; - - for (int i = 0; i < n_patoms; i++) { - if (i != row) { - dP.x += PP_MAT[i + n_patoms * row].x; - dP.y += PP_MAT[i + n_patoms * row].y; - dP.z += PP_MAT[i + n_patoms * row].z; - } - } - - int p = D_patoms[row].a - 1; - - DV_X[p].x += dP.x; - DV_X[p].y += dP.y; - DV_X[p].z += dP.z; - - __syncthreads(); -} - -__global__ void calc_energy_sum(int rows, int columns, double* Evdw_TOT, double* Ecoul_TOT, double* Evdw, double* Ecoul, bool upper_diagonal) { - int tx = threadIdx.x; - int ty = threadIdx.y; - - __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE]; - __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE]; - - // TODO: better way to distribute upper diagonal over threads? Seems like threads in left bottom corner have less work - double coul_TOT = 0; - double vdw_TOT = 0; - for (int i = ty; i < rows; i += BLOCK_SIZE) { - for (int j = tx; j < columns; j += BLOCK_SIZE) { - if (i <= j || !upper_diagonal) { - coul_TOT += Ecoul[i * columns + j]; - vdw_TOT += Evdw[i * columns + j]; - } - } - } - Ecoul_S[ty][tx] = coul_TOT; - Evdw_S[ty][tx] = vdw_TOT; - - __syncthreads(); - - if (tx == 0 && ty == 0) { - double Evdw_temp = 0; - double Ecoul_temp = 0; - - for (int i = 0; i < BLOCK_SIZE; i++) { - for (int j = 0; j < BLOCK_SIZE; j++) { - Evdw_temp += Evdw_S[i][j]; - Ecoul_temp += Ecoul_S[i][j]; - } - } - - *Evdw_TOT = Evdw_temp; - *Ecoul_TOT = Ecoul_temp; - } -} } // namespace CudaNonbondedPPForce -void calc_nonbonded_pp_forces_host_v2() { - using namespace CudaNonbondedPPForce; - - int mem_size_DV_X = n_atoms_solute * sizeof(dvel_t); - - int n_blocks = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE; - - CudaContext& ctx = CudaContext::instance(); - auto X = ctx.d_coords; - auto DV_X = ctx.d_dvelocities; - auto D_ccharges = ctx.d_ccharges; - auto D_charges = ctx.d_charges; - auto D_catypes = ctx.d_catypes; - auto D_atypes = ctx.d_atypes; - auto D_patoms = ctx.d_p_atoms; - auto D_LJ_matrix = ctx.d_LJ_matrix; - auto D_excluded = ctx.d_excluded; - - dim3 threads, grid; - - threads = dim3(BLOCK_SIZE, BLOCK_SIZE); - grid = dim3((n_patoms + BLOCK_SIZE - 1) / threads.x, (n_patoms + BLOCK_SIZE - 1) / threads.y); - - calc_pp_dvel_matrix<<>>(n_patoms, n_atoms_solute, X, D_PP_Evdw, D_PP_Ecoul, PP_MAT, D_ccharges, D_charges, D_catypes, D_atypes, D_patoms, D_LJ_matrix, D_excluded, topo); - calc_pp_dvel_vector<<<((n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_patoms, DV_X, PP_MAT, D_patoms); - - cudaMemcpy(dvelocities, DV_X, mem_size_DV_X, cudaMemcpyDeviceToHost); - -#ifdef DEBUG - cudaMemcpy(h_PP_MAT, PP_MAT, mem_size_PP_MAT, cudaMemcpyDeviceToHost); -#endif - -#ifdef DEBUG - for (int i = 0; i < n_patoms; i++) { - for (int j = 0; j < n_patoms; j++) { - if (h_PP_MAT[i * n_patoms + j].x > 100) - printf("PP_MAT[%d][%d] = %f %f %f\n", i, j, h_PP_MAT[i * n_patoms + j].x, h_PP_MAT[i * n_patoms + j].y, h_PP_MAT[i * n_patoms + j].z); - } - } -#endif - cudaMemset(D_PP_evdw_TOT, 0, sizeof(double)); - cudaMemset(D_PP_ecoul_TOT, 0, sizeof(double)); - - calc_energy_sum<<<1, threads>>>(n_blocks, n_blocks, D_PP_evdw_TOT, D_PP_ecoul_TOT, D_PP_Evdw, D_PP_Ecoul, true); - double PP_evdw_TOT, PP_ecoul_TOT; - cudaMemcpy(&PP_evdw_TOT, D_PP_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost); - cudaMemcpy(&PP_ecoul_TOT, D_PP_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost); - - E_nonbond_pp.Uvdw += PP_evdw_TOT; - E_nonbond_pp.Ucoul += PP_ecoul_TOT; -} void init_nonbonded_pp_force_kernel_data() { using namespace CudaNonbondedPPForce; if (!is_initialized) { - int n_blocks = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE; - - int mem_size_PP_MAT = n_patoms * n_patoms * sizeof(dvel_t); - - int mem_size_PP_Evdw = n_blocks * n_blocks * sizeof(double); - int mem_size_PP_Ecoul = n_blocks * n_blocks * sizeof(double); -#ifdef DEBUG - printf("Allocating PP_MAT\n"); -#endif - check_cudaMalloc((void**)&PP_MAT, mem_size_PP_MAT); -#ifdef DEBUG - printf("Allocating D_PP_Evdw\n"); -#endif - check_cudaMalloc((void**)&D_PP_Evdw, mem_size_PP_Evdw); -#ifdef DEBUG - printf("Allocating D_PP_Ecoul\n"); -#endif - check_cudaMalloc((void**)&D_PP_Ecoul, mem_size_PP_Ecoul); - - check_cudaMalloc((void**)&D_PP_evdw_TOT, sizeof(double)); - check_cudaMalloc((void**)&D_PP_ecoul_TOT, sizeof(double)); - is_initialized = true; } } +void calc_nonbonded_pp_forces_host_v2() { + int nx = CudaContext::instance().h_p_atoms_list.size(); + int ny = nx; + + auto result = calc_nonbonded_force_host( + nx, + ny, + CudaContext::instance().d_p_atoms_list, + CudaContext::instance().d_p_atoms_list, + true, // symmetric + CudaContext::instance().d_p_charge_types, + CudaContext::instance().d_p_charge_types, + CudaContext::instance().d_charge_table_all, + CudaContext::instance().d_p_catype_types, + CudaContext::instance().d_p_catype_types, + CudaContext::instance().d_catype_table_all); + printf("Nonbonded PP Force: Uvdw = %f, Ucoul = %f\n", result.first, result.second); + E_nonbond_pp.Uvdw += result.first; + E_nonbond_pp.Ucoul += result.second; +} + void cleanup_nonbonded_pp_force() { using namespace CudaNonbondedPPForce; + if (is_initialized) { - cudaFree(PP_MAT); - cudaFree(D_PP_Evdw); - cudaFree(D_PP_Ecoul); - cudaFree(D_PP_evdw_TOT); - cudaFree(D_PP_ecoul_TOT); is_initialized = false; } -} \ No newline at end of file +} diff --git a/src/core/cuda/src/CudaNonbondedPWForce.cu b/src/core/cuda/src/CudaNonbondedPWForce.cu index dec352db..e5e0a02f 100644 --- a/src/core/cuda/src/CudaNonbondedPWForce.cu +++ b/src/core/cuda/src/CudaNonbondedPWForce.cu @@ -1,367 +1,40 @@ +#include +#include + #include "cuda/include/CudaContext.cuh" +#include "cuda/include/CudaNonbondedForce.cuh" #include "cuda/include/CudaNonbondedPWForce.cuh" namespace CudaNonbondedPWForce { // Declare any necessary static variables or device pointers here bool is_initialized = false; -struct calc_pw_t { - dvel_t P; - dvel_t W; -}; - -calc_pw_t *PW_MAT, *h_PW_MAT; -double *D_PW_Evdw, *D_PW_Ecoul; -double *D_PW_evdw_TOT, *D_PW_ecoul_TOT, PW_evdw_TOT, PW_ecoul_TOT; - -__global__ void calc_energy_sum(int rows, int columns, double* Evdw_TOT, double* Ecoul_TOT, double* Evdw, double* Ecoul, bool upper_diagonal) { - int tx = threadIdx.x; - int ty = threadIdx.y; - - __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE]; - __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE]; - - // TODO: better way to distribute upper diagonal over threads? Seems like threads in left bottom corner have less work - double coul_TOT = 0; - double vdw_TOT = 0; - for (int i = ty; i < rows; i += BLOCK_SIZE) { - for (int j = tx; j < columns; j += BLOCK_SIZE) { - if (i <= j || !upper_diagonal) { - coul_TOT += Ecoul[i * columns + j]; - vdw_TOT += Evdw[i * columns + j]; - } - } - } - Ecoul_S[ty][tx] = coul_TOT; - Evdw_S[ty][tx] = vdw_TOT; - - __syncthreads(); - - if (tx == 0 && ty == 0) { - double Evdw_temp = 0; - double Ecoul_temp = 0; - - for (int i = 0; i < BLOCK_SIZE; i++) { - for (int j = 0; j < BLOCK_SIZE; j++) { - Evdw_temp += Evdw_S[i][j]; - Ecoul_temp += Ecoul_S[i][j]; - } - } - - *Evdw_TOT = Evdw_temp; - *Ecoul_TOT = Ecoul_temp; - } -} - -__device__ void calc_pw_dvel_matrix_incr(int row, int pi, int column, int j, int n_atoms_solute, - coord_t* Xs, coord_t* Ws, bool* excluded_s, double* Evdw, double* Ecoul, calc_pw_t* pw, - ccharge_t* D_ccharges, charge_t* D_charges, catype_t* D_catypes, atype_t* D_atypes, p_atom_t* D_patoms, topo_t D_topo) { - coord_t da; - double r2a, ra, r6a; - double Vela, V_a, V_b; - double dva; - double qi, qj; - double ai_aii, aj_aii, ai_bii, aj_bii; - catype_t ai_type, aj_type; - atype_t i_type, j_type; - - if (excluded_s[row]) return; - qi = D_ccharges[D_charges[pi].code - 1].charge; - qj = D_ccharges[D_charges[n_atoms_solute + j].code - 1].charge; // TODO: FIX THIS!!! WILL NOT WORK WITH QATOMS!!!!! - - // if (pi < 100 && j < 100){ - // printf("qi = %f qj = %f\n", qi, qj); - // } - - ai_type = D_catypes[D_atypes[pi].code - 1]; - aj_type = D_catypes[D_atypes[n_atoms_solute + j].code - 1]; - - da.x = Ws[column].x - Xs[row].x; - da.y = Ws[column].y - Xs[row].y; - da.z = Ws[column].z - Xs[row].z; - r2a = 1 / (pow(da.x, 2) + pow(da.y, 2) + pow(da.z, 2)); - ra = sqrt(r2a); - r6a = r2a * r2a * r2a; - - Vela = D_topo.coulomb_constant * qi * qj * ra; - - ai_aii = ai_type.aii_normal; - aj_aii = aj_type.aii_normal; - ai_bii = ai_type.bii_normal; - aj_bii = aj_type.bii_normal; - - V_a = r6a * r6a * ai_aii * aj_aii; - V_b = r6a * ai_bii * aj_bii; - dva = r2a * (-Vela - 12 * V_a + 6 * V_b); - - pw->P.x -= dva * da.x; - pw->P.y -= dva * da.y; - pw->P.z -= dva * da.z; - - pw->W.x += dva * da.x; - pw->W.y += dva * da.y; - pw->W.z += dva * da.z; - - *Ecoul += Vela; - *Evdw += (V_a - V_b); - - // if (pi == 522 && j == 175) { - // printf("Vela = %f V_a = %f V_b = %f P = %f %f %f ai_aii = %f aj_aii = %f\n", Vela, V_a, V_b, pw->P.x, pw->P.y, pw->P.z, ai_aii, aj_aii); - // } - - // if (pi < 100 && j < 100) printf("Evdw = %f Ecoul = %f\n", *Evdw, *Ecoul); -} - -__global__ void calc_pw_dvel_vector_row(int n_patoms, int n_waters, dvel_t* DV_X, dvel_t* DV_W, calc_pw_t* PW_MAT, p_atom_t* D_patoms) { - int row = blockIdx.x * blockDim.x + threadIdx.x; - if (row >= n_patoms) return; - - dvel_t dP; - - dP.x = 0; - dP.y = 0; - dP.z = 0; - - for (int i = 0; i < 3 * n_waters; i++) { - dP.x += PW_MAT[i + 3 * n_waters * row].P.x; - dP.y += PW_MAT[i + 3 * n_waters * row].P.y; - dP.z += PW_MAT[i + 3 * n_waters * row].P.z; - } - - int p = D_patoms[row].a - 1; - - DV_X[p].x += dP.x; - DV_X[p].y += dP.y; - DV_X[p].z += dP.z; - - __syncthreads(); -} - -__global__ void calc_pw_dvel_vector_column(int n_patoms, int n_waters, dvel_t* DV_X, dvel_t* DV_W, calc_pw_t* PW_MAT) { - int column = blockIdx.x * blockDim.x + threadIdx.x; - if (column >= 3 * n_waters) return; - - dvel_t dW; - - dW.x = 0; - dW.y = 0; - dW.z = 0; - - for (int i = 0; i < n_patoms; i++) { - dW.x += PW_MAT[column + 3 * n_waters * i].W.x; - dW.y += PW_MAT[column + 3 * n_waters * i].W.y; - dW.z += PW_MAT[column + 3 * n_waters * i].W.z; - } - - DV_W[column].x += dW.x; - DV_W[column].y += dW.y; - DV_W[column].z += dW.z; - - __syncthreads(); -} - -__global__ void calc_pw_dvel_matrix(int n_patoms, int n_atoms_solute, int n_waters, - coord_t* X, coord_t* W, double* Evdw, double* Ecoul, calc_pw_t* PW_MAT, - ccharge_t* D_ccharges, charge_t* D_charges, catype_t* D_catypes, atype_t* D_atypes, p_atom_t* D_patoms, bool* D_excluded, topo_t D_topo) { - // Block index - int bx = blockIdx.x; - int by = blockIdx.y; - - // Thread index - int tx = threadIdx.x; - int ty = threadIdx.y; - - __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE]; - __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE]; - - Ecoul_S[ty][tx] = 0; - Evdw_S[ty][tx] = 0; - - // if (tx == 0 && ty == 0) printf("bx = %d by = %d\n", bx, by); - - // if (bx == 0 && by == 0) printf("bx = %d by = %d tx = %d ty = %d\n", bx, by, tx, ty); - - int aStart = BLOCK_SIZE * by; - int bStart = BLOCK_SIZE * bx; - - if (aStart + ty >= n_patoms) return; - if (bStart + tx >= 3 * n_waters) return; - - // if (bx == 8 && by == 1) printf("bx = %d by = %d tx = %d ty = %d\n", bx, by, tx, ty); - - __shared__ coord_t Xs[BLOCK_SIZE]; - __shared__ coord_t Ws[BLOCK_SIZE]; - __shared__ bool excluded_s[BLOCK_SIZE]; - - int pi = D_patoms[aStart + ty].a - 1; - - Xs[ty] = X[pi]; - Ws[tx] = W[bStart + tx]; - - if (tx == 0) { - excluded_s[ty] = D_excluded[pi]; - } - - __syncthreads(); - - calc_pw_t pw; - memset(&pw, 0, sizeof(calc_pw_t)); - - int row = by * BLOCK_SIZE + ty; - int column = bx * BLOCK_SIZE + tx; - - // if (row == 0 && column == 1) { - // printf("Xs[0] = %f\n", Xs[0]); - // printf("Ys[0] = %f\n", Ys[0]); - // printf("Xs[1] = %f\n", Xs[1]); - // printf("Ys[1] = %f\n", Ys[1]); - // printf("Xs[2] = %f\n", Xs[2]); - // printf("Ys[2] = %f\n", Ys[2]); - // printf("Xs[3] = %f\n", Xs[3]); - // printf("Ys[3] = %f\n", Ys[3]); - // printf("Xs[4] = %f\n", Xs[4]); - // printf("Ys[4] = %f\n", Ys[4]); - - // printf("Ys[%d] = %f Xs[%d] = %f\n", 3 * ty, Ys[3 * ty], 3 * tx, Xs[3 * tx]); - // } - - // if (bx == 8 && by == 1) printf("bx = %d by = %d tx = %d ty = %d\n", bx, by, tx, ty); - // __device__ void calc_pw_dvel_matrix_incr(int row, int pi, int column, int j, int n_patoms, - // coord_t *Ps, coord_t *Xs, double *Evdw, double *Ecoul, calc_pw_t *pw, - // ccharge_t *D_ccharges, charge_t *D_charges, catype_t *D_catypes, atype_t *D_atypes, p_atom_t *D_patoms) - double evdw = 0, ecoul = 0; - calc_pw_dvel_matrix_incr(ty, pi, tx, bStart + tx, n_atoms_solute, Xs, Ws, excluded_s, &evdw, &ecoul, &pw, D_ccharges, D_charges, D_catypes, D_atypes, D_patoms, D_topo); - Evdw_S[ty][tx] = evdw; - Ecoul_S[ty][tx] = ecoul; - - // if (row == 0 && column == 1) { - // printf("water_a = %f %f %f water_b = %f %f %f\n", water_a[0].x, water_a[0].y, water_a[0].z, water_b[0].x, water_b[0].y, water_b[0].z); - // } - - // if (bx == 8 && by == 1) printf("n_qatoms = %d\n", n_qatoms); - // if (bx == 8 && by == 1) printf("qi = %d j = %d charge[%d] = %f\n", row, column, row + n_qatoms, D_qcharges[row + n_qatoms * 1].q); - - PW_MAT[column + 3 * n_waters * row] = pw; - - __syncthreads(); - - int rowlen = (3 * n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE; - - if (tx == 0 && ty == 0) { - double tot_Evdw = 0; - double tot_Ecoul = 0; - for (int i = 0; i < BLOCK_SIZE; i++) { - for (int j = 0; j < BLOCK_SIZE; j++) { - tot_Evdw += Evdw_S[i][j]; - tot_Ecoul += Ecoul_S[i][j]; - } - } - Evdw[rowlen * by + bx] = tot_Evdw; - Ecoul[rowlen * by + bx] = tot_Ecoul; - } - - __syncthreads(); -} } // namespace CudaNonbondedPWForce - void calc_nonbonded_pw_forces_host_v2() { using namespace CudaNonbondedPWForce; - int mem_size_W = 3 * n_waters * sizeof(coord_t); - int mem_size_DV_W = 3 * n_waters * sizeof(dvel_t); - int mem_size_DV_X = n_atoms_solute * sizeof(dvel_t); - int mem_size_PW_MAT = 3 * n_waters * n_patoms * sizeof(calc_pw_t); - - int n_blocks_p = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE; - int n_blocks_w = (3 * n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE; - - int mem_size_PW_Evdw = n_blocks_p * n_blocks_w * sizeof(double); - int mem_size_PW_Ecoul = n_blocks_p * n_blocks_w * sizeof(double); - - CudaContext& ctx = CudaContext::instance(); - auto X = ctx.d_coords; - auto DV_X = ctx.d_dvelocities; - auto D_ccharges = ctx.d_ccharges; - auto D_charges = ctx.d_charges; - auto D_catypes = ctx.d_catypes; - auto D_atypes = ctx.d_atypes; - auto D_patoms = ctx.d_p_atoms; - auto D_excluded = ctx.d_excluded; - dim3 threads, grid; - - threads = dim3(BLOCK_SIZE, BLOCK_SIZE); - grid = dim3((3 * n_waters + BLOCK_SIZE - 1) / threads.x, (n_patoms + BLOCK_SIZE - 1) / threads.y); - - calc_pw_dvel_matrix<<>>(n_patoms, n_atoms_solute, n_waters, X, X + n_atoms_solute, D_PW_Evdw, D_PW_Ecoul, PW_MAT, - D_ccharges, D_charges, D_catypes, D_atypes, D_patoms, D_excluded, topo); - calc_pw_dvel_vector_column<<<((3 * n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_patoms, n_waters, DV_X, DV_X + n_atoms_solute, PW_MAT); - calc_pw_dvel_vector_row<<<((n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_patoms, n_waters, DV_X, DV_X + n_atoms_solute, PW_MAT, D_patoms); - -#ifdef DEBUG - cudaMemcpy(h_PW_MAT, PW_MAT, mem_size_PW_MAT, cudaMemcpyDeviceToHost); -#endif - -// for (int i = 0; i < n_waters; i++) { -// printf("X[%d] = %f %f %f\n", i, coords[i].x, coords[i].y, coords[i].z); -// } - -// printf("n_patoms = %d n_watoms = %d\n", n_patoms, 3 * n_waters); -#ifdef DEBUG - for (int i = 0; i < n_patoms; i++) { - for (int j = 0; j < 3 * n_waters; j++) { - if (h_PW_MAT[3 * i * n_waters + j].P.x > 100) - printf("PW_MAT[%d][%d].P = %f %f %f\n", i, j, h_PW_MAT[3 * i * n_waters + j].P.x, h_PW_MAT[3 * i * n_waters + j].P.y, h_PW_MAT[3 * i * n_waters + j].P.z); - } - } -#endif - - cudaMemcpy(dvelocities, DV_X, mem_size_DV_X, cudaMemcpyDeviceToHost); - cudaMemcpy(&dvelocities[n_atoms_solute], DV_X + n_atoms_solute, mem_size_DV_W, cudaMemcpyDeviceToHost); - - calc_energy_sum<<<1, threads>>>(n_blocks_p, n_blocks_w, D_PW_evdw_TOT, D_PW_ecoul_TOT, D_PW_Evdw, D_PW_Ecoul, false); - - cudaMemcpy(&PW_evdw_TOT, D_PW_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost); - cudaMemcpy(&PW_ecoul_TOT, D_PW_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost); - - E_nonbond_pw.Uvdw += PW_evdw_TOT; - E_nonbond_pw.Ucoul += PW_ecoul_TOT; - - // for (int i = 0; i < n_atoms; i++) { - // printf("dvelocities[%d] = %f %f %f\n", i, dvelocities[i].x, dvelocities[i].y, dvelocities[i].z); - // } + int nx = CudaContext::instance().h_p_atoms_list.size(); + int ny = CudaContext::instance().h_w_atoms_list.size(); + auto result = calc_nonbonded_force_host( + nx, ny, + CudaContext::instance().d_p_atoms_list, + CudaContext::instance().d_w_atoms_list, + false, + CudaContext::instance().d_p_charge_types, + CudaContext::instance().d_w_charge_types, + CudaContext::instance().d_charge_table_all, + CudaContext::instance().d_p_catype_types, + CudaContext::instance().d_w_catype_types, + CudaContext::instance().d_catype_table_all); + printf("Nonbonded PW Force (Host) - VdW: %f, Coulomb: %f\n", result.first, result.second); + + E_nonbond_pw.Uvdw = result.first; + E_nonbond_pw.Ucoul = result.second; } void init_nonbonded_pw_force_kernel_data() { using namespace CudaNonbondedPWForce; if (!is_initialized) { - int mem_size_PW_MAT = 3 * n_waters * n_patoms * sizeof(calc_pw_t); - - int n_blocks_p = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE; - int n_blocks_w = (3 * n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE; - int mem_size_PW_Evdw = n_blocks_p * n_blocks_w * sizeof(double); - int mem_size_PW_Ecoul = n_blocks_p * n_blocks_w * sizeof(double); -#ifdef DEBUG - printf("Allocating PW_MAT\n"); -#endif - check_cudaMalloc((void**)&PW_MAT, mem_size_PW_MAT); - -#ifdef DEBUG - printf("Allocating D_PW_Evdw\n"); -#endif - check_cudaMalloc((void**)&D_PW_Evdw, mem_size_PW_Evdw); -#ifdef DEBUG - printf("Allocating D_PW_Ecoul\n"); -#endif - check_cudaMalloc((void**)&D_PW_Ecoul, mem_size_PW_Ecoul); - - check_cudaMalloc((void**)&D_PW_evdw_TOT, sizeof(double)); - check_cudaMalloc((void**)&D_PW_ecoul_TOT, sizeof(double)); - -#ifdef DEBUG - printf("All GPU solvent memory allocated\n"); -#endif - - h_PW_MAT = (calc_pw_t*)malloc(mem_size_PW_MAT); is_initialized = true; } } @@ -369,12 +42,6 @@ void init_nonbonded_pw_force_kernel_data() { void cleanup_nonbonded_pw_force() { using namespace CudaNonbondedPWForce; if (is_initialized) { - cudaFree(PW_MAT); - cudaFree(D_PW_Evdw); - cudaFree(D_PW_Ecoul); - cudaFree(D_PW_evdw_TOT); - cudaFree(D_PW_ecoul_TOT); - free(h_PW_MAT); is_initialized = false; } -} \ No newline at end of file +} diff --git a/src/core/cuda/src/CudaNonbondedQPForce.cu b/src/core/cuda/src/CudaNonbondedQPForce.cu index 4eb53908..39a311ee 100644 --- a/src/core/cuda/src/CudaNonbondedQPForce.cu +++ b/src/core/cuda/src/CudaNonbondedQPForce.cu @@ -1,365 +1,52 @@ #include +#include +#include #include "cuda/include/CudaContext.cuh" +#include "cuda/include/CudaNonbondedForce.cuh" #include "cuda/include/CudaNonbondedQPForce.cuh" #include "system.h" +#include "utils.h" namespace CudaNonbondedQPForce { -struct calc_qw_t { - dvel_t Q; - dvel_t O; - dvel_t H1; - dvel_t H2; -}; - -struct calc_qp_t { - dvel_t Q; - dvel_t P; -}; bool is_initialized = false; -double *D_QP_Evdw, *D_QP_Ecoul, *h_QP_Evdw, *h_QP_Ecoul; -calc_qp_t *QP_MAT, *h_QP_MAT; -double *D_QP_evdw_TOT, *D_QP_ecoul_TOT, QP_evdw_TOT, QP_ecoul_TOT; - -// General -__global__ void calc_energy_sum(int rows, int columns, double* Evdw_TOT, double* Ecoul_TOT, double* Evdw, double* Ecoul, bool upper_diagonal) { - int tx = threadIdx.x; - int ty = threadIdx.y; - - __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE]; - __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE]; - - // TODO: better way to distribute upper diagonal over threads? Seems like threads in left bottom corner have less work - double coul_TOT = 0; - double vdw_TOT = 0; - for (int i = ty; i < rows; i += BLOCK_SIZE) { - for (int j = tx; j < columns; j += BLOCK_SIZE) { - if (i <= j || !upper_diagonal) { - coul_TOT += Ecoul[i * columns + j]; - vdw_TOT += Evdw[i * columns + j]; - } - } - } - Ecoul_S[ty][tx] = coul_TOT; - Evdw_S[ty][tx] = vdw_TOT; - - __syncthreads(); - - if (tx == 0 && ty == 0) { - double Evdw_temp = 0; - double Ecoul_temp = 0; - - for (int i = 0; i < BLOCK_SIZE; i++) { - for (int j = 0; j < BLOCK_SIZE; j++) { - Evdw_temp += Evdw_S[i][j]; - Ecoul_temp += Ecoul_S[i][j]; - } - } - - *Evdw_TOT = Evdw_temp; - *Ecoul_TOT = Ecoul_temp; - } -} -__global__ void calc_qp_dvel_vector_row(int n_qatoms, int n_patoms, dvel_t* DV_X, calc_qp_t* QP_MAT, q_atom_t* D_qatoms) { - int row = blockIdx.x * blockDim.x + threadIdx.x; - if (row >= n_qatoms) return; - - dvel_t dQ; - - dQ.x = 0; - dQ.y = 0; - dQ.z = 0; - - for (int i = 0; i < n_patoms; i++) { - dQ.x += QP_MAT[i + n_patoms * row].Q.x; - dQ.y += QP_MAT[i + n_patoms * row].Q.y; - dQ.z += QP_MAT[i + n_patoms * row].Q.z; - } - - int q = D_qatoms[row].a - 1; - - DV_X[q].x += dQ.x; - DV_X[q].y += dQ.y; - DV_X[q].z += dQ.z; - - __syncthreads(); -} - -__global__ void calc_qp_dvel_vector_column(int n_qatoms, int n_patoms, dvel_t* DV_X, calc_qp_t* QP_MAT, p_atom_t* D_patoms) { - int column = blockIdx.x * blockDim.x + threadIdx.x; - if (column >= n_patoms) return; - - dvel_t dP; - - dP.x = 0; - dP.y = 0; - dP.z = 0; - - for (int i = 0; i < n_qatoms; i++) { - dP.x += QP_MAT[column + n_patoms * i].P.x; - dP.y += QP_MAT[column + n_patoms * i].P.y; - dP.z += QP_MAT[column + n_patoms * i].P.z; - } - - int p = D_patoms[column].a - 1; - - DV_X[p].x += dP.x; - DV_X[p].y += dP.y; - DV_X[p].z += dP.z; - - __syncthreads(); -} - -__device__ void calc_qp_dvel_matrix_incr(int row, int qi, int column, int pj, int n_lambdas, int n_qatoms, - coord_t* Qs, coord_t* Ps, int* LJs, bool* excluded_s, double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE], double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE], calc_qp_t* qp, - q_catype_t* D_qcatypes, q_atype_t* D_qatypes, q_charge_t* D_qcharges, p_atom_t* D_patoms, q_atom_t* D_qatoms, double* D_lambdas, - catype_t* D_catypes, atype_t* D_atypes, ccharge_t* D_ccharges, charge_t* D_charges, topo_t D_topo) { - coord_t da; - double r2, r6, r; - double ai_aii, aj_aii, ai_bii, aj_bii; - q_catype_t qi_type; - catype_t aj_type; - bool bond23, bond14; - double scaling, Vel, V_a, V_b, dv; - - int j = D_patoms[pj].a - 1; - - bond23 = LJs[row * BLOCK_SIZE + column] == 3; - bond14 = LJs[row * BLOCK_SIZE + column] == 1; - - if (bond23) return; - if (excluded_s[row] || excluded_s[BLOCK_SIZE + column]) return; - - scaling = bond14 ? D_topo.el14_scale : 1; - - da.x = Qs[row].x - Ps[column].x; - da.y = Qs[row].y - Ps[column].y; - da.z = Qs[row].z - Ps[column].z; - - r2 = pow(da.x, 2) + pow(da.y, 2) + pow(da.z, 2); - - r6 = r2 * r2 * r2; - r2 = 1 / r2; - r = sqrt(r2); - - for (int state = 0; state < n_lambdas; state++) { - qi_type = D_qcatypes[D_qatypes[qi + n_qatoms * state].code - 1]; - aj_type = D_catypes[D_atypes[j].code - 1]; - - ai_aii = bond14 ? qi_type.Ai_14 : qi_type.Ai; - aj_aii = bond14 ? aj_type.aii_1_4 : aj_type.aii_normal; - ai_bii = bond14 ? qi_type.Bi_14 : qi_type.Bi; - aj_bii = bond14 ? aj_type.bii_1_4 : aj_type.bii_normal; - - Vel = D_topo.coulomb_constant * scaling * D_qcharges[qi + n_qatoms * state].q * D_ccharges[D_charges[j].code - 1].charge * r; - V_a = ai_aii * aj_aii / (r6 * r6); - V_b = ai_bii * aj_bii / r6; - dv = r2 * (-Vel - (12 * V_a - 6 * V_b)) * D_lambdas[state]; - - // if (state == 0 && qi == 0 && pj == 1) { - // printf("crg_q = %f crg_j = %f r = %f\n", D_qcharges[qi + n_qatoms * state].q, D_ccharges[D_charges[pj].code - 1].charge, r); - // printf("ai_aii = %f aj_aii = %f ai_bii = %f aj_bii = %f\n", ai_aii, aj_aii, ai_bii, aj_bii); - // } - - // Update forces - qp->Q.x += dv * da.x; - qp->Q.y += dv * da.y; - qp->Q.z += dv * da.z; - qp->P.x -= dv * da.x; - qp->P.y -= dv * da.y; - qp->P.z -= dv * da.z; - - // Update Q totals - Ecoul_S[row][state * BLOCK_SIZE + column] += Vel; - Evdw_S[row][state * BLOCK_SIZE + column] += (V_a - V_b); - } -} - -__global__ void calc_qp_dvel_matrix(int n_qatoms, int n_patoms, int n_lambdas, int n_atoms_solute, - coord_t* X, double* Evdw, double* Ecoul, calc_qp_t* QP_MAT, - q_catype_t* D_qcatypes, q_atype_t* D_qatypes, q_charge_t* D_qcharges, p_atom_t* D_patoms, q_atom_t* D_qatoms, double* D_lambdas, - int* D_LJ_matrix, bool* D_excluded, catype_t* D_catypes, atype_t* D_atypes, ccharge_t* D_ccharges, charge_t* D_charges, topo_t D_topo) { - // Block index - int bx = blockIdx.x; - int by = blockIdx.y; - - // Thread index - int tx = threadIdx.x; - int ty = threadIdx.y; - - // TODO implement >2 states on GPU - __shared__ double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE]; - __shared__ double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE]; - - Ecoul_S[ty][tx] = 0; - Evdw_S[ty][tx] = 0; - Ecoul_S[ty][tx + BLOCK_SIZE] = 0; - Evdw_S[ty][tx + BLOCK_SIZE] = 0; - - int aStart = BLOCK_SIZE * by; - int bStart = BLOCK_SIZE * bx; - - if (aStart + ty >= n_qatoms) return; - if (bStart + tx >= n_patoms) return; - - int qi = D_qatoms[aStart + ty].a - 1; - int pj = D_patoms[bStart + tx].a - 1; - - __shared__ coord_t Qs[BLOCK_SIZE]; - __shared__ coord_t Ps[BLOCK_SIZE]; - __shared__ int LJs[BLOCK_SIZE * BLOCK_SIZE]; - __shared__ bool excluded_s[2 * BLOCK_SIZE]; - - if (tx == 0) { - Qs[ty] = X[qi]; - excluded_s[ty] = D_excluded[qi]; - } - - if (ty == 0) { - Ps[tx] = X[pj]; - excluded_s[BLOCK_SIZE + tx] = D_excluded[pj]; - } - LJs[ty * BLOCK_SIZE + tx] = D_LJ_matrix[qi * n_atoms_solute + pj]; - - __syncthreads(); - - calc_qp_t qp; - memset(&qp, 0, sizeof(calc_qp_t)); - - int row = by * BLOCK_SIZE + ty; - int column = bx * BLOCK_SIZE + tx; - - calc_qp_dvel_matrix_incr(ty, row, tx, column, n_lambdas, n_qatoms, Qs, Ps, LJs, excluded_s, Evdw_S, Ecoul_S, - &qp, D_qcatypes, D_qatypes, D_qcharges, D_patoms, D_qatoms, D_lambdas, D_catypes, D_atypes, D_ccharges, D_charges, D_topo); - - QP_MAT[n_patoms * row + column] = qp; - - __syncthreads(); - - int rowlen = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE; - int collen = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE; - - if (tx == 0 && ty == 0) { - // TODO implement >2 states on GPU - for (int state = 0; state < min(2, n_lambdas); state++) { - double tot_Evdw = 0; - double tot_Ecoul = 0; - for (int i = 0; i < BLOCK_SIZE; i++) { - for (int j = 0; j < BLOCK_SIZE; j++) { - tot_Evdw += Evdw_S[i][j + state * BLOCK_SIZE]; - tot_Ecoul += Ecoul_S[i][j + state * BLOCK_SIZE]; - } - } - Evdw[rowlen * collen * state + rowlen * by + bx] = tot_Evdw; - Ecoul[rowlen * collen * state + rowlen * by + bx] = tot_Ecoul; - } - } - - __syncthreads(); -} } // namespace CudaNonbondedQPForce void calc_nonbonded_qp_forces_host_v2() { using namespace CudaNonbondedQPForce; - - int n_blocks_q = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE; - int n_blocks_p = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE; - - CudaContext& ctx = CudaContext::instance(); - auto X = ctx.d_coords; - auto DV_X = ctx.d_dvelocities; - auto D_qcatypes = ctx.d_q_catypes; - auto D_qatypes = ctx.d_q_atypes; - auto D_qcharges = ctx.d_q_charges; - auto D_patoms = ctx.d_p_atoms; - auto D_qatoms = ctx.d_q_atoms; - auto D_lambdas = ctx.d_lambdas; - auto D_LJ_matrix = ctx.d_LJ_matrix; - auto D_excluded = ctx.d_excluded; - auto D_catypes = ctx.d_catypes; - auto D_atypes = ctx.d_atypes; - auto D_ccharges = ctx.d_ccharges; - auto D_charges = ctx.d_charges; - - dim3 threads, grid; - - threads = dim3(BLOCK_SIZE, BLOCK_SIZE); - grid = dim3((n_patoms + BLOCK_SIZE - 1) / threads.x, (n_qatoms + BLOCK_SIZE - 1) / threads.y); - - calc_qp_dvel_matrix<<>>(n_qatoms, n_patoms, n_lambdas, n_atoms_solute, X, D_QP_Evdw, D_QP_Ecoul, QP_MAT, - D_qcatypes, D_qatypes, D_qcharges, D_patoms, D_qatoms, D_lambdas, D_LJ_matrix, D_excluded, - D_catypes, D_atypes, D_ccharges, D_charges, topo); - calc_qp_dvel_vector_column<<<((n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_qatoms, n_patoms, DV_X, QP_MAT, D_patoms); - calc_qp_dvel_vector_row<<<((n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_qatoms, n_patoms, DV_X, QP_MAT, D_qatoms); - -#ifdef DEBUG - cudaMemcpy(h_QP_MAT, QP_MAT, mem_size_QP_MAT, cudaMemcpyDeviceToHost); -#endif - -#ifdef DEBUG - for (int i = 0; i < n_qatoms; i++) { - for (int j = 0; j < n_patoms; j++) { - if (i == 0) - // if (h_QP_MAT[i * n_patoms + j].Q.x > 100) - printf("QP_MAT[%d][%d].Q = %f %f %f\n", i, j, h_QP_MAT[i * n_patoms + j].Q.x, h_QP_MAT[i * n_patoms + j].Q.y, h_QP_MAT[i * n_patoms + j].Q.z); - } - } -#endif - int mem_size_DV_X = n_atoms * sizeof(dvel_t); - cudaMemcpy(dvelocities, DV_X, mem_size_DV_X, cudaMemcpyDeviceToHost); - - // TODO: make Evdw & Ecoul work for # of states > 2 - for (int state = 0; state < min(2, n_lambdas); state++) { - calc_energy_sum<<<1, threads>>>(n_blocks_q, n_blocks_p, D_QP_evdw_TOT, D_QP_ecoul_TOT, &D_QP_Evdw[state * n_blocks_p * n_blocks_q], &D_QP_Ecoul[state * n_blocks_p * n_blocks_q], false); - - cudaMemcpy(&QP_evdw_TOT, D_QP_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost); - cudaMemcpy(&QP_ecoul_TOT, D_QP_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost); - - EQ_nonbond_qp[state].Uvdw += QP_evdw_TOT; - EQ_nonbond_qp[state].Ucoul += QP_ecoul_TOT; + int nx = CudaContext::instance().h_q_atoms_list.size(); + int ny = CudaContext::instance().h_p_atoms_list.size(); + for (int state = 0; state < n_lambdas; state++) { + auto result = calc_nonbonded_force_host( + nx, + ny, + CudaContext::instance().d_q_atoms_list, + CudaContext::instance().d_p_atoms_list, + false, + CudaContext::instance().d_q_charge_types + n_lambdas * nx + state * nx, + CudaContext::instance().d_p_charge_types, + CudaContext::instance().d_charge_table_all, + CudaContext::instance().d_q_catype_types + n_lambdas * nx + state * nx, + CudaContext::instance().d_p_catype_types, + CudaContext::instance().d_catype_table_all); + + EQ_nonbond_qp[state].Uvdw += result.first / lambdas[state]; + EQ_nonbond_qp[state].Ucoul += result.second / lambdas[state]; + printf("Nonbonded QP Force State %d: Uvdw = %f, Ucoul = %f\n", state, result.first, result.second); } } void init_nonbonded_qp_force_kernel_data() { using namespace CudaNonbondedQPForce; - if (!is_initialized) { - int n_blocks_q = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE; - int n_blocks_p = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE; - // TODO make Evdw & Ecoul work for # of states > 2 - int mem_size_QP_Evdw = min(n_lambdas, 2) * n_blocks_q * n_blocks_p * sizeof(double); - int mem_size_QP_Ecoul = min(n_lambdas, 2) * n_blocks_q * n_blocks_p * sizeof(double); - int mem_size_QP_MAT = n_qatoms * n_patoms * sizeof(calc_qp_t); - - check_cudaMalloc((void**)&D_QP_Evdw, mem_size_QP_Evdw); - check_cudaMalloc((void**)&D_QP_Ecoul, mem_size_QP_Ecoul); - h_QP_Evdw = (double*)malloc(mem_size_QP_Evdw); - h_QP_Ecoul = (double*)malloc(mem_size_QP_Ecoul); - - check_cudaMalloc((void**)&QP_MAT, mem_size_QP_MAT); - h_QP_MAT = (calc_qp_t*)malloc(mem_size_QP_MAT); - - check_cudaMalloc((void**)&D_QP_evdw_TOT, sizeof(double)); - check_cudaMalloc((void**)&D_QP_ecoul_TOT, sizeof(double)); - is_initialized = true; } } void cleanup_nonbonded_qp_force() { using namespace CudaNonbondedQPForce; - if (is_initialized) { - cudaFree(D_QP_Evdw); - cudaFree(D_QP_Ecoul); - free(h_QP_Evdw); - free(h_QP_Ecoul); - cudaFree(QP_MAT); - free(h_QP_MAT); - cudaFree(D_QP_evdw_TOT); - cudaFree(D_QP_ecoul_TOT); - is_initialized = false; } -} \ No newline at end of file +} diff --git a/src/core/cuda/src/CudaNonbondedQQForce.cu b/src/core/cuda/src/CudaNonbondedQQForce.cu index e2138ed6..015fd90f 100644 --- a/src/core/cuda/src/CudaNonbondedQQForce.cu +++ b/src/core/cuda/src/CudaNonbondedQQForce.cu @@ -1,135 +1,48 @@ +#include +#include + #include "cuda/include/CudaContext.cuh" +#include "cuda/include/CudaNonbondedForce.cuh" #include "cuda/include/CudaNonbondedQQForce.cuh" #include "utils.h" -__global__ void calc_nonbonded_qq_forces_kernel( - q_atom_t* q_atoms, - q_charge_t* q_charges, - int* LJ_matrix, - bool* excluded, - q_elscale_t* q_elscales, - q_catype_t* q_catypes, - q_atype_t* q_atypes, - coord_t* coords, - E_nonbonded_t* EQ_nonbond_qq, - dvel_t* dvelocities, - double* lambdas, - int n_qatoms, - int n_lambdas, - int n_atoms_solute, - topo_t topo, - int n_qelscales +namespace CudaNonbondedQQForce { +bool is_initialized = false; -) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= n_qatoms) return; - int qi = idx; +} // namespace CudaNonbondedQQForce - int ai, aj; - double crg_i, crg_j; - double elscale, scaling; - q_catype_t qi_type, qj_type; - bool bond23, bond14; - coord_t da; - double r2a, ra, r6a; - double Vela, V_a, V_b; - double dva; - double ai_aii, aj_aii, ai_bii, aj_bii; +void calc_nonbonded_qq_forces_host() { + using namespace CudaNonbondedQQForce; + int n = CudaContext::instance().h_q_atoms_list.size(); for (int state = 0; state < n_lambdas; state++) { - for (int qj = qi + 1; qj < n_qatoms; qj++) { - ai = q_atoms[qi].a - 1; - aj = q_atoms[qj].a - 1; - - crg_i = q_charges[qi + n_qatoms * state].q; - crg_j = q_charges[qj + n_qatoms * state].q; - - bond23 = LJ_matrix[ai * n_atoms_solute + aj] == 3; - bond14 = LJ_matrix[ai * n_atoms_solute + aj] == 1; - - if (bond23) continue; - if (excluded[ai] || excluded[aj]) continue; - - scaling = bond14 ? topo.el14_scale : 1; - - elscale = 1; - for (int k = 0; k < n_qelscales; k++) { - if (q_elscales[k + n_qelscales * state].qi == qi + 1 && q_elscales[k + n_qelscales * state].qj == qj + 1) { - elscale = q_elscales[k + n_qelscales * state].mu; - } - } - - qi_type = q_catypes[q_atypes[qi + n_qatoms * state].code - 1]; - qj_type = q_catypes[q_atypes[qj + n_qatoms * state].code - 1]; - - da.x = coords[aj].x - coords[ai].x; - da.y = coords[aj].y - coords[ai].y; - da.z = coords[aj].z - coords[ai].z; - r2a = 1 / (pow(da.x, 2) + pow(da.y, 2) + pow(da.z, 2)); - ra = sqrt(r2a); - r6a = r2a * r2a * r2a; - - Vela = scaling * topo.coulomb_constant * crg_i * crg_j * ra * elscale; - - ai_aii = bond14 ? qi_type.Ai_14 : qi_type.Ai; - aj_aii = bond14 ? qj_type.Ai_14 : qj_type.Ai; - ai_bii = bond14 ? qi_type.Bi_14 : qi_type.Bi; - aj_bii = bond14 ? qj_type.Bi_14 : qj_type.Bi; - - V_a = r6a * r6a * ai_aii * aj_aii; - V_b = r6a * ai_bii * aj_bii; - dva = r2a * (-Vela - 12 * V_a + 6 * V_b) * lambdas[state]; - - atomicAdd(&dvelocities[ai].x, -dva * da.x); - atomicAdd(&dvelocities[ai].y, -dva * da.y); - atomicAdd(&dvelocities[ai].z, -dva * da.z); - atomicAdd(&dvelocities[aj].x, dva * da.x); - atomicAdd(&dvelocities[aj].y, dva * da.y); - atomicAdd(&dvelocities[aj].z, dva * da.z); - - atomicAdd(&EQ_nonbond_qq[state].Ucoul, Vela); - atomicAdd(&EQ_nonbond_qq[state].Uvdw, (V_a - V_b)); - } + auto result = calc_nonbonded_force_host( + n, + n, + CudaContext::instance().d_q_atoms_list, + CudaContext::instance().d_q_atoms_list, + true, + CudaContext::instance().d_q_charge_types + state * n, + CudaContext::instance().d_q_charge_types + state * n + n * n_lambdas, + CudaContext::instance().d_charge_table_all, + CudaContext::instance().d_q_catype_types + state * n, + CudaContext::instance().d_q_catype_types + state * n + n * n_lambdas, + CudaContext::instance().d_catype_table_all); + + EQ_nonbond_qq[state].Uvdw += result.first / lambdas[state]; + EQ_nonbond_qq[state].Ucoul += result.second / lambdas[state]; + printf("Nonbonded QQ Force State %d: Uvdw = %f, Ucoul = %f\n", state, result.first, result.second); } } -void calc_nonbonded_qq_forces_host() { - CudaContext& ctx = CudaContext::instance(); - // ctx.sync_all_to_device(); - auto d_q_atoms = ctx.d_q_atoms; - auto d_q_charges = ctx.d_q_charges; - auto d_LJ_matrix = ctx.d_LJ_matrix; - auto d_excluded = ctx.d_excluded; - auto d_q_elscales = ctx.d_q_elscales; - auto d_q_catypes = ctx.d_q_catypes; - auto d_q_atypes = ctx.d_q_atypes; - auto d_coords = ctx.d_coords; - auto d_EQ_nonbond_qq = ctx.d_EQ_nonbond_qq; - auto d_dvelocities = ctx.d_dvelocities; - auto d_lambdas = ctx.d_lambdas; - - int blockSize = 256; - int numBlocks = (n_qatoms + blockSize - 1) / blockSize; - calc_nonbonded_qq_forces_kernel<<>>( - d_q_atoms, - d_q_charges, - d_LJ_matrix, - d_excluded, - d_q_elscales, - d_q_catypes, - d_q_atypes, - d_coords, - d_EQ_nonbond_qq, - d_dvelocities, - d_lambdas, - n_qatoms, - n_lambdas, - n_atoms_solute, - topo, n_qelscales); - cudaDeviceSynchronize(); - cudaMemcpy(EQ_nonbond_qq, d_EQ_nonbond_qq, sizeof(E_nonbonded_t) * n_lambdas, cudaMemcpyDeviceToHost); - cudaMemcpy(dvelocities, d_dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyDeviceToHost); +void init_nonbonded_qq_force_kernel_data() { + using namespace CudaNonbondedQQForce; + if (is_initialized) return; + is_initialized = true; } -void init_nonbonded_qq_force_kernel_data() {} -void cleanup_nonbonded_qq_force() {} \ No newline at end of file +void cleanup_nonbonded_qq_force() { + using namespace CudaNonbondedQQForce; + if (!is_initialized) return; + is_initialized = false; +} diff --git a/src/core/cuda/src/CudaNonbondedQWForce.cu b/src/core/cuda/src/CudaNonbondedQWForce.cu index 974c380c..e39ee57f 100644 --- a/src/core/cuda/src/CudaNonbondedQWForce.cu +++ b/src/core/cuda/src/CudaNonbondedQWForce.cu @@ -1,404 +1,49 @@ +#include #include #include "cuda/include/CudaContext.cuh" +#include "cuda/include/CudaNonbondedForce.cuh" #include "cuda/include/CudaNonbondedQWForce.cuh" #include "utils.h" namespace CudaNonbondedQWForce { -struct calc_qw_t { - dvel_t Q; - dvel_t O; - dvel_t H1; - dvel_t H2; -}; bool is_initialized = false; -calc_qw_t *QW_MAT, *h_QW_MAT; -double *D_QW_Evdw, *D_QW_Ecoul, *h_QW_Evdw, *h_QW_Ecoul; -double *D_QW_evdw_TOT, *D_QW_ecoul_TOT, QW_evdw_TOT, QW_ecoul_TOT; - -// General -__global__ void calc_energy_sum(int rows, int columns, double* Evdw_TOT, double* Ecoul_TOT, double* Evdw, double* Ecoul, bool upper_diagonal) { - int tx = threadIdx.x; - int ty = threadIdx.y; - - __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE]; - __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE]; - - // TODO: better way to distribute upper diagonal over threads? Seems like threads in left bottom corner have less work - double coul_TOT = 0; - double vdw_TOT = 0; - for (int i = ty; i < rows; i += BLOCK_SIZE) { - for (int j = tx; j < columns; j += BLOCK_SIZE) { - if (i <= j || !upper_diagonal) { - coul_TOT += Ecoul[i * columns + j]; - vdw_TOT += Evdw[i * columns + j]; - } - } - } - Ecoul_S[ty][tx] = coul_TOT; - Evdw_S[ty][tx] = vdw_TOT; - - __syncthreads(); - - if (tx == 0 && ty == 0) { - double Evdw_temp = 0; - double Ecoul_temp = 0; - - for (int i = 0; i < BLOCK_SIZE; i++) { - for (int j = 0; j < BLOCK_SIZE; j++) { - Evdw_temp += Evdw_S[i][j]; - Ecoul_temp += Ecoul_S[i][j]; - } - } - - *Evdw_TOT = Evdw_temp; - *Ecoul_TOT = Ecoul_temp; - } -} - -__global__ void calc_qw_dvel_vector_row(int n_qatoms, int n_waters, dvel_t* DV_X, dvel_t* DV_W, calc_qw_t* MAT, q_atom_t* D_qatoms) { - int row = blockIdx.x * blockDim.x + threadIdx.x; - if (row >= n_qatoms) return; - - dvel_t dQ; - - dQ.x = 0; - dQ.y = 0; - dQ.z = 0; - - for (int i = 0; i < n_waters; i++) { - dQ.x += MAT[i + n_waters * row].Q.x; - dQ.y += MAT[i + n_waters * row].Q.y; - dQ.z += MAT[i + n_waters * row].Q.z; - } - - int q = D_qatoms[row].a - 1; - - DV_X[q].x += dQ.x; - DV_X[q].y += dQ.y; - DV_X[q].z += dQ.z; - - __syncthreads(); -} - -__global__ void calc_qw_dvel_vector_column(int n_qatoms, int n_waters, dvel_t* DV_X, dvel_t* DV_W, calc_qw_t* MAT) { - int column = blockIdx.x * blockDim.x + threadIdx.x; - if (column >= n_waters) return; - - dvel_t dO, dH1, dH2; - - dO.x = 0; - dO.y = 0; - dO.z = 0; - dH1.x = 0; - dH1.y = 0; - dH1.z = 0; - dH2.x = 0; - dH2.y = 0; - dH2.z = 0; - - for (int i = 0; i < n_qatoms; i++) { - dO.x += MAT[column + n_waters * i].O.x; - dO.y += MAT[column + n_waters * i].O.y; - dO.z += MAT[column + n_waters * i].O.z; - dH1.x += MAT[column + n_waters * i].H1.x; - dH1.y += MAT[column + n_waters * i].H1.y; - dH1.z += MAT[column + n_waters * i].H1.z; - dH2.x += MAT[column + n_waters * i].H2.x; - dH2.y += MAT[column + n_waters * i].H2.y; - dH2.z += MAT[column + n_waters * i].H2.z; - } - - DV_W[3 * column].x += dO.x; - DV_W[3 * column].y += dO.y; - DV_W[3 * column].z += dO.z; - DV_W[3 * column + 1].x += dH1.x; - DV_W[3 * column + 1].y += dH1.y; - DV_W[3 * column + 1].z += dH1.z; - DV_W[3 * column + 2].x += dH2.x; - DV_W[3 * column + 2].y += dH2.y; - DV_W[3 * column + 2].z += dH2.z; - - __syncthreads(); -} - -__device__ void calc_qw_dvel_matrix_incr(int row, int qi, int column, int n_lambdas, int n_qatoms, double crg_ow, double crg_hw, double A_O, double B_O, - coord_t* Qs, coord_t* Ws, double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE], double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE], calc_qw_t* qw, - q_catype_t* D_qcatypes, q_atype_t* D_qatypes, q_charge_t* D_qcharges, q_atom_t* D_qatoms, double* D_lambdas, topo_t D_topo) { - int j; - coord_t dO, dH1, dH2; - double r2O, rH1, rH2, r6O, rO, r2H1, r2H2; - double dvO, dvH1, dvH2; - double V_a, V_b, VelO, VelH1, VelH2; - q_atype_t qa_type; - q_catype_t qi_type; - double ai_aii, ai_bii; - - j = 3 * column; - dO.x = Ws[j].x - Qs[row].x; - dO.y = Ws[j].y - Qs[row].y; - dO.z = Ws[j].z - Qs[row].z; - dH1.x = Ws[j + 1].x - Qs[row].x; - dH1.y = Ws[j + 1].y - Qs[row].y; - dH1.z = Ws[j + 1].z - Qs[row].z; - dH2.x = Ws[j + 2].x - Qs[row].x; - dH2.y = Ws[j + 2].y - Qs[row].y; - dH2.z = Ws[j + 2].z - Qs[row].z; - - r2O = pow(dO.x, 2) + pow(dO.y, 2) + pow(dO.z, 2); - rH1 = sqrt(1.0 / (pow(dH1.x, 2) + pow(dH1.y, 2) + pow(dH1.z, 2))); - rH2 = sqrt(1.0 / (pow(dH2.x, 2) + pow(dH2.y, 2) + pow(dH2.z, 2))); - r6O = r2O * r2O * r2O; - r2O = 1.0 / r2O; - rO = sqrt(r2O); - r2H1 = rH1 * rH1; - r2H2 = rH2 * rH2; - - // Reset potential - dvO = 0; - dvH1 = 0; - dvH2 = 0; - - for (int state = 0; state < n_lambdas; state++) { - qa_type = D_qatypes[qi + n_qatoms * state]; - qi_type = D_qcatypes[qa_type.code - 1]; - - ai_aii = qi_type.Ai; - ai_bii = qi_type.Bi; - - V_a = ai_aii * A_O / (r6O * r6O); - V_b = ai_bii * B_O / (r6O); - - VelO = D_topo.coulomb_constant * crg_ow * D_qcharges[qi + n_qatoms * state].q * rO; - VelH1 = D_topo.coulomb_constant * crg_hw * D_qcharges[qi + n_qatoms * state].q * rH1; - VelH2 = D_topo.coulomb_constant * crg_hw * D_qcharges[qi + n_qatoms * state].q * rH2; - - dvO += r2O * (-VelO - (12 * V_a - 6 * V_b)) * D_lambdas[state]; - dvH1 -= r2H1 * VelH1 * D_lambdas[state]; - dvH2 -= r2H2 * VelH2 * D_lambdas[state]; - - // Update Q totals - Ecoul_S[row][state * BLOCK_SIZE + column] += (VelO + VelH1 + VelH2); - Evdw_S[row][state * BLOCK_SIZE + column] += (V_a - V_b); - } - - // Note r6O is not the usual 1/rO^6, but rather rO^6. be careful!!! - - // Update forces on Q-atom - (*qw).Q.x -= (dvO * dO.x + dvH1 * dH1.x + dvH2 * dH2.x); - (*qw).Q.y -= (dvO * dO.y + dvH1 * dH1.y + dvH2 * dH2.y); - (*qw).Q.z -= (dvO * dO.z + dvH1 * dH1.z + dvH2 * dH2.z); - - // Update forces on water - (*qw).O.x += dvO * dO.x; - (*qw).O.y += dvO * dO.y; - (*qw).O.z += dvO * dO.z; - (*qw).H1.x += dvH1 * dH1.x; - (*qw).H1.y += dvH1 * dH1.y; - (*qw).H1.z += dvH1 * dH1.z; - (*qw).H2.x += dvH2 * dH2.x; - (*qw).H2.y += dvH2 * dH2.y; - (*qw).H2.z += dvH2 * dH2.z; -} - -__global__ void calc_qw_dvel_matrix(int n_qatoms, int n_waters, int n_lambdas, double crg_ow, double crg_hw, double A_O, double B_O, - coord_t* X, coord_t* W, double* Evdw, double* Ecoul, calc_qw_t* MAT, - q_catype_t* D_qcatypes, q_atype_t* D_qatypes, q_charge_t* D_qcharges, q_atom_t* D_qatoms, double* D_lambdas, topo_t D_topo) { - // Block index - int bx = blockIdx.x; - int by = blockIdx.y; - - // Thread index - int tx = threadIdx.x; - int ty = threadIdx.y; - - // TODO implement >2 states on GPU - __shared__ double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE]; - __shared__ double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE]; - - Ecoul_S[ty][tx] = 0; - Evdw_S[ty][tx] = 0; - Ecoul_S[ty][tx + BLOCK_SIZE] = 0; - Evdw_S[ty][tx + BLOCK_SIZE] = 0; - - int aStart = BLOCK_SIZE * by; - int bStart = 3 * BLOCK_SIZE * bx; - - if (aStart + ty >= n_qatoms) return; - if (bStart + 3 * tx >= 3 * n_waters) return; - - __shared__ coord_t Qs[BLOCK_SIZE]; - __shared__ coord_t Ws[3 * BLOCK_SIZE]; - - if (tx == 0) { - Qs[ty] = X[D_qatoms[aStart + ty].a - 1]; - } - - if (ty == 0) { - Ws[3 * tx] = W[bStart + 3 * tx]; - Ws[3 * tx + 1] = W[bStart + 3 * tx + 1]; - Ws[3 * tx + 2] = W[bStart + 3 * tx + 2]; - } - - __syncthreads(); - - calc_qw_t qw; - memset(&qw, 0, sizeof(calc_qw_t)); - - int row = by * BLOCK_SIZE + ty; - int column = bx * BLOCK_SIZE + tx; - - calc_qw_dvel_matrix_incr(ty, aStart + ty, tx, n_lambdas, n_qatoms, crg_ow, crg_hw, A_O, B_O, Qs, Ws, Evdw_S, Ecoul_S, - &qw, D_qcatypes, D_qatypes, D_qcharges, D_qatoms, D_lambdas, D_topo); - - MAT[column + n_waters * row] = qw; - - __syncthreads(); - - int rowlen = (n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE; - int collen = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE; - - if (tx == 0 && ty == 0) { - // TODO implement >2 states on GPU - for (int state = 0; state < min(2, n_lambdas); state++) { - double tot_Evdw = 0; - double tot_Ecoul = 0; - for (int i = 0; i < BLOCK_SIZE; i++) { - for (int j = 0; j < BLOCK_SIZE; j++) { - tot_Evdw += Evdw_S[i][j + state * BLOCK_SIZE]; - tot_Ecoul += Ecoul_S[i][j + state * BLOCK_SIZE]; - } - } - Evdw[rowlen * collen * state + rowlen * by + bx] = tot_Evdw; - Ecoul[rowlen * collen * state + rowlen * by + bx] = tot_Ecoul; - } - } - - __syncthreads(); -} } // namespace CudaNonbondedQWForce void calc_nonbonded_qw_forces_host_v2() { using namespace CudaNonbondedQWForce; + int nx = CudaContext::instance().h_q_atoms_list.size(); + int ny = CudaContext::instance().h_w_atoms_list.size(); - int mem_size_X = n_atoms_solute * sizeof(coord_t); - int mem_size_W = 3 * n_waters * sizeof(coord_t); - int mem_size_DV_X = n_atoms_solute * sizeof(dvel_t); - int mem_size_DV_W = 3 * n_waters * sizeof(dvel_t); - int mem_size_MAT = 3 * n_waters * n_qatoms * sizeof(calc_qw_t); - - int n_blocks_q = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE; - int n_blocks_w = (n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE; - // TODO make Evdw & Ecoul work for # of states > 2 - int mem_size_QW_Evdw = min(n_lambdas, 2) * n_blocks_q * n_blocks_w * sizeof(double); - int mem_size_QW_Ecoul = min(n_lambdas, 2) * n_blocks_q * n_blocks_w * sizeof(double); - - CudaContext& ctx = CudaContext::instance(); - auto X = ctx.d_coords; - auto DV_X = ctx.d_dvelocities; - auto D_qcatypes = ctx.d_q_catypes; - auto D_qatypes = ctx.d_q_atypes; - auto D_qcharges = ctx.d_q_charges; - auto D_qatoms = ctx.d_q_atoms; - auto D_lambdas = ctx.d_lambdas; - - dim3 threads, grid; - - threads = dim3(BLOCK_SIZE, BLOCK_SIZE); - grid = dim3((n_waters + BLOCK_SIZE - 1) / threads.x, (n_qatoms + BLOCK_SIZE - 1) / threads.y); - - double evdw, ecoul; - - calc_qw_dvel_matrix<<>>(n_qatoms, n_waters, n_lambdas, crg_ow, crg_hw, A_O, B_O, X, X + n_atoms_solute, D_QW_Evdw, D_QW_Ecoul, - QW_MAT, D_qcatypes, D_qatypes, D_qcharges, D_qatoms, D_lambdas, topo); - calc_qw_dvel_vector_column<<<((n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_qatoms, n_waters, DV_X, DV_X + n_atoms_solute, QW_MAT); - calc_qw_dvel_vector_row<<<((n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_qatoms, n_waters, DV_X, DV_X + n_atoms_solute, QW_MAT, D_qatoms); - -#ifdef DEBUG - cudaMemcpy(h_QW_MAT, QW_MAT, mem_size_MAT, cudaMemcpyDeviceToHost); -#endif - -#ifdef DEBUG - for (int i = 0; i < n_qatoms; i++) { - for (int j = 0; j < 3 * n_waters; j++) { - if (h_QW_MAT[3 * i * n_waters + j].Q.x > 100) - printf("QW_MAT[%d][%d].Q = %f %f %f\n", i, j, h_QW_MAT[i * 3 * n_waters + j].Q.x, h_QW_MAT[i * 3 * n_waters + j].Q.y, h_QW_MAT[i * 3 * n_waters + j].Q.z); - } - } -#endif - - cudaMemcpy(dvelocities, DV_X, mem_size_DV_X, cudaMemcpyDeviceToHost); - cudaMemcpy(&dvelocities[n_atoms_solute], DV_X + n_atoms_solute, mem_size_DV_W, cudaMemcpyDeviceToHost); - - // TODO make Evdw & Ecoul work for # of states > 2 - for (int state = 0; state < min(2, n_lambdas); state++) { - calc_energy_sum<<<1, threads>>>(n_blocks_q, n_blocks_w, D_QW_evdw_TOT, D_QW_ecoul_TOT, &D_QW_Evdw[state * n_blocks_w * n_blocks_q], - &D_QW_Ecoul[state * n_blocks_w * n_blocks_q], false); - - cudaMemcpy(&QW_evdw_TOT, D_QW_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost); - cudaMemcpy(&QW_ecoul_TOT, D_QW_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost); - - EQ_nonbond_qw[state].Uvdw += QW_evdw_TOT; - EQ_nonbond_qw[state].Ucoul += QW_ecoul_TOT; + for (int state = 0; state < n_lambdas; state++) { + auto result = calc_nonbonded_force_host( + nx, + ny, + CudaContext::instance().d_q_atoms_list, + CudaContext::instance().d_w_atoms_list, + false, + CudaContext::instance().d_q_charge_types + state * nx + nx * n_lambdas, + CudaContext::instance().d_w_charge_types, + CudaContext::instance().d_charge_table_all, + CudaContext::instance().d_q_catype_types + state * nx + nx * n_lambdas, + CudaContext::instance().d_w_catype_types, + CudaContext::instance().d_catype_table_all); + + EQ_nonbond_qw[state].Uvdw += result.first / lambdas[state]; + EQ_nonbond_qw[state].Ucoul += result.second / lambdas[state]; + printf("Nonbonded QW Force State %d: Uvdw = %f, Ucoul = %f\n", state, result.first, result.second); } } void init_nonbonded_qw_force_kernel_data() { using namespace CudaNonbondedQWForce; - if (!is_initialized) { - catype_t catype_ow; // Atom type of first O atom - - catype_ow = catypes[atypes[n_atoms_solute].code - 1]; - - A_O = catype_ow.aii_normal; - B_O = catype_ow.bii_normal; - - int n_blocks_q = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE; - int n_blocks_w = (n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE; - int mem_size_MAT = 3 * n_waters * n_qatoms * sizeof(calc_qw_t); - - int mem_size_QW_Evdw = min(n_lambdas, 2) * n_blocks_q * n_blocks_w * sizeof(double); - int mem_size_QW_Ecoul = min(n_lambdas, 2) * n_blocks_q * n_blocks_w * sizeof(double); -#ifdef DEBUG - printf("Allocating QW_MAT\n"); -#endif - check_cudaMalloc((void**)&QW_MAT, mem_size_MAT); - -#ifdef DEBUG - printf("Allocating D_QW_Evdw\n"); -#endif - check_cudaMalloc((void**)&D_QW_Evdw, mem_size_QW_Evdw); -#ifdef DEBUG - printf("Allocating D_QW_Ecoul\n"); -#endif - check_cudaMalloc((void**)&D_QW_Ecoul, mem_size_QW_Ecoul); - - check_cudaMalloc((void**)&D_QW_evdw_TOT, sizeof(double)); - check_cudaMalloc((void**)&D_QW_ecoul_TOT, sizeof(double)); - - h_QW_Evdw = (double*)malloc(mem_size_QW_Evdw); - h_QW_Ecoul = (double*)malloc(mem_size_QW_Ecoul); - - h_QW_MAT = (calc_qw_t*)malloc(mem_size_MAT); - is_initialized = true; - } + if (is_initialized) return; + is_initialized = true; } void cleanup_nonbonded_qw_force() { using namespace CudaNonbondedQWForce; - - if (is_initialized) { - cudaFree(QW_MAT); - cudaFree(D_QW_Evdw); - cudaFree(D_QW_Ecoul); - cudaFree(D_QW_evdw_TOT); - cudaFree(D_QW_ecoul_TOT); - - free(h_QW_Evdw); - free(h_QW_Ecoul); - free(h_QW_MAT); - - is_initialized = false; - } -} \ No newline at end of file + if (!is_initialized) return; + is_initialized = false; +} diff --git a/src/core/cuda/src/CudaNonbondedWWForce.cu b/src/core/cuda/src/CudaNonbondedWWForce.cu index 84b6652c..e4da8f51 100644 --- a/src/core/cuda/src/CudaNonbondedWWForce.cu +++ b/src/core/cuda/src/CudaNonbondedWWForce.cu @@ -1,312 +1,36 @@ +#include +#include + #include "cuda/include/CudaContext.cuh" +#include "cuda/include/CudaNonbondedForce.cuh" #include "cuda/include/CudaNonbondedWWForce.cuh" namespace CudaNonbondedWWForce { bool is_initialized = false; -double *D_WW_evdw_TOT, *D_WW_ecoul_TOT, WW_evdw_TOT, WW_ecoul_TOT; - -__device__ __forceinline__ void calculate_unforce_bound( - const int y, const int x, const coord_t& q, const coord_t& p, - const topo_t& D_topo, const double& crg_ow, const double& crg_hw, - const double& A_OO, const double& B_OO, double& evdw, double& ecoul, - double& dv, double& tmpx, double& tmpy, double& tmpz) { - int belong_y = y / 3; - int belong_x = x / 3; - if (belong_y == belong_x) { - return; - } - - bool y_is_o = (y % 3 == 0); - bool x_is_o = (x % 3 == 0); - - // Compute distance components - tmpx = p.x - q.x; - tmpy = p.y - q.y; - tmpz = p.z - q.z; - // double inv_dis = 1.0 / sqrt(pow(tmpx, 2) + pow(tmpy, 2) + pow(tmpz, 2)); - double inv_dis = rsqrt(tmpx * tmpx + tmpy * tmpy + tmpz * tmpz); - double inv_dis2 = inv_dis * inv_dis; - - ecoul = inv_dis * D_topo.coulomb_constant * (y_is_o ? crg_ow : crg_hw) * - (x_is_o ? crg_ow : crg_hw); - double v_a = 0, v_b = 0; - if (y_is_o && x_is_o) { - double inv_dis6 = inv_dis2 * inv_dis2 * inv_dis2; - double inv_dis12 = inv_dis6 * inv_dis6; - v_a = A_OO * inv_dis12; - v_b = B_OO * inv_dis6; - evdw = v_a - v_b; - dv = inv_dis * inv_dis * (-ecoul - 12.0 * v_a + 6.0 * v_b); - } else { - dv = inv_dis * inv_dis * -ecoul; - } -} - -template -__global__ void -calc_ww(const int N, const double crg_ow, const double crg_hw, - const double A_OO, const double B_OO, const topo_t D_topo, - coord_t* __restrict__ W, dvel_t* __restrict__ DV_W, - double* __restrict__ Evdw_TOT, double* __restrict__ ecoul_TOT) { - // Calculate block boundaries - int NX = N; - int NY = (N + 1) / 2; - int x_cal_num = blockDim.x * Block_x; - int y_cal_num = blockDim.y * Block_y; - int block_x_left_begin = 1 + blockIdx.x * x_cal_num; - int block_y_left_begin = blockIdx.y * y_cal_num; - int block_x_left_end = min(block_x_left_begin + x_cal_num - 1, NX - 1); - int block_y_left_end = min(block_y_left_begin + y_cal_num - 1, NY - 1); - - // Shared memory declarations with padding to avoid bank conflicts - __shared__ coord_t p[2 * Thread_x * Block_x + 1]; - __shared__ coord_t q[2 * Thread_y * Block_y + 1]; - __shared__ double sum_row_x[2 * Thread_y * Block_y + 1]; - __shared__ double sum_row_y[2 * Thread_y * Block_y + 1]; - __shared__ double sum_row_z[2 * Thread_y * Block_y + 1]; - - __shared__ double block_ecoul[(Thread_x * Thread_y + 31) / 32], - block_evdw[(Thread_x * Thread_y + 31) / 32]; - - // Thread indices - int thread_y_left_begin = block_y_left_begin + threadIdx.y * Block_y; - int thread_x_left_begin = block_x_left_begin + threadIdx.x * Block_x; - int cur_thread_num = blockDim.x * threadIdx.y + threadIdx.x; - int thread_num = blockDim.x * blockDim.y; - -// Optimized coordinate loading with coalesced memory access -#pragma unroll - for (int i = cur_thread_num; i < x_cal_num && block_x_left_begin + i < NX; i += thread_num) { - p[i] = W[block_x_left_begin + i]; - p[x_cal_num + i] = W[N - (block_x_left_begin + i)]; - } - -#pragma unroll - for (int i = cur_thread_num; i < y_cal_num && block_y_left_begin + i < NY; i += thread_num) { - q[i] = W[block_y_left_begin + i]; - q[y_cal_num + i] = W[N - 1 - (block_y_left_begin + i)]; - } - -// Initialize sum arrays -#pragma unroll - for (int i = cur_thread_num; i < y_cal_num && block_y_left_begin + i < NY; i += thread_num) { - sum_row_x[i] = sum_row_x[y_cal_num + i] = 0; - sum_row_y[i] = sum_row_y[y_cal_num + i] = 0; - sum_row_z[i] = sum_row_z[y_cal_num + i] = 0; - } - __syncthreads(); - // Initialize column sums - double sum_col_x[2 * Block_x], sum_col_y[2 * Block_x], sum_col_z[2 * Block_x]; -#pragma unroll - for (int i = 0; i < Block_x; i++) { - sum_col_x[i] = sum_col_x[Block_x + i] = 0; - sum_col_y[i] = sum_col_y[Block_x + i] = 0; - sum_col_z[i] = sum_col_z[Block_x + i] = 0; - } - - // Main computation loop with reduced thread divergence - double evdw_tot = 0, ecoul_tot = 0; -#pragma unroll - for (int i = 0; i < Block_x; i++) { - int i2 = i; - int x = thread_x_left_begin + i2; - if (x >= NX) continue; - - int offset_x = x - block_x_left_begin; - const coord_t& now_p0 = p[offset_x]; - const coord_t& now_p1 = p[x_cal_num + offset_x]; - -#pragma unroll - for (int j = 0; j < Block_y; j++) { - int j2 = (j + threadIdx.x) % Block_y; - int y = thread_y_left_begin + j2; - if (y >= NY) continue; - - // Optimized condition check - if (y >= x && (N % 2 == 0 || y != NY - 1)) { - int offset_y = y - block_y_left_begin; - double evdw = 0, ecoul = 0, dv = 0; - double tmpx = 0, tmpy = 0, tmpz = 0; - - int y2 = N - 1 - y; - int x2 = N - x; - calculate_unforce_bound(y2, x2, q[y_cal_num + offset_y], now_p1, D_topo, - crg_ow, crg_hw, A_OO, B_OO, evdw, ecoul, dv, - tmpx, tmpy, tmpz); - - evdw_tot += evdw; - ecoul_tot += ecoul; - double v_x = dv * tmpx; - double v_y = dv * tmpy; - double v_z = dv * tmpz; - - sum_col_x[Block_x + i2] += v_x; - sum_col_y[Block_x + i2] += v_y; - sum_col_z[Block_x + i2] += v_z; - sum_row_x[y_cal_num + offset_y] -= v_x; - sum_row_y[y_cal_num + offset_y] -= v_y; - sum_row_z[y_cal_num + offset_y] -= v_z; - } else if (y < x) { - int offset_y = y - block_y_left_begin; - double evdw = 0, ecoul = 0, dv = 0; - double tmpx = 0, tmpy = 0, tmpz = 0; - - calculate_unforce_bound(y, x, q[offset_y], now_p0, D_topo, crg_ow, - crg_hw, A_OO, B_OO, evdw, ecoul, dv, tmpx, tmpy, - tmpz); - - evdw_tot += evdw; - ecoul_tot += ecoul; - double v_x = dv * tmpx; - double v_y = dv * tmpy; - double v_z = dv * tmpz; - - sum_col_x[i2] += v_x; - sum_col_y[i2] += v_y; - sum_col_z[i2] += v_z; - sum_row_x[offset_y] -= v_x; - sum_row_y[offset_y] -= v_y; - sum_row_z[offset_y] -= v_z; - } - } - } - -// Optimized reduction using warp-level primitives -#pragma unroll - for (unsigned w = 16; w >= 1; w /= 2) { - ecoul_tot += __shfl_down_sync(0xffffffff, ecoul_tot, w); - evdw_tot += __shfl_down_sync(0xffffffff, evdw_tot, w); - } - - // Store block results - int warp_id = cur_thread_num / warpSize; - int lane_id = cur_thread_num % warpSize; - if (lane_id == 0) { - block_evdw[warp_id] = evdw_tot; - block_ecoul[warp_id] = ecoul_tot; - } - __syncthreads(); - - // Final reduction - if (warp_id == 0) { - double val1 = (lane_id < (Thread_x * Thread_y + 31) / 32) ? block_ecoul[lane_id] : 0; - double val2 = (lane_id < (Thread_x * Thread_y + 31) / 32) ? block_evdw[lane_id] : 0; - -#pragma unroll - for (unsigned w = 16; w >= 1; w /= 2) { - val1 += __shfl_down_sync(0xffffffff, val1, w); - val2 += __shfl_down_sync(0xffffffff, val2, w); - } - - if (lane_id == 0) { - atomicAdd(ecoul_TOT, val1); - atomicAdd(Evdw_TOT, val2); - } - } - -// Optimized row reduction -#pragma unroll - for (int i = cur_thread_num; i < y_cal_num && block_y_left_begin + i < NY; i += thread_num) { - int idx = block_y_left_begin + i; - if (idx < block_x_left_end) { - atomicAdd(&DV_W[idx].x, sum_row_x[i]); - atomicAdd(&DV_W[idx].y, sum_row_y[i]); - atomicAdd(&DV_W[idx].z, sum_row_z[i]); - } - if (idx >= block_x_left_begin) { - idx = N - 1 - idx; - atomicAdd(&DV_W[idx].x, sum_row_x[i + y_cal_num]); - atomicAdd(&DV_W[idx].y, sum_row_y[i + y_cal_num]); - atomicAdd(&DV_W[idx].z, sum_row_z[i + y_cal_num]); - } - } - -// Optimized column reduction -#pragma unroll - for (int i = 0; i < Block_x; i++) { - int i2 = (i + threadIdx.y) % Block_x; - int idx = thread_x_left_begin + i2; - if (idx < N) { - if (block_y_left_begin < idx) { - atomicAdd(&DV_W[idx].x, sum_col_x[i2]); - atomicAdd(&DV_W[idx].y, sum_col_y[i2]); - atomicAdd(&DV_W[idx].z, sum_col_z[i2]); - } - if (block_y_left_end >= idx) { - atomicAdd(&DV_W[N - idx].x, sum_col_x[i2 + Block_x]); - atomicAdd(&DV_W[N - idx].y, sum_col_y[i2 + Block_x]); - atomicAdd(&DV_W[N - idx].z, sum_col_z[i2 + Block_x]); - } - } - } -} } // namespace CudaNonbondedWWForce void calc_nonbonded_ww_forces_host_v2() { using namespace CudaNonbondedWWForce; - int N = 3 * n_waters; - int mem_size_W = N * sizeof(coord_t); - int mem_size_DV_W = N * sizeof(dvel_t); - - WW_evdw_TOT = 0; - WW_ecoul_TOT = 0; - cudaMemcpy(D_WW_evdw_TOT, &WW_evdw_TOT, sizeof(double), - cudaMemcpyHostToDevice); - cudaMemcpy(D_WW_ecoul_TOT, &WW_ecoul_TOT, sizeof(double), - cudaMemcpyHostToDevice); - - CudaContext& ctx = CudaContext::instance(); - auto W = ctx.d_coords + n_atoms_solute; - auto DV_W = ctx.d_dvelocities + n_atoms_solute; - - // Optimize thread block configuration - const int thread_num_x = 32; // Keep at 32 for better warp utilization - const int thread_num_y = 1; // Keep at 1 for better memory coalescing - dim3 block_sz = dim3(thread_num_x, thread_num_y); - - const int N_ITERATION_Y = 32; // Keep at 32 for better memory access pattern - const int N_ITERATION_X = 1; // Keep at 1 for better memory coalescing - - int block_element_sz_x = N_ITERATION_X * block_sz.x; - int block_element_sz_y = N_ITERATION_Y * block_sz.y; - - int grid_sz_x = (N - 1 + block_element_sz_x - 1) / (block_element_sz_x); - int grid_sz_y = ((N + 1) / 2 + block_element_sz_y - 1) / block_element_sz_y; - dim3 grid = dim3(grid_sz_x, grid_sz_y); - calc_ww - <<>>(N, crg_ow, crg_hw, A_OO, B_OO, topo, W, DV_W, - D_WW_evdw_TOT, D_WW_ecoul_TOT); - - cudaDeviceSynchronize(); - cudaMemcpy(&dvelocities[n_atoms_solute], DV_W, mem_size_DV_W, - cudaMemcpyDeviceToHost); - cudaMemcpy(&WW_evdw_TOT, D_WW_evdw_TOT, sizeof(double), - cudaMemcpyDeviceToHost); - cudaMemcpy(&WW_ecoul_TOT, D_WW_ecoul_TOT, sizeof(double), - cudaMemcpyDeviceToHost); - E_nonbond_ww.Uvdw += WW_evdw_TOT; - E_nonbond_ww.Ucoul += WW_ecoul_TOT; + int nx = CudaContext::instance().h_w_atoms_list.size(); + int ny = nx; + auto result = calc_nonbonded_force_host( + nx, ny, + CudaContext::instance().d_w_atoms_list, + CudaContext::instance().d_w_atoms_list, + true, + CudaContext::instance().d_w_charge_types, + CudaContext::instance().d_w_charge_types, + CudaContext::instance().d_charge_table_all, + CudaContext::instance().d_w_catype_types, + CudaContext::instance().d_w_catype_types, + CudaContext::instance().d_catype_table_all); + E_nonbond_ww.Uvdw = result.first; + E_nonbond_ww.Ucoul = result.second; } void init_nonbonded_ww_force_kernel_data() { using namespace CudaNonbondedWWForce; if (!is_initialized) { - catype_t catype_ow; // Atom type of first O, H atom - ccharge_t ccharge_ow, ccharge_hw; // Charge of first O, H atom - - catype_ow = catypes[atypes[n_atoms_solute].code - 1]; - ccharge_ow = ccharges[charges[n_atoms_solute].code - 1]; - ccharge_hw = ccharges[charges[n_atoms_solute + 1].code - 1]; - - A_OO = pow(catype_ow.aii_normal, 2); - B_OO = pow(catype_ow.bii_normal, 2); - - crg_ow = ccharge_ow.charge; - crg_hw = ccharge_hw.charge; - - check_cudaMalloc((void**)&D_WW_evdw_TOT, sizeof(double)); - check_cudaMalloc((void**)&D_WW_ecoul_TOT, sizeof(double)); is_initialized = true; } } @@ -314,8 +38,6 @@ void init_nonbonded_ww_force_kernel_data() { void cleanup_nonbonded_ww_force() { using namespace CudaNonbondedWWForce; if (is_initialized) { - cudaFree(D_WW_evdw_TOT); - cudaFree(D_WW_ecoul_TOT); is_initialized = false; } -} \ No newline at end of file +} diff --git a/src/core/parse.cu b/src/core/parse.cu index 1b9e0f8d..c5518223 100644 --- a/src/core/parse.cu +++ b/src/core/parse.cu @@ -753,6 +753,7 @@ void init_ngbrs14(const char* filename) { int jx = (lineI + i + 1) % lines; // if (ix < 100 && jx < 100) printf("i = %d j = %d\n", ix+1, jx+1); LJ_matrix[ix * n_atoms_solute + jx] = 1; + LJ_matrix[jx * n_atoms_solute + ix] = 1; } } lineI++; @@ -797,6 +798,7 @@ void init_ngbrs23(const char* filename) { int jx = (lineI + i + 1) % lines; // if (ix < 100 && jx < 100) printf("i = %d j = %d\n", ix+1, jx+1); LJ_matrix[ix * n_atoms_solute + jx] = 3; + LJ_matrix[jx * n_atoms_solute + ix] = 3; } } lineI++; @@ -819,6 +821,7 @@ void init_ngbrs14_long(const char* filename) { int ix = atoi(file.buffer[i+1][0])-1; int jx = atoi(file.buffer[i+1][1])-1; LJ_matrix[ix * n_atoms_solute + jx] = 1; + LJ_matrix[jx * n_atoms_solute + ix] = 1; } clean_csv(file); @@ -838,6 +841,7 @@ void init_ngbrs23_long(const char* filename) { int ix = atoi(file.buffer[i+1][0])-1; int jx = atoi(file.buffer[i+1][1])-1; LJ_matrix[ix * n_atoms_solute + jx] = 3; + LJ_matrix[jx * n_atoms_solute + ix] = 3; } clean_csv(file); diff --git a/src/core/system.cu b/src/core/system.cu index afb999cf..18c15d95 100644 --- a/src/core/system.cu +++ b/src/core/system.cu @@ -29,6 +29,7 @@ #include "cuda/include/CudaNonbondedQWForce.cuh" #include "cuda/include/CudaNonbondedPWForce.cuh" #include "cuda/include/CudaNonbondedWWForce.cuh" +#include "cuda/include/CudaNonbondedForce.cuh" #include #include @@ -1320,6 +1321,7 @@ void init_cuda_kernel_data() { init_bond_force_kernel_data(); init_improper2_force_kernel_data(); init_leapfrog_kernel_data(); + init_nonbonded_force_kernel_data(); init_nonbonded_pp_force_kernel_data(); init_nonbonded_pw_force_kernel_data(); init_nonbonded_qp_force_kernel_data(); @@ -1564,6 +1566,7 @@ void clean_variables() { cleanup_nonbonded_qq_force(); cleanup_nonbonded_qw_force(); cleanup_nonbonded_ww_force(); + cleanup_nonbonded_force(); cleanup_polx_water_force(); cleanup_pshell_force(); cleanup_radix_water_force();