Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/core/cuda/include/CudaContext.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ class CudaContext {
E_nonbonded_t* d_EQ_nonbond_qq = nullptr;
double* d_lambdas = nullptr;

q_charge_t* d_q_aggregate_charges = nullptr;
q_catype_t* d_q_aggregate_catypes = nullptr;

/*
Used in CudaPolxWaterForce.cu
*/
Expand Down Expand Up @@ -124,6 +127,8 @@ class CudaContext {
private:
CudaContext() = default;

void init_data_for_q();

void free();

~CudaContext() { free(); }
Expand Down
41 changes: 40 additions & 1 deletion src/core/cuda/src/CudaContext.cu
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include <iostream>

#include "cuda/include/CudaContext.cuh"
#include <vector>

void CudaContext::init() {
check_cudaMalloc((void**)&d_coords, sizeof(coord_t) * n_atoms);
Expand Down Expand Up @@ -54,6 +55,8 @@ void CudaContext::init() {
check_cudaMalloc((void**)&d_charges, sizeof(charge_t) * n_charges);
check_cudaMalloc((void**)&d_p_atoms, sizeof(p_atom_t) * n_patoms);

init_data_for_q();

sync_all_to_device();
}

Expand Down Expand Up @@ -183,7 +186,6 @@ void CudaContext::free() {
cudaFree(d_excluded);
cudaFree(d_q_elscales);
cudaFree(d_q_catypes);
cudaFree(d_q_atypes);
cudaFree(d_EQ_nonbond_qq);
cudaFree(d_lambdas);

Expand All @@ -208,10 +210,47 @@ void CudaContext::free() {
cudaFree(d_ccharges);
cudaFree(d_charges);
cudaFree(d_p_atoms);

cudaFree(d_q_aggregate_charges);
cudaFree(d_q_aggregate_catypes);
}

void CudaContext::reset_energies() {
cudaMemset(d_dvelocities, 0, sizeof(dvel_t) * n_atoms);
cudaMemset(d_EQ_nonbond_qq, 0, sizeof(E_nonbonded_t) * n_lambdas);
cudaMemset(d_EQ_restraint, 0, sizeof(E_restraint_t) * n_lambdas);
}

void CudaContext::init_data_for_q() {

// Additional initialization for q-related data can be added here if needed

// q charges
std::vector<q_charge_t> charges(n_qatoms, q_charge_t{0.0});
for (int i = 0; i < n_qatoms; i++) {
double q = 0;
for (int state = 0; state < n_lambdas; state++) {
q += q_charges[state * n_qatoms + i].q * lambdas[state];
}
charges[i].q = q;
}
check_cudaMalloc((void**)&d_q_aggregate_charges, sizeof(q_charge_t) * n_qatoms);
check_cuda(cudaMemcpy(d_q_aggregate_charges, charges.data(), sizeof(q_charge_t) * n_qatoms, cudaMemcpyHostToDevice));


// q atypes
std::vector<q_catype_t> atypes(n_qatoms, q_catype_t{"", 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0});
for (int i = 0; i < n_qatoms; i++) {
for (int state = 0; state < n_lambdas; state++) {
atypes[i].Ai += q_catypes[q_atypes[state * n_qatoms + i].code - 1].Ai * lambdas[state];
atypes[i].Bi += q_catypes[q_atypes[state * n_qatoms + i].code - 1].Bi * lambdas[state];
atypes[i].Ci += q_catypes[q_atypes[state * n_qatoms + i].code - 1].Ci * lambdas[state];
atypes[i].ai += q_catypes[q_atypes[state * n_qatoms + i].code - 1].ai * lambdas[state];
atypes[i].Ai_14 += q_catypes[q_atypes[state * n_qatoms + i].code - 1].Ai_14 * lambdas[state];
atypes[i].Bi_14 += q_catypes[q_atypes[state * n_qatoms + i].code - 1].Bi_14 * lambdas[state];
atypes[i].m += q_catypes[q_atypes[state * n_qatoms + i].code - 1].m * lambdas[state];
}
}
check_cudaMalloc((void**)&d_q_aggregate_catypes, sizeof(q_catype_t) * n_qatoms);
check_cuda(cudaMemcpy(d_q_aggregate_catypes, atypes.data(), sizeof(q_catype_t) * n_qatoms, cudaMemcpyHostToDevice));
}
121 changes: 55 additions & 66 deletions src/core/cuda/src/CudaNonbondedQPForce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ __global__ void calc_energy_sum(int rows, int columns, double* Evdw_TOT, double*
}
Ecoul_S[ty][tx] = coul_TOT;
Evdw_S[ty][tx] = vdw_TOT;
// printf("Thread (%d,%d) coul_TOT = %f vdw_TOT = %f\n", tx, ty, coul_TOT, vdw_TOT);

__syncthreads();

Expand Down Expand Up @@ -111,8 +112,8 @@ __global__ void calc_qp_dvel_vector_column(int n_qatoms, int n_patoms, dvel_t* D
}

__device__ void calc_qp_dvel_matrix_incr(int row, int qi, int column, int pj, int n_lambdas, int n_qatoms,
coord_t* Qs, coord_t* Ps, int* LJs, bool* excluded_s, double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE], double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE], calc_qp_t* qp,
q_catype_t* D_qcatypes, q_atype_t* D_qatypes, q_charge_t* D_qcharges, p_atom_t* D_patoms, q_atom_t* D_qatoms, double* D_lambdas,
coord_t* Qs, coord_t* Ps, int* LJs, bool* excluded_s, double Evdw_S[BLOCK_SIZE][BLOCK_SIZE], double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE], calc_qp_t* qp,
q_catype_t* D_qcatypes, q_charge_t* D_qcharges, p_atom_t* D_patoms, q_atom_t* D_qatoms,
catype_t* D_catypes, atype_t* D_atypes, ccharge_t* D_ccharges, charge_t* D_charges, topo_t D_topo) {
coord_t da;
double r2, r6, r;
Expand Down Expand Up @@ -141,43 +142,40 @@ __device__ void calc_qp_dvel_matrix_incr(int row, int qi, int column, int pj, in
r6 = r2 * r2 * r2;
r2 = 1 / r2;
r = sqrt(r2);

for (int state = 0; state < n_lambdas; state++) {
qi_type = D_qcatypes[D_qatypes[qi + n_qatoms * state].code - 1];
aj_type = D_catypes[D_atypes[j].code - 1];

ai_aii = bond14 ? qi_type.Ai_14 : qi_type.Ai;
aj_aii = bond14 ? aj_type.aii_1_4 : aj_type.aii_normal;
ai_bii = bond14 ? qi_type.Bi_14 : qi_type.Bi;
aj_bii = bond14 ? aj_type.bii_1_4 : aj_type.bii_normal;

Vel = D_topo.coulomb_constant * scaling * D_qcharges[qi + n_qatoms * state].q * D_ccharges[D_charges[j].code - 1].charge * r;
V_a = ai_aii * aj_aii / (r6 * r6);
V_b = ai_bii * aj_bii / r6;
dv = r2 * (-Vel - (12 * V_a - 6 * V_b)) * D_lambdas[state];

// if (state == 0 && qi == 0 && pj == 1) {
// printf("crg_q = %f crg_j = %f r = %f\n", D_qcharges[qi + n_qatoms * state].q, D_ccharges[D_charges[pj].code - 1].charge, r);
// printf("ai_aii = %f aj_aii = %f ai_bii = %f aj_bii = %f\n", ai_aii, aj_aii, ai_bii, aj_bii);
// }

// Update forces
qp->Q.x += dv * da.x;
qp->Q.y += dv * da.y;
qp->Q.z += dv * da.z;
qp->P.x -= dv * da.x;
qp->P.y -= dv * da.y;
qp->P.z -= dv * da.z;

// Update Q totals
Ecoul_S[row][state * BLOCK_SIZE + column] += Vel;
Evdw_S[row][state * BLOCK_SIZE + column] += (V_a - V_b);
}
qi_type = D_qcatypes[qi];
aj_type = D_catypes[D_atypes[j].code - 1];

ai_aii = bond14 ? qi_type.Ai_14 : qi_type.Ai;
aj_aii = bond14 ? aj_type.aii_1_4 : aj_type.aii_normal;
ai_bii = bond14 ? qi_type.Bi_14 : qi_type.Bi;
aj_bii = bond14 ? aj_type.bii_1_4 : aj_type.bii_normal;

Vel = D_topo.coulomb_constant * scaling * D_qcharges[qi].q * D_ccharges[D_charges[j].code - 1].charge * r;
V_a = ai_aii * aj_aii / (r6 * r6);
V_b = ai_bii * aj_bii / r6;
dv = r2 * (-Vel - (12 * V_a - 6 * V_b));

// if (state == 0 && qi == 0 && pj == 1) {
// printf("crg_q = %f crg_j = %f r = %f\n", D_qcharges[qi + n_qatoms * state].q, D_ccharges[D_charges[pj].code - 1].charge, r);
// printf("ai_aii = %f aj_aii = %f ai_bii = %f aj_bii = %f\n", ai_aii, aj_aii, ai_bii, aj_bii);
// }

// Update forces
qp->Q.x += dv * da.x;
qp->Q.y += dv * da.y;
qp->Q.z += dv * da.z;
qp->P.x -= dv * da.x;
qp->P.y -= dv * da.y;
qp->P.z -= dv * da.z;

// Update Q totals
Ecoul_S[row][column] += Vel;
Evdw_S[row][column] += (V_a - V_b);
}

__global__ void calc_qp_dvel_matrix(int n_qatoms, int n_patoms, int n_lambdas, int n_atoms_solute,
coord_t* X, double* Evdw, double* Ecoul, calc_qp_t* QP_MAT,
q_catype_t* D_qcatypes, q_atype_t* D_qatypes, q_charge_t* D_qcharges, p_atom_t* D_patoms, q_atom_t* D_qatoms, double* D_lambdas,
q_catype_t* D_qcatypes, q_charge_t* D_qcharges, p_atom_t* D_patoms, q_atom_t* D_qatoms,
int* D_LJ_matrix, bool* D_excluded, catype_t* D_catypes, atype_t* D_atypes, ccharge_t* D_ccharges, charge_t* D_charges, topo_t D_topo) {
// Block index
int bx = blockIdx.x;
Expand All @@ -187,9 +185,8 @@ __global__ void calc_qp_dvel_matrix(int n_qatoms, int n_patoms, int n_lambdas, i
int tx = threadIdx.x;
int ty = threadIdx.y;

// TODO implement >2 states on GPU
__shared__ double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE];
__shared__ double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE];
__shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE];
__shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE];

Ecoul_S[ty][tx] = 0;
Evdw_S[ty][tx] = 0;
Expand Down Expand Up @@ -230,29 +227,25 @@ __global__ void calc_qp_dvel_matrix(int n_qatoms, int n_patoms, int n_lambdas, i
int column = bx * BLOCK_SIZE + tx;

calc_qp_dvel_matrix_incr(ty, row, tx, column, n_lambdas, n_qatoms, Qs, Ps, LJs, excluded_s, Evdw_S, Ecoul_S,
&qp, D_qcatypes, D_qatypes, D_qcharges, D_patoms, D_qatoms, D_lambdas, D_catypes, D_atypes, D_ccharges, D_charges, D_topo);
&qp, D_qcatypes, D_qcharges, D_patoms, D_qatoms, D_catypes, D_atypes, D_ccharges, D_charges, D_topo);

QP_MAT[n_patoms * row + column] = qp;

__syncthreads();

int rowlen = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
int collen = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE;

if (tx == 0 && ty == 0) {
// TODO implement >2 states on GPU
for (int state = 0; state < min(2, n_lambdas); state++) {
double tot_Evdw = 0;
double tot_Ecoul = 0;
for (int i = 0; i < BLOCK_SIZE; i++) {
for (int j = 0; j < BLOCK_SIZE; j++) {
tot_Evdw += Evdw_S[i][j + state * BLOCK_SIZE];
tot_Ecoul += Ecoul_S[i][j + state * BLOCK_SIZE];
}
double tot_Evdw = 0;
double tot_Ecoul = 0;
for (int i = 0; i < BLOCK_SIZE; i++) {
for (int j = 0; j < BLOCK_SIZE; j++) {
tot_Evdw += Evdw_S[i][j];
tot_Ecoul += Ecoul_S[i][j];
}
Evdw[rowlen * collen * state + rowlen * by + bx] = tot_Evdw;
Ecoul[rowlen * collen * state + rowlen * by + bx] = tot_Ecoul;
}
Evdw[rowlen * by + bx] = tot_Evdw;
Ecoul[rowlen * by + bx] = tot_Ecoul;
}

__syncthreads();
Expand All @@ -269,12 +262,10 @@ void calc_nonbonded_qp_forces_host_v2() {
CudaContext& ctx = CudaContext::instance();
auto X = ctx.d_coords;
auto DV_X = ctx.d_dvelocities;
auto D_qcatypes = ctx.d_q_catypes;
auto D_qatypes = ctx.d_q_atypes;
auto D_qcharges = ctx.d_q_charges;
auto D_qcatypes = ctx.d_q_aggregate_catypes;
auto D_qcharges = ctx.d_q_aggregate_charges;
auto D_patoms = ctx.d_p_atoms;
auto D_qatoms = ctx.d_q_atoms;
auto D_lambdas = ctx.d_lambdas;
auto D_LJ_matrix = ctx.d_LJ_matrix;
auto D_excluded = ctx.d_excluded;
auto D_catypes = ctx.d_catypes;
Expand All @@ -288,7 +279,7 @@ void calc_nonbonded_qp_forces_host_v2() {
grid = dim3((n_patoms + BLOCK_SIZE - 1) / threads.x, (n_qatoms + BLOCK_SIZE - 1) / threads.y);

calc_qp_dvel_matrix<<<grid, threads>>>(n_qatoms, n_patoms, n_lambdas, n_atoms_solute, X, D_QP_Evdw, D_QP_Ecoul, QP_MAT,
D_qcatypes, D_qatypes, D_qcharges, D_patoms, D_qatoms, D_lambdas, D_LJ_matrix, D_excluded,
D_qcatypes, D_qcharges, D_patoms, D_qatoms, D_LJ_matrix, D_excluded,
D_catypes, D_atypes, D_ccharges, D_charges, topo);
calc_qp_dvel_vector_column<<<((n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_qatoms, n_patoms, DV_X, QP_MAT, D_patoms);
calc_qp_dvel_vector_row<<<((n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_qatoms, n_patoms, DV_X, QP_MAT, D_qatoms);
Expand All @@ -309,16 +300,14 @@ void calc_nonbonded_qp_forces_host_v2() {
int mem_size_DV_X = n_atoms * sizeof(dvel_t);
cudaMemcpy(dvelocities, DV_X, mem_size_DV_X, cudaMemcpyDeviceToHost);

// TODO: make Evdw & Ecoul work for # of states > 2
for (int state = 0; state < min(2, n_lambdas); state++) {
calc_energy_sum<<<1, threads>>>(n_blocks_q, n_blocks_p, D_QP_evdw_TOT, D_QP_ecoul_TOT, &D_QP_Evdw[state * n_blocks_p * n_blocks_q], &D_QP_Ecoul[state * n_blocks_p * n_blocks_q], false);
calc_energy_sum<<<1, threads>>>(n_blocks_q, n_blocks_p, D_QP_evdw_TOT, D_QP_ecoul_TOT, D_QP_Evdw, D_QP_Ecoul, false);

cudaMemcpy(&QP_evdw_TOT, D_QP_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(&QP_ecoul_TOT, D_QP_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(&QP_evdw_TOT, D_QP_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(&QP_ecoul_TOT, D_QP_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost);
printf("QP E_vdw: %f E_coul: %f\n", QP_evdw_TOT, QP_ecoul_TOT);

EQ_nonbond_qp[state].Uvdw += QP_evdw_TOT;
EQ_nonbond_qp[state].Ucoul += QP_ecoul_TOT;
}
E_nonbond_qx.Uvdw += QP_evdw_TOT;
E_nonbond_qx.Ucoul += QP_ecoul_TOT;
}

void init_nonbonded_qp_force_kernel_data() {
Expand All @@ -328,8 +317,8 @@ void init_nonbonded_qp_force_kernel_data() {
int n_blocks_q = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
int n_blocks_p = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
// TODO make Evdw & Ecoul work for # of states > 2
int mem_size_QP_Evdw = min(n_lambdas, 2) * n_blocks_q * n_blocks_p * sizeof(double);
int mem_size_QP_Ecoul = min(n_lambdas, 2) * n_blocks_q * n_blocks_p * sizeof(double);
int mem_size_QP_Evdw = n_blocks_q * n_blocks_p * sizeof(double);
int mem_size_QP_Ecoul = n_blocks_q * n_blocks_p * sizeof(double);
int mem_size_QP_MAT = n_qatoms * n_patoms * sizeof(calc_qp_t);

check_cudaMalloc((void**)&D_QP_Evdw, mem_size_QP_Evdw);
Expand Down
4 changes: 4 additions & 0 deletions src/core/cuda/src/CudaNonbondedQQForce.cu
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "cuda/include/CudaContext.cuh"
#include "cuda/include/CudaNonbondedQQForce.cuh"
#include "utils.h"
#include <iostream>

__global__ void calc_nonbonded_qq_forces_kernel(
q_atom_t* q_atoms,
Expand Down Expand Up @@ -129,6 +130,9 @@ void calc_nonbonded_qq_forces_host() {
cudaDeviceSynchronize();
cudaMemcpy(EQ_nonbond_qq, d_EQ_nonbond_qq, sizeof(E_nonbonded_t) * n_lambdas, cudaMemcpyDeviceToHost);
cudaMemcpy(dvelocities, d_dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyDeviceToHost);
for (int i = 0; i < n_lambdas; i++) {
printf("QQ Energy State %d: Evdw = %f Ecoul = %f\n", i, EQ_nonbond_qq[i].Uvdw, EQ_nonbond_qq[i].Ucoul);
}
}

void init_nonbonded_qq_force_kernel_data() {}
Expand Down
Loading