diff --git a/src/core/cuda/include/CudaContext.cuh b/src/core/cuda/include/CudaContext.cuh index 3c2883fa..faa3cf90 100644 --- a/src/core/cuda/include/CudaContext.cuh +++ b/src/core/cuda/include/CudaContext.cuh @@ -59,6 +59,9 @@ class CudaContext { E_nonbonded_t* d_EQ_nonbond_qq = nullptr; double* d_lambdas = nullptr; + q_charge_t* d_q_aggregate_charges = nullptr; + q_catype_t* d_q_aggregate_catypes = nullptr; + /* Used in CudaPolxWaterForce.cu */ @@ -124,6 +127,8 @@ class CudaContext { private: CudaContext() = default; + void init_data_for_q(); + void free(); ~CudaContext() { free(); } diff --git a/src/core/cuda/src/CudaContext.cu b/src/core/cuda/src/CudaContext.cu index 19f22165..0ac5f0b5 100644 --- a/src/core/cuda/src/CudaContext.cu +++ b/src/core/cuda/src/CudaContext.cu @@ -1,6 +1,7 @@ #include #include "cuda/include/CudaContext.cuh" +#include void CudaContext::init() { check_cudaMalloc((void**)&d_coords, sizeof(coord_t) * n_atoms); @@ -54,6 +55,8 @@ void CudaContext::init() { check_cudaMalloc((void**)&d_charges, sizeof(charge_t) * n_charges); check_cudaMalloc((void**)&d_p_atoms, sizeof(p_atom_t) * n_patoms); + init_data_for_q(); + sync_all_to_device(); } @@ -183,7 +186,6 @@ void CudaContext::free() { cudaFree(d_excluded); cudaFree(d_q_elscales); cudaFree(d_q_catypes); - cudaFree(d_q_atypes); cudaFree(d_EQ_nonbond_qq); cudaFree(d_lambdas); @@ -208,6 +210,9 @@ void CudaContext::free() { cudaFree(d_ccharges); cudaFree(d_charges); cudaFree(d_p_atoms); + + cudaFree(d_q_aggregate_charges); + cudaFree(d_q_aggregate_catypes); } void CudaContext::reset_energies() { @@ -215,3 +220,37 @@ void CudaContext::reset_energies() { cudaMemset(d_EQ_nonbond_qq, 0, sizeof(E_nonbonded_t) * n_lambdas); cudaMemset(d_EQ_restraint, 0, sizeof(E_restraint_t) * n_lambdas); } + +void CudaContext::init_data_for_q() { + + // Additional initialization for q-related data can be added here if needed + + // q charges + std::vector charges(n_qatoms, q_charge_t{0.0}); + for (int i = 0; i < n_qatoms; i++) { + double q = 0; + for (int state = 0; state < n_lambdas; state++) { + q += q_charges[state * n_qatoms + i].q * lambdas[state]; + } + charges[i].q = q; + } + check_cudaMalloc((void**)&d_q_aggregate_charges, sizeof(q_charge_t) * n_qatoms); + check_cuda(cudaMemcpy(d_q_aggregate_charges, charges.data(), sizeof(q_charge_t) * n_qatoms, cudaMemcpyHostToDevice)); + + + // q atypes + std::vector atypes(n_qatoms, q_catype_t{"", 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}); + for (int i = 0; i < n_qatoms; i++) { + for (int state = 0; state < n_lambdas; state++) { + atypes[i].Ai += q_catypes[q_atypes[state * n_qatoms + i].code - 1].Ai * lambdas[state]; + atypes[i].Bi += q_catypes[q_atypes[state * n_qatoms + i].code - 1].Bi * lambdas[state]; + atypes[i].Ci += q_catypes[q_atypes[state * n_qatoms + i].code - 1].Ci * lambdas[state]; + atypes[i].ai += q_catypes[q_atypes[state * n_qatoms + i].code - 1].ai * lambdas[state]; + atypes[i].Ai_14 += q_catypes[q_atypes[state * n_qatoms + i].code - 1].Ai_14 * lambdas[state]; + atypes[i].Bi_14 += q_catypes[q_atypes[state * n_qatoms + i].code - 1].Bi_14 * lambdas[state]; + atypes[i].m += q_catypes[q_atypes[state * n_qatoms + i].code - 1].m * lambdas[state]; + } + } + check_cudaMalloc((void**)&d_q_aggregate_catypes, sizeof(q_catype_t) * n_qatoms); + check_cuda(cudaMemcpy(d_q_aggregate_catypes, atypes.data(), sizeof(q_catype_t) * n_qatoms, cudaMemcpyHostToDevice)); +} \ No newline at end of file diff --git a/src/core/cuda/src/CudaNonbondedQPForce.cu b/src/core/cuda/src/CudaNonbondedQPForce.cu index 4eb53908..47f26451 100644 --- a/src/core/cuda/src/CudaNonbondedQPForce.cu +++ b/src/core/cuda/src/CudaNonbondedQPForce.cu @@ -42,6 +42,7 @@ __global__ void calc_energy_sum(int rows, int columns, double* Evdw_TOT, double* } Ecoul_S[ty][tx] = coul_TOT; Evdw_S[ty][tx] = vdw_TOT; + // printf("Thread (%d,%d) coul_TOT = %f vdw_TOT = %f\n", tx, ty, coul_TOT, vdw_TOT); __syncthreads(); @@ -111,8 +112,8 @@ __global__ void calc_qp_dvel_vector_column(int n_qatoms, int n_patoms, dvel_t* D } __device__ void calc_qp_dvel_matrix_incr(int row, int qi, int column, int pj, int n_lambdas, int n_qatoms, - coord_t* Qs, coord_t* Ps, int* LJs, bool* excluded_s, double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE], double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE], calc_qp_t* qp, - q_catype_t* D_qcatypes, q_atype_t* D_qatypes, q_charge_t* D_qcharges, p_atom_t* D_patoms, q_atom_t* D_qatoms, double* D_lambdas, + coord_t* Qs, coord_t* Ps, int* LJs, bool* excluded_s, double Evdw_S[BLOCK_SIZE][BLOCK_SIZE], double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE], calc_qp_t* qp, + q_catype_t* D_qcatypes, q_charge_t* D_qcharges, p_atom_t* D_patoms, q_atom_t* D_qatoms, catype_t* D_catypes, atype_t* D_atypes, ccharge_t* D_ccharges, charge_t* D_charges, topo_t D_topo) { coord_t da; double r2, r6, r; @@ -141,43 +142,40 @@ __device__ void calc_qp_dvel_matrix_incr(int row, int qi, int column, int pj, in r6 = r2 * r2 * r2; r2 = 1 / r2; r = sqrt(r2); - - for (int state = 0; state < n_lambdas; state++) { - qi_type = D_qcatypes[D_qatypes[qi + n_qatoms * state].code - 1]; - aj_type = D_catypes[D_atypes[j].code - 1]; - - ai_aii = bond14 ? qi_type.Ai_14 : qi_type.Ai; - aj_aii = bond14 ? aj_type.aii_1_4 : aj_type.aii_normal; - ai_bii = bond14 ? qi_type.Bi_14 : qi_type.Bi; - aj_bii = bond14 ? aj_type.bii_1_4 : aj_type.bii_normal; - - Vel = D_topo.coulomb_constant * scaling * D_qcharges[qi + n_qatoms * state].q * D_ccharges[D_charges[j].code - 1].charge * r; - V_a = ai_aii * aj_aii / (r6 * r6); - V_b = ai_bii * aj_bii / r6; - dv = r2 * (-Vel - (12 * V_a - 6 * V_b)) * D_lambdas[state]; - - // if (state == 0 && qi == 0 && pj == 1) { - // printf("crg_q = %f crg_j = %f r = %f\n", D_qcharges[qi + n_qatoms * state].q, D_ccharges[D_charges[pj].code - 1].charge, r); - // printf("ai_aii = %f aj_aii = %f ai_bii = %f aj_bii = %f\n", ai_aii, aj_aii, ai_bii, aj_bii); - // } - - // Update forces - qp->Q.x += dv * da.x; - qp->Q.y += dv * da.y; - qp->Q.z += dv * da.z; - qp->P.x -= dv * da.x; - qp->P.y -= dv * da.y; - qp->P.z -= dv * da.z; - - // Update Q totals - Ecoul_S[row][state * BLOCK_SIZE + column] += Vel; - Evdw_S[row][state * BLOCK_SIZE + column] += (V_a - V_b); - } + qi_type = D_qcatypes[qi]; + aj_type = D_catypes[D_atypes[j].code - 1]; + + ai_aii = bond14 ? qi_type.Ai_14 : qi_type.Ai; + aj_aii = bond14 ? aj_type.aii_1_4 : aj_type.aii_normal; + ai_bii = bond14 ? qi_type.Bi_14 : qi_type.Bi; + aj_bii = bond14 ? aj_type.bii_1_4 : aj_type.bii_normal; + + Vel = D_topo.coulomb_constant * scaling * D_qcharges[qi].q * D_ccharges[D_charges[j].code - 1].charge * r; + V_a = ai_aii * aj_aii / (r6 * r6); + V_b = ai_bii * aj_bii / r6; + dv = r2 * (-Vel - (12 * V_a - 6 * V_b)); + + // if (state == 0 && qi == 0 && pj == 1) { + // printf("crg_q = %f crg_j = %f r = %f\n", D_qcharges[qi + n_qatoms * state].q, D_ccharges[D_charges[pj].code - 1].charge, r); + // printf("ai_aii = %f aj_aii = %f ai_bii = %f aj_bii = %f\n", ai_aii, aj_aii, ai_bii, aj_bii); + // } + + // Update forces + qp->Q.x += dv * da.x; + qp->Q.y += dv * da.y; + qp->Q.z += dv * da.z; + qp->P.x -= dv * da.x; + qp->P.y -= dv * da.y; + qp->P.z -= dv * da.z; + + // Update Q totals + Ecoul_S[row][column] += Vel; + Evdw_S[row][column] += (V_a - V_b); } __global__ void calc_qp_dvel_matrix(int n_qatoms, int n_patoms, int n_lambdas, int n_atoms_solute, coord_t* X, double* Evdw, double* Ecoul, calc_qp_t* QP_MAT, - q_catype_t* D_qcatypes, q_atype_t* D_qatypes, q_charge_t* D_qcharges, p_atom_t* D_patoms, q_atom_t* D_qatoms, double* D_lambdas, + q_catype_t* D_qcatypes, q_charge_t* D_qcharges, p_atom_t* D_patoms, q_atom_t* D_qatoms, int* D_LJ_matrix, bool* D_excluded, catype_t* D_catypes, atype_t* D_atypes, ccharge_t* D_ccharges, charge_t* D_charges, topo_t D_topo) { // Block index int bx = blockIdx.x; @@ -187,9 +185,8 @@ __global__ void calc_qp_dvel_matrix(int n_qatoms, int n_patoms, int n_lambdas, i int tx = threadIdx.x; int ty = threadIdx.y; - // TODO implement >2 states on GPU - __shared__ double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE]; - __shared__ double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE]; + __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE]; + __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE]; Ecoul_S[ty][tx] = 0; Evdw_S[ty][tx] = 0; @@ -230,29 +227,25 @@ __global__ void calc_qp_dvel_matrix(int n_qatoms, int n_patoms, int n_lambdas, i int column = bx * BLOCK_SIZE + tx; calc_qp_dvel_matrix_incr(ty, row, tx, column, n_lambdas, n_qatoms, Qs, Ps, LJs, excluded_s, Evdw_S, Ecoul_S, - &qp, D_qcatypes, D_qatypes, D_qcharges, D_patoms, D_qatoms, D_lambdas, D_catypes, D_atypes, D_ccharges, D_charges, D_topo); + &qp, D_qcatypes, D_qcharges, D_patoms, D_qatoms, D_catypes, D_atypes, D_ccharges, D_charges, D_topo); QP_MAT[n_patoms * row + column] = qp; __syncthreads(); int rowlen = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE; - int collen = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE; if (tx == 0 && ty == 0) { - // TODO implement >2 states on GPU - for (int state = 0; state < min(2, n_lambdas); state++) { - double tot_Evdw = 0; - double tot_Ecoul = 0; - for (int i = 0; i < BLOCK_SIZE; i++) { - for (int j = 0; j < BLOCK_SIZE; j++) { - tot_Evdw += Evdw_S[i][j + state * BLOCK_SIZE]; - tot_Ecoul += Ecoul_S[i][j + state * BLOCK_SIZE]; - } + double tot_Evdw = 0; + double tot_Ecoul = 0; + for (int i = 0; i < BLOCK_SIZE; i++) { + for (int j = 0; j < BLOCK_SIZE; j++) { + tot_Evdw += Evdw_S[i][j]; + tot_Ecoul += Ecoul_S[i][j]; } - Evdw[rowlen * collen * state + rowlen * by + bx] = tot_Evdw; - Ecoul[rowlen * collen * state + rowlen * by + bx] = tot_Ecoul; } + Evdw[rowlen * by + bx] = tot_Evdw; + Ecoul[rowlen * by + bx] = tot_Ecoul; } __syncthreads(); @@ -269,12 +262,10 @@ void calc_nonbonded_qp_forces_host_v2() { CudaContext& ctx = CudaContext::instance(); auto X = ctx.d_coords; auto DV_X = ctx.d_dvelocities; - auto D_qcatypes = ctx.d_q_catypes; - auto D_qatypes = ctx.d_q_atypes; - auto D_qcharges = ctx.d_q_charges; + auto D_qcatypes = ctx.d_q_aggregate_catypes; + auto D_qcharges = ctx.d_q_aggregate_charges; auto D_patoms = ctx.d_p_atoms; auto D_qatoms = ctx.d_q_atoms; - auto D_lambdas = ctx.d_lambdas; auto D_LJ_matrix = ctx.d_LJ_matrix; auto D_excluded = ctx.d_excluded; auto D_catypes = ctx.d_catypes; @@ -288,7 +279,7 @@ void calc_nonbonded_qp_forces_host_v2() { grid = dim3((n_patoms + BLOCK_SIZE - 1) / threads.x, (n_qatoms + BLOCK_SIZE - 1) / threads.y); calc_qp_dvel_matrix<<>>(n_qatoms, n_patoms, n_lambdas, n_atoms_solute, X, D_QP_Evdw, D_QP_Ecoul, QP_MAT, - D_qcatypes, D_qatypes, D_qcharges, D_patoms, D_qatoms, D_lambdas, D_LJ_matrix, D_excluded, + D_qcatypes, D_qcharges, D_patoms, D_qatoms, D_LJ_matrix, D_excluded, D_catypes, D_atypes, D_ccharges, D_charges, topo); calc_qp_dvel_vector_column<<<((n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_qatoms, n_patoms, DV_X, QP_MAT, D_patoms); calc_qp_dvel_vector_row<<<((n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_qatoms, n_patoms, DV_X, QP_MAT, D_qatoms); @@ -309,16 +300,14 @@ void calc_nonbonded_qp_forces_host_v2() { int mem_size_DV_X = n_atoms * sizeof(dvel_t); cudaMemcpy(dvelocities, DV_X, mem_size_DV_X, cudaMemcpyDeviceToHost); - // TODO: make Evdw & Ecoul work for # of states > 2 - for (int state = 0; state < min(2, n_lambdas); state++) { - calc_energy_sum<<<1, threads>>>(n_blocks_q, n_blocks_p, D_QP_evdw_TOT, D_QP_ecoul_TOT, &D_QP_Evdw[state * n_blocks_p * n_blocks_q], &D_QP_Ecoul[state * n_blocks_p * n_blocks_q], false); + calc_energy_sum<<<1, threads>>>(n_blocks_q, n_blocks_p, D_QP_evdw_TOT, D_QP_ecoul_TOT, D_QP_Evdw, D_QP_Ecoul, false); - cudaMemcpy(&QP_evdw_TOT, D_QP_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost); - cudaMemcpy(&QP_ecoul_TOT, D_QP_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(&QP_evdw_TOT, D_QP_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(&QP_ecoul_TOT, D_QP_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost); + printf("QP E_vdw: %f E_coul: %f\n", QP_evdw_TOT, QP_ecoul_TOT); - EQ_nonbond_qp[state].Uvdw += QP_evdw_TOT; - EQ_nonbond_qp[state].Ucoul += QP_ecoul_TOT; - } + E_nonbond_qx.Uvdw += QP_evdw_TOT; + E_nonbond_qx.Ucoul += QP_ecoul_TOT; } void init_nonbonded_qp_force_kernel_data() { @@ -328,8 +317,8 @@ void init_nonbonded_qp_force_kernel_data() { int n_blocks_q = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE; int n_blocks_p = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE; // TODO make Evdw & Ecoul work for # of states > 2 - int mem_size_QP_Evdw = min(n_lambdas, 2) * n_blocks_q * n_blocks_p * sizeof(double); - int mem_size_QP_Ecoul = min(n_lambdas, 2) * n_blocks_q * n_blocks_p * sizeof(double); + int mem_size_QP_Evdw = n_blocks_q * n_blocks_p * sizeof(double); + int mem_size_QP_Ecoul = n_blocks_q * n_blocks_p * sizeof(double); int mem_size_QP_MAT = n_qatoms * n_patoms * sizeof(calc_qp_t); check_cudaMalloc((void**)&D_QP_Evdw, mem_size_QP_Evdw); diff --git a/src/core/cuda/src/CudaNonbondedQQForce.cu b/src/core/cuda/src/CudaNonbondedQQForce.cu index e2138ed6..a10c315c 100644 --- a/src/core/cuda/src/CudaNonbondedQQForce.cu +++ b/src/core/cuda/src/CudaNonbondedQQForce.cu @@ -1,6 +1,7 @@ #include "cuda/include/CudaContext.cuh" #include "cuda/include/CudaNonbondedQQForce.cuh" #include "utils.h" +#include __global__ void calc_nonbonded_qq_forces_kernel( q_atom_t* q_atoms, @@ -129,6 +130,9 @@ void calc_nonbonded_qq_forces_host() { cudaDeviceSynchronize(); cudaMemcpy(EQ_nonbond_qq, d_EQ_nonbond_qq, sizeof(E_nonbonded_t) * n_lambdas, cudaMemcpyDeviceToHost); cudaMemcpy(dvelocities, d_dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyDeviceToHost); + for (int i = 0; i < n_lambdas; i++) { + printf("QQ Energy State %d: Evdw = %f Ecoul = %f\n", i, EQ_nonbond_qq[i].Uvdw, EQ_nonbond_qq[i].Ucoul); + } } void init_nonbonded_qq_force_kernel_data() {} diff --git a/src/core/cuda/src/CudaNonbondedQWForce.cu b/src/core/cuda/src/CudaNonbondedQWForce.cu index 974c380c..049a38aa 100644 --- a/src/core/cuda/src/CudaNonbondedQWForce.cu +++ b/src/core/cuda/src/CudaNonbondedQWForce.cu @@ -124,14 +124,13 @@ __global__ void calc_qw_dvel_vector_column(int n_qatoms, int n_waters, dvel_t* D } __device__ void calc_qw_dvel_matrix_incr(int row, int qi, int column, int n_lambdas, int n_qatoms, double crg_ow, double crg_hw, double A_O, double B_O, - coord_t* Qs, coord_t* Ws, double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE], double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE], calc_qw_t* qw, - q_catype_t* D_qcatypes, q_atype_t* D_qatypes, q_charge_t* D_qcharges, q_atom_t* D_qatoms, double* D_lambdas, topo_t D_topo) { + coord_t* Qs, coord_t* Ws, double Evdw_S[BLOCK_SIZE][BLOCK_SIZE], double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE], calc_qw_t* qw, + q_catype_t* D_qcatypes, q_charge_t* D_qcharges, q_atom_t* D_qatoms, topo_t D_topo) { int j; coord_t dO, dH1, dH2; double r2O, rH1, rH2, r6O, rO, r2H1, r2H2; double dvO, dvH1, dvH2; double V_a, V_b, VelO, VelH1, VelH2; - q_atype_t qa_type; q_catype_t qi_type; double ai_aii, ai_bii; @@ -160,28 +159,25 @@ __device__ void calc_qw_dvel_matrix_incr(int row, int qi, int column, int n_lamb dvH1 = 0; dvH2 = 0; - for (int state = 0; state < n_lambdas; state++) { - qa_type = D_qatypes[qi + n_qatoms * state]; - qi_type = D_qcatypes[qa_type.code - 1]; + qi_type = D_qcatypes[qi]; - ai_aii = qi_type.Ai; - ai_bii = qi_type.Bi; + ai_aii = qi_type.Ai; + ai_bii = qi_type.Bi; - V_a = ai_aii * A_O / (r6O * r6O); - V_b = ai_bii * B_O / (r6O); + V_a = ai_aii * A_O / (r6O * r6O); + V_b = ai_bii * B_O / (r6O); - VelO = D_topo.coulomb_constant * crg_ow * D_qcharges[qi + n_qatoms * state].q * rO; - VelH1 = D_topo.coulomb_constant * crg_hw * D_qcharges[qi + n_qatoms * state].q * rH1; - VelH2 = D_topo.coulomb_constant * crg_hw * D_qcharges[qi + n_qatoms * state].q * rH2; + VelO = D_topo.coulomb_constant * crg_ow * D_qcharges[qi].q * rO; + VelH1 = D_topo.coulomb_constant * crg_hw * D_qcharges[qi].q * rH1; + VelH2 = D_topo.coulomb_constant * crg_hw * D_qcharges[qi].q * rH2; - dvO += r2O * (-VelO - (12 * V_a - 6 * V_b)) * D_lambdas[state]; - dvH1 -= r2H1 * VelH1 * D_lambdas[state]; - dvH2 -= r2H2 * VelH2 * D_lambdas[state]; + dvO += r2O * (-VelO - (12 * V_a - 6 * V_b)); + dvH1 -= r2H1 * VelH1; + dvH2 -= r2H2 * VelH2; - // Update Q totals - Ecoul_S[row][state * BLOCK_SIZE + column] += (VelO + VelH1 + VelH2); - Evdw_S[row][state * BLOCK_SIZE + column] += (V_a - V_b); - } + // Update Q totals + Ecoul_S[row][column] += (VelO + VelH1 + VelH2); + Evdw_S[row][column] += (V_a - V_b); // Note r6O is not the usual 1/rO^6, but rather rO^6. be careful!!! @@ -204,7 +200,7 @@ __device__ void calc_qw_dvel_matrix_incr(int row, int qi, int column, int n_lamb __global__ void calc_qw_dvel_matrix(int n_qatoms, int n_waters, int n_lambdas, double crg_ow, double crg_hw, double A_O, double B_O, coord_t* X, coord_t* W, double* Evdw, double* Ecoul, calc_qw_t* MAT, - q_catype_t* D_qcatypes, q_atype_t* D_qatypes, q_charge_t* D_qcharges, q_atom_t* D_qatoms, double* D_lambdas, topo_t D_topo) { + q_catype_t* D_qcatypes, q_charge_t* D_qcharges, q_atom_t* D_qatoms, topo_t D_topo) { // Block index int bx = blockIdx.x; int by = blockIdx.y; @@ -214,8 +210,8 @@ __global__ void calc_qw_dvel_matrix(int n_qatoms, int n_waters, int n_lambdas, d int ty = threadIdx.y; // TODO implement >2 states on GPU - __shared__ double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE]; - __shared__ double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE]; + __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE]; + __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE]; Ecoul_S[ty][tx] = 0; Evdw_S[ty][tx] = 0; @@ -250,29 +246,26 @@ __global__ void calc_qw_dvel_matrix(int n_qatoms, int n_waters, int n_lambdas, d int column = bx * BLOCK_SIZE + tx; calc_qw_dvel_matrix_incr(ty, aStart + ty, tx, n_lambdas, n_qatoms, crg_ow, crg_hw, A_O, B_O, Qs, Ws, Evdw_S, Ecoul_S, - &qw, D_qcatypes, D_qatypes, D_qcharges, D_qatoms, D_lambdas, D_topo); + &qw, D_qcatypes, D_qcharges, D_qatoms, D_topo); MAT[column + n_waters * row] = qw; __syncthreads(); int rowlen = (n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE; - int collen = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE; if (tx == 0 && ty == 0) { // TODO implement >2 states on GPU - for (int state = 0; state < min(2, n_lambdas); state++) { - double tot_Evdw = 0; - double tot_Ecoul = 0; - for (int i = 0; i < BLOCK_SIZE; i++) { - for (int j = 0; j < BLOCK_SIZE; j++) { - tot_Evdw += Evdw_S[i][j + state * BLOCK_SIZE]; - tot_Ecoul += Ecoul_S[i][j + state * BLOCK_SIZE]; - } + double tot_Evdw = 0; + double tot_Ecoul = 0; + for (int i = 0; i < BLOCK_SIZE; i++) { + for (int j = 0; j < BLOCK_SIZE; j++) { + tot_Evdw += Evdw_S[i][j]; + tot_Ecoul += Ecoul_S[i][j]; } - Evdw[rowlen * collen * state + rowlen * by + bx] = tot_Evdw; - Ecoul[rowlen * collen * state + rowlen * by + bx] = tot_Ecoul; } + Evdw[rowlen * by + bx] = tot_Evdw; + Ecoul[rowlen * by + bx] = tot_Ecoul; } __syncthreads(); @@ -282,36 +275,27 @@ __global__ void calc_qw_dvel_matrix(int n_qatoms, int n_waters, int n_lambdas, d void calc_nonbonded_qw_forces_host_v2() { using namespace CudaNonbondedQWForce; - int mem_size_X = n_atoms_solute * sizeof(coord_t); - int mem_size_W = 3 * n_waters * sizeof(coord_t); int mem_size_DV_X = n_atoms_solute * sizeof(dvel_t); int mem_size_DV_W = 3 * n_waters * sizeof(dvel_t); - int mem_size_MAT = 3 * n_waters * n_qatoms * sizeof(calc_qw_t); int n_blocks_q = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE; int n_blocks_w = (n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE; - // TODO make Evdw & Ecoul work for # of states > 2 - int mem_size_QW_Evdw = min(n_lambdas, 2) * n_blocks_q * n_blocks_w * sizeof(double); - int mem_size_QW_Ecoul = min(n_lambdas, 2) * n_blocks_q * n_blocks_w * sizeof(double); CudaContext& ctx = CudaContext::instance(); auto X = ctx.d_coords; auto DV_X = ctx.d_dvelocities; - auto D_qcatypes = ctx.d_q_catypes; - auto D_qatypes = ctx.d_q_atypes; - auto D_qcharges = ctx.d_q_charges; + auto D_qcatypes = ctx.d_q_aggregate_catypes; + auto D_qcharges = ctx.d_q_aggregate_charges; auto D_qatoms = ctx.d_q_atoms; - auto D_lambdas = ctx.d_lambdas; dim3 threads, grid; threads = dim3(BLOCK_SIZE, BLOCK_SIZE); grid = dim3((n_waters + BLOCK_SIZE - 1) / threads.x, (n_qatoms + BLOCK_SIZE - 1) / threads.y); - double evdw, ecoul; calc_qw_dvel_matrix<<>>(n_qatoms, n_waters, n_lambdas, crg_ow, crg_hw, A_O, B_O, X, X + n_atoms_solute, D_QW_Evdw, D_QW_Ecoul, - QW_MAT, D_qcatypes, D_qatypes, D_qcharges, D_qatoms, D_lambdas, topo); + QW_MAT, D_qcatypes, D_qcharges, D_qatoms, topo); calc_qw_dvel_vector_column<<<((n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_qatoms, n_waters, DV_X, DV_X + n_atoms_solute, QW_MAT); calc_qw_dvel_vector_row<<<((n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_qatoms, n_waters, DV_X, DV_X + n_atoms_solute, QW_MAT, D_qatoms); @@ -331,17 +315,15 @@ void calc_nonbonded_qw_forces_host_v2() { cudaMemcpy(dvelocities, DV_X, mem_size_DV_X, cudaMemcpyDeviceToHost); cudaMemcpy(&dvelocities[n_atoms_solute], DV_X + n_atoms_solute, mem_size_DV_W, cudaMemcpyDeviceToHost); - // TODO make Evdw & Ecoul work for # of states > 2 - for (int state = 0; state < min(2, n_lambdas); state++) { - calc_energy_sum<<<1, threads>>>(n_blocks_q, n_blocks_w, D_QW_evdw_TOT, D_QW_ecoul_TOT, &D_QW_Evdw[state * n_blocks_w * n_blocks_q], - &D_QW_Ecoul[state * n_blocks_w * n_blocks_q], false); + calc_energy_sum<<<1, threads>>>(n_blocks_q, n_blocks_w, D_QW_evdw_TOT, D_QW_ecoul_TOT, D_QW_Evdw, + D_QW_Ecoul, false); - cudaMemcpy(&QW_evdw_TOT, D_QW_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost); - cudaMemcpy(&QW_ecoul_TOT, D_QW_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(&QW_evdw_TOT, D_QW_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(&QW_ecoul_TOT, D_QW_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost); - EQ_nonbond_qw[state].Uvdw += QW_evdw_TOT; - EQ_nonbond_qw[state].Ucoul += QW_ecoul_TOT; - } + E_nonbond_qx.Uvdw += QW_evdw_TOT; + E_nonbond_qx.Ucoul += QW_ecoul_TOT; + printf("QW Energy: Evdw = %f Ecoul = %f\n", QW_evdw_TOT, QW_ecoul_TOT); } void init_nonbonded_qw_force_kernel_data() { @@ -358,8 +340,8 @@ void init_nonbonded_qw_force_kernel_data() { int n_blocks_w = (n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE; int mem_size_MAT = 3 * n_waters * n_qatoms * sizeof(calc_qw_t); - int mem_size_QW_Evdw = min(n_lambdas, 2) * n_blocks_q * n_blocks_w * sizeof(double); - int mem_size_QW_Ecoul = min(n_lambdas, 2) * n_blocks_q * n_blocks_w * sizeof(double); + int mem_size_QW_Evdw = n_blocks_q * n_blocks_w * sizeof(double); + int mem_size_QW_Ecoul = n_blocks_q * n_blocks_w * sizeof(double); #ifdef DEBUG printf("Allocating QW_MAT\n"); #endif diff --git a/src/core/system.cu b/src/core/system.cu index afb999cf..ee50ee61 100644 --- a/src/core/system.cu +++ b/src/core/system.cu @@ -1,3 +1,4 @@ +#include #include "system.h" #include "utils.h" #include "parse.h" @@ -1094,7 +1095,6 @@ void calc_integration_step(int iteration) { calc_bonded_forces(); clock_t end_bonded = clock(); - clock_t start_pp, end_pp, start_qp, end_qp; if (run_gpu) { start_qp = clock();