diff --git a/src/core/cuda/src/CudaNonbondedPWForce.cu b/src/core/cuda/src/CudaNonbondedPWForce.cu index dec352db..99bbaed6 100644 --- a/src/core/cuda/src/CudaNonbondedPWForce.cu +++ b/src/core/cuda/src/CudaNonbondedPWForce.cu @@ -9,257 +9,248 @@ struct calc_pw_t { dvel_t W; }; -calc_pw_t *PW_MAT, *h_PW_MAT; -double *D_PW_Evdw, *D_PW_Ecoul; -double *D_PW_evdw_TOT, *D_PW_ecoul_TOT, PW_evdw_TOT, PW_ecoul_TOT; - -__global__ void calc_energy_sum(int rows, int columns, double* Evdw_TOT, double* Ecoul_TOT, double* Evdw, double* Ecoul, bool upper_diagonal) { - int tx = threadIdx.x; - int ty = threadIdx.y; - - __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE]; - __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE]; - - // TODO: better way to distribute upper diagonal over threads? Seems like threads in left bottom corner have less work - double coul_TOT = 0; - double vdw_TOT = 0; - for (int i = ty; i < rows; i += BLOCK_SIZE) { - for (int j = tx; j < columns; j += BLOCK_SIZE) { - if (i <= j || !upper_diagonal) { - coul_TOT += Ecoul[i * columns + j]; - vdw_TOT += Evdw[i * columns + j]; - } - } - } - Ecoul_S[ty][tx] = coul_TOT; - Evdw_S[ty][tx] = vdw_TOT; - - __syncthreads(); +double *D_PW_evdw_TOT, *D_PW_ecoul_TOT; - if (tx == 0 && ty == 0) { - double Evdw_temp = 0; - double Ecoul_temp = 0; - for (int i = 0; i < BLOCK_SIZE; i++) { - for (int j = 0; j < BLOCK_SIZE; j++) { - Evdw_temp += Evdw_S[i][j]; - Ecoul_temp += Ecoul_S[i][j]; - } - } - - *Evdw_TOT = Evdw_temp; - *Ecoul_TOT = Ecoul_temp; +__device__ __forceinline__ void idx2xy(int n, int t, int& x, int& y) { + x = (int)floorf((2 * n + 1 - sqrtf((2 * n + 1) * (2 * n + 1) - 8 * t)) * 0.5f); + y = t - (x * n - (x * (x - 1) >> 1)); + if (y < 0) { + x--; + y += (n - x); } + y += x; } -__device__ void calc_pw_dvel_matrix_incr(int row, int pi, int column, int j, int n_atoms_solute, - coord_t* Xs, coord_t* Ws, bool* excluded_s, double* Evdw, double* Ecoul, calc_pw_t* pw, - ccharge_t* D_ccharges, charge_t* D_charges, catype_t* D_catypes, atype_t* D_atypes, p_atom_t* D_patoms, topo_t D_topo) { - coord_t da; - double r2a, ra, r6a; - double Vela, V_a, V_b; - double dva; - double qi, qj; - double ai_aii, aj_aii, ai_bii, aj_bii; - catype_t ai_type, aj_type; - atype_t i_type, j_type; - - if (excluded_s[row]) return; - qi = D_ccharges[D_charges[pi].code - 1].charge; - qj = D_ccharges[D_charges[n_atoms_solute + j].code - 1].charge; // TODO: FIX THIS!!! WILL NOT WORK WITH QATOMS!!!!! - - // if (pi < 100 && j < 100){ - // printf("qi = %f qj = %f\n", qi, qj); - // } - - ai_type = D_catypes[D_atypes[pi].code - 1]; - aj_type = D_catypes[D_atypes[n_atoms_solute + j].code - 1]; - - da.x = Ws[column].x - Xs[row].x; - da.y = Ws[column].y - Xs[row].y; - da.z = Ws[column].z - Xs[row].z; - r2a = 1 / (pow(da.x, 2) + pow(da.y, 2) + pow(da.z, 2)); - ra = sqrt(r2a); - r6a = r2a * r2a * r2a; - - Vela = D_topo.coulomb_constant * qi * qj * ra; - - ai_aii = ai_type.aii_normal; - aj_aii = aj_type.aii_normal; - ai_bii = ai_type.bii_normal; - aj_bii = aj_type.bii_normal; - - V_a = r6a * r6a * ai_aii * aj_aii; - V_b = r6a * ai_bii * aj_bii; - dva = r2a * (-Vela - 12 * V_a + 6 * V_b); - - pw->P.x -= dva * da.x; - pw->P.y -= dva * da.y; - pw->P.z -= dva * da.z; - - pw->W.x += dva * da.x; - pw->W.y += dva * da.y; - pw->W.z += dva * da.z; - - *Ecoul += Vela; - *Evdw += (V_a - V_b); - - // if (pi == 522 && j == 175) { - // printf("Vela = %f V_a = %f V_b = %f P = %f %f %f ai_aii = %f aj_aii = %f\n", Vela, V_a, V_b, pw->P.x, pw->P.y, pw->P.z, ai_aii, aj_aii); - // } - - // if (pi < 100 && j < 100) printf("Evdw = %f Ecoul = %f\n", *Evdw, *Ecoul); +__device__ __forceinline__ double shfl(double v, int srcLane, unsigned mask = 0xffffffffu) { + int2 a = *reinterpret_cast(&v); + a.x = __shfl_sync(mask, a.x, srcLane); + a.y = __shfl_sync(mask, a.y, srcLane); + return *reinterpret_cast(&a); } -__global__ void calc_pw_dvel_vector_row(int n_patoms, int n_waters, dvel_t* DV_X, dvel_t* DV_W, calc_pw_t* PW_MAT, p_atom_t* D_patoms) { - int row = blockIdx.x * blockDim.x + threadIdx.x; - if (row >= n_patoms) return; +__device__ __forceinline__ coord_t shfl_coord(coord_t v, int srcLane, unsigned mask = 0xffffffffu) { + v.x = shfl(v.x, srcLane, mask); + v.y = shfl(v.y, srcLane, mask); + v.z = shfl(v.z, srcLane, mask); + return v; +} - dvel_t dP; +__device__ __forceinline__ catype_t shfl_catype(catype_t v, int srcLane, unsigned mask = 0xffffffffu) { + v.code = __shfl_sync(mask, v.code, srcLane); + v.m = shfl(v.m, srcLane, mask); + v.aii_normal = shfl(v.aii_normal, srcLane, mask); + v.bii_normal = shfl(v.bii_normal, srcLane, mask); + v.aii_polar = shfl(v.aii_polar, srcLane, mask); + v.bii_polar = shfl(v.bii_polar, srcLane, mask); + v.aii_1_4 = shfl(v.aii_1_4, srcLane, mask); + v.bii_1_4 = shfl(v.bii_1_4, srcLane, mask); + return v; +} - dP.x = 0; - dP.y = 0; - dP.z = 0; +__device__ void calculate_unforce_bound( + // const int x_idx, + // const int y_idx, - for (int i = 0; i < 3 * n_waters; i++) { - dP.x += PW_MAT[i + 3 * n_waters * row].P.x; - dP.y += PW_MAT[i + 3 * n_waters * row].P.y; - dP.z += PW_MAT[i + 3 * n_waters * row].P.z; - } + const coord_t& x, + const coord_t& y, - int p = D_patoms[row].a - 1; + const double x_charge, + const double y_charge, - DV_X[p].x += dP.x; - DV_X[p].y += dP.y; - DV_X[p].z += dP.z; + const double x_aii, + const double y_aii, - __syncthreads(); -} + const double x_bii, + const double y_bii, -__global__ void calc_pw_dvel_vector_column(int n_patoms, int n_waters, dvel_t* DV_X, dvel_t* DV_W, calc_pw_t* PW_MAT) { - int column = blockIdx.x * blockDim.x + threadIdx.x; - if (column >= 3 * n_waters) return; + const double coulomb_constant, - dvel_t dW; + const double scaling, - dW.x = 0; - dW.y = 0; - dW.z = 0; + double& evdw, + double& ecoul, + double& dv) { + double3 d = {x.x - y.x, x.y - y.y, x.z - y.z}; - for (int i = 0; i < n_patoms; i++) { - dW.x += PW_MAT[column + 3 * n_waters * i].W.x; - dW.y += PW_MAT[column + 3 * n_waters * i].W.y; - dW.z += PW_MAT[column + 3 * n_waters * i].W.z; - } + double r2 = 1.0 / (d.x * d.x + d.y * d.y + d.z * d.z); + double r = sqrt(r2); + double r6 = r2 * r2 * r2; - DV_W[column].x += dW.x; - DV_W[column].y += dW.y; - DV_W[column].z += dW.z; + ecoul = scaling * coulomb_constant * x_charge * y_charge * r; - __syncthreads(); + double v_a = r6 * r6 * x_aii * y_aii; + double v_b = r6 * x_bii * y_bii; + evdw = v_a - v_b; + dv = r2 * (-ecoul - 12.0 * v_a + 6.0 * v_b); } -__global__ void calc_pw_dvel_matrix(int n_patoms, int n_atoms_solute, int n_waters, - coord_t* X, coord_t* W, double* Evdw, double* Ecoul, calc_pw_t* PW_MAT, - ccharge_t* D_ccharges, charge_t* D_charges, catype_t* D_catypes, atype_t* D_atypes, p_atom_t* D_patoms, bool* D_excluded, topo_t D_topo) { - // Block index - int bx = blockIdx.x; - int by = blockIdx.y; +__global__ void calc_pw( + const int NX, + const int NY, + + const charge_t* D_charges, + const ccharge_t* D_ccharges, // Charge information - // Thread index - int tx = threadIdx.x; - int ty = threadIdx.y; + const atype_t* D_atypes, + const catype_t* D_catypes, // Atom type information - __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE]; - __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE]; + const topo_t D_topo, // Only needs coulomb_constant - Ecoul_S[ty][tx] = 0; - Evdw_S[ty][tx] = 0; + const bool* D_excluded, // Exclusion information - // if (tx == 0 && ty == 0) printf("bx = %d by = %d\n", bx, by); + const int* D_LJ_matrix, // bonding information, bond23 or bond14, for filtering and scaling - // if (bx == 0 && by == 0) printf("bx = %d by = %d tx = %d ty = %d\n", bx, by, tx, ty); + const p_atom_t* D_patoms, // the idx for X + // Here needs the idx for Y. But W does not have such idx. - int aStart = BLOCK_SIZE * by; - int bStart = BLOCK_SIZE * bx; + const coord_t* coords, // Coordinates infomation - if (aStart + ty >= n_patoms) return; - if (bStart + tx >= 3 * n_waters) return; + dvel_t* dvelocities, // Output dvelocities - // if (bx == 8 && by == 1) printf("bx = %d by = %d tx = %d ty = %d\n", bx, by, tx, ty); + // Other helper info, now we need it, but maybe we can optimize it later + const int n_atoms_solute, + bool /*x_is_solute*/, - __shared__ coord_t Xs[BLOCK_SIZE]; - __shared__ coord_t Ws[BLOCK_SIZE]; - __shared__ bool excluded_s[BLOCK_SIZE]; + double* Evdw_TOT, + double* Ecoul_TOT - int pi = D_patoms[aStart + ty].a - 1; +) { + const int x_blocks_num = (NX + 31) >> 5; // solute blocks + const int y_blocks_num = (NY + 31) >> 5; // water blocks + const int total_tiles = x_blocks_num * y_blocks_num; // full rectangle + + const int warpsPerBlock = blockDim.x >> 5; + const int tid = threadIdx.x; + const int lane = tid & 31; + const int warpInBlock = tid >> 5; - Xs[ty] = X[pi]; - Ws[tx] = W[bStart + tx]; + const int tile = blockIdx.x * warpsPerBlock + warpInBlock; - if (tx == 0) { - excluded_s[ty] = D_excluded[pi]; + if (tile >= total_tiles) { + return; // extra warps in last block } - __syncthreads(); - - calc_pw_t pw; - memset(&pw, 0, sizeof(calc_pw_t)); - - int row = by * BLOCK_SIZE + ty; - int column = bx * BLOCK_SIZE + tx; - - // if (row == 0 && column == 1) { - // printf("Xs[0] = %f\n", Xs[0]); - // printf("Ys[0] = %f\n", Ys[0]); - // printf("Xs[1] = %f\n", Xs[1]); - // printf("Ys[1] = %f\n", Ys[1]); - // printf("Xs[2] = %f\n", Xs[2]); - // printf("Ys[2] = %f\n", Ys[2]); - // printf("Xs[3] = %f\n", Xs[3]); - // printf("Ys[3] = %f\n", Ys[3]); - // printf("Xs[4] = %f\n", Xs[4]); - // printf("Ys[4] = %f\n", Ys[4]); - - // printf("Ys[%d] = %f Xs[%d] = %f\n", 3 * ty, Ys[3 * ty], 3 * tx, Xs[3 * tx]); - // } - - // if (bx == 8 && by == 1) printf("bx = %d by = %d tx = %d ty = %d\n", bx, by, tx, ty); - // __device__ void calc_pw_dvel_matrix_incr(int row, int pi, int column, int j, int n_patoms, - // coord_t *Ps, coord_t *Xs, double *Evdw, double *Ecoul, calc_pw_t *pw, - // ccharge_t *D_ccharges, charge_t *D_charges, catype_t *D_catypes, atype_t *D_atypes, p_atom_t *D_patoms) - double evdw = 0, ecoul = 0; - calc_pw_dvel_matrix_incr(ty, pi, tx, bStart + tx, n_atoms_solute, Xs, Ws, excluded_s, &evdw, &ecoul, &pw, D_ccharges, D_charges, D_catypes, D_atypes, D_patoms, D_topo); - Evdw_S[ty][tx] = evdw; - Ecoul_S[ty][tx] = ecoul; - - // if (row == 0 && column == 1) { - // printf("water_a = %f %f %f water_b = %f %f %f\n", water_a[0].x, water_a[0].y, water_a[0].z, water_b[0].x, water_b[0].y, water_b[0].z); - // } - - // if (bx == 8 && by == 1) printf("n_qatoms = %d\n", n_qatoms); - // if (bx == 8 && by == 1) printf("qi = %d j = %d charge[%d] = %f\n", row, column, row + n_qatoms, D_qcharges[row + n_qatoms * 1].q); - - PW_MAT[column + 3 * n_waters * row] = pw; - - __syncthreads(); - - int rowlen = (3 * n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE; - - if (tx == 0 && ty == 0) { - double tot_Evdw = 0; - double tot_Ecoul = 0; - for (int i = 0; i < BLOCK_SIZE; i++) { - for (int j = 0; j < BLOCK_SIZE; j++) { - tot_Evdw += Evdw_S[i][j]; - tot_Ecoul += Ecoul_S[i][j]; - } + int tile_y = tile / x_blocks_num; + int tile_x = tile - tile_y * x_blocks_num; + + const int base_x = tile_x << 5; + const int base_y = tile_y << 5; + + int x_idx = base_x + lane; + int y_idx = base_y + lane; + + int x_atom = (x_idx < NX) ? (D_patoms[x_idx].a - 1) : -1; // solute atom index + int y_atom = (y_idx < NY) ? (n_atoms_solute + y_idx) : -1; // water atom index + + coord_t invalid = {-1e9, -1e9, -1e9}; + coord_t x_coord = (x_atom >= 0) ? coords[x_atom] : invalid; + coord_t y_coord = (y_atom >= 0) ? coords[y_atom] : invalid; + + bool x_excluded = (x_atom >= 0) ? D_excluded[x_atom] : true; + bool y_excluded = (y_atom >= 0) ? D_excluded[y_atom] : true; + + double x_charge = (x_atom >= 0) ? D_ccharges[D_charges[x_atom].code - 1].charge : 0.0; + double y_charge = (y_atom >= 0) ? D_ccharges[D_charges[y_atom].code - 1].charge : 0.0; // TODO: FIX THIS!!! WILL NOT WORK WITH QATOMS!!!!!, + // For solute atom, That is fine. But for solvent atom, it is wrong. + + catype_t x_type = (x_atom >= 0) ? D_catypes[D_atypes[x_atom].code - 1] : catype_t{}; + catype_t y_type = (y_atom >= 0) ? D_catypes[D_atypes[y_atom].code - 1] : catype_t{}; // todo: Is that right for solvent? + + double3 x_force = {0.0, 0.0, 0.0}; + double3 y_force = {0.0, 0.0, 0.0}; + + double evdw_sum = 0; + double ecoul_sum = 0; + + const unsigned mask = 0xffffffffu; + + auto is_valid = [&]() -> bool { + if (x_idx >= NX || y_idx >= NY) return false; + if (x_excluded || y_excluded) return false; + bool bond23 = false; + if (bond23) return false; + return true; + }; + + auto do_shuffle = [&]() { + const int src = (lane + 1) & 31; + y_idx = __shfl_sync(mask, y_idx, src); + y_atom = __shfl_sync(mask, y_atom, src); + y_coord = shfl_coord(y_coord, src, mask); + y_excluded = __shfl_sync(mask, y_excluded, src); + y_charge = __shfl_sync(mask, y_charge, src); + y_type = shfl_catype(y_type, src, mask); + + y_force.x = __shfl_sync(mask, y_force.x, src); + y_force.y = __shfl_sync(mask, y_force.y, src); + y_force.z = __shfl_sync(mask, y_force.z, src); + }; + + for (int i = 0; i < 32; i++) { + if (is_valid()) { + // Here don't need to check LJ, LJ is only for solute-solute interactions + bool bond14 = false; + + double scaling = bond14 ? D_topo.el14_scale : 1.0; + double ai_aii = bond14 ? x_type.aii_1_4 : x_type.aii_normal; + double aj_aii = bond14 ? y_type.aii_1_4 : y_type.aii_normal; + + double ai_bii = bond14 ? x_type.bii_1_4 : x_type.bii_normal; + double aj_bii = bond14 ? y_type.bii_1_4 : y_type.bii_normal; + + double evdw = 0, ecoul = 0, dv = 0; + + calculate_unforce_bound( + x_coord, + y_coord, + x_charge, + y_charge, + ai_aii, + aj_aii, + ai_bii, + aj_bii, + D_topo.coulomb_constant, + scaling, + evdw, + ecoul, + dv); + + evdw_sum += evdw; + ecoul_sum += ecoul; + + double3 d = {x_coord.x - y_coord.x, x_coord.y - y_coord.y, x_coord.z - y_coord.z}; + y_force.x -= dv * d.x; + y_force.y -= dv * d.y; + y_force.z -= dv * d.z; + + x_force.x += dv * d.x; + x_force.y += dv * d.y; + x_force.z += dv * d.z; } - Evdw[rowlen * by + bx] = tot_Evdw; - Ecoul[rowlen * by + bx] = tot_Ecoul; + do_shuffle(); } - __syncthreads(); + if (x_atom >= 0) { + atomicAdd(&dvelocities[x_atom].x, x_force.x); + atomicAdd(&dvelocities[x_atom].y, x_force.y); + atomicAdd(&dvelocities[x_atom].z, x_force.z); + } + if (y_atom >= 0) { + atomicAdd(&dvelocities[y_atom].x, y_force.x); + atomicAdd(&dvelocities[y_atom].y, y_force.y); + atomicAdd(&dvelocities[y_atom].z, y_force.z); + } + + for (int offset = 16; offset > 0; offset >>= 1) { + evdw_sum += __shfl_down_sync(0xffffffffu, evdw_sum, offset); + ecoul_sum += __shfl_down_sync(0xffffffffu, ecoul_sum, offset); + } + if (lane == 0) { + // printf("Tile x=%d y=%d evdw_sum=%f ecoul_sum=%f\n", tile_x, tile_y, evdw_sum, ecoul_sum); + atomicAdd(Evdw_TOT, evdw_sum); + atomicAdd(Ecoul_TOT, ecoul_sum); + } } } // namespace CudaNonbondedPWForce @@ -287,81 +278,59 @@ void calc_nonbonded_pw_forces_host_v2() { auto D_patoms = ctx.d_p_atoms; auto D_excluded = ctx.d_excluded; - dim3 threads, grid; + cudaMemset(D_PW_evdw_TOT, 0, sizeof(double)); + cudaMemset(D_PW_ecoul_TOT, 0, sizeof(double)); - threads = dim3(BLOCK_SIZE, BLOCK_SIZE); - grid = dim3((3 * n_waters + BLOCK_SIZE - 1) / threads.x, (n_patoms + BLOCK_SIZE - 1) / threads.y); + const int thread_num = 128; + dim3 block_sz = dim3(thread_num); - calc_pw_dvel_matrix<<>>(n_patoms, n_atoms_solute, n_waters, X, X + n_atoms_solute, D_PW_Evdw, D_PW_Ecoul, PW_MAT, - D_ccharges, D_charges, D_catypes, D_atypes, D_patoms, D_excluded, topo); - calc_pw_dvel_vector_column<<<((3 * n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_patoms, n_waters, DV_X, DV_X + n_atoms_solute, PW_MAT); - calc_pw_dvel_vector_row<<<((n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_patoms, n_waters, DV_X, DV_X + n_atoms_solute, PW_MAT, D_patoms); + int tile_num_per_block = thread_num / 32; -#ifdef DEBUG - cudaMemcpy(h_PW_MAT, PW_MAT, mem_size_PW_MAT, cudaMemcpyDeviceToHost); -#endif + int x_blocks_num = (n_patoms + 31) / 32; // solute + int y_blocks_num = ((3 * n_waters) + 31) / 32; // water -// for (int i = 0; i < n_waters; i++) { -// printf("X[%d] = %f %f %f\n", i, coords[i].x, coords[i].y, coords[i].z); -// } + int total_tiles = x_blocks_num * y_blocks_num; // full rectangle -// printf("n_patoms = %d n_watoms = %d\n", n_patoms, 3 * n_waters); -#ifdef DEBUG - for (int i = 0; i < n_patoms; i++) { - for (int j = 0; j < 3 * n_waters; j++) { - if (h_PW_MAT[3 * i * n_waters + j].P.x > 100) - printf("PW_MAT[%d][%d].P = %f %f %f\n", i, j, h_PW_MAT[3 * i * n_waters + j].P.x, h_PW_MAT[3 * i * n_waters + j].P.y, h_PW_MAT[3 * i * n_waters + j].P.z); - } - } -#endif + int grid_sz = (total_tiles + tile_num_per_block - 1) / tile_num_per_block; - cudaMemcpy(dvelocities, DV_X, mem_size_DV_X, cudaMemcpyDeviceToHost); - cudaMemcpy(&dvelocities[n_atoms_solute], DV_X + n_atoms_solute, mem_size_DV_W, cudaMemcpyDeviceToHost); + dim3 grid = dim3(grid_sz); - calc_energy_sum<<<1, threads>>>(n_blocks_p, n_blocks_w, D_PW_evdw_TOT, D_PW_ecoul_TOT, D_PW_Evdw, D_PW_Ecoul, false); + calc_pw<<>>( + n_patoms, + 3 * n_waters, + D_charges, + D_ccharges, + D_atypes, + D_catypes, + topo, + D_excluded, + ctx.d_LJ_matrix, + D_patoms, + X, + DV_X, + n_atoms_solute, + true, + D_PW_evdw_TOT, + D_PW_ecoul_TOT); + cudaDeviceSynchronize(); + size_t mem_size_all = (n_atoms_solute + 3 * n_waters) * sizeof(dvel_t); + cudaMemcpy(dvelocities, DV_X, mem_size_all, cudaMemcpyDeviceToHost); + + double PW_evdw_TOT, PW_ecoul_TOT; cudaMemcpy(&PW_evdw_TOT, D_PW_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost); cudaMemcpy(&PW_ecoul_TOT, D_PW_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost); E_nonbond_pw.Uvdw += PW_evdw_TOT; E_nonbond_pw.Ucoul += PW_ecoul_TOT; - - // for (int i = 0; i < n_atoms; i++) { - // printf("dvelocities[%d] = %f %f %f\n", i, dvelocities[i].x, dvelocities[i].y, dvelocities[i].z); - // } } void init_nonbonded_pw_force_kernel_data() { using namespace CudaNonbondedPWForce; if (!is_initialized) { - int mem_size_PW_MAT = 3 * n_waters * n_patoms * sizeof(calc_pw_t); - - int n_blocks_p = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE; - int n_blocks_w = (3 * n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE; - int mem_size_PW_Evdw = n_blocks_p * n_blocks_w * sizeof(double); - int mem_size_PW_Ecoul = n_blocks_p * n_blocks_w * sizeof(double); -#ifdef DEBUG - printf("Allocating PW_MAT\n"); -#endif - check_cudaMalloc((void**)&PW_MAT, mem_size_PW_MAT); - -#ifdef DEBUG - printf("Allocating D_PW_Evdw\n"); -#endif - check_cudaMalloc((void**)&D_PW_Evdw, mem_size_PW_Evdw); -#ifdef DEBUG - printf("Allocating D_PW_Ecoul\n"); -#endif - check_cudaMalloc((void**)&D_PW_Ecoul, mem_size_PW_Ecoul); - check_cudaMalloc((void**)&D_PW_evdw_TOT, sizeof(double)); check_cudaMalloc((void**)&D_PW_ecoul_TOT, sizeof(double)); -#ifdef DEBUG - printf("All GPU solvent memory allocated\n"); -#endif - - h_PW_MAT = (calc_pw_t*)malloc(mem_size_PW_MAT); is_initialized = true; } } @@ -369,12 +338,8 @@ void init_nonbonded_pw_force_kernel_data() { void cleanup_nonbonded_pw_force() { using namespace CudaNonbondedPWForce; if (is_initialized) { - cudaFree(PW_MAT); - cudaFree(D_PW_Evdw); - cudaFree(D_PW_Ecoul); cudaFree(D_PW_evdw_TOT); cudaFree(D_PW_ecoul_TOT); - free(h_PW_MAT); is_initialized = false; } -} \ No newline at end of file +}