-
Notifications
You must be signed in to change notification settings - Fork 1
Feature/qgpu optimize nonbonded ww using openmm method #68
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: feature/qgpu
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -1,5 +1,6 @@ | ||||||||||||||||||||||||||
| #include "cuda/include/CudaContext.cuh" | ||||||||||||||||||||||||||
| #include "cuda/include/CudaNonbondedWWForce.cuh" | ||||||||||||||||||||||||||
| #include <iostream> | ||||||||||||||||||||||||||
| namespace CudaNonbondedWWForce { | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| bool is_initialized = false; | ||||||||||||||||||||||||||
|
|
@@ -42,210 +43,130 @@ __device__ __forceinline__ void calculate_unforce_bound( | |||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| template <const int Thread_x, const int Thread_y, const int Block_x, | ||||||||||||||||||||||||||
| const int Block_y> | ||||||||||||||||||||||||||
| __global__ void | ||||||||||||||||||||||||||
| calc_ww(const int N, const double crg_ow, const double crg_hw, | ||||||||||||||||||||||||||
| const double A_OO, const double B_OO, const topo_t D_topo, | ||||||||||||||||||||||||||
| coord_t* __restrict__ W, dvel_t* __restrict__ DV_W, | ||||||||||||||||||||||||||
| double* __restrict__ Evdw_TOT, double* __restrict__ ecoul_TOT) { | ||||||||||||||||||||||||||
| // Calculate block boundaries | ||||||||||||||||||||||||||
| int NX = N; | ||||||||||||||||||||||||||
| int NY = (N + 1) / 2; | ||||||||||||||||||||||||||
| int x_cal_num = blockDim.x * Block_x; | ||||||||||||||||||||||||||
| int y_cal_num = blockDim.y * Block_y; | ||||||||||||||||||||||||||
| int block_x_left_begin = 1 + blockIdx.x * x_cal_num; | ||||||||||||||||||||||||||
| int block_y_left_begin = blockIdx.y * y_cal_num; | ||||||||||||||||||||||||||
| int block_x_left_end = min(block_x_left_begin + x_cal_num - 1, NX - 1); | ||||||||||||||||||||||||||
| int block_y_left_end = min(block_y_left_begin + y_cal_num - 1, NY - 1); | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| // Shared memory declarations with padding to avoid bank conflicts | ||||||||||||||||||||||||||
| __shared__ coord_t p[2 * Thread_x * Block_x + 1]; | ||||||||||||||||||||||||||
| __shared__ coord_t q[2 * Thread_y * Block_y + 1]; | ||||||||||||||||||||||||||
| __shared__ double sum_row_x[2 * Thread_y * Block_y + 1]; | ||||||||||||||||||||||||||
| __shared__ double sum_row_y[2 * Thread_y * Block_y + 1]; | ||||||||||||||||||||||||||
| __shared__ double sum_row_z[2 * Thread_y * Block_y + 1]; | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| __shared__ double block_ecoul[(Thread_x * Thread_y + 31) / 32], | ||||||||||||||||||||||||||
| block_evdw[(Thread_x * Thread_y + 31) / 32]; | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| // Thread indices | ||||||||||||||||||||||||||
| int thread_y_left_begin = block_y_left_begin + threadIdx.y * Block_y; | ||||||||||||||||||||||||||
| int thread_x_left_begin = block_x_left_begin + threadIdx.x * Block_x; | ||||||||||||||||||||||||||
| int cur_thread_num = blockDim.x * threadIdx.y + threadIdx.x; | ||||||||||||||||||||||||||
| int thread_num = blockDim.x * blockDim.y; | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| // Optimized coordinate loading with coalesced memory access | ||||||||||||||||||||||||||
| #pragma unroll | ||||||||||||||||||||||||||
| for (int i = cur_thread_num; i < x_cal_num && block_x_left_begin + i < NX; i += thread_num) { | ||||||||||||||||||||||||||
| p[i] = W[block_x_left_begin + i]; | ||||||||||||||||||||||||||
| p[x_cal_num + i] = W[N - (block_x_left_begin + i)]; | ||||||||||||||||||||||||||
| __device__ __forceinline__ void idx2xy(int n, int t, int& x, int& y) { | ||||||||||||||||||||||||||
| x = (int)floorf((2 * n + 1 - sqrtf((2 * n + 1) * (2 * n + 1) - 8 * t)) * 0.5f); | ||||||||||||||||||||||||||
|
||||||||||||||||||||||||||
| x = (int)floorf((2 * n + 1 - sqrtf((2 * n + 1) * (2 * n + 1) - 8 * t)) * 0.5f); | |
| // Use double precision for the quadratic inversion to avoid precision loss for large n, t. | |
| double dn = static_cast<double>(n); | |
| double dt = static_cast<double>(t); | |
| double tmp = 2.0 * dn + 1.0; | |
| double disc = tmp * tmp - 8.0 * dt; | |
| double xd = floor((tmp - sqrt(disc)) * 0.5); | |
| x = static_cast<int>(xd); |
Copilot
AI
Jan 6, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The idx2xy function lacks documentation explaining its algorithm and parameters. Add a comment describing that this maps a linear tile index to 2D coordinates in the upper triangle of a matrix, and explain the meaning of parameters n (number of blocks per dimension) and t (tile index).
Copilot
AI
Jan 6, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The shfl and shfl_coord helper functions lack documentation. Add comments explaining that these functions enable warp shuffle operations for double and coord_t types respectively, which are essential for the warp-synchronous algorithm.
Copilot
AI
Jan 6, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add a comment explaining the grid decomposition strategy. The calculation (N + 31) >> 5 computes the number of 32-atom blocks (ceiling division by 32), and idx2xy maps the linear tile index to upper-triangle coordinates. This is a key part of the algorithm and deserves documentation for maintainability.
Copilot
AI
Jan 6, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Variable name 'invalid' is vague and could be more descriptive. Consider renaming to 'invalidCoord' or 'outOfBoundsMarker' to clarify its purpose as a sentinel value for out-of-range atoms.
| coord_t invalid = {-1e9, -1e9, -1e9}; | |
| coord_t row = (rowIdx < N ? W[rowIdx] : invalid); | |
| // "col" state that will be rotated around the warp | |
| int colIdx = colIdx0; | |
| coord_t col = (colIdx < N ? W[colIdx] : invalid); | |
| coord_t invalidCoord = {-1e9, -1e9, -1e9}; | |
| coord_t row = (rowIdx < N ? W[rowIdx] : invalidCoord); | |
| // "col" state that will be rotated around the warp | |
| int colIdx = colIdx0; | |
| coord_t col = (colIdx < N ? W[colIdx] : invalidCoord); |
Copilot
AI
Jan 6, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Commented-out printf statement should be removed before merging. Debug code should not remain in production code.
| // printf("evdw_sum: %f, ecoul_sum: %f\n", evdw_sum, ecoul_sum); |
Copilot
AI
Jan 6, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The magic number 128 for thread_num should be defined as a named constant (e.g., THREAD_BLOCK_SIZE) to improve code maintainability and make it clear that this value is related to warpsPerBlock calculation in the kernel (line 74).
Copilot
AI
Jan 6, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Multiple blank lines should be removed. This appears to be accidental whitespace that reduces code readability.
Copilot
AI
Jan 6, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Commented-out printf statement should be removed before merging. Debug code should not remain in production code.
| // printf("WW E_vdw: %f, E_coul: %f\n", WW_evdw_TOT, WW_ecoul_TOT); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The iostream header is included but not used in this file. This adds unnecessary compilation overhead and should be removed.