From ce2dda0407443acfc6a92a374ba7591737d56b77 Mon Sep 17 00:00:00 2001
From: "shen.guo" <g.shen@rug.nl>
Date: Tue, 6 Jan 2026 16:33:20 +0100
Subject: [PATCH 1/4] optimize nonbonded pp force using openmm method

---
 src/core/cuda/src/CudaNonbondedPPForce.cu | 290 ++++++++++++++++++----
 src/core/parse.cu                         |   4 +
 2 files changed, 247 insertions(+), 47 deletions(-)
diff --git a/src/core/cuda/src/CudaNonbondedPPForce.cu b/src/core/cuda/src/CudaNonbondedPPForce.cu
index d32db32b..e5a78ee8 100644
--- a/src/core/cuda/src/CudaNonbondedPPForce.cu
+++ b/src/core/cuda/src/CudaNonbondedPPForce.cu
@@ -1,11 +1,11 @@
+#include <iostream>
+
 #include "cuda/include/CudaContext.cuh"
 #include "cuda/include/CudaNonbondedPPForce.cuh"
 #include "utils.h"
 
 namespace CudaNonbondedPPForce {
 bool is_initialized = false;
-dvel_t* PP_MAT;
-double *D_PP_Evdw, *D_PP_Ecoul;
 double *D_PP_evdw_TOT, *D_PP_ecoul_TOT;
 
 __device__ void calc_pp_dvel_matrix_incr(int row, int pi, int column, int pj,
@@ -65,6 +65,231 @@ __device__ void calc_pp_dvel_matrix_incr(int row, int pi, int column, int pj,
     *Evdw += (V_a - V_b);
 }
 
+__device__ __forceinline__ void idx2xy(int n, int t, int& x, int& y) {
+    x = (int)floorf((2 * n + 1 - sqrtf((2 * n + 1) * (2 * n + 1) - 8 * t)) * 0.5f);
+    y = t - (x * n - (x * (x - 1) >> 1));
+    if (y < 0) {
+        x--;
+        y += (n - x);
+    }
+    y += x;
+}
+
+__device__ __forceinline__ double shfl(double v, int srcLane, unsigned mask = 0xffffffffu) {
+    int2 a = *reinterpret_cast<int2*>(&v);
+    a.x = __shfl_sync(mask, a.x, srcLane);
+    a.y = __shfl_sync(mask, a.y, srcLane);
+    return *reinterpret_cast<double*>(&a);
+}
+
+__device__ __forceinline__ coord_t shfl_coord(coord_t v, int srcLane, unsigned mask = 0xffffffffu) {
+    v.x = shfl(v.x, srcLane, mask);
+    v.y = shfl(v.y, srcLane, mask);
+    v.z = shfl(v.z, srcLane, mask);
+    return v;
+}
+
+__device__ void calculate_unforce_bound(
+    // const int x_idx,
+    // const int y_idx,
+
+    const coord_t& x,
+    const coord_t& y,
+
+    const double x_charge,
+    const double y_charge,
+
+    const double x_aii,
+    const double y_aii,
+
+    const double x_bii,
+    const double y_bii,
+
+    const double coulomb_constant,
+
+    const double scaling,
+
+    double& evdw,
+    double& ecoul,
+    double& dv) {
+    double3 d = {y.x - x.x, y.y - x.y, y.z - x.z};
+
+    double r2 = 1.0 / (d.x * d.x + d.y * d.y + d.z * d.z);
+    double r = sqrt(r2);
+    double r6 = r2 * r2 * r2;
+
+    ecoul = scaling * coulomb_constant * x_charge * y_charge * r;
+
+    double v_a = r6 * r6 * x_aii * y_aii;
+    double v_b = r6 * x_bii * y_bii;
+    evdw = v_a - v_b;
+    dv = r2 * (-ecoul - 12.0 * v_a + 6.0 * v_b);
+}
+
+__global__ void calc_pp(
+    const int N,
+    const int n_atoms_solute,
+
+    const charge_t* D_charges,    // charge type of each atom
+    const ccharge_t* D_ccharges,  // charge value of each charge type
+
+    const atype_t* D_atypes,
+    const catype_t* D_catypes,
+
+    const topo_t D_topo,
+
+    const bool* D_excluded,
+
+    const int* D_LJ_matrix,
+
+    const p_atom_t* D_patoms,
+
+    coord_t* X,
+    dvel_t* DV_X,
+
+    double* Evdw_TOT,
+    double* Ecoul_TOT) {
+    const int nBlocks = (N + 31) >> 5;  // ceil(N/32)
+    const int totalTiles = nBlocks * (nBlocks + 1) / 2;
+
+    const int warpsPerBlock = blockDim.x >> 5;  // THREAD_BLOCK_SIZE / 32
+    const int tid = threadIdx.x;
+    const int lane = tid & 31;
+    const int warpInBlock = tid >> 5;
+
+    const int tile = blockIdx.x * warpsPerBlock + warpInBlock;
+    if (tile >= totalTiles) {
+        return;  // extra warps in last block
+    }
+
+    // Map tile -> (x,y) with y>=x (upper triangle)
+    int tile_x, tile_y;
+    idx2xy(nBlocks, tile, tile_x, tile_y);
+
+    const int baseX = tile_x << 5;  // x*32
+    const int baseY = tile_y << 5;  // y*32
+
+    int x_idx = baseX + lane;
+    int y_idx = baseY + lane;
+
+    int x_atom = (x_idx < N) ? (D_patoms[x_idx].a - 1) : -1;
+    int y_atom = (y_idx < N) ? (D_patoms[y_idx].a - 1) : -1;
+
+    coord_t invalid = {-1e9, -1e9, -1e9};
+    coord_t x_coord = (x_atom >= 0) ? X[x_atom] : invalid;
+    coord_t y_coord = (y_atom >= 0) ? X[y_atom] : invalid;
+
+    bool x_excluded = (x_atom >= 0) ? D_excluded[x_atom] : true;
+    bool y_excluded = (y_atom >= 0) ? D_excluded[y_atom] : true;
+
+    double x_charge = (x_atom >= 0) ? D_ccharges[D_charges[x_atom].code - 1].charge : 0.0;
+    double y_charge = (y_atom >= 0) ? D_ccharges[D_charges[y_atom].code - 1].charge : 0.0;
+
+    catype_t x_type = (x_atom >= 0) ? D_catypes[D_atypes[x_atom].code - 1] : catype_t{};
+    catype_t y_type = (y_atom >= 0) ? D_catypes[D_atypes[y_atom].code - 1] : catype_t{};
+    double x_aii_normal = x_type.aii_normal;
+    double y_aii_normal = y_type.aii_normal;
+    double x_aii_14 = x_type.aii_1_4;
+    double y_aii_14 = y_type.aii_1_4;
+    double x_bii_normal = x_type.bii_normal;
+    double y_bii_normal = y_type.bii_normal;
+    double x_bii_14 = x_type.bii_1_4;
+    double y_bii_14 = y_type.bii_1_4;
+
+    double3 x_force = {0.0, 0.0, 0.0};
+    double3 y_force = {0.0, 0.0, 0.0};
+
+    double evdw_sum = 0;
+    double ecoul_sum = 0;
+
+    const unsigned mask = 0xffffffffu;
+
+    auto is_valid = [&]() -> bool {
+        if (x_idx >= N || y_idx >= N) {
+            return false;
+        }
+        if (tile_x == tile_y && x_idx <= y_idx) {
+            return false;
+        }
+        if (x_excluded || y_excluded) {
+            return false;
+        }
+
+    
+        bool bond23 = (D_LJ_matrix[y_atom * n_atoms_solute + x_atom] == 3);
+        if (bond23) {
+            return false;
+        }
+
+        return true;
+    };
+
+    // shuffle to get next y
+    auto do_shuffle = [&]() {
+        const int src = (lane + 1) & 31;
+        y_idx = __shfl_sync(mask, y_idx, src);
+        y_atom = __shfl_sync(mask, y_atom, src);
+        y_coord = shfl_coord(y_coord, src, mask);
+        y_excluded = __shfl_sync(mask, y_excluded, src);
+        y_charge = __shfl_sync(mask, y_charge, src);
+        y_aii_normal = __shfl_sync(mask, y_aii_normal, src);
+        y_aii_14 = __shfl_sync(mask, y_aii_14, src);
+        y_bii_normal = __shfl_sync(mask, y_bii_normal, src);
+        y_bii_14 = __shfl_sync(mask, y_bii_14, src);
+
+        // shuffle y_force
+        y_force.x = shfl(y_force.x, src, mask);
+        y_force.y = shfl(y_force.y, src, mask);
+        y_force.z = shfl(y_force.z, src, mask);
+    };
+
+    for (int i = 0; i < 32; i++) {
+        if (is_valid()) {
+            bool bond14 = (D_LJ_matrix[y_atom * n_atoms_solute + x_atom] == 1);
+            double scaling = bond14 ? D_topo.el14_scale : 1.0;
+            double ai_aii = bond14 ? x_aii_14 : x_aii_normal;
+            double aj_aii = bond14 ? y_aii_14 : y_aii_normal;
+            double ai_bii = bond14 ? x_bii_14 : x_bii_normal;
+            double aj_bii = bond14 ? y_bii_14 : y_bii_normal;
+            double evdw = 0.0, ecoul = 0.0, dv = 0.0;
+            calculate_unforce_bound(
+                x_coord, y_coord, x_charge, y_charge, ai_aii, aj_aii, ai_bii, aj_bii,
+                D_topo.coulomb_constant, scaling, evdw, ecoul, dv);
+            evdw_sum += evdw;
+            ecoul_sum += ecoul;
+            double3 d = {y_coord.x - x_coord.x, y_coord.y - x_coord.y, y_coord.z - x_coord.z};
+            x_force.x += dv * d.x;
+            x_force.y += dv * d.y;
+            x_force.z += dv * d.z;
+
+            y_force.x -= dv * d.x;
+            y_force.y -= dv * d.y;
+            y_force.z -= dv * d.z;
+        }
+        do_shuffle();
+    }
+
+    if (x_atom >= 0) {
+        atomicAdd(&DV_X[x_atom].x, x_force.x);
+        atomicAdd(&DV_X[x_atom].y, x_force.y);
+        atomicAdd(&DV_X[x_atom].z, x_force.z);
+    }
+    if (y_atom >= 0) {
+        atomicAdd(&DV_X[y_atom].x, y_force.x);
+        atomicAdd(&DV_X[y_atom].y, y_force.y);
+        atomicAdd(&DV_X[y_atom].z, y_force.z);
+    }
+
+    for (int offset = 16; offset > 0; offset >>= 1) {
+        evdw_sum += __shfl_down_sync(0xffffffffu, evdw_sum, offset);
+        ecoul_sum += __shfl_down_sync(0xffffffffu, ecoul_sum, offset);
+    }
+    if (lane == 0) {
+        atomicAdd(Evdw_TOT, evdw_sum);
+        atomicAdd(Ecoul_TOT, ecoul_sum);
+    }
+}
+
 __global__ void calc_pp_dvel_matrix(int n_patoms, int n_atoms_solute,
                                     coord_t* X, double* Evdw, double* Ecoul, dvel_t* PP_MAT,
                                     ccharge_t* D_ccharges, charge_t* D_charges, catype_t* D_catypes, atype_t* D_atypes, p_atom_t* D_patoms, int* D_LJ_matrix, bool* D_excluded, topo_t D_topo) {
@@ -221,8 +446,6 @@ void calc_nonbonded_pp_forces_host_v2() {
 
     int mem_size_DV_X = n_atoms_solute * sizeof(dvel_t);
 
-    int n_blocks = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
-
     CudaContext& ctx = CudaContext::instance();
     auto X = ctx.d_coords;
     auto DV_X = ctx.d_dvelocities;
@@ -234,32 +457,27 @@ void calc_nonbonded_pp_forces_host_v2() {
     auto D_LJ_matrix = ctx.d_LJ_matrix;
     auto D_excluded = ctx.d_excluded;
 
-    dim3 threads, grid;
+    cudaMemset(D_PP_evdw_TOT, 0, sizeof(double));
+    cudaMemset(D_PP_ecoul_TOT, 0, sizeof(double));
 
-    threads = dim3(BLOCK_SIZE, BLOCK_SIZE);
-    grid = dim3((n_patoms + BLOCK_SIZE - 1) / threads.x, (n_patoms + BLOCK_SIZE - 1) / threads.y);
+    const int thread_num = 128;
+    dim3 block_sz = dim3(thread_num);
 
-    calc_pp_dvel_matrix<<<grid, threads>>>(n_patoms, n_atoms_solute, X, D_PP_Evdw, D_PP_Ecoul, PP_MAT, D_ccharges, D_charges, D_catypes, D_atypes, D_patoms, D_LJ_matrix, D_excluded, topo);
-    calc_pp_dvel_vector<<<((n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_patoms, DV_X, PP_MAT, D_patoms);
+    int tile_num_per_block = thread_num / 32;
+    int nBlocks = (n_patoms + 31) / 32;
+    int total_tiles = nBlocks * (nBlocks + 1) / 2;
 
-    cudaMemcpy(dvelocities, DV_X, mem_size_DV_X, cudaMemcpyDeviceToHost);
+    int grid_sz = (total_tiles + tile_num_per_block - 1) / tile_num_per_block;
+    dim3 grid = dim3(grid_sz);
 
-#ifdef DEBUG
-    cudaMemcpy(h_PP_MAT, PP_MAT, mem_size_PP_MAT, cudaMemcpyDeviceToHost);
-#endif
+    calc_pp<<<grid, block_sz>>>(n_patoms, n_atoms_solute, D_charges, D_ccharges,
+                                D_atypes, D_catypes, topo, D_excluded,
+                                D_LJ_matrix, D_patoms, X, DV_X, D_PP_evdw_TOT,
+                                D_PP_ecoul_TOT);
 
-#ifdef DEBUG
-    for (int i = 0; i < n_patoms; i++) {
-        for (int j = 0; j < n_patoms; j++) {
-            if (h_PP_MAT[i * n_patoms + j].x > 100)
-                printf("PP_MAT[%d][%d] = %f %f %f\n", i, j, h_PP_MAT[i * n_patoms + j].x, h_PP_MAT[i * n_patoms + j].y, h_PP_MAT[i * n_patoms + j].z);
-        }
-    }
-#endif
-    cudaMemset(D_PP_evdw_TOT, 0, sizeof(double));
-    cudaMemset(D_PP_ecoul_TOT, 0, sizeof(double));
+    cudaDeviceSynchronize();
+    cudaMemcpy(dvelocities, DV_X, mem_size_DV_X, cudaMemcpyDeviceToHost);
 
-    calc_energy_sum<<<1, threads>>>(n_blocks, n_blocks, D_PP_evdw_TOT, D_PP_ecoul_TOT, D_PP_Evdw, D_PP_Ecoul, true);
     double PP_evdw_TOT, PP_ecoul_TOT;
     cudaMemcpy(&PP_evdw_TOT, D_PP_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost);
     cudaMemcpy(&PP_ecoul_TOT, D_PP_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost);
@@ -272,25 +490,6 @@ void init_nonbonded_pp_force_kernel_data() {
     using namespace CudaNonbondedPPForce;
 
     if (!is_initialized) {
-        int n_blocks = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
-
-        int mem_size_PP_MAT = n_patoms * n_patoms * sizeof(dvel_t);
-
-        int mem_size_PP_Evdw = n_blocks * n_blocks * sizeof(double);
-        int mem_size_PP_Ecoul = n_blocks * n_blocks * sizeof(double);
-#ifdef DEBUG
-        printf("Allocating PP_MAT\n");
-#endif
-        check_cudaMalloc((void**)&PP_MAT, mem_size_PP_MAT);
-#ifdef DEBUG
-        printf("Allocating D_PP_Evdw\n");
-#endif
-        check_cudaMalloc((void**)&D_PP_Evdw, mem_size_PP_Evdw);
-#ifdef DEBUG
-        printf("Allocating D_PP_Ecoul\n");
-#endif
-        check_cudaMalloc((void**)&D_PP_Ecoul, mem_size_PP_Ecoul);
-
         check_cudaMalloc((void**)&D_PP_evdw_TOT, sizeof(double));
         check_cudaMalloc((void**)&D_PP_ecoul_TOT, sizeof(double));
 
@@ -301,11 +500,8 @@ void init_nonbonded_pp_force_kernel_data() {
 void cleanup_nonbonded_pp_force() {
     using namespace CudaNonbondedPPForce;
     if (is_initialized) {
-        cudaFree(PP_MAT);
-        cudaFree(D_PP_Evdw);
-        cudaFree(D_PP_Ecoul);
         cudaFree(D_PP_evdw_TOT);
         cudaFree(D_PP_ecoul_TOT);
         is_initialized = false;
     }
-}
\ No newline at end of file
+}
diff --git a/src/core/parse.cu b/src/core/parse.cu
index 1b9e0f8d..c5518223 100644
--- a/src/core/parse.cu
+++ b/src/core/parse.cu
@@ -753,6 +753,7 @@ void init_ngbrs14(const char* filename) {
                 int jx = (lineI + i + 1) % lines;
                 // if (ix < 100 && jx < 100) printf("i = %d j = %d\n", ix+1, jx+1);
                 LJ_matrix[ix * n_atoms_solute + jx] = 1;
+                LJ_matrix[jx * n_atoms_solute + ix] = 1;
             }
         }
         lineI++;
@@ -797,6 +798,7 @@ void init_ngbrs23(const char* filename) {
                 int jx = (lineI + i + 1) % lines;
                 // if (ix < 100 && jx < 100) printf("i = %d j = %d\n", ix+1, jx+1);
                 LJ_matrix[ix * n_atoms_solute + jx] = 3;
+                LJ_matrix[jx * n_atoms_solute + ix] = 3;
             }
         }
         lineI++;
@@ -819,6 +821,7 @@ void init_ngbrs14_long(const char* filename) {
         int ix = atoi(file.buffer[i+1][0])-1;
         int jx = atoi(file.buffer[i+1][1])-1;
         LJ_matrix[ix * n_atoms_solute + jx] = 1;
+        LJ_matrix[jx * n_atoms_solute + ix] = 1;
     }
 
     clean_csv(file);
@@ -838,6 +841,7 @@ void init_ngbrs23_long(const char* filename) {
         int ix = atoi(file.buffer[i+1][0])-1;
         int jx = atoi(file.buffer[i+1][1])-1;
         LJ_matrix[ix * n_atoms_solute + jx] = 3;
+        LJ_matrix[jx * n_atoms_solute + ix] = 3;
     }
 
     clean_csv(file);

From 7143986823074c194944a033abe7c0ae1acfd259 Mon Sep 17 00:00:00 2001
From: "shen.guo" <g.shen@rug.nl>
Date: Tue, 6 Jan 2026 16:55:35 +0100
Subject: [PATCH 2/4] optimize nonbonded pp force using openmm method

---
 src/core/cuda/src/CudaNonbondedPPForce.cu | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/core/cuda/src/CudaNonbondedPPForce.cu b/src/core/cuda/src/CudaNonbondedPPForce.cu
index e5a78ee8..4b2b67b2 100644
--- a/src/core/cuda/src/CudaNonbondedPPForce.cu
+++ b/src/core/cuda/src/CudaNonbondedPPForce.cu
@@ -214,9 +214,7 @@ __global__ void calc_pp(
         if (x_excluded || y_excluded) {
             return false;
         }
-
-    
-        bool bond23 = (D_LJ_matrix[y_atom * n_atoms_solute + x_atom] == 3);
+        bool bond23 = (D_LJ_matrix[x_atom * n_atoms_solute + y_atom] == 3);
         if (bond23) {
             return false;
         }
@@ -258,13 +256,13 @@ __global__ void calc_pp(
             evdw_sum += evdw;
             ecoul_sum += ecoul;
             double3 d = {y_coord.x - x_coord.x, y_coord.y - x_coord.y, y_coord.z - x_coord.z};
-            x_force.x += dv * d.x;
-            x_force.y += dv * d.y;
-            x_force.z += dv * d.z;
+            x_force.x -= dv * d.x;
+            x_force.y -= dv * d.y;
+            x_force.z -= dv * d.z;
 
-            y_force.x -= dv * d.x;
-            y_force.y -= dv * d.y;
-            y_force.z -= dv * d.z;
+            y_force.x += dv * d.x;
+            y_force.y += dv * d.y;
+            y_force.z += dv * d.z;
         }
         do_shuffle();
     }

From 318a2f2f448c40598232c492c95e4b7720ddc4de Mon Sep 17 00:00:00 2001
From: "shen.guo" <g.shen@rug.nl>
Date: Thu, 8 Jan 2026 09:34:11 +0100
Subject: [PATCH 3/4] adjst the direction to make it same as before

---
 src/core/cuda/src/CudaNonbondedPPForce.cu | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/core/cuda/src/CudaNonbondedPPForce.cu b/src/core/cuda/src/CudaNonbondedPPForce.cu
index 4b2b67b2..d94115b4 100644
--- a/src/core/cuda/src/CudaNonbondedPPForce.cu
+++ b/src/core/cuda/src/CudaNonbondedPPForce.cu
@@ -112,7 +112,7 @@ __device__ void calculate_unforce_bound(
     double& evdw,
     double& ecoul,
     double& dv) {
-    double3 d = {y.x - x.x, y.y - x.y, y.z - x.z};
+    double3 d = {x.x - y.x, x.y - y.y, x.z - y.z};
 
     double r2 = 1.0 / (d.x * d.x + d.y * d.y + d.z * d.z);
     double r = sqrt(r2);
@@ -255,14 +255,15 @@ __global__ void calc_pp(
                 D_topo.coulomb_constant, scaling, evdw, ecoul, dv);
             evdw_sum += evdw;
             ecoul_sum += ecoul;
-            double3 d = {y_coord.x - x_coord.x, y_coord.y - x_coord.y, y_coord.z - x_coord.z};
-            x_force.x -= dv * d.x;
-            x_force.y -= dv * d.y;
-            x_force.z -= dv * d.z;
-
-            y_force.x += dv * d.x;
-            y_force.y += dv * d.y;
-            y_force.z += dv * d.z;
+            double3 d = {x_coord.x - y_coord.x, x_coord.y - y_coord.y, x_coord.z - y_coord.z};
+            y_force.x -= dv * d.x;
+            y_force.y -= dv * d.y;
+            y_force.z -= dv * d.z;
+
+            x_force.x += dv * d.x;
+            x_force.y += dv * d.y;
+            x_force.z += dv * d.z;
+
         }
         do_shuffle();
     }

From 50caeb961dbbf29aca55ea8fae2e1fd120495778 Mon Sep 17 00:00:00 2001
From: "shen.guo" <g.shen@rug.nl>
Date: Thu, 8 Jan 2026 09:35:38 +0100
Subject: [PATCH 4/4] remove old code

---
 src/core/cuda/src/CudaNonbondedPPForce.cu | 207 ----------------------
 1 file changed, 207 deletions(-)

diff --git a/src/core/cuda/src/CudaNonbondedPPForce.cu b/src/core/cuda/src/CudaNonbondedPPForce.cu
index d94115b4..0041df11 100644
--- a/src/core/cuda/src/CudaNonbondedPPForce.cu
+++ b/src/core/cuda/src/CudaNonbondedPPForce.cu
@@ -8,63 +8,6 @@ namespace CudaNonbondedPPForce {
 bool is_initialized = false;
 double *D_PP_evdw_TOT, *D_PP_ecoul_TOT;
 
-__device__ void calc_pp_dvel_matrix_incr(int row, int pi, int column, int pj,
-                                         coord_t* Xs, coord_t* Ys, int* LJs, bool* excluded_s, double* Evdw, double* Ecoul, dvel_t* patom_a, dvel_t* patom_b,
-                                         ccharge_t* D_ccharges, charge_t* D_charges, catype_t* D_catypes, atype_t* D_atypes, p_atom_t* D_patoms, topo_t D_topo) {
-    bool bond14, bond23;
-    double scaling;
-    coord_t da;
-    double r2a, ra, r6a;
-    double Vela, V_a, V_b;
-    double dva;
-    double crg_i, crg_j;
-    double ai_aii, aj_aii, ai_bii, aj_bii;
-    catype_t ai_type, aj_type;
-
-    bond23 = LJs[row * BLOCK_SIZE + column] == 3;
-    bond14 = LJs[row * BLOCK_SIZE + column] == 1;
-
-    if (bond23) return;
-    if (excluded_s[row] || excluded_s[BLOCK_SIZE + column]) return;
-
-    scaling = bond14 ? D_topo.el14_scale : 1;
-
-    crg_i = D_ccharges[D_charges[pi].code - 1].charge;
-    crg_j = D_ccharges[D_charges[pj].code - 1].charge;
-
-    ai_type = D_catypes[D_atypes[pi].code - 1];
-    aj_type = D_catypes[D_atypes[pj].code - 1];
-
-    da.x = Ys[column].x - Xs[row].x;
-    da.y = Ys[column].y - Xs[row].y;
-    da.z = Ys[column].z - Xs[row].z;
-    r2a = 1 / (pow(da.x, 2) + pow(da.y, 2) + pow(da.z, 2));
-    ra = sqrt(r2a);
-    r6a = r2a * r2a * r2a;
-
-    Vela = scaling * D_topo.coulomb_constant * crg_i * crg_j * ra;
-
-    ai_aii = bond14 ? ai_type.aii_1_4 : ai_type.aii_normal;
-    aj_aii = bond14 ? aj_type.aii_1_4 : aj_type.aii_normal;
-    ai_bii = bond14 ? ai_type.bii_1_4 : ai_type.bii_normal;
-    aj_bii = bond14 ? aj_type.bii_1_4 : aj_type.bii_normal;
-
-    V_a = r6a * r6a * ai_aii * aj_aii;
-    V_b = r6a * ai_bii * aj_bii;
-    dva = r2a * (-Vela - 12 * V_a + 6 * V_b);
-
-    patom_a->x = -dva * da.x;
-    patom_a->y = -dva * da.y;
-    patom_a->z = -dva * da.z;
-
-    patom_b->x = dva * da.x;
-    patom_b->y = dva * da.y;
-    patom_b->z = dva * da.z;
-
-    *Ecoul += Vela;
-    *Evdw += (V_a - V_b);
-}
-
 __device__ __forceinline__ void idx2xy(int n, int t, int& x, int& y) {
     x = (int)floorf((2 * n + 1 - sqrtf((2 * n + 1) * (2 * n + 1) - 8 * t)) * 0.5f);
     y = t - (x * n - (x * (x - 1) >> 1));
@@ -289,156 +232,6 @@ __global__ void calc_pp(
     }
 }
 
-__global__ void calc_pp_dvel_matrix(int n_patoms, int n_atoms_solute,
-                                    coord_t* X, double* Evdw, double* Ecoul, dvel_t* PP_MAT,
-                                    ccharge_t* D_ccharges, charge_t* D_charges, catype_t* D_catypes, atype_t* D_atypes, p_atom_t* D_patoms, int* D_LJ_matrix, bool* D_excluded, topo_t D_topo) {
-    // Block index
-    int bx = blockIdx.x;
-    int by = blockIdx.y;
-
-    // Thread index
-    int tx = threadIdx.x;
-    int ty = threadIdx.y;
-
-    int aStart = BLOCK_SIZE * by;
-    int bStart = BLOCK_SIZE * bx;
-
-    __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE];
-    __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE];
-
-    Ecoul_S[ty][tx] = 0;
-    Evdw_S[ty][tx] = 0;
-
-    if (aStart + ty >= n_patoms) return;
-    if (bStart + tx >= n_patoms) return;
-
-    __shared__ coord_t Xs[BLOCK_SIZE];
-    __shared__ coord_t Ys[BLOCK_SIZE];
-    __shared__ int LJs[BLOCK_SIZE * BLOCK_SIZE];
-    __shared__ bool excluded_s[2 * BLOCK_SIZE];
-
-    int pi = D_patoms[aStart + ty].a - 1;
-    int pj = D_patoms[bStart + tx].a - 1;
-
-    if (tx == 0) {
-        Xs[ty] = X[pi];
-        excluded_s[ty] = D_excluded[pi];
-    }
-
-    if (ty == 0) {
-        Ys[tx] = X[pj];
-        excluded_s[BLOCK_SIZE + tx] = D_excluded[pj];
-    }
-    LJs[ty * BLOCK_SIZE + tx] = D_LJ_matrix[pi * n_atoms_solute + pj];
-
-    __syncthreads();
-
-    if (bx < by || (bx == by && tx < ty)) return;
-
-    dvel_t patom_a, patom_b;
-    memset(&patom_a, 0, sizeof(dvel_t));
-    memset(&patom_b, 0, sizeof(dvel_t));
-
-    int row = by * BLOCK_SIZE + ty;
-    int column = bx * BLOCK_SIZE + tx;
-
-    if (bx != by || tx != ty) {
-        double evdw = 0, ecoul = 0;
-        calc_pp_dvel_matrix_incr(ty, pi, tx, pj, Xs, Ys, LJs, excluded_s, &evdw, &ecoul, &patom_a,
-                                 &patom_b, D_ccharges, D_charges, D_catypes, D_atypes, D_patoms, D_topo);
-        Evdw_S[ty][tx] = evdw;
-        Ecoul_S[ty][tx] = ecoul;
-    }
-
-    PP_MAT[row * n_patoms + column] = patom_a;
-    PP_MAT[column * n_patoms + row] = patom_b;
-
-    __syncthreads();
-
-    int rowlen = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
-
-    if (tx == 0 && ty == 0) {
-        double tot_Evdw = 0;
-        double tot_Ecoul = 0;
-        for (int i = 0; i < BLOCK_SIZE; i++) {
-            for (int j = 0; j < BLOCK_SIZE; j++) {
-                tot_Evdw += Evdw_S[i][j];
-                tot_Ecoul += Ecoul_S[i][j];
-            }
-        }
-        Evdw[rowlen * by + bx] = tot_Evdw;
-        Ecoul[rowlen * by + bx] = tot_Ecoul;
-
-        // printf("bx = %d by = %d tot_Evdw = %f tot_Ecoul = %f\n", bx, by, Evdw[rowlen * by + bx], Ecoul[rowlen * by + bx]);
-    }
-
-    __syncthreads();
-}
-
-__global__ void calc_pp_dvel_vector(int n_patoms, dvel_t* DV_X, dvel_t* PP_MAT, p_atom_t* D_patoms) {
-    int row = blockIdx.x * blockDim.x + threadIdx.x;
-    if (row >= n_patoms) return;
-
-    dvel_t dP;
-    dP.x = 0;
-    dP.y = 0;
-    dP.z = 0;
-
-    for (int i = 0; i < n_patoms; i++) {
-        if (i != row) {
-            dP.x += PP_MAT[i + n_patoms * row].x;
-            dP.y += PP_MAT[i + n_patoms * row].y;
-            dP.z += PP_MAT[i + n_patoms * row].z;
-        }
-    }
-
-    int p = D_patoms[row].a - 1;
-
-    DV_X[p].x += dP.x;
-    DV_X[p].y += dP.y;
-    DV_X[p].z += dP.z;
-
-    __syncthreads();
-}
-
-__global__ void calc_energy_sum(int rows, int columns, double* Evdw_TOT, double* Ecoul_TOT, double* Evdw, double* Ecoul, bool upper_diagonal) {
-    int tx = threadIdx.x;
-    int ty = threadIdx.y;
-
-    __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE];
-    __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE];
-
-    // TODO: better way to distribute upper diagonal over threads? Seems like threads in left bottom corner have less work
-    double coul_TOT = 0;
-    double vdw_TOT = 0;
-    for (int i = ty; i < rows; i += BLOCK_SIZE) {
-        for (int j = tx; j < columns; j += BLOCK_SIZE) {
-            if (i <= j || !upper_diagonal) {
-                coul_TOT += Ecoul[i * columns + j];
-                vdw_TOT += Evdw[i * columns + j];
-            }
-        }
-    }
-    Ecoul_S[ty][tx] = coul_TOT;
-    Evdw_S[ty][tx] = vdw_TOT;
-
-    __syncthreads();
-
-    if (tx == 0 && ty == 0) {
-        double Evdw_temp = 0;
-        double Ecoul_temp = 0;
-
-        for (int i = 0; i < BLOCK_SIZE; i++) {
-            for (int j = 0; j < BLOCK_SIZE; j++) {
-                Evdw_temp += Evdw_S[i][j];
-                Ecoul_temp += Ecoul_S[i][j];
-            }
-        }
-
-        *Evdw_TOT = Evdw_temp;
-        *Ecoul_TOT = Ecoul_temp;
-    }
-}
 }  // namespace CudaNonbondedPPForce
 void calc_nonbonded_pp_forces_host_v2() {
     using namespace CudaNonbondedPPForce;