From ce2dda0407443acfc6a92a374ba7591737d56b77 Mon Sep 17 00:00:00 2001
From: "shen.guo" <g.shen@rug.nl>
Date: Tue, 6 Jan 2026 16:33:20 +0100
Subject: [PATCH 1/5] optimize nonbonded pp force using openmm method

---
 src/core/cuda/src/CudaNonbondedPPForce.cu | 290 ++++++++++++++++++----
 src/core/parse.cu                         |   4 +
 2 files changed, 247 insertions(+), 47 deletions(-)
diff --git a/src/core/cuda/src/CudaNonbondedPPForce.cu b/src/core/cuda/src/CudaNonbondedPPForce.cu
index d32db32b..e5a78ee8 100644
--- a/src/core/cuda/src/CudaNonbondedPPForce.cu
+++ b/src/core/cuda/src/CudaNonbondedPPForce.cu
@@ -1,11 +1,11 @@
+#include <iostream>
+
 #include "cuda/include/CudaContext.cuh"
 #include "cuda/include/CudaNonbondedPPForce.cuh"
 #include "utils.h"
 
 namespace CudaNonbondedPPForce {
 bool is_initialized = false;
-dvel_t* PP_MAT;
-double *D_PP_Evdw, *D_PP_Ecoul;
 double *D_PP_evdw_TOT, *D_PP_ecoul_TOT;
 
 __device__ void calc_pp_dvel_matrix_incr(int row, int pi, int column, int pj,
@@ -65,6 +65,231 @@ __device__ void calc_pp_dvel_matrix_incr(int row, int pi, int column, int pj,
     *Evdw += (V_a - V_b);
 }
 
+__device__ __forceinline__ void idx2xy(int n, int t, int& x, int& y) {
+    x = (int)floorf((2 * n + 1 - sqrtf((2 * n + 1) * (2 * n + 1) - 8 * t)) * 0.5f);
+    y = t - (x * n - (x * (x - 1) >> 1));
+    if (y < 0) {
+        x--;
+        y += (n - x);
+    }
+    y += x;
+}
+
+__device__ __forceinline__ double shfl(double v, int srcLane, unsigned mask = 0xffffffffu) {
+    int2 a = *reinterpret_cast<int2*>(&v);
+    a.x = __shfl_sync(mask, a.x, srcLane);
+    a.y = __shfl_sync(mask, a.y, srcLane);
+    return *reinterpret_cast<double*>(&a);
+}
+
+__device__ __forceinline__ coord_t shfl_coord(coord_t v, int srcLane, unsigned mask = 0xffffffffu) {
+    v.x = shfl(v.x, srcLane, mask);
+    v.y = shfl(v.y, srcLane, mask);
+    v.z = shfl(v.z, srcLane, mask);
+    return v;
+}
+
+__device__ void calculate_unforce_bound(
+    // const int x_idx,
+    // const int y_idx,
+
+    const coord_t& x,
+    const coord_t& y,
+
+    const double x_charge,
+    const double y_charge,
+
+    const double x_aii,
+    const double y_aii,
+
+    const double x_bii,
+    const double y_bii,
+
+    const double coulomb_constant,
+
+    const double scaling,
+
+    double& evdw,
+    double& ecoul,
+    double& dv) {
+    double3 d = {y.x - x.x, y.y - x.y, y.z - x.z};
+
+    double r2 = 1.0 / (d.x * d.x + d.y * d.y + d.z * d.z);
+    double r = sqrt(r2);
+    double r6 = r2 * r2 * r2;
+
+    ecoul = scaling * coulomb_constant * x_charge * y_charge * r;
+
+    double v_a = r6 * r6 * x_aii * y_aii;
+    double v_b = r6 * x_bii * y_bii;
+    evdw = v_a - v_b;
+    dv = r2 * (-ecoul - 12.0 * v_a + 6.0 * v_b);
+}
+
+__global__ void calc_pp(
+    const int N,
+    const int n_atoms_solute,
+
+    const charge_t* D_charges,    // charge type of each atom
+    const ccharge_t* D_ccharges,  // charge value of each charge type
+
+    const atype_t* D_atypes,
+    const catype_t* D_catypes,
+
+    const topo_t D_topo,
+
+    const bool* D_excluded,
+
+    const int* D_LJ_matrix,
+
+    const p_atom_t* D_patoms,
+
+    coord_t* X,
+    dvel_t* DV_X,
+
+    double* Evdw_TOT,
+    double* Ecoul_TOT) {
+    const int nBlocks = (N + 31) >> 5;  // ceil(N/32)
+    const int totalTiles = nBlocks * (nBlocks + 1) / 2;
+
+    const int warpsPerBlock = blockDim.x >> 5;  // THREAD_BLOCK_SIZE / 32
+    const int tid = threadIdx.x;
+    const int lane = tid & 31;
+    const int warpInBlock = tid >> 5;
+
+    const int tile = blockIdx.x * warpsPerBlock + warpInBlock;
+    if (tile >= totalTiles) {
+        return;  // extra warps in last block
+    }
+
+    // Map tile -> (x,y) with y>=x (upper triangle)
+    int tile_x, tile_y;
+    idx2xy(nBlocks, tile, tile_x, tile_y);
+
+    const int baseX = tile_x << 5;  // x*32
+    const int baseY = tile_y << 5;  // y*32
+
+    int x_idx = baseX + lane;
+    int y_idx = baseY + lane;
+
+    int x_atom = (x_idx < N) ? (D_patoms[x_idx].a - 1) : -1;
+    int y_atom = (y_idx < N) ? (D_patoms[y_idx].a - 1) : -1;
+
+    coord_t invalid = {-1e9, -1e9, -1e9};
+    coord_t x_coord = (x_atom >= 0) ? X[x_atom] : invalid;
+    coord_t y_coord = (y_atom >= 0) ? X[y_atom] : invalid;
+
+    bool x_excluded = (x_atom >= 0) ? D_excluded[x_atom] : true;
+    bool y_excluded = (y_atom >= 0) ? D_excluded[y_atom] : true;
+
+    double x_charge = (x_atom >= 0) ? D_ccharges[D_charges[x_atom].code - 1].charge : 0.0;
+    double y_charge = (y_atom >= 0) ? D_ccharges[D_charges[y_atom].code - 1].charge : 0.0;
+
+    catype_t x_type = (x_atom >= 0) ? D_catypes[D_atypes[x_atom].code - 1] : catype_t{};
+    catype_t y_type = (y_atom >= 0) ? D_catypes[D_atypes[y_atom].code - 1] : catype_t{};
+    double x_aii_normal = x_type.aii_normal;
+    double y_aii_normal = y_type.aii_normal;
+    double x_aii_14 = x_type.aii_1_4;
+    double y_aii_14 = y_type.aii_1_4;
+    double x_bii_normal = x_type.bii_normal;
+    double y_bii_normal = y_type.bii_normal;
+    double x_bii_14 = x_type.bii_1_4;
+    double y_bii_14 = y_type.bii_1_4;
+
+    double3 x_force = {0.0, 0.0, 0.0};
+    double3 y_force = {0.0, 0.0, 0.0};
+
+    double evdw_sum = 0;
+    double ecoul_sum = 0;
+
+    const unsigned mask = 0xffffffffu;
+
+    auto is_valid = [&]() -> bool {
+        if (x_idx >= N || y_idx >= N) {
+            return false;
+        }
+        if (tile_x == tile_y && x_idx <= y_idx) {
+            return false;
+        }
+        if (x_excluded || y_excluded) {
+            return false;
+        }
+
+    
+        bool bond23 = (D_LJ_matrix[y_atom * n_atoms_solute + x_atom] == 3);
+        if (bond23) {
+            return false;
+        }
+
+        return true;
+    };
+
+    // shuffle to get next y
+    auto do_shuffle = [&]() {
+        const int src = (lane + 1) & 31;
+        y_idx = __shfl_sync(mask, y_idx, src);
+        y_atom = __shfl_sync(mask, y_atom, src);
+        y_coord = shfl_coord(y_coord, src, mask);
+        y_excluded = __shfl_sync(mask, y_excluded, src);
+        y_charge = __shfl_sync(mask, y_charge, src);
+        y_aii_normal = __shfl_sync(mask, y_aii_normal, src);
+        y_aii_14 = __shfl_sync(mask, y_aii_14, src);
+        y_bii_normal = __shfl_sync(mask, y_bii_normal, src);
+        y_bii_14 = __shfl_sync(mask, y_bii_14, src);
+
+        // shuffle y_force
+        y_force.x = shfl(y_force.x, src, mask);
+        y_force.y = shfl(y_force.y, src, mask);
+        y_force.z = shfl(y_force.z, src, mask);
+    };
+
+    for (int i = 0; i < 32; i++) {
+        if (is_valid()) {
+            bool bond14 = (D_LJ_matrix[y_atom * n_atoms_solute + x_atom] == 1);
+            double scaling = bond14 ? D_topo.el14_scale : 1.0;
+            double ai_aii = bond14 ? x_aii_14 : x_aii_normal;
+            double aj_aii = bond14 ? y_aii_14 : y_aii_normal;
+            double ai_bii = bond14 ? x_bii_14 : x_bii_normal;
+            double aj_bii = bond14 ? y_bii_14 : y_bii_normal;
+            double evdw = 0.0, ecoul = 0.0, dv = 0.0;
+            calculate_unforce_bound(
+                x_coord, y_coord, x_charge, y_charge, ai_aii, aj_aii, ai_bii, aj_bii,
+                D_topo.coulomb_constant, scaling, evdw, ecoul, dv);
+            evdw_sum += evdw;
+            ecoul_sum += ecoul;
+            double3 d = {y_coord.x - x_coord.x, y_coord.y - x_coord.y, y_coord.z - x_coord.z};
+            x_force.x += dv * d.x;
+            x_force.y += dv * d.y;
+            x_force.z += dv * d.z;
+
+            y_force.x -= dv * d.x;
+            y_force.y -= dv * d.y;
+            y_force.z -= dv * d.z;
+        }
+        do_shuffle();
+    }
+
+    if (x_atom >= 0) {
+        atomicAdd(&DV_X[x_atom].x, x_force.x);
+        atomicAdd(&DV_X[x_atom].y, x_force.y);
+        atomicAdd(&DV_X[x_atom].z, x_force.z);
+    }
+    if (y_atom >= 0) {
+        atomicAdd(&DV_X[y_atom].x, y_force.x);
+        atomicAdd(&DV_X[y_atom].y, y_force.y);
+        atomicAdd(&DV_X[y_atom].z, y_force.z);
+    }
+
+    for (int offset = 16; offset > 0; offset >>= 1) {
+        evdw_sum += __shfl_down_sync(0xffffffffu, evdw_sum, offset);
+        ecoul_sum += __shfl_down_sync(0xffffffffu, ecoul_sum, offset);
+    }
+    if (lane == 0) {
+        atomicAdd(Evdw_TOT, evdw_sum);
+        atomicAdd(Ecoul_TOT, ecoul_sum);
+    }
+}
+
 __global__ void calc_pp_dvel_matrix(int n_patoms, int n_atoms_solute,
                                     coord_t* X, double* Evdw, double* Ecoul, dvel_t* PP_MAT,
                                     ccharge_t* D_ccharges, charge_t* D_charges, catype_t* D_catypes, atype_t* D_atypes, p_atom_t* D_patoms, int* D_LJ_matrix, bool* D_excluded, topo_t D_topo) {
@@ -221,8 +446,6 @@ void calc_nonbonded_pp_forces_host_v2() {
 
     int mem_size_DV_X = n_atoms_solute * sizeof(dvel_t);
 
-    int n_blocks = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
-
     CudaContext& ctx = CudaContext::instance();
     auto X = ctx.d_coords;
     auto DV_X = ctx.d_dvelocities;
@@ -234,32 +457,27 @@ void calc_nonbonded_pp_forces_host_v2() {
     auto D_LJ_matrix = ctx.d_LJ_matrix;
     auto D_excluded = ctx.d_excluded;
 
-    dim3 threads, grid;
+    cudaMemset(D_PP_evdw_TOT, 0, sizeof(double));
+    cudaMemset(D_PP_ecoul_TOT, 0, sizeof(double));
 
-    threads = dim3(BLOCK_SIZE, BLOCK_SIZE);
-    grid = dim3((n_patoms + BLOCK_SIZE - 1) / threads.x, (n_patoms + BLOCK_SIZE - 1) / threads.y);
+    const int thread_num = 128;
+    dim3 block_sz = dim3(thread_num);
 
-    calc_pp_dvel_matrix<<<grid, threads>>>(n_patoms, n_atoms_solute, X, D_PP_Evdw, D_PP_Ecoul, PP_MAT, D_ccharges, D_charges, D_catypes, D_atypes, D_patoms, D_LJ_matrix, D_excluded, topo);
-    calc_pp_dvel_vector<<<((n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_patoms, DV_X, PP_MAT, D_patoms);
+    int tile_num_per_block = thread_num / 32;
+    int nBlocks = (n_patoms + 31) / 32;
+    int total_tiles = nBlocks * (nBlocks + 1) / 2;
 
-    cudaMemcpy(dvelocities, DV_X, mem_size_DV_X, cudaMemcpyDeviceToHost);
+    int grid_sz = (total_tiles + tile_num_per_block - 1) / tile_num_per_block;
+    dim3 grid = dim3(grid_sz);
 
-#ifdef DEBUG
-    cudaMemcpy(h_PP_MAT, PP_MAT, mem_size_PP_MAT, cudaMemcpyDeviceToHost);
-#endif
+    calc_pp<<<grid, block_sz>>>(n_patoms, n_atoms_solute, D_charges, D_ccharges,
+                                D_atypes, D_catypes, topo, D_excluded,
+                                D_LJ_matrix, D_patoms, X, DV_X, D_PP_evdw_TOT,
+                                D_PP_ecoul_TOT);
 
-#ifdef DEBUG
-    for (int i = 0; i < n_patoms; i++) {
-        for (int j = 0; j < n_patoms; j++) {
-            if (h_PP_MAT[i * n_patoms + j].x > 100)
-                printf("PP_MAT[%d][%d] = %f %f %f\n", i, j, h_PP_MAT[i * n_patoms + j].x, h_PP_MAT[i * n_patoms + j].y, h_PP_MAT[i * n_patoms + j].z);
-        }
-    }
-#endif
-    cudaMemset(D_PP_evdw_TOT, 0, sizeof(double));
-    cudaMemset(D_PP_ecoul_TOT, 0, sizeof(double));
+    cudaDeviceSynchronize();
+    cudaMemcpy(dvelocities, DV_X, mem_size_DV_X, cudaMemcpyDeviceToHost);
 
-    calc_energy_sum<<<1, threads>>>(n_blocks, n_blocks, D_PP_evdw_TOT, D_PP_ecoul_TOT, D_PP_Evdw, D_PP_Ecoul, true);
     double PP_evdw_TOT, PP_ecoul_TOT;
     cudaMemcpy(&PP_evdw_TOT, D_PP_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost);
     cudaMemcpy(&PP_ecoul_TOT, D_PP_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost);
@@ -272,25 +490,6 @@ void init_nonbonded_pp_force_kernel_data() {
     using namespace CudaNonbondedPPForce;
 
     if (!is_initialized) {
-        int n_blocks = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
-
-        int mem_size_PP_MAT = n_patoms * n_patoms * sizeof(dvel_t);
-
-        int mem_size_PP_Evdw = n_blocks * n_blocks * sizeof(double);
-        int mem_size_PP_Ecoul = n_blocks * n_blocks * sizeof(double);
-#ifdef DEBUG
-        printf("Allocating PP_MAT\n");
-#endif
-        check_cudaMalloc((void**)&PP_MAT, mem_size_PP_MAT);
-#ifdef DEBUG
-        printf("Allocating D_PP_Evdw\n");
-#endif
-        check_cudaMalloc((void**)&D_PP_Evdw, mem_size_PP_Evdw);
-#ifdef DEBUG
-        printf("Allocating D_PP_Ecoul\n");
-#endif
-        check_cudaMalloc((void**)&D_PP_Ecoul, mem_size_PP_Ecoul);
-
         check_cudaMalloc((void**)&D_PP_evdw_TOT, sizeof(double));
         check_cudaMalloc((void**)&D_PP_ecoul_TOT, sizeof(double));
 
@@ -301,11 +500,8 @@ void init_nonbonded_pp_force_kernel_data() {
 void cleanup_nonbonded_pp_force() {
     using namespace CudaNonbondedPPForce;
     if (is_initialized) {
-        cudaFree(PP_MAT);
-        cudaFree(D_PP_Evdw);
-        cudaFree(D_PP_Ecoul);
         cudaFree(D_PP_evdw_TOT);
         cudaFree(D_PP_ecoul_TOT);
         is_initialized = false;
     }
-}
\ No newline at end of file
+}
diff --git a/src/core/parse.cu b/src/core/parse.cu
index 1b9e0f8d..c5518223 100644
--- a/src/core/parse.cu
+++ b/src/core/parse.cu
@@ -753,6 +753,7 @@ void init_ngbrs14(const char* filename) {
                 int jx = (lineI + i + 1) % lines;
                 // if (ix < 100 && jx < 100) printf("i = %d j = %d\n", ix+1, jx+1);
                 LJ_matrix[ix * n_atoms_solute + jx] = 1;
+                LJ_matrix[jx * n_atoms_solute + ix] = 1;
             }
         }
         lineI++;
@@ -797,6 +798,7 @@ void init_ngbrs23(const char* filename) {
                 int jx = (lineI + i + 1) % lines;
                 // if (ix < 100 && jx < 100) printf("i = %d j = %d\n", ix+1, jx+1);
                 LJ_matrix[ix * n_atoms_solute + jx] = 3;
+                LJ_matrix[jx * n_atoms_solute + ix] = 3;
             }
         }
         lineI++;
@@ -819,6 +821,7 @@ void init_ngbrs14_long(const char* filename) {
         int ix = atoi(file.buffer[i+1][0])-1;
         int jx = atoi(file.buffer[i+1][1])-1;
         LJ_matrix[ix * n_atoms_solute + jx] = 1;
+        LJ_matrix[jx * n_atoms_solute + ix] = 1;
     }
 
     clean_csv(file);
@@ -838,6 +841,7 @@ void init_ngbrs23_long(const char* filename) {
         int ix = atoi(file.buffer[i+1][0])-1;
         int jx = atoi(file.buffer[i+1][1])-1;
         LJ_matrix[ix * n_atoms_solute + jx] = 3;
+        LJ_matrix[jx * n_atoms_solute + ix] = 3;
     }
 
     clean_csv(file);

From 7143986823074c194944a033abe7c0ae1acfd259 Mon Sep 17 00:00:00 2001
From: "shen.guo" <g.shen@rug.nl>
Date: Tue, 6 Jan 2026 16:55:35 +0100
Subject: [PATCH 2/5] optimize nonbonded pp force using openmm method

---
 src/core/cuda/src/CudaNonbondedPPForce.cu | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/core/cuda/src/CudaNonbondedPPForce.cu b/src/core/cuda/src/CudaNonbondedPPForce.cu
index e5a78ee8..4b2b67b2 100644
--- a/src/core/cuda/src/CudaNonbondedPPForce.cu
+++ b/src/core/cuda/src/CudaNonbondedPPForce.cu
@@ -214,9 +214,7 @@ __global__ void calc_pp(
         if (x_excluded || y_excluded) {
             return false;
         }
-
-    
-        bool bond23 = (D_LJ_matrix[y_atom * n_atoms_solute + x_atom] == 3);
+        bool bond23 = (D_LJ_matrix[x_atom * n_atoms_solute + y_atom] == 3);
         if (bond23) {
             return false;
         }
@@ -258,13 +256,13 @@ __global__ void calc_pp(
             evdw_sum += evdw;
             ecoul_sum += ecoul;
             double3 d = {y_coord.x - x_coord.x, y_coord.y - x_coord.y, y_coord.z - x_coord.z};
-            x_force.x += dv * d.x;
-            x_force.y += dv * d.y;
-            x_force.z += dv * d.z;
+            x_force.x -= dv * d.x;
+            x_force.y -= dv * d.y;
+            x_force.z -= dv * d.z;
 
-            y_force.x -= dv * d.x;
-            y_force.y -= dv * d.y;
-            y_force.z -= dv * d.z;
+            y_force.x += dv * d.x;
+            y_force.y += dv * d.y;
+            y_force.z += dv * d.z;
         }
         do_shuffle();
     }

From da70854110299bb3d181c788f1da809547f1f967 Mon Sep 17 00:00:00 2001
From: shen <goodstudyqaq@gmail.com>
Date: Mon, 12 Jan 2026 14:57:48 +0100
Subject: [PATCH 3/5] optimize q lambda

---
 src/core/cuda/include/CudaContext.cuh     |   5 +
 src/core/cuda/src/CudaContext.cu          |  41 +++++++-
 src/core/cuda/src/CudaNonbondedQPForce.cu | 121 ++++++++++------------
 src/core/cuda/src/CudaNonbondedQQForce.cu |   4 +
 src/core/cuda/src/CudaNonbondedQWForce.cu |  98 +++++++-----------
 src/core/system.cu                        |   2 +-
 6 files changed, 145 insertions(+), 126 deletions(-)

diff --git a/src/core/cuda/include/CudaContext.cuh b/src/core/cuda/include/CudaContext.cuh
index 3c2883fa..faa3cf90 100644
--- a/src/core/cuda/include/CudaContext.cuh
+++ b/src/core/cuda/include/CudaContext.cuh
@@ -59,6 +59,9 @@ class CudaContext {
     E_nonbonded_t* d_EQ_nonbond_qq = nullptr;
     double* d_lambdas = nullptr;
 
+    q_charge_t* d_q_aggregate_charges = nullptr;
+    q_catype_t* d_q_aggregate_catypes = nullptr;
+
     /*
     Used in CudaPolxWaterForce.cu
     */
@@ -124,6 +127,8 @@ class CudaContext {
    private:
     CudaContext() = default;
 
+    void init_data_for_q();
+
     void free();
 
     ~CudaContext() { free(); }
diff --git a/src/core/cuda/src/CudaContext.cu b/src/core/cuda/src/CudaContext.cu
index 19f22165..0ac5f0b5 100644
--- a/src/core/cuda/src/CudaContext.cu
+++ b/src/core/cuda/src/CudaContext.cu
@@ -1,6 +1,7 @@
 #include <iostream>
 
 #include "cuda/include/CudaContext.cuh"
+#include <vector>
 
 void CudaContext::init() {
     check_cudaMalloc((void**)&d_coords, sizeof(coord_t) * n_atoms);
@@ -54,6 +55,8 @@ void CudaContext::init() {
     check_cudaMalloc((void**)&d_charges, sizeof(charge_t) * n_charges);
     check_cudaMalloc((void**)&d_p_atoms, sizeof(p_atom_t) * n_patoms);
 
+    init_data_for_q();
+
     sync_all_to_device();
 }
 
@@ -183,7 +186,6 @@ void CudaContext::free() {
     cudaFree(d_excluded);
     cudaFree(d_q_elscales);
     cudaFree(d_q_catypes);
-    cudaFree(d_q_atypes);
     cudaFree(d_EQ_nonbond_qq);
     cudaFree(d_lambdas);
 
@@ -208,6 +210,9 @@ void CudaContext::free() {
     cudaFree(d_ccharges);
     cudaFree(d_charges);
     cudaFree(d_p_atoms);
+
+    cudaFree(d_q_aggregate_charges);
+    cudaFree(d_q_aggregate_catypes);
 }
 
 void CudaContext::reset_energies() {
@@ -215,3 +220,37 @@ void CudaContext::reset_energies() {
     cudaMemset(d_EQ_nonbond_qq, 0, sizeof(E_nonbonded_t) * n_lambdas);
     cudaMemset(d_EQ_restraint, 0, sizeof(E_restraint_t) * n_lambdas);
 }
+
+void CudaContext::init_data_for_q() {
+
+    // Additional initialization for q-related data can be added here if needed
+
+    // q charges
+    std::vector<q_charge_t> charges(n_qatoms, q_charge_t{0.0});
+    for (int i = 0; i < n_qatoms; i++) {
+        double q = 0;
+        for (int state = 0; state < n_lambdas; state++) {
+            q += q_charges[state * n_qatoms + i].q * lambdas[state];
+        }
+        charges[i].q = q;
+    }
+    check_cudaMalloc((void**)&d_q_aggregate_charges, sizeof(q_charge_t) * n_qatoms);
+    check_cuda(cudaMemcpy(d_q_aggregate_charges, charges.data(), sizeof(q_charge_t) * n_qatoms, cudaMemcpyHostToDevice));
+
+    
+    // q atypes
+    std::vector<q_catype_t> atypes(n_qatoms, q_catype_t{"", 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0});
+    for (int i = 0; i < n_qatoms; i++) {
+        for (int state = 0; state < n_lambdas; state++) {
+            atypes[i].Ai += q_catypes[q_atypes[state * n_qatoms + i].code - 1].Ai * lambdas[state];
+            atypes[i].Bi += q_catypes[q_atypes[state * n_qatoms + i].code - 1].Bi * lambdas[state];
+            atypes[i].Ci += q_catypes[q_atypes[state * n_qatoms + i].code - 1].Ci * lambdas[state];
+            atypes[i].ai += q_catypes[q_atypes[state * n_qatoms + i].code - 1].ai * lambdas[state];
+            atypes[i].Ai_14 += q_catypes[q_atypes[state * n_qatoms + i].code - 1].Ai_14 * lambdas[state];
+            atypes[i].Bi_14 += q_catypes[q_atypes[state * n_qatoms + i].code - 1].Bi_14 * lambdas[state];
+            atypes[i].m += q_catypes[q_atypes[state * n_qatoms + i].code - 1].m * lambdas[state];
+        }
+    }
+    check_cudaMalloc((void**)&d_q_aggregate_catypes, sizeof(q_catype_t) * n_qatoms);
+    check_cuda(cudaMemcpy(d_q_aggregate_catypes, atypes.data(), sizeof(q_catype_t) * n_qatoms, cudaMemcpyHostToDevice));
+}
\ No newline at end of file
diff --git a/src/core/cuda/src/CudaNonbondedQPForce.cu b/src/core/cuda/src/CudaNonbondedQPForce.cu
index 4eb53908..47f26451 100644
--- a/src/core/cuda/src/CudaNonbondedQPForce.cu
+++ b/src/core/cuda/src/CudaNonbondedQPForce.cu
@@ -42,6 +42,7 @@ __global__ void calc_energy_sum(int rows, int columns, double* Evdw_TOT, double*
     }
     Ecoul_S[ty][tx] = coul_TOT;
     Evdw_S[ty][tx] = vdw_TOT;
+    // printf("Thread (%d,%d) coul_TOT = %f vdw_TOT = %f\n", tx, ty, coul_TOT, vdw_TOT);
 
     __syncthreads();
 
@@ -111,8 +112,8 @@ __global__ void calc_qp_dvel_vector_column(int n_qatoms, int n_patoms, dvel_t* D
 }
 
 __device__ void calc_qp_dvel_matrix_incr(int row, int qi, int column, int pj, int n_lambdas, int n_qatoms,
-                                         coord_t* Qs, coord_t* Ps, int* LJs, bool* excluded_s, double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE], double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE], calc_qp_t* qp,
-                                         q_catype_t* D_qcatypes, q_atype_t* D_qatypes, q_charge_t* D_qcharges, p_atom_t* D_patoms, q_atom_t* D_qatoms, double* D_lambdas,
+                                         coord_t* Qs, coord_t* Ps, int* LJs, bool* excluded_s, double Evdw_S[BLOCK_SIZE][BLOCK_SIZE], double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE], calc_qp_t* qp,
+                                         q_catype_t* D_qcatypes, q_charge_t* D_qcharges, p_atom_t* D_patoms, q_atom_t* D_qatoms,
                                          catype_t* D_catypes, atype_t* D_atypes, ccharge_t* D_ccharges, charge_t* D_charges, topo_t D_topo) {
     coord_t da;
     double r2, r6, r;
@@ -141,43 +142,40 @@ __device__ void calc_qp_dvel_matrix_incr(int row, int qi, int column, int pj, in
     r6 = r2 * r2 * r2;
     r2 = 1 / r2;
     r = sqrt(r2);
-
-    for (int state = 0; state < n_lambdas; state++) {
-        qi_type = D_qcatypes[D_qatypes[qi + n_qatoms * state].code - 1];
-        aj_type = D_catypes[D_atypes[j].code - 1];
-
-        ai_aii = bond14 ? qi_type.Ai_14 : qi_type.Ai;
-        aj_aii = bond14 ? aj_type.aii_1_4 : aj_type.aii_normal;
-        ai_bii = bond14 ? qi_type.Bi_14 : qi_type.Bi;
-        aj_bii = bond14 ? aj_type.bii_1_4 : aj_type.bii_normal;
-
-        Vel = D_topo.coulomb_constant * scaling * D_qcharges[qi + n_qatoms * state].q * D_ccharges[D_charges[j].code - 1].charge * r;
-        V_a = ai_aii * aj_aii / (r6 * r6);
-        V_b = ai_bii * aj_bii / r6;
-        dv = r2 * (-Vel - (12 * V_a - 6 * V_b)) * D_lambdas[state];
-
-        // if (state == 0 && qi == 0 && pj == 1) {
-        //     printf("crg_q = %f crg_j = %f r = %f\n", D_qcharges[qi + n_qatoms * state].q, D_ccharges[D_charges[pj].code - 1].charge, r);
-        //     printf("ai_aii = %f aj_aii = %f ai_bii = %f aj_bii = %f\n", ai_aii, aj_aii, ai_bii, aj_bii);
-        // }
-
-        // Update forces
-        qp->Q.x += dv * da.x;
-        qp->Q.y += dv * da.y;
-        qp->Q.z += dv * da.z;
-        qp->P.x -= dv * da.x;
-        qp->P.y -= dv * da.y;
-        qp->P.z -= dv * da.z;
-
-        // Update Q totals
-        Ecoul_S[row][state * BLOCK_SIZE + column] += Vel;
-        Evdw_S[row][state * BLOCK_SIZE + column] += (V_a - V_b);
-    }
+    qi_type = D_qcatypes[qi];
+    aj_type = D_catypes[D_atypes[j].code - 1];
+
+    ai_aii = bond14 ? qi_type.Ai_14 : qi_type.Ai;
+    aj_aii = bond14 ? aj_type.aii_1_4 : aj_type.aii_normal;
+    ai_bii = bond14 ? qi_type.Bi_14 : qi_type.Bi;
+    aj_bii = bond14 ? aj_type.bii_1_4 : aj_type.bii_normal;
+
+    Vel = D_topo.coulomb_constant * scaling * D_qcharges[qi].q * D_ccharges[D_charges[j].code - 1].charge * r;
+    V_a = ai_aii * aj_aii / (r6 * r6);
+    V_b = ai_bii * aj_bii / r6;
+    dv = r2 * (-Vel - (12 * V_a - 6 * V_b));
+
+    // if (state == 0 && qi == 0 && pj == 1) {
+    //     printf("crg_q = %f crg_j = %f r = %f\n", D_qcharges[qi + n_qatoms * state].q, D_ccharges[D_charges[pj].code - 1].charge, r);
+    //     printf("ai_aii = %f aj_aii = %f ai_bii = %f aj_bii = %f\n", ai_aii, aj_aii, ai_bii, aj_bii);
+    // }
+
+    // Update forces
+    qp->Q.x += dv * da.x;
+    qp->Q.y += dv * da.y;
+    qp->Q.z += dv * da.z;
+    qp->P.x -= dv * da.x;
+    qp->P.y -= dv * da.y;
+    qp->P.z -= dv * da.z;
+
+    // Update Q totals
+    Ecoul_S[row][column] += Vel;
+    Evdw_S[row][column] += (V_a - V_b);
 }
 
 __global__ void calc_qp_dvel_matrix(int n_qatoms, int n_patoms, int n_lambdas, int n_atoms_solute,
                                     coord_t* X, double* Evdw, double* Ecoul, calc_qp_t* QP_MAT,
-                                    q_catype_t* D_qcatypes, q_atype_t* D_qatypes, q_charge_t* D_qcharges, p_atom_t* D_patoms, q_atom_t* D_qatoms, double* D_lambdas,
+                                    q_catype_t* D_qcatypes, q_charge_t* D_qcharges, p_atom_t* D_patoms, q_atom_t* D_qatoms,
                                     int* D_LJ_matrix, bool* D_excluded, catype_t* D_catypes, atype_t* D_atypes, ccharge_t* D_ccharges, charge_t* D_charges, topo_t D_topo) {
     // Block index
     int bx = blockIdx.x;
@@ -187,9 +185,8 @@ __global__ void calc_qp_dvel_matrix(int n_qatoms, int n_patoms, int n_lambdas, i
     int tx = threadIdx.x;
     int ty = threadIdx.y;
 
-    // TODO implement >2 states on GPU
-    __shared__ double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE];
-    __shared__ double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE];
+    __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE];
+    __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE];
 
     Ecoul_S[ty][tx] = 0;
     Evdw_S[ty][tx] = 0;
@@ -230,29 +227,25 @@ __global__ void calc_qp_dvel_matrix(int n_qatoms, int n_patoms, int n_lambdas, i
     int column = bx * BLOCK_SIZE + tx;
 
     calc_qp_dvel_matrix_incr(ty, row, tx, column, n_lambdas, n_qatoms, Qs, Ps, LJs, excluded_s, Evdw_S, Ecoul_S,
-                             &qp, D_qcatypes, D_qatypes, D_qcharges, D_patoms, D_qatoms, D_lambdas, D_catypes, D_atypes, D_ccharges, D_charges, D_topo);
+                             &qp, D_qcatypes, D_qcharges, D_patoms, D_qatoms, D_catypes, D_atypes, D_ccharges, D_charges, D_topo);
 
     QP_MAT[n_patoms * row + column] = qp;
 
     __syncthreads();
 
     int rowlen = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    int collen = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
 
     if (tx == 0 && ty == 0) {
-        // TODO implement >2 states on GPU
-        for (int state = 0; state < min(2, n_lambdas); state++) {
-            double tot_Evdw = 0;
-            double tot_Ecoul = 0;
-            for (int i = 0; i < BLOCK_SIZE; i++) {
-                for (int j = 0; j < BLOCK_SIZE; j++) {
-                    tot_Evdw += Evdw_S[i][j + state * BLOCK_SIZE];
-                    tot_Ecoul += Ecoul_S[i][j + state * BLOCK_SIZE];
-                }
+        double tot_Evdw = 0;
+        double tot_Ecoul = 0;
+        for (int i = 0; i < BLOCK_SIZE; i++) {
+            for (int j = 0; j < BLOCK_SIZE; j++) {
+                tot_Evdw += Evdw_S[i][j];
+                tot_Ecoul += Ecoul_S[i][j];
             }
-            Evdw[rowlen * collen * state + rowlen * by + bx] = tot_Evdw;
-            Ecoul[rowlen * collen * state + rowlen * by + bx] = tot_Ecoul;
         }
+        Evdw[rowlen * by + bx] = tot_Evdw;
+        Ecoul[rowlen * by + bx] = tot_Ecoul;
     }
 
     __syncthreads();
@@ -269,12 +262,10 @@ void calc_nonbonded_qp_forces_host_v2() {
     CudaContext& ctx = CudaContext::instance();
     auto X = ctx.d_coords;
     auto DV_X = ctx.d_dvelocities;
-    auto D_qcatypes = ctx.d_q_catypes;
-    auto D_qatypes = ctx.d_q_atypes;
-    auto D_qcharges = ctx.d_q_charges;
+    auto D_qcatypes = ctx.d_q_aggregate_catypes;
+    auto D_qcharges = ctx.d_q_aggregate_charges;
     auto D_patoms = ctx.d_p_atoms;
     auto D_qatoms = ctx.d_q_atoms;
-    auto D_lambdas = ctx.d_lambdas;
     auto D_LJ_matrix = ctx.d_LJ_matrix;
     auto D_excluded = ctx.d_excluded;
     auto D_catypes = ctx.d_catypes;
@@ -288,7 +279,7 @@ void calc_nonbonded_qp_forces_host_v2() {
     grid = dim3((n_patoms + BLOCK_SIZE - 1) / threads.x, (n_qatoms + BLOCK_SIZE - 1) / threads.y);
 
     calc_qp_dvel_matrix<<<grid, threads>>>(n_qatoms, n_patoms, n_lambdas, n_atoms_solute, X, D_QP_Evdw, D_QP_Ecoul, QP_MAT,
-                                           D_qcatypes, D_qatypes, D_qcharges, D_patoms, D_qatoms, D_lambdas, D_LJ_matrix, D_excluded,
+                                           D_qcatypes, D_qcharges, D_patoms, D_qatoms, D_LJ_matrix, D_excluded,
                                            D_catypes, D_atypes, D_ccharges, D_charges, topo);
     calc_qp_dvel_vector_column<<<((n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_qatoms, n_patoms, DV_X, QP_MAT, D_patoms);
     calc_qp_dvel_vector_row<<<((n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_qatoms, n_patoms, DV_X, QP_MAT, D_qatoms);
@@ -309,16 +300,14 @@ void calc_nonbonded_qp_forces_host_v2() {
     int mem_size_DV_X = n_atoms * sizeof(dvel_t);
     cudaMemcpy(dvelocities, DV_X, mem_size_DV_X, cudaMemcpyDeviceToHost);
 
-    // TODO: make Evdw & Ecoul work for # of states > 2
-    for (int state = 0; state < min(2, n_lambdas); state++) {
-        calc_energy_sum<<<1, threads>>>(n_blocks_q, n_blocks_p, D_QP_evdw_TOT, D_QP_ecoul_TOT, &D_QP_Evdw[state * n_blocks_p * n_blocks_q], &D_QP_Ecoul[state * n_blocks_p * n_blocks_q], false);
+    calc_energy_sum<<<1, threads>>>(n_blocks_q, n_blocks_p, D_QP_evdw_TOT, D_QP_ecoul_TOT, D_QP_Evdw, D_QP_Ecoul, false);
 
-        cudaMemcpy(&QP_evdw_TOT, D_QP_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost);
-        cudaMemcpy(&QP_ecoul_TOT, D_QP_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&QP_evdw_TOT, D_QP_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&QP_ecoul_TOT, D_QP_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost);
+    printf("QP E_vdw: %f E_coul: %f\n", QP_evdw_TOT, QP_ecoul_TOT);
 
-        EQ_nonbond_qp[state].Uvdw += QP_evdw_TOT;
-        EQ_nonbond_qp[state].Ucoul += QP_ecoul_TOT;
-    }
+    E_nonbond_qx.Uvdw += QP_evdw_TOT;
+    E_nonbond_qx.Ucoul += QP_ecoul_TOT;
 }
 
 void init_nonbonded_qp_force_kernel_data() {
@@ -328,8 +317,8 @@ void init_nonbonded_qp_force_kernel_data() {
         int n_blocks_q = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
         int n_blocks_p = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
         // TODO make Evdw & Ecoul work for # of states > 2
-        int mem_size_QP_Evdw = min(n_lambdas, 2) * n_blocks_q * n_blocks_p * sizeof(double);
-        int mem_size_QP_Ecoul = min(n_lambdas, 2) * n_blocks_q * n_blocks_p * sizeof(double);
+        int mem_size_QP_Evdw = n_blocks_q * n_blocks_p * sizeof(double);
+        int mem_size_QP_Ecoul = n_blocks_q * n_blocks_p * sizeof(double);
         int mem_size_QP_MAT = n_qatoms * n_patoms * sizeof(calc_qp_t);
 
         check_cudaMalloc((void**)&D_QP_Evdw, mem_size_QP_Evdw);
diff --git a/src/core/cuda/src/CudaNonbondedQQForce.cu b/src/core/cuda/src/CudaNonbondedQQForce.cu
index e2138ed6..a10c315c 100644
--- a/src/core/cuda/src/CudaNonbondedQQForce.cu
+++ b/src/core/cuda/src/CudaNonbondedQQForce.cu
@@ -1,6 +1,7 @@
 #include "cuda/include/CudaContext.cuh"
 #include "cuda/include/CudaNonbondedQQForce.cuh"
 #include "utils.h"
+#include <iostream>
 
 __global__ void calc_nonbonded_qq_forces_kernel(
     q_atom_t* q_atoms,
@@ -129,6 +130,9 @@ void calc_nonbonded_qq_forces_host() {
     cudaDeviceSynchronize();
     cudaMemcpy(EQ_nonbond_qq, d_EQ_nonbond_qq, sizeof(E_nonbonded_t) * n_lambdas, cudaMemcpyDeviceToHost);
     cudaMemcpy(dvelocities, d_dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < n_lambdas; i++) {
+        printf("QQ Energy State %d: Evdw = %f Ecoul = %f\n", i, EQ_nonbond_qq[i].Uvdw, EQ_nonbond_qq[i].Ucoul);
+    }
 }
 
 void init_nonbonded_qq_force_kernel_data() {}
diff --git a/src/core/cuda/src/CudaNonbondedQWForce.cu b/src/core/cuda/src/CudaNonbondedQWForce.cu
index 974c380c..68ac0f21 100644
--- a/src/core/cuda/src/CudaNonbondedQWForce.cu
+++ b/src/core/cuda/src/CudaNonbondedQWForce.cu
@@ -124,14 +124,13 @@ __global__ void calc_qw_dvel_vector_column(int n_qatoms, int n_waters, dvel_t* D
 }
 
 __device__ void calc_qw_dvel_matrix_incr(int row, int qi, int column, int n_lambdas, int n_qatoms, double crg_ow, double crg_hw, double A_O, double B_O,
-                                         coord_t* Qs, coord_t* Ws, double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE], double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE], calc_qw_t* qw,
-                                         q_catype_t* D_qcatypes, q_atype_t* D_qatypes, q_charge_t* D_qcharges, q_atom_t* D_qatoms, double* D_lambdas, topo_t D_topo) {
+                                         coord_t* Qs, coord_t* Ws, double Evdw_S[BLOCK_SIZE][BLOCK_SIZE], double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE], calc_qw_t* qw,
+                                         q_catype_t* D_qcatypes, q_charge_t* D_qcharges, q_atom_t* D_qatoms, topo_t D_topo) {
     int j;
     coord_t dO, dH1, dH2;
     double r2O, rH1, rH2, r6O, rO, r2H1, r2H2;
     double dvO, dvH1, dvH2;
     double V_a, V_b, VelO, VelH1, VelH2;
-    q_atype_t qa_type;
     q_catype_t qi_type;
     double ai_aii, ai_bii;
 
@@ -160,28 +159,25 @@ __device__ void calc_qw_dvel_matrix_incr(int row, int qi, int column, int n_lamb
     dvH1 = 0;
     dvH2 = 0;
 
-    for (int state = 0; state < n_lambdas; state++) {
-        qa_type = D_qatypes[qi + n_qatoms * state];
-        qi_type = D_qcatypes[qa_type.code - 1];
+    qi_type = D_qcatypes[qi];
 
-        ai_aii = qi_type.Ai;
-        ai_bii = qi_type.Bi;
+    ai_aii = qi_type.Ai;
+    ai_bii = qi_type.Bi;
 
-        V_a = ai_aii * A_O / (r6O * r6O);
-        V_b = ai_bii * B_O / (r6O);
+    V_a = ai_aii * A_O / (r6O * r6O);
+    V_b = ai_bii * B_O / (r6O);
 
-        VelO = D_topo.coulomb_constant * crg_ow * D_qcharges[qi + n_qatoms * state].q * rO;
-        VelH1 = D_topo.coulomb_constant * crg_hw * D_qcharges[qi + n_qatoms * state].q * rH1;
-        VelH2 = D_topo.coulomb_constant * crg_hw * D_qcharges[qi + n_qatoms * state].q * rH2;
+    VelO = D_topo.coulomb_constant * crg_ow * D_qcharges[qi].q * rO;
+    VelH1 = D_topo.coulomb_constant * crg_hw * D_qcharges[qi].q * rH1;
+    VelH2 = D_topo.coulomb_constant * crg_hw * D_qcharges[qi].q * rH2;
 
-        dvO += r2O * (-VelO - (12 * V_a - 6 * V_b)) * D_lambdas[state];
-        dvH1 -= r2H1 * VelH1 * D_lambdas[state];
-        dvH2 -= r2H2 * VelH2 * D_lambdas[state];
+    dvO += r2O * (-VelO - (12 * V_a - 6 * V_b));
+    dvH1 -= r2H1 * VelH1;
+    dvH2 -= r2H2 * VelH2;
 
-        // Update Q totals
-        Ecoul_S[row][state * BLOCK_SIZE + column] += (VelO + VelH1 + VelH2);
-        Evdw_S[row][state * BLOCK_SIZE + column] += (V_a - V_b);
-    }
+    // Update Q totals
+    Ecoul_S[row][column] += (VelO + VelH1 + VelH2);
+    Evdw_S[row][column] += (V_a - V_b);
 
     // Note r6O is not the usual 1/rO^6, but rather rO^6. be careful!!!
 
@@ -204,7 +200,7 @@ __device__ void calc_qw_dvel_matrix_incr(int row, int qi, int column, int n_lamb
 
 __global__ void calc_qw_dvel_matrix(int n_qatoms, int n_waters, int n_lambdas, double crg_ow, double crg_hw, double A_O, double B_O,
                                     coord_t* X, coord_t* W, double* Evdw, double* Ecoul, calc_qw_t* MAT,
-                                    q_catype_t* D_qcatypes, q_atype_t* D_qatypes, q_charge_t* D_qcharges, q_atom_t* D_qatoms, double* D_lambdas, topo_t D_topo) {
+                                    q_catype_t* D_qcatypes, q_charge_t* D_qcharges, q_atom_t* D_qatoms, topo_t D_topo) {
     // Block index
     int bx = blockIdx.x;
     int by = blockIdx.y;
@@ -214,8 +210,8 @@ __global__ void calc_qw_dvel_matrix(int n_qatoms, int n_waters, int n_lambdas, d
     int ty = threadIdx.y;
 
     // TODO implement >2 states on GPU
-    __shared__ double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE];
-    __shared__ double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE];
+    __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE];
+    __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE];
 
     Ecoul_S[ty][tx] = 0;
     Evdw_S[ty][tx] = 0;
@@ -250,29 +246,26 @@ __global__ void calc_qw_dvel_matrix(int n_qatoms, int n_waters, int n_lambdas, d
     int column = bx * BLOCK_SIZE + tx;
 
     calc_qw_dvel_matrix_incr(ty, aStart + ty, tx, n_lambdas, n_qatoms, crg_ow, crg_hw, A_O, B_O, Qs, Ws, Evdw_S, Ecoul_S,
-                             &qw, D_qcatypes, D_qatypes, D_qcharges, D_qatoms, D_lambdas, D_topo);
+                             &qw, D_qcatypes, D_qcharges, D_qatoms, D_topo);
 
     MAT[column + n_waters * row] = qw;
 
     __syncthreads();
 
     int rowlen = (n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    int collen = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
 
     if (tx == 0 && ty == 0) {
         // TODO implement >2 states on GPU
-        for (int state = 0; state < min(2, n_lambdas); state++) {
-            double tot_Evdw = 0;
-            double tot_Ecoul = 0;
-            for (int i = 0; i < BLOCK_SIZE; i++) {
-                for (int j = 0; j < BLOCK_SIZE; j++) {
-                    tot_Evdw += Evdw_S[i][j + state * BLOCK_SIZE];
-                    tot_Ecoul += Ecoul_S[i][j + state * BLOCK_SIZE];
-                }
+        double tot_Evdw = 0;
+        double tot_Ecoul = 0;
+        for (int i = 0; i < BLOCK_SIZE; i++) {
+            for (int j = 0; j < BLOCK_SIZE; j++) {
+                tot_Evdw += Evdw_S[i][j + BLOCK_SIZE];
+                tot_Ecoul += Ecoul_S[i][j + BLOCK_SIZE];
             }
-            Evdw[rowlen * collen * state + rowlen * by + bx] = tot_Evdw;
-            Ecoul[rowlen * collen * state + rowlen * by + bx] = tot_Ecoul;
         }
+        Evdw[rowlen * by + bx] = tot_Evdw;
+        Ecoul[rowlen * by + bx] = tot_Ecoul;
     }
 
     __syncthreads();
@@ -282,36 +275,27 @@ __global__ void calc_qw_dvel_matrix(int n_qatoms, int n_waters, int n_lambdas, d
 void calc_nonbonded_qw_forces_host_v2() {
     using namespace CudaNonbondedQWForce;
 
-    int mem_size_X = n_atoms_solute * sizeof(coord_t);
-    int mem_size_W = 3 * n_waters * sizeof(coord_t);
     int mem_size_DV_X = n_atoms_solute * sizeof(dvel_t);
     int mem_size_DV_W = 3 * n_waters * sizeof(dvel_t);
-    int mem_size_MAT = 3 * n_waters * n_qatoms * sizeof(calc_qw_t);
 
     int n_blocks_q = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
     int n_blocks_w = (n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    // TODO make Evdw & Ecoul work for # of states > 2
-    int mem_size_QW_Evdw = min(n_lambdas, 2) * n_blocks_q * n_blocks_w * sizeof(double);
-    int mem_size_QW_Ecoul = min(n_lambdas, 2) * n_blocks_q * n_blocks_w * sizeof(double);
 
     CudaContext& ctx = CudaContext::instance();
     auto X = ctx.d_coords;
     auto DV_X = ctx.d_dvelocities;
-    auto D_qcatypes = ctx.d_q_catypes;
-    auto D_qatypes = ctx.d_q_atypes;
-    auto D_qcharges = ctx.d_q_charges;
+    auto D_qcatypes = ctx.d_q_aggregate_catypes;
+    auto D_qcharges = ctx.d_q_aggregate_charges;
     auto D_qatoms = ctx.d_q_atoms;
-    auto D_lambdas = ctx.d_lambdas;
 
     dim3 threads, grid;
 
     threads = dim3(BLOCK_SIZE, BLOCK_SIZE);
     grid = dim3((n_waters + BLOCK_SIZE - 1) / threads.x, (n_qatoms + BLOCK_SIZE - 1) / threads.y);
 
-    double evdw, ecoul;
 
     calc_qw_dvel_matrix<<<grid, threads>>>(n_qatoms, n_waters, n_lambdas, crg_ow, crg_hw, A_O, B_O, X, X + n_atoms_solute, D_QW_Evdw, D_QW_Ecoul,
-                                           QW_MAT, D_qcatypes, D_qatypes, D_qcharges, D_qatoms, D_lambdas, topo);
+                                           QW_MAT, D_qcatypes, D_qcharges, D_qatoms, topo);
     calc_qw_dvel_vector_column<<<((n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_qatoms, n_waters, DV_X, DV_X + n_atoms_solute, QW_MAT);
     calc_qw_dvel_vector_row<<<((n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_qatoms, n_waters, DV_X, DV_X + n_atoms_solute, QW_MAT, D_qatoms);
 
@@ -331,17 +315,15 @@ void calc_nonbonded_qw_forces_host_v2() {
     cudaMemcpy(dvelocities, DV_X, mem_size_DV_X, cudaMemcpyDeviceToHost);
     cudaMemcpy(&dvelocities[n_atoms_solute], DV_X + n_atoms_solute, mem_size_DV_W, cudaMemcpyDeviceToHost);
 
-    // TODO make Evdw & Ecoul work for # of states > 2
-    for (int state = 0; state < min(2, n_lambdas); state++) {
-        calc_energy_sum<<<1, threads>>>(n_blocks_q, n_blocks_w, D_QW_evdw_TOT, D_QW_ecoul_TOT, &D_QW_Evdw[state * n_blocks_w * n_blocks_q],
-                                        &D_QW_Ecoul[state * n_blocks_w * n_blocks_q], false);
+    calc_energy_sum<<<1, threads>>>(n_blocks_q, n_blocks_w, D_QW_evdw_TOT, D_QW_ecoul_TOT, D_QW_Evdw,
+                                    D_QW_Ecoul, false);
 
-        cudaMemcpy(&QW_evdw_TOT, D_QW_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost);
-        cudaMemcpy(&QW_ecoul_TOT, D_QW_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&QW_evdw_TOT, D_QW_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&QW_ecoul_TOT, D_QW_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost);
 
-        EQ_nonbond_qw[state].Uvdw += QW_evdw_TOT;
-        EQ_nonbond_qw[state].Ucoul += QW_ecoul_TOT;
-    }
+    E_nonbond_qx.Uvdw += QW_evdw_TOT;
+    E_nonbond_qx.Ucoul += QW_ecoul_TOT;
+    printf("QW Energy: Evdw = %f Ecoul = %f\n", QW_evdw_TOT, QW_ecoul_TOT);
 }
 
 void init_nonbonded_qw_force_kernel_data() {
@@ -358,8 +340,8 @@ void init_nonbonded_qw_force_kernel_data() {
         int n_blocks_w = (n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE;
         int mem_size_MAT = 3 * n_waters * n_qatoms * sizeof(calc_qw_t);
 
-        int mem_size_QW_Evdw = min(n_lambdas, 2) * n_blocks_q * n_blocks_w * sizeof(double);
-        int mem_size_QW_Ecoul = min(n_lambdas, 2) * n_blocks_q * n_blocks_w * sizeof(double);
+        int mem_size_QW_Evdw = n_blocks_q * n_blocks_w * sizeof(double);
+        int mem_size_QW_Ecoul = n_blocks_q * n_blocks_w * sizeof(double);
 #ifdef DEBUG
         printf("Allocating QW_MAT\n");
 #endif
diff --git a/src/core/system.cu b/src/core/system.cu
index afb999cf..ee50ee61 100644
--- a/src/core/system.cu
+++ b/src/core/system.cu
@@ -1,3 +1,4 @@
+#include <iostream>
 #include "system.h"
 #include "utils.h"
 #include "parse.h"
@@ -1094,7 +1095,6 @@ void calc_integration_step(int iteration) {
     calc_bonded_forces();
 
     clock_t end_bonded = clock();
-
     clock_t start_pp, end_pp, start_qp, end_qp;
     if (run_gpu) {
         start_qp = clock();

From f6c57c9b2bafba18068aebc6934c90f1315235ac Mon Sep 17 00:00:00 2001
From: "shen.guo" <g.shen@rug.nl>
Date: Tue, 13 Jan 2026 09:33:36 +0100
Subject: [PATCH 4/5] fix array idx

---
 src/core/cuda/src/CudaNonbondedQWForce.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/core/cuda/src/CudaNonbondedQWForce.cu b/src/core/cuda/src/CudaNonbondedQWForce.cu
index 68ac0f21..049a38aa 100644
--- a/src/core/cuda/src/CudaNonbondedQWForce.cu
+++ b/src/core/cuda/src/CudaNonbondedQWForce.cu
@@ -260,8 +260,8 @@ __global__ void calc_qw_dvel_matrix(int n_qatoms, int n_waters, int n_lambdas, d
         double tot_Ecoul = 0;
         for (int i = 0; i < BLOCK_SIZE; i++) {
             for (int j = 0; j < BLOCK_SIZE; j++) {
-                tot_Evdw += Evdw_S[i][j + BLOCK_SIZE];
-                tot_Ecoul += Ecoul_S[i][j + BLOCK_SIZE];
+                tot_Evdw += Evdw_S[i][j];
+                tot_Ecoul += Ecoul_S[i][j];
             }
         }
         Evdw[rowlen * by + bx] = tot_Evdw;

From c585cac11bd928965ece60fd6de748a0ad087db3 Mon Sep 17 00:00:00 2001
From: "shen.guo" <g.shen@rug.nl>
Date: Tue, 13 Jan 2026 09:55:28 +0100
Subject: [PATCH 5/5] revert no need commit

---
 src/core/cuda/src/CudaNonbondedPPForce.cu | 288 ++++------------------
 src/core/parse.cu                         |   4 -
 2 files changed, 47 insertions(+), 245 deletions(-)

diff --git a/src/core/cuda/src/CudaNonbondedPPForce.cu b/src/core/cuda/src/CudaNonbondedPPForce.cu
index 4b2b67b2..d32db32b 100644
--- a/src/core/cuda/src/CudaNonbondedPPForce.cu
+++ b/src/core/cuda/src/CudaNonbondedPPForce.cu
@@ -1,11 +1,11 @@
-#include <iostream>
-
 #include "cuda/include/CudaContext.cuh"
 #include "cuda/include/CudaNonbondedPPForce.cuh"
 #include "utils.h"
 
 namespace CudaNonbondedPPForce {
 bool is_initialized = false;
+dvel_t* PP_MAT;
+double *D_PP_Evdw, *D_PP_Ecoul;
 double *D_PP_evdw_TOT, *D_PP_ecoul_TOT;
 
 __device__ void calc_pp_dvel_matrix_incr(int row, int pi, int column, int pj,
@@ -65,229 +65,6 @@ __device__ void calc_pp_dvel_matrix_incr(int row, int pi, int column, int pj,
     *Evdw += (V_a - V_b);
 }
 
-__device__ __forceinline__ void idx2xy(int n, int t, int& x, int& y) {
-    x = (int)floorf((2 * n + 1 - sqrtf((2 * n + 1) * (2 * n + 1) - 8 * t)) * 0.5f);
-    y = t - (x * n - (x * (x - 1) >> 1));
-    if (y < 0) {
-        x--;
-        y += (n - x);
-    }
-    y += x;
-}
-
-__device__ __forceinline__ double shfl(double v, int srcLane, unsigned mask = 0xffffffffu) {
-    int2 a = *reinterpret_cast<int2*>(&v);
-    a.x = __shfl_sync(mask, a.x, srcLane);
-    a.y = __shfl_sync(mask, a.y, srcLane);
-    return *reinterpret_cast<double*>(&a);
-}
-
-__device__ __forceinline__ coord_t shfl_coord(coord_t v, int srcLane, unsigned mask = 0xffffffffu) {
-    v.x = shfl(v.x, srcLane, mask);
-    v.y = shfl(v.y, srcLane, mask);
-    v.z = shfl(v.z, srcLane, mask);
-    return v;
-}
-
-__device__ void calculate_unforce_bound(
-    // const int x_idx,
-    // const int y_idx,
-
-    const coord_t& x,
-    const coord_t& y,
-
-    const double x_charge,
-    const double y_charge,
-
-    const double x_aii,
-    const double y_aii,
-
-    const double x_bii,
-    const double y_bii,
-
-    const double coulomb_constant,
-
-    const double scaling,
-
-    double& evdw,
-    double& ecoul,
-    double& dv) {
-    double3 d = {y.x - x.x, y.y - x.y, y.z - x.z};
-
-    double r2 = 1.0 / (d.x * d.x + d.y * d.y + d.z * d.z);
-    double r = sqrt(r2);
-    double r6 = r2 * r2 * r2;
-
-    ecoul = scaling * coulomb_constant * x_charge * y_charge * r;
-
-    double v_a = r6 * r6 * x_aii * y_aii;
-    double v_b = r6 * x_bii * y_bii;
-    evdw = v_a - v_b;
-    dv = r2 * (-ecoul - 12.0 * v_a + 6.0 * v_b);
-}
-
-__global__ void calc_pp(
-    const int N,
-    const int n_atoms_solute,
-
-    const charge_t* D_charges,    // charge type of each atom
-    const ccharge_t* D_ccharges,  // charge value of each charge type
-
-    const atype_t* D_atypes,
-    const catype_t* D_catypes,
-
-    const topo_t D_topo,
-
-    const bool* D_excluded,
-
-    const int* D_LJ_matrix,
-
-    const p_atom_t* D_patoms,
-
-    coord_t* X,
-    dvel_t* DV_X,
-
-    double* Evdw_TOT,
-    double* Ecoul_TOT) {
-    const int nBlocks = (N + 31) >> 5;  // ceil(N/32)
-    const int totalTiles = nBlocks * (nBlocks + 1) / 2;
-
-    const int warpsPerBlock = blockDim.x >> 5;  // THREAD_BLOCK_SIZE / 32
-    const int tid = threadIdx.x;
-    const int lane = tid & 31;
-    const int warpInBlock = tid >> 5;
-
-    const int tile = blockIdx.x * warpsPerBlock + warpInBlock;
-    if (tile >= totalTiles) {
-        return;  // extra warps in last block
-    }
-
-    // Map tile -> (x,y) with y>=x (upper triangle)
-    int tile_x, tile_y;
-    idx2xy(nBlocks, tile, tile_x, tile_y);
-
-    const int baseX = tile_x << 5;  // x*32
-    const int baseY = tile_y << 5;  // y*32
-
-    int x_idx = baseX + lane;
-    int y_idx = baseY + lane;
-
-    int x_atom = (x_idx < N) ? (D_patoms[x_idx].a - 1) : -1;
-    int y_atom = (y_idx < N) ? (D_patoms[y_idx].a - 1) : -1;
-
-    coord_t invalid = {-1e9, -1e9, -1e9};
-    coord_t x_coord = (x_atom >= 0) ? X[x_atom] : invalid;
-    coord_t y_coord = (y_atom >= 0) ? X[y_atom] : invalid;
-
-    bool x_excluded = (x_atom >= 0) ? D_excluded[x_atom] : true;
-    bool y_excluded = (y_atom >= 0) ? D_excluded[y_atom] : true;
-
-    double x_charge = (x_atom >= 0) ? D_ccharges[D_charges[x_atom].code - 1].charge : 0.0;
-    double y_charge = (y_atom >= 0) ? D_ccharges[D_charges[y_atom].code - 1].charge : 0.0;
-
-    catype_t x_type = (x_atom >= 0) ? D_catypes[D_atypes[x_atom].code - 1] : catype_t{};
-    catype_t y_type = (y_atom >= 0) ? D_catypes[D_atypes[y_atom].code - 1] : catype_t{};
-    double x_aii_normal = x_type.aii_normal;
-    double y_aii_normal = y_type.aii_normal;
-    double x_aii_14 = x_type.aii_1_4;
-    double y_aii_14 = y_type.aii_1_4;
-    double x_bii_normal = x_type.bii_normal;
-    double y_bii_normal = y_type.bii_normal;
-    double x_bii_14 = x_type.bii_1_4;
-    double y_bii_14 = y_type.bii_1_4;
-
-    double3 x_force = {0.0, 0.0, 0.0};
-    double3 y_force = {0.0, 0.0, 0.0};
-
-    double evdw_sum = 0;
-    double ecoul_sum = 0;
-
-    const unsigned mask = 0xffffffffu;
-
-    auto is_valid = [&]() -> bool {
-        if (x_idx >= N || y_idx >= N) {
-            return false;
-        }
-        if (tile_x == tile_y && x_idx <= y_idx) {
-            return false;
-        }
-        if (x_excluded || y_excluded) {
-            return false;
-        }
-        bool bond23 = (D_LJ_matrix[x_atom * n_atoms_solute + y_atom] == 3);
-        if (bond23) {
-            return false;
-        }
-
-        return true;
-    };
-
-    // shuffle to get next y
-    auto do_shuffle = [&]() {
-        const int src = (lane + 1) & 31;
-        y_idx = __shfl_sync(mask, y_idx, src);
-        y_atom = __shfl_sync(mask, y_atom, src);
-        y_coord = shfl_coord(y_coord, src, mask);
-        y_excluded = __shfl_sync(mask, y_excluded, src);
-        y_charge = __shfl_sync(mask, y_charge, src);
-        y_aii_normal = __shfl_sync(mask, y_aii_normal, src);
-        y_aii_14 = __shfl_sync(mask, y_aii_14, src);
-        y_bii_normal = __shfl_sync(mask, y_bii_normal, src);
-        y_bii_14 = __shfl_sync(mask, y_bii_14, src);
-
-        // shuffle y_force
-        y_force.x = shfl(y_force.x, src, mask);
-        y_force.y = shfl(y_force.y, src, mask);
-        y_force.z = shfl(y_force.z, src, mask);
-    };
-
-    for (int i = 0; i < 32; i++) {
-        if (is_valid()) {
-            bool bond14 = (D_LJ_matrix[y_atom * n_atoms_solute + x_atom] == 1);
-            double scaling = bond14 ? D_topo.el14_scale : 1.0;
-            double ai_aii = bond14 ? x_aii_14 : x_aii_normal;
-            double aj_aii = bond14 ? y_aii_14 : y_aii_normal;
-            double ai_bii = bond14 ? x_bii_14 : x_bii_normal;
-            double aj_bii = bond14 ? y_bii_14 : y_bii_normal;
-            double evdw = 0.0, ecoul = 0.0, dv = 0.0;
-            calculate_unforce_bound(
-                x_coord, y_coord, x_charge, y_charge, ai_aii, aj_aii, ai_bii, aj_bii,
-                D_topo.coulomb_constant, scaling, evdw, ecoul, dv);
-            evdw_sum += evdw;
-            ecoul_sum += ecoul;
-            double3 d = {y_coord.x - x_coord.x, y_coord.y - x_coord.y, y_coord.z - x_coord.z};
-            x_force.x -= dv * d.x;
-            x_force.y -= dv * d.y;
-            x_force.z -= dv * d.z;
-
-            y_force.x += dv * d.x;
-            y_force.y += dv * d.y;
-            y_force.z += dv * d.z;
-        }
-        do_shuffle();
-    }
-
-    if (x_atom >= 0) {
-        atomicAdd(&DV_X[x_atom].x, x_force.x);
-        atomicAdd(&DV_X[x_atom].y, x_force.y);
-        atomicAdd(&DV_X[x_atom].z, x_force.z);
-    }
-    if (y_atom >= 0) {
-        atomicAdd(&DV_X[y_atom].x, y_force.x);
-        atomicAdd(&DV_X[y_atom].y, y_force.y);
-        atomicAdd(&DV_X[y_atom].z, y_force.z);
-    }
-
-    for (int offset = 16; offset > 0; offset >>= 1) {
-        evdw_sum += __shfl_down_sync(0xffffffffu, evdw_sum, offset);
-        ecoul_sum += __shfl_down_sync(0xffffffffu, ecoul_sum, offset);
-    }
-    if (lane == 0) {
-        atomicAdd(Evdw_TOT, evdw_sum);
-        atomicAdd(Ecoul_TOT, ecoul_sum);
-    }
-}
-
 __global__ void calc_pp_dvel_matrix(int n_patoms, int n_atoms_solute,
                                     coord_t* X, double* Evdw, double* Ecoul, dvel_t* PP_MAT,
                                     ccharge_t* D_ccharges, charge_t* D_charges, catype_t* D_catypes, atype_t* D_atypes, p_atom_t* D_patoms, int* D_LJ_matrix, bool* D_excluded, topo_t D_topo) {
@@ -444,6 +221,8 @@ void calc_nonbonded_pp_forces_host_v2() {
 
     int mem_size_DV_X = n_atoms_solute * sizeof(dvel_t);
 
+    int n_blocks = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
     CudaContext& ctx = CudaContext::instance();
     auto X = ctx.d_coords;
     auto DV_X = ctx.d_dvelocities;
@@ -455,27 +234,32 @@ void calc_nonbonded_pp_forces_host_v2() {
     auto D_LJ_matrix = ctx.d_LJ_matrix;
     auto D_excluded = ctx.d_excluded;
 
-    cudaMemset(D_PP_evdw_TOT, 0, sizeof(double));
-    cudaMemset(D_PP_ecoul_TOT, 0, sizeof(double));
+    dim3 threads, grid;
 
-    const int thread_num = 128;
-    dim3 block_sz = dim3(thread_num);
+    threads = dim3(BLOCK_SIZE, BLOCK_SIZE);
+    grid = dim3((n_patoms + BLOCK_SIZE - 1) / threads.x, (n_patoms + BLOCK_SIZE - 1) / threads.y);
 
-    int tile_num_per_block = thread_num / 32;
-    int nBlocks = (n_patoms + 31) / 32;
-    int total_tiles = nBlocks * (nBlocks + 1) / 2;
+    calc_pp_dvel_matrix<<<grid, threads>>>(n_patoms, n_atoms_solute, X, D_PP_Evdw, D_PP_Ecoul, PP_MAT, D_ccharges, D_charges, D_catypes, D_atypes, D_patoms, D_LJ_matrix, D_excluded, topo);
+    calc_pp_dvel_vector<<<((n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_patoms, DV_X, PP_MAT, D_patoms);
 
-    int grid_sz = (total_tiles + tile_num_per_block - 1) / tile_num_per_block;
-    dim3 grid = dim3(grid_sz);
+    cudaMemcpy(dvelocities, DV_X, mem_size_DV_X, cudaMemcpyDeviceToHost);
 
-    calc_pp<<<grid, block_sz>>>(n_patoms, n_atoms_solute, D_charges, D_ccharges,
-                                D_atypes, D_catypes, topo, D_excluded,
-                                D_LJ_matrix, D_patoms, X, DV_X, D_PP_evdw_TOT,
-                                D_PP_ecoul_TOT);
+#ifdef DEBUG
+    cudaMemcpy(h_PP_MAT, PP_MAT, mem_size_PP_MAT, cudaMemcpyDeviceToHost);
+#endif
 
-    cudaDeviceSynchronize();
-    cudaMemcpy(dvelocities, DV_X, mem_size_DV_X, cudaMemcpyDeviceToHost);
+#ifdef DEBUG
+    for (int i = 0; i < n_patoms; i++) {
+        for (int j = 0; j < n_patoms; j++) {
+            if (h_PP_MAT[i * n_patoms + j].x > 100)
+                printf("PP_MAT[%d][%d] = %f %f %f\n", i, j, h_PP_MAT[i * n_patoms + j].x, h_PP_MAT[i * n_patoms + j].y, h_PP_MAT[i * n_patoms + j].z);
+        }
+    }
+#endif
+    cudaMemset(D_PP_evdw_TOT, 0, sizeof(double));
+    cudaMemset(D_PP_ecoul_TOT, 0, sizeof(double));
 
+    calc_energy_sum<<<1, threads>>>(n_blocks, n_blocks, D_PP_evdw_TOT, D_PP_ecoul_TOT, D_PP_Evdw, D_PP_Ecoul, true);
     double PP_evdw_TOT, PP_ecoul_TOT;
     cudaMemcpy(&PP_evdw_TOT, D_PP_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost);
     cudaMemcpy(&PP_ecoul_TOT, D_PP_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost);
@@ -488,6 +272,25 @@ void init_nonbonded_pp_force_kernel_data() {
     using namespace CudaNonbondedPPForce;
 
     if (!is_initialized) {
+        int n_blocks = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+        int mem_size_PP_MAT = n_patoms * n_patoms * sizeof(dvel_t);
+
+        int mem_size_PP_Evdw = n_blocks * n_blocks * sizeof(double);
+        int mem_size_PP_Ecoul = n_blocks * n_blocks * sizeof(double);
+#ifdef DEBUG
+        printf("Allocating PP_MAT\n");
+#endif
+        check_cudaMalloc((void**)&PP_MAT, mem_size_PP_MAT);
+#ifdef DEBUG
+        printf("Allocating D_PP_Evdw\n");
+#endif
+        check_cudaMalloc((void**)&D_PP_Evdw, mem_size_PP_Evdw);
+#ifdef DEBUG
+        printf("Allocating D_PP_Ecoul\n");
+#endif
+        check_cudaMalloc((void**)&D_PP_Ecoul, mem_size_PP_Ecoul);
+
         check_cudaMalloc((void**)&D_PP_evdw_TOT, sizeof(double));
         check_cudaMalloc((void**)&D_PP_ecoul_TOT, sizeof(double));
 
@@ -498,8 +301,11 @@ void init_nonbonded_pp_force_kernel_data() {
 void cleanup_nonbonded_pp_force() {
     using namespace CudaNonbondedPPForce;
     if (is_initialized) {
+        cudaFree(PP_MAT);
+        cudaFree(D_PP_Evdw);
+        cudaFree(D_PP_Ecoul);
         cudaFree(D_PP_evdw_TOT);
         cudaFree(D_PP_ecoul_TOT);
         is_initialized = false;
     }
-}
+}
\ No newline at end of file
diff --git a/src/core/parse.cu b/src/core/parse.cu
index c5518223..1b9e0f8d 100644
--- a/src/core/parse.cu
+++ b/src/core/parse.cu
@@ -753,7 +753,6 @@ void init_ngbrs14(const char* filename) {
                 int jx = (lineI + i + 1) % lines;
                 // if (ix < 100 && jx < 100) printf("i = %d j = %d\n", ix+1, jx+1);
                 LJ_matrix[ix * n_atoms_solute + jx] = 1;
-                LJ_matrix[jx * n_atoms_solute + ix] = 1;
             }
         }
         lineI++;
@@ -798,7 +797,6 @@ void init_ngbrs23(const char* filename) {
                 int jx = (lineI + i + 1) % lines;
                 // if (ix < 100 && jx < 100) printf("i = %d j = %d\n", ix+1, jx+1);
                 LJ_matrix[ix * n_atoms_solute + jx] = 3;
-                LJ_matrix[jx * n_atoms_solute + ix] = 3;
             }
         }
         lineI++;
@@ -821,7 +819,6 @@ void init_ngbrs14_long(const char* filename) {
         int ix = atoi(file.buffer[i+1][0])-1;
         int jx = atoi(file.buffer[i+1][1])-1;
         LJ_matrix[ix * n_atoms_solute + jx] = 1;
-        LJ_matrix[jx * n_atoms_solute + ix] = 1;
     }
 
     clean_csv(file);
@@ -841,7 +838,6 @@ void init_ngbrs23_long(const char* filename) {
         int ix = atoi(file.buffer[i+1][0])-1;
         int jx = atoi(file.buffer[i+1][1])-1;
         LJ_matrix[ix * n_atoms_solute + jx] = 3;
-        LJ_matrix[jx * n_atoms_solute + ix] = 3;
     }
 
     clean_csv(file);