diff --git a/src/core/cuda/include/CudaContext.cuh b/src/core/cuda/include/CudaContext.cuh
index 3c2883fa..8ed0369d 100644
--- a/src/core/cuda/include/CudaContext.cuh
+++ b/src/core/cuda/include/CudaContext.cuh
@@ -2,6 +2,10 @@
 
 #include <cuda_runtime.h>
 
+#include <array>
+#include <map>
+#include <vector>
+
 #include "system.h"
 #include "utils.h"
 
@@ -105,6 +109,27 @@ class CudaContext {
     charge_t* d_charges;
     p_atom_t* d_p_atoms;
 
+    /*
+    Other helper arrays
+    */
+    ccharge_t* d_charge_table_all;  // Device copy of h_charge_table_all
+    catype_t* d_catype_table_all;   // Device copy of h_catype_table_all
+    std::map<std::array<double, 6>, int> catype_to_type_host;
+    int* d_p_charge_types;
+    int* d_w_charge_types;
+    int* d_q_charge_types;  // [0, lambdas * n_qatoms) is the normal q_charge_type, [lambdas * n_qatoms, ... ) is the lambda-scaled q_charge_type]
+    
+    int* d_p_catype_types;
+    int* d_w_catype_types;
+    int* d_q_catype_types;  // [0, lambdas * n_qatoms) is the normal q_catype_type, [lambdas * n_qatoms, ... ) is the lambda-scaled q_catype_type]
+
+    int *d_p_atoms_list;
+    int *d_w_atoms_list;
+    int *d_q_atoms_list;
+    std::vector<int> h_p_atoms_list;
+    std::vector<int> h_w_atoms_list;
+    std::vector<int> h_q_atoms_list;
+
     static CudaContext& instance() {
         static CudaContext ctx;
         return ctx;
@@ -121,6 +146,16 @@ class CudaContext {
     void sync_all_to_host();
     void reset_energies();
 
+    std::array<double, 6> get_catype_key(const catype_t& catype) {
+        return {
+            catype.aii_normal,
+            catype.bii_normal,
+            catype.aii_polar,
+            catype.bii_polar,
+            catype.aii_1_4,
+            catype.bii_1_4};
+    }
+
    private:
     CudaContext() = default;
 
@@ -129,6 +164,10 @@ class CudaContext {
     ~CudaContext() { free(); }
     CudaContext(const CudaContext&) = delete;
     CudaContext& operator=(const CudaContext&) = delete;
+
+    void initialize_charge_tables_host();
+    void initialize_catype_tables_host();
+    void initialize_atom_lists_host();
 };
 template <typename T>
 void CudaContext::sync_array_to_device(T* dst, const T* src, int count) {
diff --git a/src/core/cuda/include/CudaNonbondedForce.cuh b/src/core/cuda/include/CudaNonbondedForce.cuh
new file mode 100644
index 00000000..891820fa
--- /dev/null
+++ b/src/core/cuda/include/CudaNonbondedForce.cuh
@@ -0,0 +1,22 @@
+#pragma once
+#include "system.h"
+
+void init_nonbonded_force_kernel_data();
+
+std::pair<double, double> calc_nonbonded_force_host(
+    int nx, 
+    int ny, 
+    int* x_idx_list, 
+    int* y_idx_list, 
+    bool symmetric, 
+
+    const int* x_charges_types, 
+    const int* y_charges_types,
+    const ccharge_t* ccharges_table,
+
+    const int* x_atypes_types, 
+    const int* y_atypes_types,
+    const catype_t* catypes_table
+);
+
+void cleanup_nonbonded_force();
\ No newline at end of file
diff --git a/src/core/cuda/src/CudaContext.cu b/src/core/cuda/src/CudaContext.cu
index 19f22165..183172bc 100644
--- a/src/core/cuda/src/CudaContext.cu
+++ b/src/core/cuda/src/CudaContext.cu
@@ -21,7 +21,6 @@ void CudaContext::init() {
     check_cudaMalloc((void**)&d_atypes, sizeof(atype_t) * n_atypes);
     check_cudaMalloc((void**)&d_catypes, sizeof(catype_t) * n_catypes);
 
-
     check_cudaMalloc((void**)&d_q_atoms, sizeof(q_atom_t) * n_qatoms);
     check_cudaMalloc((void**)&d_q_charges, sizeof(q_charge_t) * n_qatoms * n_lambdas);
     check_cudaMalloc((void**)&d_LJ_matrix, sizeof(int) * n_atoms_solute * n_atoms_solute);
@@ -54,6 +53,10 @@ void CudaContext::init() {
     check_cudaMalloc((void**)&d_charges, sizeof(charge_t) * n_charges);
     check_cudaMalloc((void**)&d_p_atoms, sizeof(p_atom_t) * n_patoms);
 
+    initialize_atom_lists_host();
+    initialize_charge_tables_host();
+    initialize_catype_tables_host();
+
     sync_all_to_device();
 }
 
@@ -208,6 +211,23 @@ void CudaContext::free() {
     cudaFree(d_ccharges);
     cudaFree(d_charges);
     cudaFree(d_p_atoms);
+    cudaFree(d_charge_table_all);
+    cudaFree(d_catype_table_all);
+    cudaFree(d_p_charge_types);
+    cudaFree(d_w_charge_types);
+    cudaFree(d_q_charge_types);
+    cudaFree(d_p_catype_types);
+    cudaFree(d_w_catype_types);
+    cudaFree(d_q_catype_types);
+
+    d_charge_table_all = nullptr;
+    d_catype_table_all = nullptr;
+    d_p_charge_types = nullptr;
+    d_w_charge_types = nullptr;
+    d_q_charge_types = nullptr;
+    d_p_catype_types = nullptr;
+    d_w_catype_types = nullptr;
+    d_q_catype_types = nullptr;
 }
 
 void CudaContext::reset_energies() {
@@ -215,3 +235,227 @@ void CudaContext::reset_energies() {
     cudaMemset(d_EQ_nonbond_qq, 0, sizeof(E_nonbonded_t) * n_lambdas);
     cudaMemset(d_EQ_restraint, 0, sizeof(E_restraint_t) * n_lambdas);
 }
+
+void CudaContext::initialize_atom_lists_host() {
+    h_p_atoms_list.clear();
+    for (int i = 0; i < n_patoms; i++) {
+        int id = p_atoms[i].a - 1;
+        if (!excluded[id]) {
+            h_p_atoms_list.push_back(id);
+        }
+    }
+
+    h_w_atoms_list.clear();
+    for (int i = n_atoms_solute; i < n_atoms; i++) {
+        int id = i;
+        if (!excluded[id]) {
+            h_w_atoms_list.push_back(id);
+        }
+    }
+    printf("Number of water atoms: %d, number of all water atoms: %d\n", (int)h_w_atoms_list.size(), n_atoms - n_atoms_solute);
+
+    h_q_atoms_list.clear();
+    for (int i = 0; i < n_qatoms; i++) {
+        int id = q_atoms[i].a - 1;
+        if (!excluded[id]) {
+            h_q_atoms_list.push_back(id);
+        }
+    }
+
+    check_cudaMalloc((void**)&d_p_atoms_list, sizeof(int) * h_p_atoms_list.size());
+    cudaMemcpy(d_p_atoms_list, h_p_atoms_list.data(), sizeof(int) * h_p_atoms_list.size(), cudaMemcpyHostToDevice);
+    check_cudaMalloc((void**)&d_w_atoms_list, sizeof(int) * h_w_atoms_list.size());
+    cudaMemcpy(d_w_atoms_list, h_w_atoms_list.data(), sizeof(int) * h_w_atoms_list.size(), cudaMemcpyHostToDevice);
+    check_cudaMalloc((void**)&d_q_atoms_list, sizeof(int) * h_q_atoms_list.size());
+    cudaMemcpy(d_q_atoms_list, h_q_atoms_list.data(), sizeof(int) * h_q_atoms_list.size(), cudaMemcpyHostToDevice);
+}
+
+void CudaContext::initialize_charge_tables_host() {
+    std::map<double, int> charge_to_type_host;
+    std::vector<ccharge_t> h_charge_table_all;  // h_charge_table_all[charge type] = (code, charge)
+
+    auto add_charge = [&](double charge) {
+        if (charge_to_type_host.count(charge) == 0) {
+            int sz = h_charge_table_all.size();
+            ccharge_t new_ccharge;
+            new_ccharge.code = sz;
+            new_ccharge.charge = charge;
+            h_charge_table_all.push_back(new_ccharge);
+            charge_to_type_host[charge] = sz;
+        }
+    };
+
+    for (int i = 0; i < n_ccharges; i++) {
+        double charge = ccharges[i].charge;
+        add_charge(charge);
+    }
+    for (int state = 0; state < n_lambdas; state++) {
+        for (int i = 0; i < n_qatoms; i++) {
+            double charge = q_charges[i + n_qatoms * state].q;
+            add_charge(charge);
+            charge *= lambdas[state];
+            add_charge(charge);
+        }
+    }
+
+    std::vector<int> p_charge_types(h_p_atoms_list.size());
+    for (int i = 0; i < h_p_atoms_list.size(); i++) {
+        int id = h_p_atoms_list[i];
+        double charge = ccharges[charges[id].code - 1].charge;
+        p_charge_types[i] = charge_to_type_host[charge];
+    }
+
+    int h_q_atoms_size = h_q_atoms_list.size();
+    std::vector<int> q_charge_types(h_q_atoms_size * n_lambdas * 2);
+    std::map<int, int> q_idx;
+    for (int i = 0; i < n_qatoms; i++) {
+        int id = q_atoms[i].a - 1;
+        if (!excluded[id]) {
+            q_idx[id] = i;
+        }
+    }
+
+    for (int state = 0; state < n_lambdas; state++) {
+        for (int i = 0; i < h_q_atoms_size; i++) {
+            int id = h_q_atoms_list[i];
+            double charge = q_charges[q_idx[id] + n_qatoms * state].q;
+            q_charge_types[state * h_q_atoms_size + i] = charge_to_type_host[charge];
+            charge *= lambdas[state];
+            q_charge_types[state * h_q_atoms_size + i + h_q_atoms_size * n_lambdas] = charge_to_type_host[charge];
+        }
+    }
+
+    int h_w_atoms_size = h_w_atoms_list.size();
+    std::vector<int> w_charge_types(h_w_atoms_size);
+    for (int i = 0; i < h_w_atoms_size; i++) {
+        int id = h_w_atoms_list[i];
+        double charge = ccharges[charges[id].code - 1].charge;
+        w_charge_types[i] = charge_to_type_host[charge];
+    }
+
+    check_cudaMalloc((void**)&d_charge_table_all, sizeof(ccharge_t) * h_charge_table_all.size());
+    cudaMemcpy(d_charge_table_all, h_charge_table_all.data(), sizeof(ccharge_t) * h_charge_table_all.size(), cudaMemcpyHostToDevice);
+    check_cudaMalloc((void**)&d_p_charge_types, sizeof(int) * p_charge_types.size());
+    cudaMemcpy(d_p_charge_types, p_charge_types.data(), sizeof(int) * p_charge_types.size(), cudaMemcpyHostToDevice);
+    check_cudaMalloc((void**)&d_q_charge_types, sizeof(int) * q_charge_types.size());
+    cudaMemcpy(d_q_charge_types, q_charge_types.data(), sizeof(int) * q_charge_types.size(), cudaMemcpyHostToDevice);
+    check_cudaMalloc((void**)&d_w_charge_types, sizeof(int) * w_charge_types.size());
+    cudaMemcpy(d_w_charge_types, w_charge_types.data(), sizeof(int) * w_charge_types.size(), cudaMemcpyHostToDevice);
+}
+
+void CudaContext::initialize_catype_tables_host() {
+    std::vector<catype_t> h_catype_table_all;  // h_catype_table_all[catype code] = catype_t
+    catype_to_type_host.clear();
+
+    auto add_catype = [&](catype_t catype) {
+        auto key = get_catype_key(catype);
+        if (catype_to_type_host.count(key) == 0) {
+            int sz = h_catype_table_all.size();
+            catype.code = sz;
+            h_catype_table_all.push_back(catype);
+            catype_to_type_host[key] = sz;
+        }
+    };
+
+    for (int i = 0; i < n_catypes; i++) {
+        add_catype(catypes[i]);
+    }
+
+    for (int state = 0; state < n_lambdas; state++) {
+        for (int i = 0; i < n_qatoms; i++) {
+            const q_atype_t& qat = q_atypes[i + n_qatoms * state];
+            const q_catype_t& catype = q_catypes[qat.code - 1];
+
+            catype_t new_catype;
+            new_catype.m = catype.m;
+            new_catype.aii_normal = catype.Ai;
+            new_catype.bii_normal = catype.Bi;
+            new_catype.aii_polar = catype.Ci;
+            new_catype.bii_polar = catype.ai;
+            new_catype.aii_1_4 = catype.Ai_14;
+            new_catype.bii_1_4 = catype.Bi_14;
+
+            add_catype(new_catype);
+
+            new_catype.m *= lambdas[state];
+            new_catype.aii_normal *= lambdas[state];
+            new_catype.bii_normal *= lambdas[state];
+            new_catype.aii_polar *= lambdas[state];
+            new_catype.bii_polar *= lambdas[state];
+            new_catype.aii_1_4 *= lambdas[state];
+            new_catype.bii_1_4 *= lambdas[state];
+            add_catype(new_catype);
+        }
+    }
+
+    std::vector<int> p_catype_types(h_p_atoms_list.size());
+    for (int i = 0; i < h_p_atoms_list.size(); i++) {
+        int id = h_p_atoms_list[i];
+        auto catype = catypes[atypes[id].code - 1];
+        auto key = get_catype_key(catype);
+        p_catype_types[i] = catype_to_type_host[key];
+    }
+
+    int h_q_atoms_size = h_q_atoms_list.size();
+    std::vector<int> q_catype_types(h_q_atoms_size * n_lambdas * 2);
+    std::map<int, int> q_idx;
+    for (int i = 0; i < n_qatoms; i++) {
+        int id = q_atoms[i].a - 1;
+        if (!excluded[id]) {
+            q_idx[id] = i;
+        }
+    }
+
+
+    for (int state = 0; state < n_lambdas; state++) {
+        for (int i = 0; i < h_q_atoms_size; i++) {
+            int id = h_q_atoms_list[i];
+            const q_atype_t& qat = q_atypes[q_idx[id] + n_qatoms * state];
+            const q_catype_t& catype = q_catypes[qat.code - 1];
+
+            catype_t normal_catype;
+            normal_catype.m = catype.m;
+            normal_catype.aii_normal = catype.Ai;
+            normal_catype.bii_normal = catype.Bi;
+            normal_catype.aii_polar = catype.Ci;
+            normal_catype.bii_polar = catype.ai;
+            normal_catype.aii_1_4 = catype.Ai_14;
+            normal_catype.bii_1_4 = catype.Bi_14;
+
+            auto key_normal = get_catype_key(normal_catype);
+            q_catype_types[state * h_q_atoms_size + i] = catype_to_type_host[key_normal];
+
+            catype_t scaled_catype;
+            scaled_catype.m = catype.m * lambdas[state];
+            scaled_catype.aii_normal = catype.Ai * lambdas[state];
+            scaled_catype.bii_normal = catype.Bi * lambdas[state];
+            scaled_catype.aii_polar = catype.Ci * lambdas[state];
+            scaled_catype.bii_polar = catype.ai * lambdas[state];
+            scaled_catype.aii_1_4 = catype.Ai_14 * lambdas[state];
+            scaled_catype.bii_1_4 = catype.Bi_14 * lambdas[state];
+
+            auto key_scaled = get_catype_key(scaled_catype);
+            q_catype_types[state * h_q_atoms_size + i + h_q_atoms_size * n_lambdas] = catype_to_type_host[key_scaled];
+        }
+    }
+
+    int h_w_atoms_size = h_w_atoms_list.size();
+    std::vector<int> w_catype_types(h_w_atoms_size);
+    for (int i = 0; i < h_w_atoms_size; i++) {
+        int id = h_w_atoms_list[i];
+        auto catype = catypes[atypes[id].code - 1];
+        auto key = get_catype_key(catype);
+        w_catype_types[i] = catype_to_type_host[key];
+        // printf("Water atom %d: catype code %d mapped to %d, data: m=%f, aii_normal=%f, bii_normal=%f, aii_polar=%f, bii_polar=%f, aii_1_4=%f, bii_1_4=%f\n", id, atypes[id].code, w_catype_types[i], catype.m, catype.aii_normal, catype.bii_normal, catype.aii_polar, catype.bii_polar, catype.aii_1_4, catype.bii_1_4);
+    }
+    printf("Total water atom number: %d, w_catype_types size: %lu\n", h_w_atoms_size, w_catype_types.size());
+
+    check_cudaMalloc((void**)&d_catype_table_all, sizeof(catype_t) * h_catype_table_all.size());
+    cudaMemcpy(d_catype_table_all, h_catype_table_all.data(), sizeof(catype_t) * h_catype_table_all.size(), cudaMemcpyHostToDevice);
+    check_cudaMalloc((void**)&d_p_catype_types, sizeof(int) * p_catype_types.size());
+    cudaMemcpy(d_p_catype_types, p_catype_types.data(), sizeof(int) * p_catype_types.size(), cudaMemcpyHostToDevice);
+    check_cudaMalloc((void**)&d_q_catype_types, sizeof(int) * q_catype_types.size());
+    cudaMemcpy(d_q_catype_types, q_catype_types.data(), sizeof(int) * q_catype_types.size(), cudaMemcpyHostToDevice);
+    check_cudaMalloc((void**)&d_w_catype_types, sizeof(int) * w_catype_types.size());
+    cudaMemcpy(d_w_catype_types, w_catype_types.data(), sizeof(int) * w_catype_types.size(), cudaMemcpyHostToDevice);
+}
diff --git a/src/core/cuda/src/CudaNonbondedForce.cu b/src/core/cuda/src/CudaNonbondedForce.cu
new file mode 100644
index 00000000..fa03ad9d
--- /dev/null
+++ b/src/core/cuda/src/CudaNonbondedForce.cu
@@ -0,0 +1,371 @@
+#include <iostream>
+
+#include "cuda/include/CudaContext.cuh"
+#include "cuda/include/CudaNonbondedForce.cuh"
+#include "system.h"
+
+namespace CudaNonbondedForce {
+bool is_initialized = false;
+double *d_evdw_total, *d_ecoul_total;
+
+__device__ __forceinline__ void idx2xy(int n, int t, int& x, int& y) {
+    x = (int)floorf((2 * n + 1 - sqrtf((2 * n + 1) * (2 * n + 1) - 8 * t)) * 0.5f);
+    y = t - (x * n - (x * (x - 1) >> 1));
+    if (y < 0) {
+        x--;
+        y += (n - x);
+    }
+    y += x;
+}
+
+__device__ __forceinline__ double shfl(double v, int srcLane, unsigned mask = 0xffffffffu) {
+    int2 a = *reinterpret_cast<int2*>(&v);
+    a.x = __shfl_sync(mask, a.x, srcLane);
+    a.y = __shfl_sync(mask, a.y, srcLane);
+    return *reinterpret_cast<double*>(&a);
+}
+
+__device__ __forceinline__ coord_t shfl_coord(coord_t v, int srcLane, unsigned mask = 0xffffffffu) {
+    v.x = shfl(v.x, srcLane, mask);
+    v.y = shfl(v.y, srcLane, mask);
+    v.z = shfl(v.z, srcLane, mask);
+    return v;
+}
+
+__device__ __forceinline__ catype_t shfl_catype(catype_t v, int srcLane, unsigned mask = 0xffffffffu) {
+    v.code = __shfl_sync(mask, v.code, srcLane);
+    v.m = shfl(v.m, srcLane, mask);
+    v.aii_normal = shfl(v.aii_normal, srcLane, mask);
+    v.bii_normal = shfl(v.bii_normal, srcLane, mask);
+    v.aii_polar = shfl(v.aii_polar, srcLane, mask);
+    v.bii_polar = shfl(v.bii_polar, srcLane, mask);
+    v.aii_1_4 = shfl(v.aii_1_4, srcLane, mask);
+    v.bii_1_4 = shfl(v.bii_1_4, srcLane, mask);
+    return v;
+}
+
+__device__ void calculate_unforce_bound(
+    // const int x_idx,
+    // const int y_idx,
+
+    const coord_t& x,
+    const coord_t& y,
+
+    const double x_charge,
+    const double y_charge,
+
+    const double x_aii,
+    const double y_aii,
+
+    const double x_bii,
+    const double y_bii,
+
+    const double coulomb_constant,
+
+    const double scaling,
+
+    double& evdw,
+    double& ecoul,
+    double& dv) {
+    double3 d = {x.x - y.x, x.y - y.y, x.z - y.z};
+    double r = rsqrt(d.x * d.x + d.y * d.y + d.z * d.z);
+    double r2 = r * r;
+    double r6 = r2 * r2 * r2;
+    // double v_a = r6 * r6;
+    // double v_b = r6;
+    // ecoul = r;
+    // evdw = v_a - v_b;
+    // dv = r2 * (-ecoul - v_a + v_b);
+    
+
+    ecoul = scaling * coulomb_constant * x_charge * y_charge * r;
+
+    double v_a = r6 * r6 * x_aii * y_aii;
+    double v_b = r6 * x_bii * y_bii;
+    evdw = v_a - v_b;
+    dv = r2 * (-ecoul - 12.0 * v_a + 6.0 * v_b);
+}
+
+__global__ void calc_nonbonded_force_kernel(
+    const int nx,
+    const int ny,
+
+    const int* x_charges_types,
+    const int* y_charges_types,
+    const ccharge_t* ccharges_table,
+
+    const int* x_atypes_types,
+    const int* y_atypes_types,
+    const catype_t* catypes_table,
+
+    const topo_t d_topo,
+
+    const bool* d_excluded,
+
+    const int* d_LJ_matrix,
+
+    const int* x_idx_list,
+    const int* y_idx_list,
+
+    const coord_t* d_coords,
+
+    dvel_t* d_dvelocities,
+
+    double* evdw_tot,
+    double* ecoul_tot,
+
+    bool symmetric,
+
+    // helper variables
+    const int n_atoms_solute
+
+) {
+    const int x_block_num = (nx + 31) >> 5;
+    const int y_block_num = (ny + 31) >> 5;
+
+    const int total_tiles = symmetric ? (x_block_num * (x_block_num + 1)) >> 1 : x_block_num * y_block_num;
+
+    const int warps_per_block = blockDim.x >> 5;
+    const int tid = threadIdx.x;
+    const int lane = tid & 31;
+    const int warp_in_block = tid >> 5;
+
+    const int tile = blockIdx.x * warps_per_block + warp_in_block;
+    if (tile >= total_tiles) return;
+
+    int tile_x, tile_y;
+    if (symmetric) {
+        idx2xy(x_block_num, tile, tile_x, tile_y);
+    } else {
+        tile_x = tile % x_block_num;
+        tile_y = tile / x_block_num;
+    }
+
+    const int base_x = tile_x << 5;
+    const int base_y = tile_y << 5;
+
+    int x_idx = base_x + lane;
+    int y_idx = base_y + lane;
+
+    int x_atom_idx = (x_idx < nx) ? x_idx_list[x_idx] : -1;
+    int y_atom_idx = (y_idx < ny) ? y_idx_list[y_idx] : -1;
+
+    coord_t invalid = {-1e9, -1e9, -1e9};
+    coord_t x_coord = (x_atom_idx >= 0) ? d_coords[x_atom_idx] : invalid;
+    coord_t y_coord = (y_atom_idx >= 0) ? d_coords[y_atom_idx] : invalid;
+
+    bool x_excluded = (x_atom_idx >= 0) ? d_excluded[x_atom_idx] : true;
+    bool y_excluded = (y_atom_idx >= 0) ? d_excluded[y_atom_idx] : true;
+
+    int x_charge_type_idx = (x_idx < nx) ? x_charges_types[x_idx] : -1;
+    int y_charge_type_idx = (y_idx < ny) ? y_charges_types[y_idx] : -1;
+    double x_charge = (x_atom_idx >= 0 && x_charge_type_idx >= 0) ? ccharges_table[x_charge_type_idx].charge : 0.0;
+    double y_charge = (y_atom_idx >= 0 && y_charge_type_idx >= 0) ? ccharges_table[y_charge_type_idx].charge : 0.0;
+
+    int x_catype_type_idx = (x_idx < nx) ? x_atypes_types[x_idx] : -1;
+    int y_catype_type_idx = (y_idx < ny) ? y_atypes_types[y_idx] : -1;
+    catype_t x_type = (x_atom_idx >= 0 && x_catype_type_idx >= 0) ? catypes_table[x_catype_type_idx] : catype_t{};
+    catype_t y_type = (y_atom_idx >= 0 && y_catype_type_idx >= 0) ? catypes_table[y_catype_type_idx] : catype_t{};
+
+
+    double3 x_force = {0.0, 0.0, 0.0};
+    double3 y_force = {0.0, 0.0, 0.0};
+
+    double evdw_sum = 0.0;
+    double ecoul_sum = 0.0;
+
+    const unsigned mask = 0xffffffffu;
+
+    auto lj_code = [&](int ix, int iy) -> int {
+        if (ix < 0 || iy < 0 || ix >= n_atoms_solute || iy >= n_atoms_solute) return 0;
+        return d_LJ_matrix[ix * n_atoms_solute + iy];
+    };
+
+    auto is_valid = [&]() -> bool {
+        if (x_idx >= nx || y_idx >= ny) return false;
+
+        if (symmetric && tile_x == tile_y && x_idx <= y_idx) return false;
+
+        if (x_excluded || y_excluded) return false;
+
+        // Skip interactions within the same rigid water molecule
+        if (x_atom_idx >= n_atoms_solute && y_atom_idx >= n_atoms_solute) {
+            int wx = (x_atom_idx - n_atoms_solute) / 3;
+            int wy = (y_atom_idx - n_atoms_solute) / 3;
+            if (wx == wy) return false;
+        }
+
+        bool bond23 = lj_code(x_atom_idx, y_atom_idx) == 3;
+        if (bond23) return false;
+
+        return true;
+    };
+
+    auto do_shuffle = [&]() {
+        const int src = (lane + 1) & 31;
+        y_idx = __shfl_sync(mask, y_idx, src);
+        y_atom_idx = __shfl_sync(mask, y_atom_idx, src);
+        y_coord = shfl_coord(y_coord, src, mask);
+        y_excluded = __shfl_sync(mask, y_excluded, src);
+        y_charge = __shfl_sync(mask, y_charge, src);
+        y_type = shfl_catype(y_type, src, mask);
+
+        y_force.x = shfl(y_force.x, src, mask);
+        y_force.y = shfl(y_force.y, src, mask);
+        y_force.z = shfl(y_force.z, src, mask);
+    };
+
+    for (int i = 0; i < 32; i++) {
+        if (is_valid()) {
+            bool bond14 = lj_code(x_atom_idx, y_atom_idx) == 1;
+            double scaling = bond14 ? d_topo.el14_scale : 1.0;
+            double ai_aii = bond14 ? x_type.aii_1_4 : x_type.aii_normal;
+            double bi_bii = bond14 ? x_type.bii_1_4 : x_type.bii_normal;
+            double aj_aii = bond14 ? y_type.aii_1_4 : y_type.aii_normal;
+            double bj_bii = bond14 ? y_type.bii_1_4 : y_type.bii_normal;
+
+             // Match legacy water-water behavior: LJ only between oxygens
+            if (x_atom_idx >= n_atoms_solute && y_atom_idx >= n_atoms_solute) {
+                bool x_is_O = ((x_atom_idx - n_atoms_solute) % 3) == 0;
+                bool y_is_O = ((y_atom_idx - n_atoms_solute) % 3) == 0;
+                if (!(x_is_O && y_is_O)) {
+                    ai_aii = bi_bii = aj_aii = bj_bii = 0.0;
+                }
+            }
+
+            double evdw = 0, ecoul = 0, dv = 0;
+
+            calculate_unforce_bound(
+                x_coord,
+                y_coord,
+                x_charge,
+                y_charge,
+                ai_aii,
+                aj_aii,
+                bi_bii,
+                bj_bii,
+                d_topo.coulomb_constant,
+                scaling,
+                evdw,
+                ecoul,
+                dv);
+
+            evdw_sum += evdw;
+            ecoul_sum += ecoul;
+
+            double3 d = {x_coord.x - y_coord.x, x_coord.y - y_coord.y, x_coord.z - y_coord.z};
+            y_force.x -= dv * d.x;
+            y_force.y -= dv * d.y;
+            y_force.z -= dv * d.z;
+
+            x_force.x += dv * d.x;
+            x_force.y += dv * d.y;
+            x_force.z += dv * d.z;
+        }
+        do_shuffle();
+    }
+
+    if (x_atom_idx >= 0) {
+        atomicAdd(&d_dvelocities[x_atom_idx].x, x_force.x);
+        atomicAdd(&d_dvelocities[x_atom_idx].y, x_force.y);
+        atomicAdd(&d_dvelocities[x_atom_idx].z, x_force.z);
+    }
+
+    if (y_atom_idx >= 0) {
+        atomicAdd(&d_dvelocities[y_atom_idx].x, y_force.x);
+        atomicAdd(&d_dvelocities[y_atom_idx].y, y_force.y);
+        atomicAdd(&d_dvelocities[y_atom_idx].z, y_force.z);
+    }
+
+    for (int offset = 16; offset > 0; offset >>= 1) {
+        evdw_sum += __shfl_down_sync(mask, evdw_sum, offset);
+        ecoul_sum += __shfl_down_sync(mask, ecoul_sum, offset);
+    }
+    if (lane == 0) {
+        atomicAdd(evdw_tot, evdw_sum);
+        atomicAdd(ecoul_tot, ecoul_sum);
+    }
+}
+
+}  // namespace CudaNonbondedForce
+
+std::pair<double, double> calc_nonbonded_force_host(
+    int nx, 
+    int ny, 
+    int* x_idx_list, 
+    int* y_idx_list, 
+    bool symmetric, 
+
+    const int* x_charges_types, 
+    const int* y_charges_types,
+    const ccharge_t* ccharges_table,
+
+    const int* x_atypes_types, 
+    const int* y_atypes_types,
+    const catype_t* catypes_table
+) {
+    using namespace CudaNonbondedForce;
+    CudaContext& context = CudaContext::instance();
+    const int thread_num = 256;
+    dim3 block_sz = dim3(thread_num);
+
+    int tile_num_per_block = thread_num >> 5;
+
+    int x_block_num = (nx + 31) >> 5;
+    int y_block_num = (ny + 31) >> 5;
+
+    int total_tiles = symmetric ? (x_block_num * (x_block_num + 1)) >> 1 : x_block_num * y_block_num;
+    int grid_sz = (total_tiles + tile_num_per_block - 1) / tile_num_per_block;
+
+    dim3 grid = dim3(grid_sz);
+
+    cudaMemset(d_ecoul_total, 0, sizeof(double));
+    cudaMemset(d_evdw_total, 0, sizeof(double));
+
+    calc_nonbonded_force_kernel<<<grid, block_sz>>>(
+        nx,
+        ny,
+        x_charges_types,
+        y_charges_types,
+        ccharges_table,
+        x_atypes_types,
+        y_atypes_types,
+        catypes_table,
+        topo,
+        context.d_excluded,
+        context.d_LJ_matrix,
+        x_idx_list,
+        y_idx_list,
+        context.d_coords,
+        context.d_dvelocities,
+        d_evdw_total,
+        d_ecoul_total,
+        symmetric,
+        n_atoms_solute);
+
+    cudaDeviceSynchronize();
+
+    double evdw_tot = 0, ecoul_tot = 0;
+    cudaMemcpy(&evdw_tot, d_evdw_total, sizeof(double), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&ecoul_tot, d_ecoul_total, sizeof(double), cudaMemcpyDeviceToHost);
+
+    return {evdw_tot, ecoul_tot};
+}
+
+void init_nonbonded_force_kernel_data() {
+    using namespace CudaNonbondedForce;
+    if (!is_initialized) {
+        check_cudaMalloc((void**)&d_evdw_total, sizeof(double));
+        check_cudaMalloc((void**)&d_ecoul_total, sizeof(double));
+        is_initialized = true;
+    }
+}
+
+void cleanup_nonbonded_force() {
+    using namespace CudaNonbondedForce;
+    if (is_initialized) {
+        cudaFree(d_evdw_total);
+        cudaFree(d_ecoul_total);
+        is_initialized = false;
+    }
+}
diff --git a/src/core/cuda/src/CudaNonbondedPPForce.cu b/src/core/cuda/src/CudaNonbondedPPForce.cu
index d32db32b..792abba2 100644
--- a/src/core/cuda/src/CudaNonbondedPPForce.cu
+++ b/src/core/cuda/src/CudaNonbondedPPForce.cu
@@ -1,311 +1,48 @@
+#include <iostream>
+#include <vector>
+
 #include "cuda/include/CudaContext.cuh"
+#include "cuda/include/CudaNonbondedForce.cuh"
 #include "cuda/include/CudaNonbondedPPForce.cuh"
 #include "utils.h"
 
 namespace CudaNonbondedPPForce {
 bool is_initialized = false;
-dvel_t* PP_MAT;
-double *D_PP_Evdw, *D_PP_Ecoul;
-double *D_PP_evdw_TOT, *D_PP_ecoul_TOT;
-
-__device__ void calc_pp_dvel_matrix_incr(int row, int pi, int column, int pj,
-                                         coord_t* Xs, coord_t* Ys, int* LJs, bool* excluded_s, double* Evdw, double* Ecoul, dvel_t* patom_a, dvel_t* patom_b,
-                                         ccharge_t* D_ccharges, charge_t* D_charges, catype_t* D_catypes, atype_t* D_atypes, p_atom_t* D_patoms, topo_t D_topo) {
-    bool bond14, bond23;
-    double scaling;
-    coord_t da;
-    double r2a, ra, r6a;
-    double Vela, V_a, V_b;
-    double dva;
-    double crg_i, crg_j;
-    double ai_aii, aj_aii, ai_bii, aj_bii;
-    catype_t ai_type, aj_type;
-
-    bond23 = LJs[row * BLOCK_SIZE + column] == 3;
-    bond14 = LJs[row * BLOCK_SIZE + column] == 1;
-
-    if (bond23) return;
-    if (excluded_s[row] || excluded_s[BLOCK_SIZE + column]) return;
-
-    scaling = bond14 ? D_topo.el14_scale : 1;
-
-    crg_i = D_ccharges[D_charges[pi].code - 1].charge;
-    crg_j = D_ccharges[D_charges[pj].code - 1].charge;
-
-    ai_type = D_catypes[D_atypes[pi].code - 1];
-    aj_type = D_catypes[D_atypes[pj].code - 1];
-
-    da.x = Ys[column].x - Xs[row].x;
-    da.y = Ys[column].y - Xs[row].y;
-    da.z = Ys[column].z - Xs[row].z;
-    r2a = 1 / (pow(da.x, 2) + pow(da.y, 2) + pow(da.z, 2));
-    ra = sqrt(r2a);
-    r6a = r2a * r2a * r2a;
-
-    Vela = scaling * D_topo.coulomb_constant * crg_i * crg_j * ra;
-
-    ai_aii = bond14 ? ai_type.aii_1_4 : ai_type.aii_normal;
-    aj_aii = bond14 ? aj_type.aii_1_4 : aj_type.aii_normal;
-    ai_bii = bond14 ? ai_type.bii_1_4 : ai_type.bii_normal;
-    aj_bii = bond14 ? aj_type.bii_1_4 : aj_type.bii_normal;
-
-    V_a = r6a * r6a * ai_aii * aj_aii;
-    V_b = r6a * ai_bii * aj_bii;
-    dva = r2a * (-Vela - 12 * V_a + 6 * V_b);
-
-    patom_a->x = -dva * da.x;
-    patom_a->y = -dva * da.y;
-    patom_a->z = -dva * da.z;
-
-    patom_b->x = dva * da.x;
-    patom_b->y = dva * da.y;
-    patom_b->z = dva * da.z;
-
-    *Ecoul += Vela;
-    *Evdw += (V_a - V_b);
-}
-
-__global__ void calc_pp_dvel_matrix(int n_patoms, int n_atoms_solute,
-                                    coord_t* X, double* Evdw, double* Ecoul, dvel_t* PP_MAT,
-                                    ccharge_t* D_ccharges, charge_t* D_charges, catype_t* D_catypes, atype_t* D_atypes, p_atom_t* D_patoms, int* D_LJ_matrix, bool* D_excluded, topo_t D_topo) {
-    // Block index
-    int bx = blockIdx.x;
-    int by = blockIdx.y;
-
-    // Thread index
-    int tx = threadIdx.x;
-    int ty = threadIdx.y;
-
-    int aStart = BLOCK_SIZE * by;
-    int bStart = BLOCK_SIZE * bx;
-
-    __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE];
-    __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE];
-
-    Ecoul_S[ty][tx] = 0;
-    Evdw_S[ty][tx] = 0;
-
-    if (aStart + ty >= n_patoms) return;
-    if (bStart + tx >= n_patoms) return;
-
-    __shared__ coord_t Xs[BLOCK_SIZE];
-    __shared__ coord_t Ys[BLOCK_SIZE];
-    __shared__ int LJs[BLOCK_SIZE * BLOCK_SIZE];
-    __shared__ bool excluded_s[2 * BLOCK_SIZE];
-
-    int pi = D_patoms[aStart + ty].a - 1;
-    int pj = D_patoms[bStart + tx].a - 1;
-
-    if (tx == 0) {
-        Xs[ty] = X[pi];
-        excluded_s[ty] = D_excluded[pi];
-    }
-
-    if (ty == 0) {
-        Ys[tx] = X[pj];
-        excluded_s[BLOCK_SIZE + tx] = D_excluded[pj];
-    }
-    LJs[ty * BLOCK_SIZE + tx] = D_LJ_matrix[pi * n_atoms_solute + pj];
-
-    __syncthreads();
-
-    if (bx < by || (bx == by && tx < ty)) return;
-
-    dvel_t patom_a, patom_b;
-    memset(&patom_a, 0, sizeof(dvel_t));
-    memset(&patom_b, 0, sizeof(dvel_t));
-
-    int row = by * BLOCK_SIZE + ty;
-    int column = bx * BLOCK_SIZE + tx;
-
-    if (bx != by || tx != ty) {
-        double evdw = 0, ecoul = 0;
-        calc_pp_dvel_matrix_incr(ty, pi, tx, pj, Xs, Ys, LJs, excluded_s, &evdw, &ecoul, &patom_a,
-                                 &patom_b, D_ccharges, D_charges, D_catypes, D_atypes, D_patoms, D_topo);
-        Evdw_S[ty][tx] = evdw;
-        Ecoul_S[ty][tx] = ecoul;
-    }
-
-    PP_MAT[row * n_patoms + column] = patom_a;
-    PP_MAT[column * n_patoms + row] = patom_b;
-
-    __syncthreads();
-
-    int rowlen = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
-
-    if (tx == 0 && ty == 0) {
-        double tot_Evdw = 0;
-        double tot_Ecoul = 0;
-        for (int i = 0; i < BLOCK_SIZE; i++) {
-            for (int j = 0; j < BLOCK_SIZE; j++) {
-                tot_Evdw += Evdw_S[i][j];
-                tot_Ecoul += Ecoul_S[i][j];
-            }
-        }
-        Evdw[rowlen * by + bx] = tot_Evdw;
-        Ecoul[rowlen * by + bx] = tot_Ecoul;
-
-        // printf("bx = %d by = %d tot_Evdw = %f tot_Ecoul = %f\n", bx, by, Evdw[rowlen * by + bx], Ecoul[rowlen * by + bx]);
-    }
-
-    __syncthreads();
-}
-
-__global__ void calc_pp_dvel_vector(int n_patoms, dvel_t* DV_X, dvel_t* PP_MAT, p_atom_t* D_patoms) {
-    int row = blockIdx.x * blockDim.x + threadIdx.x;
-    if (row >= n_patoms) return;
-
-    dvel_t dP;
-    dP.x = 0;
-    dP.y = 0;
-    dP.z = 0;
-
-    for (int i = 0; i < n_patoms; i++) {
-        if (i != row) {
-            dP.x += PP_MAT[i + n_patoms * row].x;
-            dP.y += PP_MAT[i + n_patoms * row].y;
-            dP.z += PP_MAT[i + n_patoms * row].z;
-        }
-    }
-
-    int p = D_patoms[row].a - 1;
-
-    DV_X[p].x += dP.x;
-    DV_X[p].y += dP.y;
-    DV_X[p].z += dP.z;
-
-    __syncthreads();
-}
-
-__global__ void calc_energy_sum(int rows, int columns, double* Evdw_TOT, double* Ecoul_TOT, double* Evdw, double* Ecoul, bool upper_diagonal) {
-    int tx = threadIdx.x;
-    int ty = threadIdx.y;
-
-    __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE];
-    __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE];
-
-    // TODO: better way to distribute upper diagonal over threads? Seems like threads in left bottom corner have less work
-    double coul_TOT = 0;
-    double vdw_TOT = 0;
-    for (int i = ty; i < rows; i += BLOCK_SIZE) {
-        for (int j = tx; j < columns; j += BLOCK_SIZE) {
-            if (i <= j || !upper_diagonal) {
-                coul_TOT += Ecoul[i * columns + j];
-                vdw_TOT += Evdw[i * columns + j];
-            }
-        }
-    }
-    Ecoul_S[ty][tx] = coul_TOT;
-    Evdw_S[ty][tx] = vdw_TOT;
-
-    __syncthreads();
-
-    if (tx == 0 && ty == 0) {
-        double Evdw_temp = 0;
-        double Ecoul_temp = 0;
-
-        for (int i = 0; i < BLOCK_SIZE; i++) {
-            for (int j = 0; j < BLOCK_SIZE; j++) {
-                Evdw_temp += Evdw_S[i][j];
-                Ecoul_temp += Ecoul_S[i][j];
-            }
-        }
-
-        *Evdw_TOT = Evdw_temp;
-        *Ecoul_TOT = Ecoul_temp;
-    }
-}
 }  // namespace CudaNonbondedPPForce
-void calc_nonbonded_pp_forces_host_v2() {
-    using namespace CudaNonbondedPPForce;
-
-    int mem_size_DV_X = n_atoms_solute * sizeof(dvel_t);
-
-    int n_blocks = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
-
-    CudaContext& ctx = CudaContext::instance();
-    auto X = ctx.d_coords;
-    auto DV_X = ctx.d_dvelocities;
-    auto D_ccharges = ctx.d_ccharges;
-    auto D_charges = ctx.d_charges;
-    auto D_catypes = ctx.d_catypes;
-    auto D_atypes = ctx.d_atypes;
-    auto D_patoms = ctx.d_p_atoms;
-    auto D_LJ_matrix = ctx.d_LJ_matrix;
-    auto D_excluded = ctx.d_excluded;
-
-    dim3 threads, grid;
-
-    threads = dim3(BLOCK_SIZE, BLOCK_SIZE);
-    grid = dim3((n_patoms + BLOCK_SIZE - 1) / threads.x, (n_patoms + BLOCK_SIZE - 1) / threads.y);
-
-    calc_pp_dvel_matrix<<<grid, threads>>>(n_patoms, n_atoms_solute, X, D_PP_Evdw, D_PP_Ecoul, PP_MAT, D_ccharges, D_charges, D_catypes, D_atypes, D_patoms, D_LJ_matrix, D_excluded, topo);
-    calc_pp_dvel_vector<<<((n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_patoms, DV_X, PP_MAT, D_patoms);
-
-    cudaMemcpy(dvelocities, DV_X, mem_size_DV_X, cudaMemcpyDeviceToHost);
-
-#ifdef DEBUG
-    cudaMemcpy(h_PP_MAT, PP_MAT, mem_size_PP_MAT, cudaMemcpyDeviceToHost);
-#endif
-
-#ifdef DEBUG
-    for (int i = 0; i < n_patoms; i++) {
-        for (int j = 0; j < n_patoms; j++) {
-            if (h_PP_MAT[i * n_patoms + j].x > 100)
-                printf("PP_MAT[%d][%d] = %f %f %f\n", i, j, h_PP_MAT[i * n_patoms + j].x, h_PP_MAT[i * n_patoms + j].y, h_PP_MAT[i * n_patoms + j].z);
-        }
-    }
-#endif
-    cudaMemset(D_PP_evdw_TOT, 0, sizeof(double));
-    cudaMemset(D_PP_ecoul_TOT, 0, sizeof(double));
-
-    calc_energy_sum<<<1, threads>>>(n_blocks, n_blocks, D_PP_evdw_TOT, D_PP_ecoul_TOT, D_PP_Evdw, D_PP_Ecoul, true);
-    double PP_evdw_TOT, PP_ecoul_TOT;
-    cudaMemcpy(&PP_evdw_TOT, D_PP_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost);
-    cudaMemcpy(&PP_ecoul_TOT, D_PP_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost);
-
-    E_nonbond_pp.Uvdw += PP_evdw_TOT;
-    E_nonbond_pp.Ucoul += PP_ecoul_TOT;
-}
 
 void init_nonbonded_pp_force_kernel_data() {
     using namespace CudaNonbondedPPForce;
 
     if (!is_initialized) {
-        int n_blocks = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
-
-        int mem_size_PP_MAT = n_patoms * n_patoms * sizeof(dvel_t);
-
-        int mem_size_PP_Evdw = n_blocks * n_blocks * sizeof(double);
-        int mem_size_PP_Ecoul = n_blocks * n_blocks * sizeof(double);
-#ifdef DEBUG
-        printf("Allocating PP_MAT\n");
-#endif
-        check_cudaMalloc((void**)&PP_MAT, mem_size_PP_MAT);
-#ifdef DEBUG
-        printf("Allocating D_PP_Evdw\n");
-#endif
-        check_cudaMalloc((void**)&D_PP_Evdw, mem_size_PP_Evdw);
-#ifdef DEBUG
-        printf("Allocating D_PP_Ecoul\n");
-#endif
-        check_cudaMalloc((void**)&D_PP_Ecoul, mem_size_PP_Ecoul);
-
-        check_cudaMalloc((void**)&D_PP_evdw_TOT, sizeof(double));
-        check_cudaMalloc((void**)&D_PP_ecoul_TOT, sizeof(double));
-
         is_initialized = true;
     }
 }
 
+void calc_nonbonded_pp_forces_host_v2() {
+    int nx = CudaContext::instance().h_p_atoms_list.size();
+    int ny = nx;
+
+    auto result = calc_nonbonded_force_host(
+        nx,
+        ny,
+        CudaContext::instance().d_p_atoms_list,
+        CudaContext::instance().d_p_atoms_list,
+        true,  // symmetric
+        CudaContext::instance().d_p_charge_types,
+        CudaContext::instance().d_p_charge_types,
+        CudaContext::instance().d_charge_table_all,
+        CudaContext::instance().d_p_catype_types,
+        CudaContext::instance().d_p_catype_types,
+        CudaContext::instance().d_catype_table_all);
+    printf("Nonbonded PP Force: Uvdw = %f, Ucoul = %f\n", result.first, result.second);
+    E_nonbond_pp.Uvdw += result.first;
+    E_nonbond_pp.Ucoul += result.second;
+}
+
 void cleanup_nonbonded_pp_force() {
     using namespace CudaNonbondedPPForce;
+
     if (is_initialized) {
-        cudaFree(PP_MAT);
-        cudaFree(D_PP_Evdw);
-        cudaFree(D_PP_Ecoul);
-        cudaFree(D_PP_evdw_TOT);
-        cudaFree(D_PP_ecoul_TOT);
         is_initialized = false;
     }
-}
\ No newline at end of file
+}
diff --git a/src/core/cuda/src/CudaNonbondedPWForce.cu b/src/core/cuda/src/CudaNonbondedPWForce.cu
index dec352db..e5e0a02f 100644
--- a/src/core/cuda/src/CudaNonbondedPWForce.cu
+++ b/src/core/cuda/src/CudaNonbondedPWForce.cu
@@ -1,367 +1,40 @@
+#include <iostream>
+#include <vector>
+
 #include "cuda/include/CudaContext.cuh"
+#include "cuda/include/CudaNonbondedForce.cuh"
 #include "cuda/include/CudaNonbondedPWForce.cuh"
 
 namespace CudaNonbondedPWForce {
 // Declare any necessary static variables or device pointers here
 bool is_initialized = false;
-struct calc_pw_t {
-    dvel_t P;
-    dvel_t W;
-};
-
-calc_pw_t *PW_MAT, *h_PW_MAT;
-double *D_PW_Evdw, *D_PW_Ecoul;
-double *D_PW_evdw_TOT, *D_PW_ecoul_TOT, PW_evdw_TOT, PW_ecoul_TOT;
-
-__global__ void calc_energy_sum(int rows, int columns, double* Evdw_TOT, double* Ecoul_TOT, double* Evdw, double* Ecoul, bool upper_diagonal) {
-    int tx = threadIdx.x;
-    int ty = threadIdx.y;
-
-    __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE];
-    __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE];
-
-    // TODO: better way to distribute upper diagonal over threads? Seems like threads in left bottom corner have less work
-    double coul_TOT = 0;
-    double vdw_TOT = 0;
-    for (int i = ty; i < rows; i += BLOCK_SIZE) {
-        for (int j = tx; j < columns; j += BLOCK_SIZE) {
-            if (i <= j || !upper_diagonal) {
-                coul_TOT += Ecoul[i * columns + j];
-                vdw_TOT += Evdw[i * columns + j];
-            }
-        }
-    }
-    Ecoul_S[ty][tx] = coul_TOT;
-    Evdw_S[ty][tx] = vdw_TOT;
-
-    __syncthreads();
-
-    if (tx == 0 && ty == 0) {
-        double Evdw_temp = 0;
-        double Ecoul_temp = 0;
-
-        for (int i = 0; i < BLOCK_SIZE; i++) {
-            for (int j = 0; j < BLOCK_SIZE; j++) {
-                Evdw_temp += Evdw_S[i][j];
-                Ecoul_temp += Ecoul_S[i][j];
-            }
-        }
-
-        *Evdw_TOT = Evdw_temp;
-        *Ecoul_TOT = Ecoul_temp;
-    }
-}
-
-__device__ void calc_pw_dvel_matrix_incr(int row, int pi, int column, int j, int n_atoms_solute,
-                                         coord_t* Xs, coord_t* Ws, bool* excluded_s, double* Evdw, double* Ecoul, calc_pw_t* pw,
-                                         ccharge_t* D_ccharges, charge_t* D_charges, catype_t* D_catypes, atype_t* D_atypes, p_atom_t* D_patoms, topo_t D_topo) {
-    coord_t da;
-    double r2a, ra, r6a;
-    double Vela, V_a, V_b;
-    double dva;
-    double qi, qj;
-    double ai_aii, aj_aii, ai_bii, aj_bii;
-    catype_t ai_type, aj_type;
-    atype_t i_type, j_type;
-
-    if (excluded_s[row]) return;
-    qi = D_ccharges[D_charges[pi].code - 1].charge;
-    qj = D_ccharges[D_charges[n_atoms_solute + j].code - 1].charge;  // TODO: FIX THIS!!! WILL NOT WORK WITH QATOMS!!!!!
-
-    // if (pi < 100 && j < 100){
-    //     printf("qi = %f qj = %f\n", qi, qj);
-    // }
-
-    ai_type = D_catypes[D_atypes[pi].code - 1];
-    aj_type = D_catypes[D_atypes[n_atoms_solute + j].code - 1];
-
-    da.x = Ws[column].x - Xs[row].x;
-    da.y = Ws[column].y - Xs[row].y;
-    da.z = Ws[column].z - Xs[row].z;
-    r2a = 1 / (pow(da.x, 2) + pow(da.y, 2) + pow(da.z, 2));
-    ra = sqrt(r2a);
-    r6a = r2a * r2a * r2a;
-
-    Vela = D_topo.coulomb_constant * qi * qj * ra;
-
-    ai_aii = ai_type.aii_normal;
-    aj_aii = aj_type.aii_normal;
-    ai_bii = ai_type.bii_normal;
-    aj_bii = aj_type.bii_normal;
-
-    V_a = r6a * r6a * ai_aii * aj_aii;
-    V_b = r6a * ai_bii * aj_bii;
-    dva = r2a * (-Vela - 12 * V_a + 6 * V_b);
-
-    pw->P.x -= dva * da.x;
-    pw->P.y -= dva * da.y;
-    pw->P.z -= dva * da.z;
-
-    pw->W.x += dva * da.x;
-    pw->W.y += dva * da.y;
-    pw->W.z += dva * da.z;
-
-    *Ecoul += Vela;
-    *Evdw += (V_a - V_b);
-
-    // if (pi == 522 && j == 175) {
-    //     printf("Vela = %f V_a = %f V_b = %f P = %f %f %f ai_aii = %f aj_aii = %f\n", Vela, V_a, V_b, pw->P.x, pw->P.y, pw->P.z, ai_aii, aj_aii);
-    // }
-
-    // if (pi < 100 && j < 100) printf("Evdw = %f Ecoul = %f\n", *Evdw, *Ecoul);
-}
-
-__global__ void calc_pw_dvel_vector_row(int n_patoms, int n_waters, dvel_t* DV_X, dvel_t* DV_W, calc_pw_t* PW_MAT, p_atom_t* D_patoms) {
-    int row = blockIdx.x * blockDim.x + threadIdx.x;
-    if (row >= n_patoms) return;
-
-    dvel_t dP;
-
-    dP.x = 0;
-    dP.y = 0;
-    dP.z = 0;
-
-    for (int i = 0; i < 3 * n_waters; i++) {
-        dP.x += PW_MAT[i + 3 * n_waters * row].P.x;
-        dP.y += PW_MAT[i + 3 * n_waters * row].P.y;
-        dP.z += PW_MAT[i + 3 * n_waters * row].P.z;
-    }
-
-    int p = D_patoms[row].a - 1;
-
-    DV_X[p].x += dP.x;
-    DV_X[p].y += dP.y;
-    DV_X[p].z += dP.z;
-
-    __syncthreads();
-}
-
-__global__ void calc_pw_dvel_vector_column(int n_patoms, int n_waters, dvel_t* DV_X, dvel_t* DV_W, calc_pw_t* PW_MAT) {
-    int column = blockIdx.x * blockDim.x + threadIdx.x;
-    if (column >= 3 * n_waters) return;
-
-    dvel_t dW;
-
-    dW.x = 0;
-    dW.y = 0;
-    dW.z = 0;
-
-    for (int i = 0; i < n_patoms; i++) {
-        dW.x += PW_MAT[column + 3 * n_waters * i].W.x;
-        dW.y += PW_MAT[column + 3 * n_waters * i].W.y;
-        dW.z += PW_MAT[column + 3 * n_waters * i].W.z;
-    }
-
-    DV_W[column].x += dW.x;
-    DV_W[column].y += dW.y;
-    DV_W[column].z += dW.z;
-
-    __syncthreads();
-}
-
-__global__ void calc_pw_dvel_matrix(int n_patoms, int n_atoms_solute, int n_waters,
-                                    coord_t* X, coord_t* W, double* Evdw, double* Ecoul, calc_pw_t* PW_MAT,
-                                    ccharge_t* D_ccharges, charge_t* D_charges, catype_t* D_catypes, atype_t* D_atypes, p_atom_t* D_patoms, bool* D_excluded, topo_t D_topo) {
-    // Block index
-    int bx = blockIdx.x;
-    int by = blockIdx.y;
-
-    // Thread index
-    int tx = threadIdx.x;
-    int ty = threadIdx.y;
-
-    __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE];
-    __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE];
-
-    Ecoul_S[ty][tx] = 0;
-    Evdw_S[ty][tx] = 0;
-
-    // if (tx == 0 && ty == 0) printf("bx = %d by = %d\n", bx, by);
-
-    // if (bx == 0 && by == 0) printf("bx = %d by = %d tx = %d ty = %d\n", bx, by, tx, ty);
-
-    int aStart = BLOCK_SIZE * by;
-    int bStart = BLOCK_SIZE * bx;
-
-    if (aStart + ty >= n_patoms) return;
-    if (bStart + tx >= 3 * n_waters) return;
-
-    // if (bx == 8 && by == 1) printf("bx = %d by = %d tx = %d ty = %d\n", bx, by, tx, ty);
-
-    __shared__ coord_t Xs[BLOCK_SIZE];
-    __shared__ coord_t Ws[BLOCK_SIZE];
-    __shared__ bool excluded_s[BLOCK_SIZE];
-
-    int pi = D_patoms[aStart + ty].a - 1;
-
-    Xs[ty] = X[pi];
-    Ws[tx] = W[bStart + tx];
-
-    if (tx == 0) {
-        excluded_s[ty] = D_excluded[pi];
-    }
-
-    __syncthreads();
-
-    calc_pw_t pw;
-    memset(&pw, 0, sizeof(calc_pw_t));
-
-    int row = by * BLOCK_SIZE + ty;
-    int column = bx * BLOCK_SIZE + tx;
-
-    // if (row == 0 && column == 1) {
-    //     printf("Xs[0] = %f\n", Xs[0]);
-    //     printf("Ys[0] = %f\n", Ys[0]);
-    //     printf("Xs[1] = %f\n", Xs[1]);
-    //     printf("Ys[1] = %f\n", Ys[1]);
-    //     printf("Xs[2] = %f\n", Xs[2]);
-    //     printf("Ys[2] = %f\n", Ys[2]);
-    //     printf("Xs[3] = %f\n", Xs[3]);
-    //     printf("Ys[3] = %f\n", Ys[3]);
-    //     printf("Xs[4] = %f\n", Xs[4]);
-    //     printf("Ys[4] = %f\n", Ys[4]);
-
-    //     printf("Ys[%d] = %f Xs[%d] = %f\n", 3 * ty, Ys[3 * ty], 3 * tx, Xs[3 * tx]);
-    // }
-
-    // if (bx == 8 && by == 1) printf("bx = %d by = %d tx = %d ty = %d\n", bx, by, tx, ty);
-    // __device__ void calc_pw_dvel_matrix_incr(int row, int pi, int column, int j, int n_patoms,
-    // coord_t *Ps, coord_t *Xs, double *Evdw, double *Ecoul, calc_pw_t *pw,
-    // ccharge_t *D_ccharges, charge_t *D_charges, catype_t *D_catypes, atype_t *D_atypes, p_atom_t *D_patoms)
-    double evdw = 0, ecoul = 0;
-    calc_pw_dvel_matrix_incr(ty, pi, tx, bStart + tx, n_atoms_solute, Xs, Ws, excluded_s, &evdw, &ecoul, &pw, D_ccharges, D_charges, D_catypes, D_atypes, D_patoms, D_topo);
-    Evdw_S[ty][tx] = evdw;
-    Ecoul_S[ty][tx] = ecoul;
-
-    // if (row == 0 && column == 1) {
-    //     printf("water_a = %f %f %f water_b = %f %f %f\n", water_a[0].x, water_a[0].y, water_a[0].z, water_b[0].x, water_b[0].y, water_b[0].z);
-    // }
-
-    // if (bx == 8 && by == 1) printf("n_qatoms = %d\n", n_qatoms);
-    // if (bx == 8 && by == 1) printf("qi = %d j = %d charge[%d] = %f\n", row, column, row + n_qatoms, D_qcharges[row + n_qatoms * 1].q);
-
-    PW_MAT[column + 3 * n_waters * row] = pw;
-
-    __syncthreads();
-
-    int rowlen = (3 * n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE;
-
-    if (tx == 0 && ty == 0) {
-        double tot_Evdw = 0;
-        double tot_Ecoul = 0;
-        for (int i = 0; i < BLOCK_SIZE; i++) {
-            for (int j = 0; j < BLOCK_SIZE; j++) {
-                tot_Evdw += Evdw_S[i][j];
-                tot_Ecoul += Ecoul_S[i][j];
-            }
-        }
-        Evdw[rowlen * by + bx] = tot_Evdw;
-        Ecoul[rowlen * by + bx] = tot_Ecoul;
-    }
-
-    __syncthreads();
-}
 
 }  // namespace CudaNonbondedPWForce
-
 void calc_nonbonded_pw_forces_host_v2() {
     using namespace CudaNonbondedPWForce;
-    int mem_size_W = 3 * n_waters * sizeof(coord_t);
-    int mem_size_DV_W = 3 * n_waters * sizeof(dvel_t);
-    int mem_size_DV_X = n_atoms_solute * sizeof(dvel_t);
-    int mem_size_PW_MAT = 3 * n_waters * n_patoms * sizeof(calc_pw_t);
-
-    int n_blocks_p = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    int n_blocks_w = (3 * n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE;
-
-    int mem_size_PW_Evdw = n_blocks_p * n_blocks_w * sizeof(double);
-    int mem_size_PW_Ecoul = n_blocks_p * n_blocks_w * sizeof(double);
-
-    CudaContext& ctx = CudaContext::instance();
-    auto X = ctx.d_coords;
-    auto DV_X = ctx.d_dvelocities;
-    auto D_ccharges = ctx.d_ccharges;
-    auto D_charges = ctx.d_charges;
-    auto D_catypes = ctx.d_catypes;
-    auto D_atypes = ctx.d_atypes;
-    auto D_patoms = ctx.d_p_atoms;
-    auto D_excluded = ctx.d_excluded;
 
-    dim3 threads, grid;
-
-    threads = dim3(BLOCK_SIZE, BLOCK_SIZE);
-    grid = dim3((3 * n_waters + BLOCK_SIZE - 1) / threads.x, (n_patoms + BLOCK_SIZE - 1) / threads.y);
-
-    calc_pw_dvel_matrix<<<grid, threads>>>(n_patoms, n_atoms_solute, n_waters, X, X + n_atoms_solute, D_PW_Evdw, D_PW_Ecoul, PW_MAT,
-                                           D_ccharges, D_charges, D_catypes, D_atypes, D_patoms, D_excluded, topo);
-    calc_pw_dvel_vector_column<<<((3 * n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_patoms, n_waters, DV_X, DV_X + n_atoms_solute, PW_MAT);
-    calc_pw_dvel_vector_row<<<((n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_patoms, n_waters, DV_X, DV_X + n_atoms_solute, PW_MAT, D_patoms);
-
-#ifdef DEBUG
-    cudaMemcpy(h_PW_MAT, PW_MAT, mem_size_PW_MAT, cudaMemcpyDeviceToHost);
-#endif
-
-// for (int i = 0; i < n_waters; i++) {
-//     printf("X[%d] = %f %f %f\n", i, coords[i].x, coords[i].y, coords[i].z);
-// }
-
-// printf("n_patoms = %d n_watoms = %d\n", n_patoms, 3 * n_waters);
-#ifdef DEBUG
-    for (int i = 0; i < n_patoms; i++) {
-        for (int j = 0; j < 3 * n_waters; j++) {
-            if (h_PW_MAT[3 * i * n_waters + j].P.x > 100)
-                printf("PW_MAT[%d][%d].P = %f %f %f\n", i, j, h_PW_MAT[3 * i * n_waters + j].P.x, h_PW_MAT[3 * i * n_waters + j].P.y, h_PW_MAT[3 * i * n_waters + j].P.z);
-        }
-    }
-#endif
-
-    cudaMemcpy(dvelocities, DV_X, mem_size_DV_X, cudaMemcpyDeviceToHost);
-    cudaMemcpy(&dvelocities[n_atoms_solute], DV_X + n_atoms_solute, mem_size_DV_W, cudaMemcpyDeviceToHost);
-
-    calc_energy_sum<<<1, threads>>>(n_blocks_p, n_blocks_w, D_PW_evdw_TOT, D_PW_ecoul_TOT, D_PW_Evdw, D_PW_Ecoul, false);
-
-    cudaMemcpy(&PW_evdw_TOT, D_PW_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost);
-    cudaMemcpy(&PW_ecoul_TOT, D_PW_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost);
-
-    E_nonbond_pw.Uvdw += PW_evdw_TOT;
-    E_nonbond_pw.Ucoul += PW_ecoul_TOT;
-
-    // for (int i = 0; i < n_atoms; i++) {
-    //     printf("dvelocities[%d] = %f %f %f\n", i, dvelocities[i].x, dvelocities[i].y, dvelocities[i].z);
-    // }
+    int nx = CudaContext::instance().h_p_atoms_list.size();
+    int ny = CudaContext::instance().h_w_atoms_list.size();
+    auto result = calc_nonbonded_force_host(
+        nx, ny,
+        CudaContext::instance().d_p_atoms_list,
+        CudaContext::instance().d_w_atoms_list,
+        false,
+        CudaContext::instance().d_p_charge_types,
+        CudaContext::instance().d_w_charge_types,
+        CudaContext::instance().d_charge_table_all,
+        CudaContext::instance().d_p_catype_types,
+        CudaContext::instance().d_w_catype_types,
+        CudaContext::instance().d_catype_table_all);
+    printf("Nonbonded PW Force (Host) - VdW: %f, Coulomb: %f\n", result.first, result.second);
+
+    E_nonbond_pw.Uvdw = result.first;
+    E_nonbond_pw.Ucoul = result.second;
 }
 
 void init_nonbonded_pw_force_kernel_data() {
     using namespace CudaNonbondedPWForce;
     if (!is_initialized) {
-        int mem_size_PW_MAT = 3 * n_waters * n_patoms * sizeof(calc_pw_t);
-
-        int n_blocks_p = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
-        int n_blocks_w = (3 * n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE;
-        int mem_size_PW_Evdw = n_blocks_p * n_blocks_w * sizeof(double);
-        int mem_size_PW_Ecoul = n_blocks_p * n_blocks_w * sizeof(double);
-#ifdef DEBUG
-        printf("Allocating PW_MAT\n");
-#endif
-        check_cudaMalloc((void**)&PW_MAT, mem_size_PW_MAT);
-
-#ifdef DEBUG
-        printf("Allocating D_PW_Evdw\n");
-#endif
-        check_cudaMalloc((void**)&D_PW_Evdw, mem_size_PW_Evdw);
-#ifdef DEBUG
-        printf("Allocating D_PW_Ecoul\n");
-#endif
-        check_cudaMalloc((void**)&D_PW_Ecoul, mem_size_PW_Ecoul);
-
-        check_cudaMalloc((void**)&D_PW_evdw_TOT, sizeof(double));
-        check_cudaMalloc((void**)&D_PW_ecoul_TOT, sizeof(double));
-
-#ifdef DEBUG
-        printf("All GPU solvent memory allocated\n");
-#endif
-
-        h_PW_MAT = (calc_pw_t*)malloc(mem_size_PW_MAT);
         is_initialized = true;
     }
 }
@@ -369,12 +42,6 @@ void init_nonbonded_pw_force_kernel_data() {
 void cleanup_nonbonded_pw_force() {
     using namespace CudaNonbondedPWForce;
     if (is_initialized) {
-        cudaFree(PW_MAT);
-        cudaFree(D_PW_Evdw);
-        cudaFree(D_PW_Ecoul);
-        cudaFree(D_PW_evdw_TOT);
-        cudaFree(D_PW_ecoul_TOT);
-        free(h_PW_MAT);
         is_initialized = false;
     }
-}
\ No newline at end of file
+}
diff --git a/src/core/cuda/src/CudaNonbondedQPForce.cu b/src/core/cuda/src/CudaNonbondedQPForce.cu
index 4eb53908..39a311ee 100644
--- a/src/core/cuda/src/CudaNonbondedQPForce.cu
+++ b/src/core/cuda/src/CudaNonbondedQPForce.cu
@@ -1,365 +1,52 @@
 #include <iostream>
+#include <map>
+#include <vector>
 
 #include "cuda/include/CudaContext.cuh"
+#include "cuda/include/CudaNonbondedForce.cuh"
 #include "cuda/include/CudaNonbondedQPForce.cuh"
 #include "system.h"
+#include "utils.h"
 
 namespace CudaNonbondedQPForce {
-struct calc_qw_t {
-    dvel_t Q;
-    dvel_t O;
-    dvel_t H1;
-    dvel_t H2;
-};
-
-struct calc_qp_t {
-    dvel_t Q;
-    dvel_t P;
-};
 bool is_initialized = false;
-double *D_QP_Evdw, *D_QP_Ecoul, *h_QP_Evdw, *h_QP_Ecoul;
-calc_qp_t *QP_MAT, *h_QP_MAT;
-double *D_QP_evdw_TOT, *D_QP_ecoul_TOT, QP_evdw_TOT, QP_ecoul_TOT;
-
-// General
-__global__ void calc_energy_sum(int rows, int columns, double* Evdw_TOT, double* Ecoul_TOT, double* Evdw, double* Ecoul, bool upper_diagonal) {
-    int tx = threadIdx.x;
-    int ty = threadIdx.y;
-
-    __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE];
-    __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE];
-
-    // TODO: better way to distribute upper diagonal over threads? Seems like threads in left bottom corner have less work
-    double coul_TOT = 0;
-    double vdw_TOT = 0;
-    for (int i = ty; i < rows; i += BLOCK_SIZE) {
-        for (int j = tx; j < columns; j += BLOCK_SIZE) {
-            if (i <= j || !upper_diagonal) {
-                coul_TOT += Ecoul[i * columns + j];
-                vdw_TOT += Evdw[i * columns + j];
-            }
-        }
-    }
-    Ecoul_S[ty][tx] = coul_TOT;
-    Evdw_S[ty][tx] = vdw_TOT;
-
-    __syncthreads();
-
-    if (tx == 0 && ty == 0) {
-        double Evdw_temp = 0;
-        double Ecoul_temp = 0;
-
-        for (int i = 0; i < BLOCK_SIZE; i++) {
-            for (int j = 0; j < BLOCK_SIZE; j++) {
-                Evdw_temp += Evdw_S[i][j];
-                Ecoul_temp += Ecoul_S[i][j];
-            }
-        }
-
-        *Evdw_TOT = Evdw_temp;
-        *Ecoul_TOT = Ecoul_temp;
-    }
-}
-__global__ void calc_qp_dvel_vector_row(int n_qatoms, int n_patoms, dvel_t* DV_X, calc_qp_t* QP_MAT, q_atom_t* D_qatoms) {
-    int row = blockIdx.x * blockDim.x + threadIdx.x;
-    if (row >= n_qatoms) return;
-
-    dvel_t dQ;
-
-    dQ.x = 0;
-    dQ.y = 0;
-    dQ.z = 0;
-
-    for (int i = 0; i < n_patoms; i++) {
-        dQ.x += QP_MAT[i + n_patoms * row].Q.x;
-        dQ.y += QP_MAT[i + n_patoms * row].Q.y;
-        dQ.z += QP_MAT[i + n_patoms * row].Q.z;
-    }
-
-    int q = D_qatoms[row].a - 1;
-
-    DV_X[q].x += dQ.x;
-    DV_X[q].y += dQ.y;
-    DV_X[q].z += dQ.z;
-
-    __syncthreads();
-}
-
-__global__ void calc_qp_dvel_vector_column(int n_qatoms, int n_patoms, dvel_t* DV_X, calc_qp_t* QP_MAT, p_atom_t* D_patoms) {
-    int column = blockIdx.x * blockDim.x + threadIdx.x;
-    if (column >= n_patoms) return;
-
-    dvel_t dP;
-
-    dP.x = 0;
-    dP.y = 0;
-    dP.z = 0;
-
-    for (int i = 0; i < n_qatoms; i++) {
-        dP.x += QP_MAT[column + n_patoms * i].P.x;
-        dP.y += QP_MAT[column + n_patoms * i].P.y;
-        dP.z += QP_MAT[column + n_patoms * i].P.z;
-    }
-
-    int p = D_patoms[column].a - 1;
-
-    DV_X[p].x += dP.x;
-    DV_X[p].y += dP.y;
-    DV_X[p].z += dP.z;
-
-    __syncthreads();
-}
-
-__device__ void calc_qp_dvel_matrix_incr(int row, int qi, int column, int pj, int n_lambdas, int n_qatoms,
-                                         coord_t* Qs, coord_t* Ps, int* LJs, bool* excluded_s, double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE], double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE], calc_qp_t* qp,
-                                         q_catype_t* D_qcatypes, q_atype_t* D_qatypes, q_charge_t* D_qcharges, p_atom_t* D_patoms, q_atom_t* D_qatoms, double* D_lambdas,
-                                         catype_t* D_catypes, atype_t* D_atypes, ccharge_t* D_ccharges, charge_t* D_charges, topo_t D_topo) {
-    coord_t da;
-    double r2, r6, r;
-    double ai_aii, aj_aii, ai_bii, aj_bii;
-    q_catype_t qi_type;
-    catype_t aj_type;
-    bool bond23, bond14;
-    double scaling, Vel, V_a, V_b, dv;
-
-    int j = D_patoms[pj].a - 1;
-
-    bond23 = LJs[row * BLOCK_SIZE + column] == 3;
-    bond14 = LJs[row * BLOCK_SIZE + column] == 1;
-
-    if (bond23) return;
-    if (excluded_s[row] || excluded_s[BLOCK_SIZE + column]) return;
-
-    scaling = bond14 ? D_topo.el14_scale : 1;
-
-    da.x = Qs[row].x - Ps[column].x;
-    da.y = Qs[row].y - Ps[column].y;
-    da.z = Qs[row].z - Ps[column].z;
-
-    r2 = pow(da.x, 2) + pow(da.y, 2) + pow(da.z, 2);
-
-    r6 = r2 * r2 * r2;
-    r2 = 1 / r2;
-    r = sqrt(r2);
-
-    for (int state = 0; state < n_lambdas; state++) {
-        qi_type = D_qcatypes[D_qatypes[qi + n_qatoms * state].code - 1];
-        aj_type = D_catypes[D_atypes[j].code - 1];
-
-        ai_aii = bond14 ? qi_type.Ai_14 : qi_type.Ai;
-        aj_aii = bond14 ? aj_type.aii_1_4 : aj_type.aii_normal;
-        ai_bii = bond14 ? qi_type.Bi_14 : qi_type.Bi;
-        aj_bii = bond14 ? aj_type.bii_1_4 : aj_type.bii_normal;
-
-        Vel = D_topo.coulomb_constant * scaling * D_qcharges[qi + n_qatoms * state].q * D_ccharges[D_charges[j].code - 1].charge * r;
-        V_a = ai_aii * aj_aii / (r6 * r6);
-        V_b = ai_bii * aj_bii / r6;
-        dv = r2 * (-Vel - (12 * V_a - 6 * V_b)) * D_lambdas[state];
-
-        // if (state == 0 && qi == 0 && pj == 1) {
-        //     printf("crg_q = %f crg_j = %f r = %f\n", D_qcharges[qi + n_qatoms * state].q, D_ccharges[D_charges[pj].code - 1].charge, r);
-        //     printf("ai_aii = %f aj_aii = %f ai_bii = %f aj_bii = %f\n", ai_aii, aj_aii, ai_bii, aj_bii);
-        // }
-
-        // Update forces
-        qp->Q.x += dv * da.x;
-        qp->Q.y += dv * da.y;
-        qp->Q.z += dv * da.z;
-        qp->P.x -= dv * da.x;
-        qp->P.y -= dv * da.y;
-        qp->P.z -= dv * da.z;
-
-        // Update Q totals
-        Ecoul_S[row][state * BLOCK_SIZE + column] += Vel;
-        Evdw_S[row][state * BLOCK_SIZE + column] += (V_a - V_b);
-    }
-}
-
-__global__ void calc_qp_dvel_matrix(int n_qatoms, int n_patoms, int n_lambdas, int n_atoms_solute,
-                                    coord_t* X, double* Evdw, double* Ecoul, calc_qp_t* QP_MAT,
-                                    q_catype_t* D_qcatypes, q_atype_t* D_qatypes, q_charge_t* D_qcharges, p_atom_t* D_patoms, q_atom_t* D_qatoms, double* D_lambdas,
-                                    int* D_LJ_matrix, bool* D_excluded, catype_t* D_catypes, atype_t* D_atypes, ccharge_t* D_ccharges, charge_t* D_charges, topo_t D_topo) {
-    // Block index
-    int bx = blockIdx.x;
-    int by = blockIdx.y;
-
-    // Thread index
-    int tx = threadIdx.x;
-    int ty = threadIdx.y;
-
-    // TODO implement >2 states on GPU
-    __shared__ double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE];
-    __shared__ double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE];
-
-    Ecoul_S[ty][tx] = 0;
-    Evdw_S[ty][tx] = 0;
-    Ecoul_S[ty][tx + BLOCK_SIZE] = 0;
-    Evdw_S[ty][tx + BLOCK_SIZE] = 0;
-
-    int aStart = BLOCK_SIZE * by;
-    int bStart = BLOCK_SIZE * bx;
-
-    if (aStart + ty >= n_qatoms) return;
-    if (bStart + tx >= n_patoms) return;
-
-    int qi = D_qatoms[aStart + ty].a - 1;
-    int pj = D_patoms[bStart + tx].a - 1;
-
-    __shared__ coord_t Qs[BLOCK_SIZE];
-    __shared__ coord_t Ps[BLOCK_SIZE];
-    __shared__ int LJs[BLOCK_SIZE * BLOCK_SIZE];
-    __shared__ bool excluded_s[2 * BLOCK_SIZE];
-
-    if (tx == 0) {
-        Qs[ty] = X[qi];
-        excluded_s[ty] = D_excluded[qi];
-    }
-
-    if (ty == 0) {
-        Ps[tx] = X[pj];
-        excluded_s[BLOCK_SIZE + tx] = D_excluded[pj];
-    }
-    LJs[ty * BLOCK_SIZE + tx] = D_LJ_matrix[qi * n_atoms_solute + pj];
-
-    __syncthreads();
-
-    calc_qp_t qp;
-    memset(&qp, 0, sizeof(calc_qp_t));
-
-    int row = by * BLOCK_SIZE + ty;
-    int column = bx * BLOCK_SIZE + tx;
-
-    calc_qp_dvel_matrix_incr(ty, row, tx, column, n_lambdas, n_qatoms, Qs, Ps, LJs, excluded_s, Evdw_S, Ecoul_S,
-                             &qp, D_qcatypes, D_qatypes, D_qcharges, D_patoms, D_qatoms, D_lambdas, D_catypes, D_atypes, D_ccharges, D_charges, D_topo);
-
-    QP_MAT[n_patoms * row + column] = qp;
-
-    __syncthreads();
-
-    int rowlen = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    int collen = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
-
-    if (tx == 0 && ty == 0) {
-        // TODO implement >2 states on GPU
-        for (int state = 0; state < min(2, n_lambdas); state++) {
-            double tot_Evdw = 0;
-            double tot_Ecoul = 0;
-            for (int i = 0; i < BLOCK_SIZE; i++) {
-                for (int j = 0; j < BLOCK_SIZE; j++) {
-                    tot_Evdw += Evdw_S[i][j + state * BLOCK_SIZE];
-                    tot_Ecoul += Ecoul_S[i][j + state * BLOCK_SIZE];
-                }
-            }
-            Evdw[rowlen * collen * state + rowlen * by + bx] = tot_Evdw;
-            Ecoul[rowlen * collen * state + rowlen * by + bx] = tot_Ecoul;
-        }
-    }
-
-    __syncthreads();
-}
 
 }  // namespace CudaNonbondedQPForce
 
 void calc_nonbonded_qp_forces_host_v2() {
     using namespace CudaNonbondedQPForce;
-
-    int n_blocks_q = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    int n_blocks_p = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
-
-    CudaContext& ctx = CudaContext::instance();
-    auto X = ctx.d_coords;
-    auto DV_X = ctx.d_dvelocities;
-    auto D_qcatypes = ctx.d_q_catypes;
-    auto D_qatypes = ctx.d_q_atypes;
-    auto D_qcharges = ctx.d_q_charges;
-    auto D_patoms = ctx.d_p_atoms;
-    auto D_qatoms = ctx.d_q_atoms;
-    auto D_lambdas = ctx.d_lambdas;
-    auto D_LJ_matrix = ctx.d_LJ_matrix;
-    auto D_excluded = ctx.d_excluded;
-    auto D_catypes = ctx.d_catypes;
-    auto D_atypes = ctx.d_atypes;
-    auto D_ccharges = ctx.d_ccharges;
-    auto D_charges = ctx.d_charges;
-
-    dim3 threads, grid;
-
-    threads = dim3(BLOCK_SIZE, BLOCK_SIZE);
-    grid = dim3((n_patoms + BLOCK_SIZE - 1) / threads.x, (n_qatoms + BLOCK_SIZE - 1) / threads.y);
-
-    calc_qp_dvel_matrix<<<grid, threads>>>(n_qatoms, n_patoms, n_lambdas, n_atoms_solute, X, D_QP_Evdw, D_QP_Ecoul, QP_MAT,
-                                           D_qcatypes, D_qatypes, D_qcharges, D_patoms, D_qatoms, D_lambdas, D_LJ_matrix, D_excluded,
-                                           D_catypes, D_atypes, D_ccharges, D_charges, topo);
-    calc_qp_dvel_vector_column<<<((n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_qatoms, n_patoms, DV_X, QP_MAT, D_patoms);
-    calc_qp_dvel_vector_row<<<((n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_qatoms, n_patoms, DV_X, QP_MAT, D_qatoms);
-
-#ifdef DEBUG
-    cudaMemcpy(h_QP_MAT, QP_MAT, mem_size_QP_MAT, cudaMemcpyDeviceToHost);
-#endif
-
-#ifdef DEBUG
-    for (int i = 0; i < n_qatoms; i++) {
-        for (int j = 0; j < n_patoms; j++) {
-            if (i == 0)
-                // if (h_QP_MAT[i * n_patoms + j].Q.x > 100)
-                printf("QP_MAT[%d][%d].Q = %f %f %f\n", i, j, h_QP_MAT[i * n_patoms + j].Q.x, h_QP_MAT[i * n_patoms + j].Q.y, h_QP_MAT[i * n_patoms + j].Q.z);
-        }
-    }
-#endif
-    int mem_size_DV_X = n_atoms * sizeof(dvel_t);
-    cudaMemcpy(dvelocities, DV_X, mem_size_DV_X, cudaMemcpyDeviceToHost);
-
-    // TODO: make Evdw & Ecoul work for # of states > 2
-    for (int state = 0; state < min(2, n_lambdas); state++) {
-        calc_energy_sum<<<1, threads>>>(n_blocks_q, n_blocks_p, D_QP_evdw_TOT, D_QP_ecoul_TOT, &D_QP_Evdw[state * n_blocks_p * n_blocks_q], &D_QP_Ecoul[state * n_blocks_p * n_blocks_q], false);
-
-        cudaMemcpy(&QP_evdw_TOT, D_QP_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost);
-        cudaMemcpy(&QP_ecoul_TOT, D_QP_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost);
-
-        EQ_nonbond_qp[state].Uvdw += QP_evdw_TOT;
-        EQ_nonbond_qp[state].Ucoul += QP_ecoul_TOT;
+    int nx = CudaContext::instance().h_q_atoms_list.size();
+    int ny = CudaContext::instance().h_p_atoms_list.size();
+    for (int state = 0; state < n_lambdas; state++) {
+        auto result = calc_nonbonded_force_host(
+            nx,
+            ny,
+            CudaContext::instance().d_q_atoms_list,
+            CudaContext::instance().d_p_atoms_list,
+            false,
+            CudaContext::instance().d_q_charge_types + n_lambdas * nx + state * nx,
+            CudaContext::instance().d_p_charge_types,
+            CudaContext::instance().d_charge_table_all,
+            CudaContext::instance().d_q_catype_types + n_lambdas * nx + state * nx,
+            CudaContext::instance().d_p_catype_types,
+            CudaContext::instance().d_catype_table_all);
+
+        EQ_nonbond_qp[state].Uvdw += result.first / lambdas[state];
+        EQ_nonbond_qp[state].Ucoul += result.second / lambdas[state];
+        printf("Nonbonded QP Force State %d: Uvdw = %f, Ucoul = %f\n", state, result.first, result.second);
     }
 }
 
 void init_nonbonded_qp_force_kernel_data() {
     using namespace CudaNonbondedQPForce;
-
     if (!is_initialized) {
-        int n_blocks_q = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
-        int n_blocks_p = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
-        // TODO make Evdw & Ecoul work for # of states > 2
-        int mem_size_QP_Evdw = min(n_lambdas, 2) * n_blocks_q * n_blocks_p * sizeof(double);
-        int mem_size_QP_Ecoul = min(n_lambdas, 2) * n_blocks_q * n_blocks_p * sizeof(double);
-        int mem_size_QP_MAT = n_qatoms * n_patoms * sizeof(calc_qp_t);
-
-        check_cudaMalloc((void**)&D_QP_Evdw, mem_size_QP_Evdw);
-        check_cudaMalloc((void**)&D_QP_Ecoul, mem_size_QP_Ecoul);
-        h_QP_Evdw = (double*)malloc(mem_size_QP_Evdw);
-        h_QP_Ecoul = (double*)malloc(mem_size_QP_Ecoul);
-
-        check_cudaMalloc((void**)&QP_MAT, mem_size_QP_MAT);
-        h_QP_MAT = (calc_qp_t*)malloc(mem_size_QP_MAT);
-
-        check_cudaMalloc((void**)&D_QP_evdw_TOT, sizeof(double));
-        check_cudaMalloc((void**)&D_QP_ecoul_TOT, sizeof(double));
-
         is_initialized = true;
     }
 }
 
 void cleanup_nonbonded_qp_force() {
     using namespace CudaNonbondedQPForce;
-
     if (is_initialized) {
-        cudaFree(D_QP_Evdw);
-        cudaFree(D_QP_Ecoul);
-        free(h_QP_Evdw);
-        free(h_QP_Ecoul);
-        cudaFree(QP_MAT);
-        free(h_QP_MAT);
-        cudaFree(D_QP_evdw_TOT);
-        cudaFree(D_QP_ecoul_TOT);
-
         is_initialized = false;
     }
-}
\ No newline at end of file
+}
diff --git a/src/core/cuda/src/CudaNonbondedQQForce.cu b/src/core/cuda/src/CudaNonbondedQQForce.cu
index e2138ed6..015fd90f 100644
--- a/src/core/cuda/src/CudaNonbondedQQForce.cu
+++ b/src/core/cuda/src/CudaNonbondedQQForce.cu
@@ -1,135 +1,48 @@
+#include <vector>
+#include <iostream>
+
 #include "cuda/include/CudaContext.cuh"
+#include "cuda/include/CudaNonbondedForce.cuh"
 #include "cuda/include/CudaNonbondedQQForce.cuh"
 #include "utils.h"
 
-__global__ void calc_nonbonded_qq_forces_kernel(
-    q_atom_t* q_atoms,
-    q_charge_t* q_charges,
-    int* LJ_matrix,
-    bool* excluded,
-    q_elscale_t* q_elscales,
-    q_catype_t* q_catypes,
-    q_atype_t* q_atypes,
-    coord_t* coords,
-    E_nonbonded_t* EQ_nonbond_qq,
-    dvel_t* dvelocities,
-    double* lambdas,
-    int n_qatoms,
-    int n_lambdas,
-    int n_atoms_solute,
-    topo_t topo,
-    int n_qelscales
+namespace CudaNonbondedQQForce {
+bool is_initialized = false;
 
-) {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx >= n_qatoms) return;
-    int qi = idx;
+}  // namespace CudaNonbondedQQForce
 
-    int ai, aj;
-    double crg_i, crg_j;
-    double elscale, scaling;
-    q_catype_t qi_type, qj_type;
-    bool bond23, bond14;
-    coord_t da;
-    double r2a, ra, r6a;
-    double Vela, V_a, V_b;
-    double dva;
-    double ai_aii, aj_aii, ai_bii, aj_bii;
+void calc_nonbonded_qq_forces_host() {
+    using namespace CudaNonbondedQQForce;
 
+    int n = CudaContext::instance().h_q_atoms_list.size();
     for (int state = 0; state < n_lambdas; state++) {
-        for (int qj = qi + 1; qj < n_qatoms; qj++) {
-            ai = q_atoms[qi].a - 1;
-            aj = q_atoms[qj].a - 1;
-
-            crg_i = q_charges[qi + n_qatoms * state].q;
-            crg_j = q_charges[qj + n_qatoms * state].q;
-
-            bond23 = LJ_matrix[ai * n_atoms_solute + aj] == 3;
-            bond14 = LJ_matrix[ai * n_atoms_solute + aj] == 1;
-
-            if (bond23) continue;
-            if (excluded[ai] || excluded[aj]) continue;
-
-            scaling = bond14 ? topo.el14_scale : 1;
-
-            elscale = 1;
-            for (int k = 0; k < n_qelscales; k++) {
-                if (q_elscales[k + n_qelscales * state].qi == qi + 1 && q_elscales[k + n_qelscales * state].qj == qj + 1) {
-                    elscale = q_elscales[k + n_qelscales * state].mu;
-                }
-            }
-
-            qi_type = q_catypes[q_atypes[qi + n_qatoms * state].code - 1];
-            qj_type = q_catypes[q_atypes[qj + n_qatoms * state].code - 1];
-
-            da.x = coords[aj].x - coords[ai].x;
-            da.y = coords[aj].y - coords[ai].y;
-            da.z = coords[aj].z - coords[ai].z;
-            r2a = 1 / (pow(da.x, 2) + pow(da.y, 2) + pow(da.z, 2));
-            ra = sqrt(r2a);
-            r6a = r2a * r2a * r2a;
-
-            Vela = scaling * topo.coulomb_constant * crg_i * crg_j * ra * elscale;
-
-            ai_aii = bond14 ? qi_type.Ai_14 : qi_type.Ai;
-            aj_aii = bond14 ? qj_type.Ai_14 : qj_type.Ai;
-            ai_bii = bond14 ? qi_type.Bi_14 : qi_type.Bi;
-            aj_bii = bond14 ? qj_type.Bi_14 : qj_type.Bi;
-
-            V_a = r6a * r6a * ai_aii * aj_aii;
-            V_b = r6a * ai_bii * aj_bii;
-            dva = r2a * (-Vela - 12 * V_a + 6 * V_b) * lambdas[state];
-
-            atomicAdd(&dvelocities[ai].x, -dva * da.x);
-            atomicAdd(&dvelocities[ai].y, -dva * da.y);
-            atomicAdd(&dvelocities[ai].z, -dva * da.z);
-            atomicAdd(&dvelocities[aj].x, dva * da.x);
-            atomicAdd(&dvelocities[aj].y, dva * da.y);
-            atomicAdd(&dvelocities[aj].z, dva * da.z);
-
-            atomicAdd(&EQ_nonbond_qq[state].Ucoul, Vela);
-            atomicAdd(&EQ_nonbond_qq[state].Uvdw, (V_a - V_b));
-        }
+        auto result = calc_nonbonded_force_host(
+            n,
+            n,
+            CudaContext::instance().d_q_atoms_list,
+            CudaContext::instance().d_q_atoms_list,
+            true,
+            CudaContext::instance().d_q_charge_types + state * n,
+            CudaContext::instance().d_q_charge_types + state * n + n * n_lambdas,
+            CudaContext::instance().d_charge_table_all,
+            CudaContext::instance().d_q_catype_types + state * n,
+            CudaContext::instance().d_q_catype_types + state * n + n * n_lambdas,
+            CudaContext::instance().d_catype_table_all);
+
+        EQ_nonbond_qq[state].Uvdw += result.first / lambdas[state];
+        EQ_nonbond_qq[state].Ucoul += result.second / lambdas[state];
+        printf("Nonbonded QQ Force State %d: Uvdw = %f, Ucoul = %f\n", state, result.first, result.second);
     }
 }
 
-void calc_nonbonded_qq_forces_host() {
-    CudaContext& ctx = CudaContext::instance();
-    // ctx.sync_all_to_device();
-    auto d_q_atoms = ctx.d_q_atoms;
-    auto d_q_charges = ctx.d_q_charges;
-    auto d_LJ_matrix = ctx.d_LJ_matrix;
-    auto d_excluded = ctx.d_excluded;
-    auto d_q_elscales = ctx.d_q_elscales;
-    auto d_q_catypes = ctx.d_q_catypes;
-    auto d_q_atypes = ctx.d_q_atypes;
-    auto d_coords = ctx.d_coords;
-    auto d_EQ_nonbond_qq = ctx.d_EQ_nonbond_qq;
-    auto d_dvelocities = ctx.d_dvelocities;
-    auto d_lambdas = ctx.d_lambdas;
-
-    int blockSize = 256;
-    int numBlocks = (n_qatoms + blockSize - 1) / blockSize;
-    calc_nonbonded_qq_forces_kernel<<<numBlocks, blockSize>>>(
-        d_q_atoms,
-        d_q_charges,
-        d_LJ_matrix,
-        d_excluded,
-        d_q_elscales,
-        d_q_catypes,
-        d_q_atypes,
-        d_coords,
-        d_EQ_nonbond_qq,
-        d_dvelocities,
-        d_lambdas,
-        n_qatoms,
-        n_lambdas,
-        n_atoms_solute,
-        topo, n_qelscales);
-    cudaDeviceSynchronize();
-    cudaMemcpy(EQ_nonbond_qq, d_EQ_nonbond_qq, sizeof(E_nonbonded_t) * n_lambdas, cudaMemcpyDeviceToHost);
-    cudaMemcpy(dvelocities, d_dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyDeviceToHost);
+void init_nonbonded_qq_force_kernel_data() {
+    using namespace CudaNonbondedQQForce;
+    if (is_initialized) return;
+    is_initialized = true;
 }
 
-void init_nonbonded_qq_force_kernel_data() {}
-void cleanup_nonbonded_qq_force() {}
\ No newline at end of file
+void cleanup_nonbonded_qq_force() {
+    using namespace CudaNonbondedQQForce;
+    if (!is_initialized) return;
+    is_initialized = false;
+}
diff --git a/src/core/cuda/src/CudaNonbondedQWForce.cu b/src/core/cuda/src/CudaNonbondedQWForce.cu
index 974c380c..e39ee57f 100644
--- a/src/core/cuda/src/CudaNonbondedQWForce.cu
+++ b/src/core/cuda/src/CudaNonbondedQWForce.cu
@@ -1,404 +1,49 @@
+#include <vector>
 #include <iostream>
 
 #include "cuda/include/CudaContext.cuh"
+#include "cuda/include/CudaNonbondedForce.cuh"
 #include "cuda/include/CudaNonbondedQWForce.cuh"
 #include "utils.h"
 
 namespace CudaNonbondedQWForce {
-struct calc_qw_t {
-    dvel_t Q;
-    dvel_t O;
-    dvel_t H1;
-    dvel_t H2;
-};
 bool is_initialized = false;
-calc_qw_t *QW_MAT, *h_QW_MAT;
 
-double *D_QW_Evdw, *D_QW_Ecoul, *h_QW_Evdw, *h_QW_Ecoul;
-double *D_QW_evdw_TOT, *D_QW_ecoul_TOT, QW_evdw_TOT, QW_ecoul_TOT;
-
-// General
-__global__ void calc_energy_sum(int rows, int columns, double* Evdw_TOT, double* Ecoul_TOT, double* Evdw, double* Ecoul, bool upper_diagonal) {
-    int tx = threadIdx.x;
-    int ty = threadIdx.y;
-
-    __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE];
-    __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE];
-
-    // TODO: better way to distribute upper diagonal over threads? Seems like threads in left bottom corner have less work
-    double coul_TOT = 0;
-    double vdw_TOT = 0;
-    for (int i = ty; i < rows; i += BLOCK_SIZE) {
-        for (int j = tx; j < columns; j += BLOCK_SIZE) {
-            if (i <= j || !upper_diagonal) {
-                coul_TOT += Ecoul[i * columns + j];
-                vdw_TOT += Evdw[i * columns + j];
-            }
-        }
-    }
-    Ecoul_S[ty][tx] = coul_TOT;
-    Evdw_S[ty][tx] = vdw_TOT;
-
-    __syncthreads();
-
-    if (tx == 0 && ty == 0) {
-        double Evdw_temp = 0;
-        double Ecoul_temp = 0;
-
-        for (int i = 0; i < BLOCK_SIZE; i++) {
-            for (int j = 0; j < BLOCK_SIZE; j++) {
-                Evdw_temp += Evdw_S[i][j];
-                Ecoul_temp += Ecoul_S[i][j];
-            }
-        }
-
-        *Evdw_TOT = Evdw_temp;
-        *Ecoul_TOT = Ecoul_temp;
-    }
-}
-
-__global__ void calc_qw_dvel_vector_row(int n_qatoms, int n_waters, dvel_t* DV_X, dvel_t* DV_W, calc_qw_t* MAT, q_atom_t* D_qatoms) {
-    int row = blockIdx.x * blockDim.x + threadIdx.x;
-    if (row >= n_qatoms) return;
-
-    dvel_t dQ;
-
-    dQ.x = 0;
-    dQ.y = 0;
-    dQ.z = 0;
-
-    for (int i = 0; i < n_waters; i++) {
-        dQ.x += MAT[i + n_waters * row].Q.x;
-        dQ.y += MAT[i + n_waters * row].Q.y;
-        dQ.z += MAT[i + n_waters * row].Q.z;
-    }
-
-    int q = D_qatoms[row].a - 1;
-
-    DV_X[q].x += dQ.x;
-    DV_X[q].y += dQ.y;
-    DV_X[q].z += dQ.z;
-
-    __syncthreads();
-}
-
-__global__ void calc_qw_dvel_vector_column(int n_qatoms, int n_waters, dvel_t* DV_X, dvel_t* DV_W, calc_qw_t* MAT) {
-    int column = blockIdx.x * blockDim.x + threadIdx.x;
-    if (column >= n_waters) return;
-
-    dvel_t dO, dH1, dH2;
-
-    dO.x = 0;
-    dO.y = 0;
-    dO.z = 0;
-    dH1.x = 0;
-    dH1.y = 0;
-    dH1.z = 0;
-    dH2.x = 0;
-    dH2.y = 0;
-    dH2.z = 0;
-
-    for (int i = 0; i < n_qatoms; i++) {
-        dO.x += MAT[column + n_waters * i].O.x;
-        dO.y += MAT[column + n_waters * i].O.y;
-        dO.z += MAT[column + n_waters * i].O.z;
-        dH1.x += MAT[column + n_waters * i].H1.x;
-        dH1.y += MAT[column + n_waters * i].H1.y;
-        dH1.z += MAT[column + n_waters * i].H1.z;
-        dH2.x += MAT[column + n_waters * i].H2.x;
-        dH2.y += MAT[column + n_waters * i].H2.y;
-        dH2.z += MAT[column + n_waters * i].H2.z;
-    }
-
-    DV_W[3 * column].x += dO.x;
-    DV_W[3 * column].y += dO.y;
-    DV_W[3 * column].z += dO.z;
-    DV_W[3 * column + 1].x += dH1.x;
-    DV_W[3 * column + 1].y += dH1.y;
-    DV_W[3 * column + 1].z += dH1.z;
-    DV_W[3 * column + 2].x += dH2.x;
-    DV_W[3 * column + 2].y += dH2.y;
-    DV_W[3 * column + 2].z += dH2.z;
-
-    __syncthreads();
-}
-
-__device__ void calc_qw_dvel_matrix_incr(int row, int qi, int column, int n_lambdas, int n_qatoms, double crg_ow, double crg_hw, double A_O, double B_O,
-                                         coord_t* Qs, coord_t* Ws, double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE], double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE], calc_qw_t* qw,
-                                         q_catype_t* D_qcatypes, q_atype_t* D_qatypes, q_charge_t* D_qcharges, q_atom_t* D_qatoms, double* D_lambdas, topo_t D_topo) {
-    int j;
-    coord_t dO, dH1, dH2;
-    double r2O, rH1, rH2, r6O, rO, r2H1, r2H2;
-    double dvO, dvH1, dvH2;
-    double V_a, V_b, VelO, VelH1, VelH2;
-    q_atype_t qa_type;
-    q_catype_t qi_type;
-    double ai_aii, ai_bii;
-
-    j = 3 * column;
-    dO.x = Ws[j].x - Qs[row].x;
-    dO.y = Ws[j].y - Qs[row].y;
-    dO.z = Ws[j].z - Qs[row].z;
-    dH1.x = Ws[j + 1].x - Qs[row].x;
-    dH1.y = Ws[j + 1].y - Qs[row].y;
-    dH1.z = Ws[j + 1].z - Qs[row].z;
-    dH2.x = Ws[j + 2].x - Qs[row].x;
-    dH2.y = Ws[j + 2].y - Qs[row].y;
-    dH2.z = Ws[j + 2].z - Qs[row].z;
-
-    r2O = pow(dO.x, 2) + pow(dO.y, 2) + pow(dO.z, 2);
-    rH1 = sqrt(1.0 / (pow(dH1.x, 2) + pow(dH1.y, 2) + pow(dH1.z, 2)));
-    rH2 = sqrt(1.0 / (pow(dH2.x, 2) + pow(dH2.y, 2) + pow(dH2.z, 2)));
-    r6O = r2O * r2O * r2O;
-    r2O = 1.0 / r2O;
-    rO = sqrt(r2O);
-    r2H1 = rH1 * rH1;
-    r2H2 = rH2 * rH2;
-
-    // Reset potential
-    dvO = 0;
-    dvH1 = 0;
-    dvH2 = 0;
-
-    for (int state = 0; state < n_lambdas; state++) {
-        qa_type = D_qatypes[qi + n_qatoms * state];
-        qi_type = D_qcatypes[qa_type.code - 1];
-
-        ai_aii = qi_type.Ai;
-        ai_bii = qi_type.Bi;
-
-        V_a = ai_aii * A_O / (r6O * r6O);
-        V_b = ai_bii * B_O / (r6O);
-
-        VelO = D_topo.coulomb_constant * crg_ow * D_qcharges[qi + n_qatoms * state].q * rO;
-        VelH1 = D_topo.coulomb_constant * crg_hw * D_qcharges[qi + n_qatoms * state].q * rH1;
-        VelH2 = D_topo.coulomb_constant * crg_hw * D_qcharges[qi + n_qatoms * state].q * rH2;
-
-        dvO += r2O * (-VelO - (12 * V_a - 6 * V_b)) * D_lambdas[state];
-        dvH1 -= r2H1 * VelH1 * D_lambdas[state];
-        dvH2 -= r2H2 * VelH2 * D_lambdas[state];
-
-        // Update Q totals
-        Ecoul_S[row][state * BLOCK_SIZE + column] += (VelO + VelH1 + VelH2);
-        Evdw_S[row][state * BLOCK_SIZE + column] += (V_a - V_b);
-    }
-
-    // Note r6O is not the usual 1/rO^6, but rather rO^6. be careful!!!
-
-    // Update forces on Q-atom
-    (*qw).Q.x -= (dvO * dO.x + dvH1 * dH1.x + dvH2 * dH2.x);
-    (*qw).Q.y -= (dvO * dO.y + dvH1 * dH1.y + dvH2 * dH2.y);
-    (*qw).Q.z -= (dvO * dO.z + dvH1 * dH1.z + dvH2 * dH2.z);
-
-    // Update forces on water
-    (*qw).O.x += dvO * dO.x;
-    (*qw).O.y += dvO * dO.y;
-    (*qw).O.z += dvO * dO.z;
-    (*qw).H1.x += dvH1 * dH1.x;
-    (*qw).H1.y += dvH1 * dH1.y;
-    (*qw).H1.z += dvH1 * dH1.z;
-    (*qw).H2.x += dvH2 * dH2.x;
-    (*qw).H2.y += dvH2 * dH2.y;
-    (*qw).H2.z += dvH2 * dH2.z;
-}
-
-__global__ void calc_qw_dvel_matrix(int n_qatoms, int n_waters, int n_lambdas, double crg_ow, double crg_hw, double A_O, double B_O,
-                                    coord_t* X, coord_t* W, double* Evdw, double* Ecoul, calc_qw_t* MAT,
-                                    q_catype_t* D_qcatypes, q_atype_t* D_qatypes, q_charge_t* D_qcharges, q_atom_t* D_qatoms, double* D_lambdas, topo_t D_topo) {
-    // Block index
-    int bx = blockIdx.x;
-    int by = blockIdx.y;
-
-    // Thread index
-    int tx = threadIdx.x;
-    int ty = threadIdx.y;
-
-    // TODO implement >2 states on GPU
-    __shared__ double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE];
-    __shared__ double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE];
-
-    Ecoul_S[ty][tx] = 0;
-    Evdw_S[ty][tx] = 0;
-    Ecoul_S[ty][tx + BLOCK_SIZE] = 0;
-    Evdw_S[ty][tx + BLOCK_SIZE] = 0;
-
-    int aStart = BLOCK_SIZE * by;
-    int bStart = 3 * BLOCK_SIZE * bx;
-
-    if (aStart + ty >= n_qatoms) return;
-    if (bStart + 3 * tx >= 3 * n_waters) return;
-
-    __shared__ coord_t Qs[BLOCK_SIZE];
-    __shared__ coord_t Ws[3 * BLOCK_SIZE];
-
-    if (tx == 0) {
-        Qs[ty] = X[D_qatoms[aStart + ty].a - 1];
-    }
-
-    if (ty == 0) {
-        Ws[3 * tx] = W[bStart + 3 * tx];
-        Ws[3 * tx + 1] = W[bStart + 3 * tx + 1];
-        Ws[3 * tx + 2] = W[bStart + 3 * tx + 2];
-    }
-
-    __syncthreads();
-
-    calc_qw_t qw;
-    memset(&qw, 0, sizeof(calc_qw_t));
-
-    int row = by * BLOCK_SIZE + ty;
-    int column = bx * BLOCK_SIZE + tx;
-
-    calc_qw_dvel_matrix_incr(ty, aStart + ty, tx, n_lambdas, n_qatoms, crg_ow, crg_hw, A_O, B_O, Qs, Ws, Evdw_S, Ecoul_S,
-                             &qw, D_qcatypes, D_qatypes, D_qcharges, D_qatoms, D_lambdas, D_topo);
-
-    MAT[column + n_waters * row] = qw;
-
-    __syncthreads();
-
-    int rowlen = (n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    int collen = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
-
-    if (tx == 0 && ty == 0) {
-        // TODO implement >2 states on GPU
-        for (int state = 0; state < min(2, n_lambdas); state++) {
-            double tot_Evdw = 0;
-            double tot_Ecoul = 0;
-            for (int i = 0; i < BLOCK_SIZE; i++) {
-                for (int j = 0; j < BLOCK_SIZE; j++) {
-                    tot_Evdw += Evdw_S[i][j + state * BLOCK_SIZE];
-                    tot_Ecoul += Ecoul_S[i][j + state * BLOCK_SIZE];
-                }
-            }
-            Evdw[rowlen * collen * state + rowlen * by + bx] = tot_Evdw;
-            Ecoul[rowlen * collen * state + rowlen * by + bx] = tot_Ecoul;
-        }
-    }
-
-    __syncthreads();
-}
 }  // namespace CudaNonbondedQWForce
 
 void calc_nonbonded_qw_forces_host_v2() {
     using namespace CudaNonbondedQWForce;
+    int nx = CudaContext::instance().h_q_atoms_list.size();
+    int ny = CudaContext::instance().h_w_atoms_list.size();
 
-    int mem_size_X = n_atoms_solute * sizeof(coord_t);
-    int mem_size_W = 3 * n_waters * sizeof(coord_t);
-    int mem_size_DV_X = n_atoms_solute * sizeof(dvel_t);
-    int mem_size_DV_W = 3 * n_waters * sizeof(dvel_t);
-    int mem_size_MAT = 3 * n_waters * n_qatoms * sizeof(calc_qw_t);
-
-    int n_blocks_q = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    int n_blocks_w = (n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    // TODO make Evdw & Ecoul work for # of states > 2
-    int mem_size_QW_Evdw = min(n_lambdas, 2) * n_blocks_q * n_blocks_w * sizeof(double);
-    int mem_size_QW_Ecoul = min(n_lambdas, 2) * n_blocks_q * n_blocks_w * sizeof(double);
-
-    CudaContext& ctx = CudaContext::instance();
-    auto X = ctx.d_coords;
-    auto DV_X = ctx.d_dvelocities;
-    auto D_qcatypes = ctx.d_q_catypes;
-    auto D_qatypes = ctx.d_q_atypes;
-    auto D_qcharges = ctx.d_q_charges;
-    auto D_qatoms = ctx.d_q_atoms;
-    auto D_lambdas = ctx.d_lambdas;
-
-    dim3 threads, grid;
-
-    threads = dim3(BLOCK_SIZE, BLOCK_SIZE);
-    grid = dim3((n_waters + BLOCK_SIZE - 1) / threads.x, (n_qatoms + BLOCK_SIZE - 1) / threads.y);
-
-    double evdw, ecoul;
-
-    calc_qw_dvel_matrix<<<grid, threads>>>(n_qatoms, n_waters, n_lambdas, crg_ow, crg_hw, A_O, B_O, X, X + n_atoms_solute, D_QW_Evdw, D_QW_Ecoul,
-                                           QW_MAT, D_qcatypes, D_qatypes, D_qcharges, D_qatoms, D_lambdas, topo);
-    calc_qw_dvel_vector_column<<<((n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_qatoms, n_waters, DV_X, DV_X + n_atoms_solute, QW_MAT);
-    calc_qw_dvel_vector_row<<<((n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_qatoms, n_waters, DV_X, DV_X + n_atoms_solute, QW_MAT, D_qatoms);
-
-#ifdef DEBUG
-    cudaMemcpy(h_QW_MAT, QW_MAT, mem_size_MAT, cudaMemcpyDeviceToHost);
-#endif
-
-#ifdef DEBUG
-    for (int i = 0; i < n_qatoms; i++) {
-        for (int j = 0; j < 3 * n_waters; j++) {
-            if (h_QW_MAT[3 * i * n_waters + j].Q.x > 100)
-                printf("QW_MAT[%d][%d].Q = %f %f %f\n", i, j, h_QW_MAT[i * 3 * n_waters + j].Q.x, h_QW_MAT[i * 3 * n_waters + j].Q.y, h_QW_MAT[i * 3 * n_waters + j].Q.z);
-        }
-    }
-#endif
-
-    cudaMemcpy(dvelocities, DV_X, mem_size_DV_X, cudaMemcpyDeviceToHost);
-    cudaMemcpy(&dvelocities[n_atoms_solute], DV_X + n_atoms_solute, mem_size_DV_W, cudaMemcpyDeviceToHost);
-
-    // TODO make Evdw & Ecoul work for # of states > 2
-    for (int state = 0; state < min(2, n_lambdas); state++) {
-        calc_energy_sum<<<1, threads>>>(n_blocks_q, n_blocks_w, D_QW_evdw_TOT, D_QW_ecoul_TOT, &D_QW_Evdw[state * n_blocks_w * n_blocks_q],
-                                        &D_QW_Ecoul[state * n_blocks_w * n_blocks_q], false);
-
-        cudaMemcpy(&QW_evdw_TOT, D_QW_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost);
-        cudaMemcpy(&QW_ecoul_TOT, D_QW_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost);
-
-        EQ_nonbond_qw[state].Uvdw += QW_evdw_TOT;
-        EQ_nonbond_qw[state].Ucoul += QW_ecoul_TOT;
+    for (int state = 0; state < n_lambdas; state++) {
+        auto result = calc_nonbonded_force_host(
+            nx,
+            ny,
+            CudaContext::instance().d_q_atoms_list,
+            CudaContext::instance().d_w_atoms_list,
+            false,
+            CudaContext::instance().d_q_charge_types + state * nx + nx * n_lambdas,
+            CudaContext::instance().d_w_charge_types,
+            CudaContext::instance().d_charge_table_all,
+            CudaContext::instance().d_q_catype_types + state * nx + nx * n_lambdas,
+            CudaContext::instance().d_w_catype_types,
+            CudaContext::instance().d_catype_table_all);
+
+        EQ_nonbond_qw[state].Uvdw += result.first / lambdas[state];
+        EQ_nonbond_qw[state].Ucoul += result.second / lambdas[state];
+        printf("Nonbonded QW Force State %d: Uvdw = %f, Ucoul = %f\n", state, result.first, result.second);
     }
 }
 
 void init_nonbonded_qw_force_kernel_data() {
     using namespace CudaNonbondedQWForce;
-    if (!is_initialized) {
-        catype_t catype_ow;  // Atom type of first O atom
-
-        catype_ow = catypes[atypes[n_atoms_solute].code - 1];
-
-        A_O = catype_ow.aii_normal;
-        B_O = catype_ow.bii_normal;
-
-        int n_blocks_q = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
-        int n_blocks_w = (n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE;
-        int mem_size_MAT = 3 * n_waters * n_qatoms * sizeof(calc_qw_t);
-
-        int mem_size_QW_Evdw = min(n_lambdas, 2) * n_blocks_q * n_blocks_w * sizeof(double);
-        int mem_size_QW_Ecoul = min(n_lambdas, 2) * n_blocks_q * n_blocks_w * sizeof(double);
-#ifdef DEBUG
-        printf("Allocating QW_MAT\n");
-#endif
-        check_cudaMalloc((void**)&QW_MAT, mem_size_MAT);
-
-#ifdef DEBUG
-        printf("Allocating D_QW_Evdw\n");
-#endif
-        check_cudaMalloc((void**)&D_QW_Evdw, mem_size_QW_Evdw);
-#ifdef DEBUG
-        printf("Allocating D_QW_Ecoul\n");
-#endif
-        check_cudaMalloc((void**)&D_QW_Ecoul, mem_size_QW_Ecoul);
-
-        check_cudaMalloc((void**)&D_QW_evdw_TOT, sizeof(double));
-        check_cudaMalloc((void**)&D_QW_ecoul_TOT, sizeof(double));
-
-        h_QW_Evdw = (double*)malloc(mem_size_QW_Evdw);
-        h_QW_Ecoul = (double*)malloc(mem_size_QW_Ecoul);
-
-        h_QW_MAT = (calc_qw_t*)malloc(mem_size_MAT);
-        is_initialized = true;
-    }
+    if (is_initialized) return;
+    is_initialized = true;
 }
 
 void cleanup_nonbonded_qw_force() {
     using namespace CudaNonbondedQWForce;
-
-    if (is_initialized) {
-        cudaFree(QW_MAT);
-        cudaFree(D_QW_Evdw);
-        cudaFree(D_QW_Ecoul);
-        cudaFree(D_QW_evdw_TOT);
-        cudaFree(D_QW_ecoul_TOT);
-
-        free(h_QW_Evdw);
-        free(h_QW_Ecoul);
-        free(h_QW_MAT);
-
-        is_initialized = false;
-    }
-}
\ No newline at end of file
+    if (!is_initialized) return;
+    is_initialized = false;
+}
diff --git a/src/core/cuda/src/CudaNonbondedWWForce.cu b/src/core/cuda/src/CudaNonbondedWWForce.cu
index 84b6652c..e4da8f51 100644
--- a/src/core/cuda/src/CudaNonbondedWWForce.cu
+++ b/src/core/cuda/src/CudaNonbondedWWForce.cu
@@ -1,312 +1,36 @@
+#include <iostream>
+#include <vector>
+
 #include "cuda/include/CudaContext.cuh"
+#include "cuda/include/CudaNonbondedForce.cuh"
 #include "cuda/include/CudaNonbondedWWForce.cuh"
 namespace CudaNonbondedWWForce {
 
 bool is_initialized = false;
-double *D_WW_evdw_TOT, *D_WW_ecoul_TOT, WW_evdw_TOT, WW_ecoul_TOT;
-
-__device__ __forceinline__ void calculate_unforce_bound(
-    const int y, const int x, const coord_t& q, const coord_t& p,
-    const topo_t& D_topo, const double& crg_ow, const double& crg_hw,
-    const double& A_OO, const double& B_OO, double& evdw, double& ecoul,
-    double& dv, double& tmpx, double& tmpy, double& tmpz) {
-    int belong_y = y / 3;
-    int belong_x = x / 3;
-    if (belong_y == belong_x) {
-        return;
-    }
-
-    bool y_is_o = (y % 3 == 0);
-    bool x_is_o = (x % 3 == 0);
-
-    // Compute distance components
-    tmpx = p.x - q.x;
-    tmpy = p.y - q.y;
-    tmpz = p.z - q.z;
-    // double inv_dis = 1.0 / sqrt(pow(tmpx, 2) + pow(tmpy, 2) + pow(tmpz, 2));
-    double inv_dis = rsqrt(tmpx * tmpx + tmpy * tmpy + tmpz * tmpz);
-    double inv_dis2 = inv_dis * inv_dis;
-
-    ecoul = inv_dis * D_topo.coulomb_constant * (y_is_o ? crg_ow : crg_hw) *
-            (x_is_o ? crg_ow : crg_hw);
-    double v_a = 0, v_b = 0;
-    if (y_is_o && x_is_o) {
-        double inv_dis6 = inv_dis2 * inv_dis2 * inv_dis2;
-        double inv_dis12 = inv_dis6 * inv_dis6;
-        v_a = A_OO * inv_dis12;
-        v_b = B_OO * inv_dis6;
-        evdw = v_a - v_b;
-        dv = inv_dis * inv_dis * (-ecoul - 12.0 * v_a + 6.0 * v_b);
-    } else {
-        dv = inv_dis * inv_dis * -ecoul;
-    }
-}
-
-template <const int Thread_x, const int Thread_y, const int Block_x,
-          const int Block_y>
-__global__ void
-calc_ww(const int N, const double crg_ow, const double crg_hw,
-        const double A_OO, const double B_OO, const topo_t D_topo,
-        coord_t* __restrict__ W, dvel_t* __restrict__ DV_W,
-        double* __restrict__ Evdw_TOT, double* __restrict__ ecoul_TOT) {
-    // Calculate block boundaries
-    int NX = N;
-    int NY = (N + 1) / 2;
-    int x_cal_num = blockDim.x * Block_x;
-    int y_cal_num = blockDim.y * Block_y;
-    int block_x_left_begin = 1 + blockIdx.x * x_cal_num;
-    int block_y_left_begin = blockIdx.y * y_cal_num;
-    int block_x_left_end = min(block_x_left_begin + x_cal_num - 1, NX - 1);
-    int block_y_left_end = min(block_y_left_begin + y_cal_num - 1, NY - 1);
-
-    // Shared memory declarations with padding to avoid bank conflicts
-    __shared__ coord_t p[2 * Thread_x * Block_x + 1];
-    __shared__ coord_t q[2 * Thread_y * Block_y + 1];
-    __shared__ double sum_row_x[2 * Thread_y * Block_y + 1];
-    __shared__ double sum_row_y[2 * Thread_y * Block_y + 1];
-    __shared__ double sum_row_z[2 * Thread_y * Block_y + 1];
-
-    __shared__ double block_ecoul[(Thread_x * Thread_y + 31) / 32],
-        block_evdw[(Thread_x * Thread_y + 31) / 32];
-
-    // Thread indices
-    int thread_y_left_begin = block_y_left_begin + threadIdx.y * Block_y;
-    int thread_x_left_begin = block_x_left_begin + threadIdx.x * Block_x;
-    int cur_thread_num = blockDim.x * threadIdx.y + threadIdx.x;
-    int thread_num = blockDim.x * blockDim.y;
-
-// Optimized coordinate loading with coalesced memory access
-#pragma unroll
-    for (int i = cur_thread_num; i < x_cal_num && block_x_left_begin + i < NX; i += thread_num) {
-        p[i] = W[block_x_left_begin + i];
-        p[x_cal_num + i] = W[N - (block_x_left_begin + i)];
-    }
-
-#pragma unroll
-    for (int i = cur_thread_num; i < y_cal_num && block_y_left_begin + i < NY; i += thread_num) {
-        q[i] = W[block_y_left_begin + i];
-        q[y_cal_num + i] = W[N - 1 - (block_y_left_begin + i)];
-    }
-
-// Initialize sum arrays
-#pragma unroll
-    for (int i = cur_thread_num; i < y_cal_num && block_y_left_begin + i < NY; i += thread_num) {
-        sum_row_x[i] = sum_row_x[y_cal_num + i] = 0;
-        sum_row_y[i] = sum_row_y[y_cal_num + i] = 0;
-        sum_row_z[i] = sum_row_z[y_cal_num + i] = 0;
-    }
-    __syncthreads();
-    // Initialize column sums
-    double sum_col_x[2 * Block_x], sum_col_y[2 * Block_x], sum_col_z[2 * Block_x];
-#pragma unroll
-    for (int i = 0; i < Block_x; i++) {
-        sum_col_x[i] = sum_col_x[Block_x + i] = 0;
-        sum_col_y[i] = sum_col_y[Block_x + i] = 0;
-        sum_col_z[i] = sum_col_z[Block_x + i] = 0;
-    }
-
-    // Main computation loop with reduced thread divergence
-    double evdw_tot = 0, ecoul_tot = 0;
-#pragma unroll
-    for (int i = 0; i < Block_x; i++) {
-        int i2 = i;
-        int x = thread_x_left_begin + i2;
-        if (x >= NX) continue;
-
-        int offset_x = x - block_x_left_begin;
-        const coord_t& now_p0 = p[offset_x];
-        const coord_t& now_p1 = p[x_cal_num + offset_x];
-
-#pragma unroll
-        for (int j = 0; j < Block_y; j++) {
-            int j2 = (j + threadIdx.x) % Block_y;
-            int y = thread_y_left_begin + j2;
-            if (y >= NY) continue;
-
-            // Optimized condition check
-            if (y >= x && (N % 2 == 0 || y != NY - 1)) {
-                int offset_y = y - block_y_left_begin;
-                double evdw = 0, ecoul = 0, dv = 0;
-                double tmpx = 0, tmpy = 0, tmpz = 0;
-
-                int y2 = N - 1 - y;
-                int x2 = N - x;
-                calculate_unforce_bound(y2, x2, q[y_cal_num + offset_y], now_p1, D_topo,
-                                        crg_ow, crg_hw, A_OO, B_OO, evdw, ecoul, dv,
-                                        tmpx, tmpy, tmpz);
-
-                evdw_tot += evdw;
-                ecoul_tot += ecoul;
-                double v_x = dv * tmpx;
-                double v_y = dv * tmpy;
-                double v_z = dv * tmpz;
-
-                sum_col_x[Block_x + i2] += v_x;
-                sum_col_y[Block_x + i2] += v_y;
-                sum_col_z[Block_x + i2] += v_z;
-                sum_row_x[y_cal_num + offset_y] -= v_x;
-                sum_row_y[y_cal_num + offset_y] -= v_y;
-                sum_row_z[y_cal_num + offset_y] -= v_z;
-            } else if (y < x) {
-                int offset_y = y - block_y_left_begin;
-                double evdw = 0, ecoul = 0, dv = 0;
-                double tmpx = 0, tmpy = 0, tmpz = 0;
-
-                calculate_unforce_bound(y, x, q[offset_y], now_p0, D_topo, crg_ow,
-                                        crg_hw, A_OO, B_OO, evdw, ecoul, dv, tmpx, tmpy,
-                                        tmpz);
-
-                evdw_tot += evdw;
-                ecoul_tot += ecoul;
-                double v_x = dv * tmpx;
-                double v_y = dv * tmpy;
-                double v_z = dv * tmpz;
-
-                sum_col_x[i2] += v_x;
-                sum_col_y[i2] += v_y;
-                sum_col_z[i2] += v_z;
-                sum_row_x[offset_y] -= v_x;
-                sum_row_y[offset_y] -= v_y;
-                sum_row_z[offset_y] -= v_z;
-            }
-        }
-    }
-
-// Optimized reduction using warp-level primitives
-#pragma unroll
-    for (unsigned w = 16; w >= 1; w /= 2) {
-        ecoul_tot += __shfl_down_sync(0xffffffff, ecoul_tot, w);
-        evdw_tot += __shfl_down_sync(0xffffffff, evdw_tot, w);
-    }
-
-    // Store block results
-    int warp_id = cur_thread_num / warpSize;
-    int lane_id = cur_thread_num % warpSize;
-    if (lane_id == 0) {
-        block_evdw[warp_id] = evdw_tot;
-        block_ecoul[warp_id] = ecoul_tot;
-    }
-    __syncthreads();
-
-    // Final reduction
-    if (warp_id == 0) {
-        double val1 = (lane_id < (Thread_x * Thread_y + 31) / 32) ? block_ecoul[lane_id] : 0;
-        double val2 = (lane_id < (Thread_x * Thread_y + 31) / 32) ? block_evdw[lane_id] : 0;
-
-#pragma unroll
-        for (unsigned w = 16; w >= 1; w /= 2) {
-            val1 += __shfl_down_sync(0xffffffff, val1, w);
-            val2 += __shfl_down_sync(0xffffffff, val2, w);
-        }
-
-        if (lane_id == 0) {
-            atomicAdd(ecoul_TOT, val1);
-            atomicAdd(Evdw_TOT, val2);
-        }
-    }
-
-// Optimized row reduction
-#pragma unroll
-    for (int i = cur_thread_num; i < y_cal_num && block_y_left_begin + i < NY; i += thread_num) {
-        int idx = block_y_left_begin + i;
-        if (idx < block_x_left_end) {
-            atomicAdd(&DV_W[idx].x, sum_row_x[i]);
-            atomicAdd(&DV_W[idx].y, sum_row_y[i]);
-            atomicAdd(&DV_W[idx].z, sum_row_z[i]);
-        }
-        if (idx >= block_x_left_begin) {
-            idx = N - 1 - idx;
-            atomicAdd(&DV_W[idx].x, sum_row_x[i + y_cal_num]);
-            atomicAdd(&DV_W[idx].y, sum_row_y[i + y_cal_num]);
-            atomicAdd(&DV_W[idx].z, sum_row_z[i + y_cal_num]);
-        }
-    }
-
-// Optimized column reduction
-#pragma unroll
-    for (int i = 0; i < Block_x; i++) {
-        int i2 = (i + threadIdx.y) % Block_x;
-        int idx = thread_x_left_begin + i2;
-        if (idx < N) {
-            if (block_y_left_begin < idx) {
-                atomicAdd(&DV_W[idx].x, sum_col_x[i2]);
-                atomicAdd(&DV_W[idx].y, sum_col_y[i2]);
-                atomicAdd(&DV_W[idx].z, sum_col_z[i2]);
-            }
-            if (block_y_left_end >= idx) {
-                atomicAdd(&DV_W[N - idx].x, sum_col_x[i2 + Block_x]);
-                atomicAdd(&DV_W[N - idx].y, sum_col_y[i2 + Block_x]);
-                atomicAdd(&DV_W[N - idx].z, sum_col_z[i2 + Block_x]);
-            }
-        }
-    }
-}
 }  // namespace CudaNonbondedWWForce
 void calc_nonbonded_ww_forces_host_v2() {
     using namespace CudaNonbondedWWForce;
-    int N = 3 * n_waters;
-    int mem_size_W = N * sizeof(coord_t);
-    int mem_size_DV_W = N * sizeof(dvel_t);
-
-    WW_evdw_TOT = 0;
-    WW_ecoul_TOT = 0;
-    cudaMemcpy(D_WW_evdw_TOT, &WW_evdw_TOT, sizeof(double),
-               cudaMemcpyHostToDevice);
-    cudaMemcpy(D_WW_ecoul_TOT, &WW_ecoul_TOT, sizeof(double),
-               cudaMemcpyHostToDevice);
-
-    CudaContext& ctx = CudaContext::instance();
-    auto W = ctx.d_coords + n_atoms_solute;
-    auto DV_W = ctx.d_dvelocities + n_atoms_solute;
-
-    // Optimize thread block configuration
-    const int thread_num_x = 32;  // Keep at 32 for better warp utilization
-    const int thread_num_y = 1;   // Keep at 1 for better memory coalescing
-    dim3 block_sz = dim3(thread_num_x, thread_num_y);
-
-    const int N_ITERATION_Y = 32;  // Keep at 32 for better memory access pattern
-    const int N_ITERATION_X = 1;   // Keep at 1 for better memory coalescing
-
-    int block_element_sz_x = N_ITERATION_X * block_sz.x;
-    int block_element_sz_y = N_ITERATION_Y * block_sz.y;
-
-    int grid_sz_x = (N - 1 + block_element_sz_x - 1) / (block_element_sz_x);
-    int grid_sz_y = ((N + 1) / 2 + block_element_sz_y - 1) / block_element_sz_y;
-    dim3 grid = dim3(grid_sz_x, grid_sz_y);
 
-    calc_ww<thread_num_x, thread_num_y, N_ITERATION_X, N_ITERATION_Y>
-        <<<grid, block_sz>>>(N, crg_ow, crg_hw, A_OO, B_OO, topo, W, DV_W,
-                             D_WW_evdw_TOT, D_WW_ecoul_TOT);
-
-    cudaDeviceSynchronize();
-    cudaMemcpy(&dvelocities[n_atoms_solute], DV_W, mem_size_DV_W,
-               cudaMemcpyDeviceToHost);
-    cudaMemcpy(&WW_evdw_TOT, D_WW_evdw_TOT, sizeof(double),
-               cudaMemcpyDeviceToHost);
-    cudaMemcpy(&WW_ecoul_TOT, D_WW_ecoul_TOT, sizeof(double),
-               cudaMemcpyDeviceToHost);
-    E_nonbond_ww.Uvdw += WW_evdw_TOT;
-    E_nonbond_ww.Ucoul += WW_ecoul_TOT;
+    int nx = CudaContext::instance().h_w_atoms_list.size();
+    int ny = nx;
+    auto result = calc_nonbonded_force_host(
+        nx, ny,
+        CudaContext::instance().d_w_atoms_list,
+        CudaContext::instance().d_w_atoms_list,
+        true,
+        CudaContext::instance().d_w_charge_types,
+        CudaContext::instance().d_w_charge_types,
+        CudaContext::instance().d_charge_table_all,
+        CudaContext::instance().d_w_catype_types,
+        CudaContext::instance().d_w_catype_types,
+        CudaContext::instance().d_catype_table_all);
+    E_nonbond_ww.Uvdw = result.first;
+    E_nonbond_ww.Ucoul = result.second;
 }
 
 void init_nonbonded_ww_force_kernel_data() {
     using namespace CudaNonbondedWWForce;
     if (!is_initialized) {
-        catype_t catype_ow;                // Atom type of first O, H atom
-        ccharge_t ccharge_ow, ccharge_hw;  // Charge of first O, H atom
-
-        catype_ow = catypes[atypes[n_atoms_solute].code - 1];
-        ccharge_ow = ccharges[charges[n_atoms_solute].code - 1];
-        ccharge_hw = ccharges[charges[n_atoms_solute + 1].code - 1];
-
-        A_OO = pow(catype_ow.aii_normal, 2);
-        B_OO = pow(catype_ow.bii_normal, 2);
-
-        crg_ow = ccharge_ow.charge;
-        crg_hw = ccharge_hw.charge;
-
-        check_cudaMalloc((void**)&D_WW_evdw_TOT, sizeof(double));
-        check_cudaMalloc((void**)&D_WW_ecoul_TOT, sizeof(double));
         is_initialized = true;
     }
 }
@@ -314,8 +38,6 @@ void init_nonbonded_ww_force_kernel_data() {
 void cleanup_nonbonded_ww_force() {
     using namespace CudaNonbondedWWForce;
     if (is_initialized) {
-        cudaFree(D_WW_evdw_TOT);
-        cudaFree(D_WW_ecoul_TOT);
         is_initialized = false;
     }
-}
\ No newline at end of file
+}
diff --git a/src/core/parse.cu b/src/core/parse.cu
index 1b9e0f8d..c5518223 100644
--- a/src/core/parse.cu
+++ b/src/core/parse.cu
@@ -753,6 +753,7 @@ void init_ngbrs14(const char* filename) {
                 int jx = (lineI + i + 1) % lines;
                 // if (ix < 100 && jx < 100) printf("i = %d j = %d\n", ix+1, jx+1);
                 LJ_matrix[ix * n_atoms_solute + jx] = 1;
+                LJ_matrix[jx * n_atoms_solute + ix] = 1;
             }
         }
         lineI++;
@@ -797,6 +798,7 @@ void init_ngbrs23(const char* filename) {
                 int jx = (lineI + i + 1) % lines;
                 // if (ix < 100 && jx < 100) printf("i = %d j = %d\n", ix+1, jx+1);
                 LJ_matrix[ix * n_atoms_solute + jx] = 3;
+                LJ_matrix[jx * n_atoms_solute + ix] = 3;
             }
         }
         lineI++;
@@ -819,6 +821,7 @@ void init_ngbrs14_long(const char* filename) {
         int ix = atoi(file.buffer[i+1][0])-1;
         int jx = atoi(file.buffer[i+1][1])-1;
         LJ_matrix[ix * n_atoms_solute + jx] = 1;
+        LJ_matrix[jx * n_atoms_solute + ix] = 1;
     }
 
     clean_csv(file);
@@ -838,6 +841,7 @@ void init_ngbrs23_long(const char* filename) {
         int ix = atoi(file.buffer[i+1][0])-1;
         int jx = atoi(file.buffer[i+1][1])-1;
         LJ_matrix[ix * n_atoms_solute + jx] = 3;
+        LJ_matrix[jx * n_atoms_solute + ix] = 3;
     }
 
     clean_csv(file);
diff --git a/src/core/system.cu b/src/core/system.cu
index afb999cf..18c15d95 100644
--- a/src/core/system.cu
+++ b/src/core/system.cu
@@ -29,6 +29,7 @@
 #include "cuda/include/CudaNonbondedQWForce.cuh"
 #include "cuda/include/CudaNonbondedPWForce.cuh"
 #include "cuda/include/CudaNonbondedWWForce.cuh"
+#include "cuda/include/CudaNonbondedForce.cuh"
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -1320,6 +1321,7 @@ void init_cuda_kernel_data() {
     init_bond_force_kernel_data();
     init_improper2_force_kernel_data();
     init_leapfrog_kernel_data();
+    init_nonbonded_force_kernel_data();
     init_nonbonded_pp_force_kernel_data();
     init_nonbonded_pw_force_kernel_data();
     init_nonbonded_qp_force_kernel_data();
@@ -1564,6 +1566,7 @@ void clean_variables() {
         cleanup_nonbonded_qq_force();
         cleanup_nonbonded_qw_force();
         cleanup_nonbonded_ww_force();
+        cleanup_nonbonded_force();
         cleanup_polx_water_force();
         cleanup_pshell_force();
         cleanup_radix_water_force();