diff --git a/src/core/cuda/include/CudaContext.cuh b/src/core/cuda/include/CudaContext.cuh
index 3c2883fa..faa3cf90 100644
--- a/src/core/cuda/include/CudaContext.cuh
+++ b/src/core/cuda/include/CudaContext.cuh
@@ -59,6 +59,9 @@ class CudaContext {
     E_nonbonded_t* d_EQ_nonbond_qq = nullptr;
     double* d_lambdas = nullptr;
 
+    q_charge_t* d_q_aggregate_charges = nullptr;
+    q_catype_t* d_q_aggregate_catypes = nullptr;
+
     /*
     Used in CudaPolxWaterForce.cu
     */
@@ -124,6 +127,8 @@ class CudaContext {
    private:
     CudaContext() = default;
 
+    void init_data_for_q();
+
     void free();
 
     ~CudaContext() { free(); }
diff --git a/src/core/cuda/src/CudaContext.cu b/src/core/cuda/src/CudaContext.cu
index 19f22165..0ac5f0b5 100644
--- a/src/core/cuda/src/CudaContext.cu
+++ b/src/core/cuda/src/CudaContext.cu
@@ -1,6 +1,7 @@
 #include <iostream>
 
 #include "cuda/include/CudaContext.cuh"
+#include <vector>
 
 void CudaContext::init() {
     check_cudaMalloc((void**)&d_coords, sizeof(coord_t) * n_atoms);
@@ -54,6 +55,8 @@ void CudaContext::init() {
     check_cudaMalloc((void**)&d_charges, sizeof(charge_t) * n_charges);
     check_cudaMalloc((void**)&d_p_atoms, sizeof(p_atom_t) * n_patoms);
 
+    init_data_for_q();
+
     sync_all_to_device();
 }
 
@@ -183,7 +186,6 @@ void CudaContext::free() {
     cudaFree(d_excluded);
     cudaFree(d_q_elscales);
     cudaFree(d_q_catypes);
-    cudaFree(d_q_atypes);
     cudaFree(d_EQ_nonbond_qq);
     cudaFree(d_lambdas);
 
@@ -208,6 +210,9 @@ void CudaContext::free() {
     cudaFree(d_ccharges);
     cudaFree(d_charges);
     cudaFree(d_p_atoms);
+
+    cudaFree(d_q_aggregate_charges);
+    cudaFree(d_q_aggregate_catypes);
 }
 
 void CudaContext::reset_energies() {
@@ -215,3 +220,37 @@ void CudaContext::reset_energies() {
     cudaMemset(d_EQ_nonbond_qq, 0, sizeof(E_nonbonded_t) * n_lambdas);
     cudaMemset(d_EQ_restraint, 0, sizeof(E_restraint_t) * n_lambdas);
 }
+
+void CudaContext::init_data_for_q() {
+
+    // Additional initialization for q-related data can be added here if needed
+
+    // q charges
+    std::vector<q_charge_t> charges(n_qatoms, q_charge_t{0.0});
+    for (int i = 0; i < n_qatoms; i++) {
+        double q = 0;
+        for (int state = 0; state < n_lambdas; state++) {
+            q += q_charges[state * n_qatoms + i].q * lambdas[state];
+        }
+        charges[i].q = q;
+    }
+    check_cudaMalloc((void**)&d_q_aggregate_charges, sizeof(q_charge_t) * n_qatoms);
+    check_cuda(cudaMemcpy(d_q_aggregate_charges, charges.data(), sizeof(q_charge_t) * n_qatoms, cudaMemcpyHostToDevice));
+
+    
+    // q atypes
+    std::vector<q_catype_t> atypes(n_qatoms, q_catype_t{"", 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0});
+    for (int i = 0; i < n_qatoms; i++) {
+        for (int state = 0; state < n_lambdas; state++) {
+            atypes[i].Ai += q_catypes[q_atypes[state * n_qatoms + i].code - 1].Ai * lambdas[state];
+            atypes[i].Bi += q_catypes[q_atypes[state * n_qatoms + i].code - 1].Bi * lambdas[state];
+            atypes[i].Ci += q_catypes[q_atypes[state * n_qatoms + i].code - 1].Ci * lambdas[state];
+            atypes[i].ai += q_catypes[q_atypes[state * n_qatoms + i].code - 1].ai * lambdas[state];
+            atypes[i].Ai_14 += q_catypes[q_atypes[state * n_qatoms + i].code - 1].Ai_14 * lambdas[state];
+            atypes[i].Bi_14 += q_catypes[q_atypes[state * n_qatoms + i].code - 1].Bi_14 * lambdas[state];
+            atypes[i].m += q_catypes[q_atypes[state * n_qatoms + i].code - 1].m * lambdas[state];
+        }
+    }
+    check_cudaMalloc((void**)&d_q_aggregate_catypes, sizeof(q_catype_t) * n_qatoms);
+    check_cuda(cudaMemcpy(d_q_aggregate_catypes, atypes.data(), sizeof(q_catype_t) * n_qatoms, cudaMemcpyHostToDevice));
+}
\ No newline at end of file
diff --git a/src/core/cuda/src/CudaNonbondedQPForce.cu b/src/core/cuda/src/CudaNonbondedQPForce.cu
index 4eb53908..47f26451 100644
--- a/src/core/cuda/src/CudaNonbondedQPForce.cu
+++ b/src/core/cuda/src/CudaNonbondedQPForce.cu
@@ -42,6 +42,7 @@ __global__ void calc_energy_sum(int rows, int columns, double* Evdw_TOT, double*
     }
     Ecoul_S[ty][tx] = coul_TOT;
     Evdw_S[ty][tx] = vdw_TOT;
+    // printf("Thread (%d,%d) coul_TOT = %f vdw_TOT = %f\n", tx, ty, coul_TOT, vdw_TOT);
 
     __syncthreads();
 
@@ -111,8 +112,8 @@ __global__ void calc_qp_dvel_vector_column(int n_qatoms, int n_patoms, dvel_t* D
 }
 
 __device__ void calc_qp_dvel_matrix_incr(int row, int qi, int column, int pj, int n_lambdas, int n_qatoms,
-                                         coord_t* Qs, coord_t* Ps, int* LJs, bool* excluded_s, double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE], double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE], calc_qp_t* qp,
-                                         q_catype_t* D_qcatypes, q_atype_t* D_qatypes, q_charge_t* D_qcharges, p_atom_t* D_patoms, q_atom_t* D_qatoms, double* D_lambdas,
+                                         coord_t* Qs, coord_t* Ps, int* LJs, bool* excluded_s, double Evdw_S[BLOCK_SIZE][BLOCK_SIZE], double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE], calc_qp_t* qp,
+                                         q_catype_t* D_qcatypes, q_charge_t* D_qcharges, p_atom_t* D_patoms, q_atom_t* D_qatoms,
                                          catype_t* D_catypes, atype_t* D_atypes, ccharge_t* D_ccharges, charge_t* D_charges, topo_t D_topo) {
     coord_t da;
     double r2, r6, r;
@@ -141,43 +142,40 @@ __device__ void calc_qp_dvel_matrix_incr(int row, int qi, int column, int pj, in
     r6 = r2 * r2 * r2;
     r2 = 1 / r2;
     r = sqrt(r2);
-
-    for (int state = 0; state < n_lambdas; state++) {
-        qi_type = D_qcatypes[D_qatypes[qi + n_qatoms * state].code - 1];
-        aj_type = D_catypes[D_atypes[j].code - 1];
-
-        ai_aii = bond14 ? qi_type.Ai_14 : qi_type.Ai;
-        aj_aii = bond14 ? aj_type.aii_1_4 : aj_type.aii_normal;
-        ai_bii = bond14 ? qi_type.Bi_14 : qi_type.Bi;
-        aj_bii = bond14 ? aj_type.bii_1_4 : aj_type.bii_normal;
-
-        Vel = D_topo.coulomb_constant * scaling * D_qcharges[qi + n_qatoms * state].q * D_ccharges[D_charges[j].code - 1].charge * r;
-        V_a = ai_aii * aj_aii / (r6 * r6);
-        V_b = ai_bii * aj_bii / r6;
-        dv = r2 * (-Vel - (12 * V_a - 6 * V_b)) * D_lambdas[state];
-
-        // if (state == 0 && qi == 0 && pj == 1) {
-        //     printf("crg_q = %f crg_j = %f r = %f\n", D_qcharges[qi + n_qatoms * state].q, D_ccharges[D_charges[pj].code - 1].charge, r);
-        //     printf("ai_aii = %f aj_aii = %f ai_bii = %f aj_bii = %f\n", ai_aii, aj_aii, ai_bii, aj_bii);
-        // }
-
-        // Update forces
-        qp->Q.x += dv * da.x;
-        qp->Q.y += dv * da.y;
-        qp->Q.z += dv * da.z;
-        qp->P.x -= dv * da.x;
-        qp->P.y -= dv * da.y;
-        qp->P.z -= dv * da.z;
-
-        // Update Q totals
-        Ecoul_S[row][state * BLOCK_SIZE + column] += Vel;
-        Evdw_S[row][state * BLOCK_SIZE + column] += (V_a - V_b);
-    }
+    qi_type = D_qcatypes[qi];
+    aj_type = D_catypes[D_atypes[j].code - 1];
+
+    ai_aii = bond14 ? qi_type.Ai_14 : qi_type.Ai;
+    aj_aii = bond14 ? aj_type.aii_1_4 : aj_type.aii_normal;
+    ai_bii = bond14 ? qi_type.Bi_14 : qi_type.Bi;
+    aj_bii = bond14 ? aj_type.bii_1_4 : aj_type.bii_normal;
+
+    Vel = D_topo.coulomb_constant * scaling * D_qcharges[qi].q * D_ccharges[D_charges[j].code - 1].charge * r;
+    V_a = ai_aii * aj_aii / (r6 * r6);
+    V_b = ai_bii * aj_bii / r6;
+    dv = r2 * (-Vel - (12 * V_a - 6 * V_b));
+
+    // if (state == 0 && qi == 0 && pj == 1) {
+    //     printf("crg_q = %f crg_j = %f r = %f\n", D_qcharges[qi + n_qatoms * state].q, D_ccharges[D_charges[pj].code - 1].charge, r);
+    //     printf("ai_aii = %f aj_aii = %f ai_bii = %f aj_bii = %f\n", ai_aii, aj_aii, ai_bii, aj_bii);
+    // }
+
+    // Update forces
+    qp->Q.x += dv * da.x;
+    qp->Q.y += dv * da.y;
+    qp->Q.z += dv * da.z;
+    qp->P.x -= dv * da.x;
+    qp->P.y -= dv * da.y;
+    qp->P.z -= dv * da.z;
+
+    // Update Q totals
+    Ecoul_S[row][column] += Vel;
+    Evdw_S[row][column] += (V_a - V_b);
 }
 
 __global__ void calc_qp_dvel_matrix(int n_qatoms, int n_patoms, int n_lambdas, int n_atoms_solute,
                                     coord_t* X, double* Evdw, double* Ecoul, calc_qp_t* QP_MAT,
-                                    q_catype_t* D_qcatypes, q_atype_t* D_qatypes, q_charge_t* D_qcharges, p_atom_t* D_patoms, q_atom_t* D_qatoms, double* D_lambdas,
+                                    q_catype_t* D_qcatypes, q_charge_t* D_qcharges, p_atom_t* D_patoms, q_atom_t* D_qatoms,
                                     int* D_LJ_matrix, bool* D_excluded, catype_t* D_catypes, atype_t* D_atypes, ccharge_t* D_ccharges, charge_t* D_charges, topo_t D_topo) {
     // Block index
     int bx = blockIdx.x;
@@ -187,9 +185,8 @@ __global__ void calc_qp_dvel_matrix(int n_qatoms, int n_patoms, int n_lambdas, i
     int tx = threadIdx.x;
     int ty = threadIdx.y;
 
-    // TODO implement >2 states on GPU
-    __shared__ double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE];
-    __shared__ double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE];
+    __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE];
+    __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE];
 
     Ecoul_S[ty][tx] = 0;
     Evdw_S[ty][tx] = 0;
@@ -230,29 +227,25 @@ __global__ void calc_qp_dvel_matrix(int n_qatoms, int n_patoms, int n_lambdas, i
     int column = bx * BLOCK_SIZE + tx;
 
     calc_qp_dvel_matrix_incr(ty, row, tx, column, n_lambdas, n_qatoms, Qs, Ps, LJs, excluded_s, Evdw_S, Ecoul_S,
-                             &qp, D_qcatypes, D_qatypes, D_qcharges, D_patoms, D_qatoms, D_lambdas, D_catypes, D_atypes, D_ccharges, D_charges, D_topo);
+                             &qp, D_qcatypes, D_qcharges, D_patoms, D_qatoms, D_catypes, D_atypes, D_ccharges, D_charges, D_topo);
 
     QP_MAT[n_patoms * row + column] = qp;
 
     __syncthreads();
 
     int rowlen = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    int collen = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
 
     if (tx == 0 && ty == 0) {
-        // TODO implement >2 states on GPU
-        for (int state = 0; state < min(2, n_lambdas); state++) {
-            double tot_Evdw = 0;
-            double tot_Ecoul = 0;
-            for (int i = 0; i < BLOCK_SIZE; i++) {
-                for (int j = 0; j < BLOCK_SIZE; j++) {
-                    tot_Evdw += Evdw_S[i][j + state * BLOCK_SIZE];
-                    tot_Ecoul += Ecoul_S[i][j + state * BLOCK_SIZE];
-                }
+        double tot_Evdw = 0;
+        double tot_Ecoul = 0;
+        for (int i = 0; i < BLOCK_SIZE; i++) {
+            for (int j = 0; j < BLOCK_SIZE; j++) {
+                tot_Evdw += Evdw_S[i][j];
+                tot_Ecoul += Ecoul_S[i][j];
             }
-            Evdw[rowlen * collen * state + rowlen * by + bx] = tot_Evdw;
-            Ecoul[rowlen * collen * state + rowlen * by + bx] = tot_Ecoul;
         }
+        Evdw[rowlen * by + bx] = tot_Evdw;
+        Ecoul[rowlen * by + bx] = tot_Ecoul;
     }
 
     __syncthreads();
@@ -269,12 +262,10 @@ void calc_nonbonded_qp_forces_host_v2() {
     CudaContext& ctx = CudaContext::instance();
     auto X = ctx.d_coords;
     auto DV_X = ctx.d_dvelocities;
-    auto D_qcatypes = ctx.d_q_catypes;
-    auto D_qatypes = ctx.d_q_atypes;
-    auto D_qcharges = ctx.d_q_charges;
+    auto D_qcatypes = ctx.d_q_aggregate_catypes;
+    auto D_qcharges = ctx.d_q_aggregate_charges;
     auto D_patoms = ctx.d_p_atoms;
     auto D_qatoms = ctx.d_q_atoms;
-    auto D_lambdas = ctx.d_lambdas;
     auto D_LJ_matrix = ctx.d_LJ_matrix;
     auto D_excluded = ctx.d_excluded;
     auto D_catypes = ctx.d_catypes;
@@ -288,7 +279,7 @@ void calc_nonbonded_qp_forces_host_v2() {
     grid = dim3((n_patoms + BLOCK_SIZE - 1) / threads.x, (n_qatoms + BLOCK_SIZE - 1) / threads.y);
 
     calc_qp_dvel_matrix<<<grid, threads>>>(n_qatoms, n_patoms, n_lambdas, n_atoms_solute, X, D_QP_Evdw, D_QP_Ecoul, QP_MAT,
-                                           D_qcatypes, D_qatypes, D_qcharges, D_patoms, D_qatoms, D_lambdas, D_LJ_matrix, D_excluded,
+                                           D_qcatypes, D_qcharges, D_patoms, D_qatoms, D_LJ_matrix, D_excluded,
                                            D_catypes, D_atypes, D_ccharges, D_charges, topo);
     calc_qp_dvel_vector_column<<<((n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_qatoms, n_patoms, DV_X, QP_MAT, D_patoms);
     calc_qp_dvel_vector_row<<<((n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_qatoms, n_patoms, DV_X, QP_MAT, D_qatoms);
@@ -309,16 +300,14 @@ void calc_nonbonded_qp_forces_host_v2() {
     int mem_size_DV_X = n_atoms * sizeof(dvel_t);
     cudaMemcpy(dvelocities, DV_X, mem_size_DV_X, cudaMemcpyDeviceToHost);
 
-    // TODO: make Evdw & Ecoul work for # of states > 2
-    for (int state = 0; state < min(2, n_lambdas); state++) {
-        calc_energy_sum<<<1, threads>>>(n_blocks_q, n_blocks_p, D_QP_evdw_TOT, D_QP_ecoul_TOT, &D_QP_Evdw[state * n_blocks_p * n_blocks_q], &D_QP_Ecoul[state * n_blocks_p * n_blocks_q], false);
+    calc_energy_sum<<<1, threads>>>(n_blocks_q, n_blocks_p, D_QP_evdw_TOT, D_QP_ecoul_TOT, D_QP_Evdw, D_QP_Ecoul, false);
 
-        cudaMemcpy(&QP_evdw_TOT, D_QP_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost);
-        cudaMemcpy(&QP_ecoul_TOT, D_QP_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&QP_evdw_TOT, D_QP_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&QP_ecoul_TOT, D_QP_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost);
+    printf("QP E_vdw: %f E_coul: %f\n", QP_evdw_TOT, QP_ecoul_TOT);
 
-        EQ_nonbond_qp[state].Uvdw += QP_evdw_TOT;
-        EQ_nonbond_qp[state].Ucoul += QP_ecoul_TOT;
-    }
+    E_nonbond_qx.Uvdw += QP_evdw_TOT;
+    E_nonbond_qx.Ucoul += QP_ecoul_TOT;
 }
 
 void init_nonbonded_qp_force_kernel_data() {
@@ -328,8 +317,8 @@ void init_nonbonded_qp_force_kernel_data() {
         int n_blocks_q = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
         int n_blocks_p = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
         // TODO make Evdw & Ecoul work for # of states > 2
-        int mem_size_QP_Evdw = min(n_lambdas, 2) * n_blocks_q * n_blocks_p * sizeof(double);
-        int mem_size_QP_Ecoul = min(n_lambdas, 2) * n_blocks_q * n_blocks_p * sizeof(double);
+        int mem_size_QP_Evdw = n_blocks_q * n_blocks_p * sizeof(double);
+        int mem_size_QP_Ecoul = n_blocks_q * n_blocks_p * sizeof(double);
         int mem_size_QP_MAT = n_qatoms * n_patoms * sizeof(calc_qp_t);
 
         check_cudaMalloc((void**)&D_QP_Evdw, mem_size_QP_Evdw);
diff --git a/src/core/cuda/src/CudaNonbondedQQForce.cu b/src/core/cuda/src/CudaNonbondedQQForce.cu
index e2138ed6..a10c315c 100644
--- a/src/core/cuda/src/CudaNonbondedQQForce.cu
+++ b/src/core/cuda/src/CudaNonbondedQQForce.cu
@@ -1,6 +1,7 @@
 #include "cuda/include/CudaContext.cuh"
 #include "cuda/include/CudaNonbondedQQForce.cuh"
 #include "utils.h"
+#include <iostream>
 
 __global__ void calc_nonbonded_qq_forces_kernel(
     q_atom_t* q_atoms,
@@ -129,6 +130,9 @@ void calc_nonbonded_qq_forces_host() {
     cudaDeviceSynchronize();
     cudaMemcpy(EQ_nonbond_qq, d_EQ_nonbond_qq, sizeof(E_nonbonded_t) * n_lambdas, cudaMemcpyDeviceToHost);
     cudaMemcpy(dvelocities, d_dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < n_lambdas; i++) {
+        printf("QQ Energy State %d: Evdw = %f Ecoul = %f\n", i, EQ_nonbond_qq[i].Uvdw, EQ_nonbond_qq[i].Ucoul);
+    }
 }
 
 void init_nonbonded_qq_force_kernel_data() {}
diff --git a/src/core/cuda/src/CudaNonbondedQWForce.cu b/src/core/cuda/src/CudaNonbondedQWForce.cu
index 974c380c..049a38aa 100644
--- a/src/core/cuda/src/CudaNonbondedQWForce.cu
+++ b/src/core/cuda/src/CudaNonbondedQWForce.cu
@@ -124,14 +124,13 @@ __global__ void calc_qw_dvel_vector_column(int n_qatoms, int n_waters, dvel_t* D
 }
 
 __device__ void calc_qw_dvel_matrix_incr(int row, int qi, int column, int n_lambdas, int n_qatoms, double crg_ow, double crg_hw, double A_O, double B_O,
-                                         coord_t* Qs, coord_t* Ws, double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE], double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE], calc_qw_t* qw,
-                                         q_catype_t* D_qcatypes, q_atype_t* D_qatypes, q_charge_t* D_qcharges, q_atom_t* D_qatoms, double* D_lambdas, topo_t D_topo) {
+                                         coord_t* Qs, coord_t* Ws, double Evdw_S[BLOCK_SIZE][BLOCK_SIZE], double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE], calc_qw_t* qw,
+                                         q_catype_t* D_qcatypes, q_charge_t* D_qcharges, q_atom_t* D_qatoms, topo_t D_topo) {
     int j;
     coord_t dO, dH1, dH2;
     double r2O, rH1, rH2, r6O, rO, r2H1, r2H2;
     double dvO, dvH1, dvH2;
     double V_a, V_b, VelO, VelH1, VelH2;
-    q_atype_t qa_type;
     q_catype_t qi_type;
     double ai_aii, ai_bii;
 
@@ -160,28 +159,25 @@ __device__ void calc_qw_dvel_matrix_incr(int row, int qi, int column, int n_lamb
     dvH1 = 0;
     dvH2 = 0;
 
-    for (int state = 0; state < n_lambdas; state++) {
-        qa_type = D_qatypes[qi + n_qatoms * state];
-        qi_type = D_qcatypes[qa_type.code - 1];
+    qi_type = D_qcatypes[qi];
 
-        ai_aii = qi_type.Ai;
-        ai_bii = qi_type.Bi;
+    ai_aii = qi_type.Ai;
+    ai_bii = qi_type.Bi;
 
-        V_a = ai_aii * A_O / (r6O * r6O);
-        V_b = ai_bii * B_O / (r6O);
+    V_a = ai_aii * A_O / (r6O * r6O);
+    V_b = ai_bii * B_O / (r6O);
 
-        VelO = D_topo.coulomb_constant * crg_ow * D_qcharges[qi + n_qatoms * state].q * rO;
-        VelH1 = D_topo.coulomb_constant * crg_hw * D_qcharges[qi + n_qatoms * state].q * rH1;
-        VelH2 = D_topo.coulomb_constant * crg_hw * D_qcharges[qi + n_qatoms * state].q * rH2;
+    VelO = D_topo.coulomb_constant * crg_ow * D_qcharges[qi].q * rO;
+    VelH1 = D_topo.coulomb_constant * crg_hw * D_qcharges[qi].q * rH1;
+    VelH2 = D_topo.coulomb_constant * crg_hw * D_qcharges[qi].q * rH2;
 
-        dvO += r2O * (-VelO - (12 * V_a - 6 * V_b)) * D_lambdas[state];
-        dvH1 -= r2H1 * VelH1 * D_lambdas[state];
-        dvH2 -= r2H2 * VelH2 * D_lambdas[state];
+    dvO += r2O * (-VelO - (12 * V_a - 6 * V_b));
+    dvH1 -= r2H1 * VelH1;
+    dvH2 -= r2H2 * VelH2;
 
-        // Update Q totals
-        Ecoul_S[row][state * BLOCK_SIZE + column] += (VelO + VelH1 + VelH2);
-        Evdw_S[row][state * BLOCK_SIZE + column] += (V_a - V_b);
-    }
+    // Update Q totals
+    Ecoul_S[row][column] += (VelO + VelH1 + VelH2);
+    Evdw_S[row][column] += (V_a - V_b);
 
     // Note r6O is not the usual 1/rO^6, but rather rO^6. be careful!!!
 
@@ -204,7 +200,7 @@ __device__ void calc_qw_dvel_matrix_incr(int row, int qi, int column, int n_lamb
 
 __global__ void calc_qw_dvel_matrix(int n_qatoms, int n_waters, int n_lambdas, double crg_ow, double crg_hw, double A_O, double B_O,
                                     coord_t* X, coord_t* W, double* Evdw, double* Ecoul, calc_qw_t* MAT,
-                                    q_catype_t* D_qcatypes, q_atype_t* D_qatypes, q_charge_t* D_qcharges, q_atom_t* D_qatoms, double* D_lambdas, topo_t D_topo) {
+                                    q_catype_t* D_qcatypes, q_charge_t* D_qcharges, q_atom_t* D_qatoms, topo_t D_topo) {
     // Block index
     int bx = blockIdx.x;
     int by = blockIdx.y;
@@ -214,8 +210,8 @@ __global__ void calc_qw_dvel_matrix(int n_qatoms, int n_waters, int n_lambdas, d
     int ty = threadIdx.y;
 
     // TODO implement >2 states on GPU
-    __shared__ double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE];
-    __shared__ double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE];
+    __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE];
+    __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE];
 
     Ecoul_S[ty][tx] = 0;
     Evdw_S[ty][tx] = 0;
@@ -250,29 +246,26 @@ __global__ void calc_qw_dvel_matrix(int n_qatoms, int n_waters, int n_lambdas, d
     int column = bx * BLOCK_SIZE + tx;
 
     calc_qw_dvel_matrix_incr(ty, aStart + ty, tx, n_lambdas, n_qatoms, crg_ow, crg_hw, A_O, B_O, Qs, Ws, Evdw_S, Ecoul_S,
-                             &qw, D_qcatypes, D_qatypes, D_qcharges, D_qatoms, D_lambdas, D_topo);
+                             &qw, D_qcatypes, D_qcharges, D_qatoms, D_topo);
 
     MAT[column + n_waters * row] = qw;
 
     __syncthreads();
 
     int rowlen = (n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    int collen = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
 
     if (tx == 0 && ty == 0) {
         // TODO implement >2 states on GPU
-        for (int state = 0; state < min(2, n_lambdas); state++) {
-            double tot_Evdw = 0;
-            double tot_Ecoul = 0;
-            for (int i = 0; i < BLOCK_SIZE; i++) {
-                for (int j = 0; j < BLOCK_SIZE; j++) {
-                    tot_Evdw += Evdw_S[i][j + state * BLOCK_SIZE];
-                    tot_Ecoul += Ecoul_S[i][j + state * BLOCK_SIZE];
-                }
+        double tot_Evdw = 0;
+        double tot_Ecoul = 0;
+        for (int i = 0; i < BLOCK_SIZE; i++) {
+            for (int j = 0; j < BLOCK_SIZE; j++) {
+                tot_Evdw += Evdw_S[i][j];
+                tot_Ecoul += Ecoul_S[i][j];
             }
-            Evdw[rowlen * collen * state + rowlen * by + bx] = tot_Evdw;
-            Ecoul[rowlen * collen * state + rowlen * by + bx] = tot_Ecoul;
         }
+        Evdw[rowlen * by + bx] = tot_Evdw;
+        Ecoul[rowlen * by + bx] = tot_Ecoul;
     }
 
     __syncthreads();
@@ -282,36 +275,27 @@ __global__ void calc_qw_dvel_matrix(int n_qatoms, int n_waters, int n_lambdas, d
 void calc_nonbonded_qw_forces_host_v2() {
     using namespace CudaNonbondedQWForce;
 
-    int mem_size_X = n_atoms_solute * sizeof(coord_t);
-    int mem_size_W = 3 * n_waters * sizeof(coord_t);
     int mem_size_DV_X = n_atoms_solute * sizeof(dvel_t);
     int mem_size_DV_W = 3 * n_waters * sizeof(dvel_t);
-    int mem_size_MAT = 3 * n_waters * n_qatoms * sizeof(calc_qw_t);
 
     int n_blocks_q = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
     int n_blocks_w = (n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    // TODO make Evdw & Ecoul work for # of states > 2
-    int mem_size_QW_Evdw = min(n_lambdas, 2) * n_blocks_q * n_blocks_w * sizeof(double);
-    int mem_size_QW_Ecoul = min(n_lambdas, 2) * n_blocks_q * n_blocks_w * sizeof(double);
 
     CudaContext& ctx = CudaContext::instance();
     auto X = ctx.d_coords;
     auto DV_X = ctx.d_dvelocities;
-    auto D_qcatypes = ctx.d_q_catypes;
-    auto D_qatypes = ctx.d_q_atypes;
-    auto D_qcharges = ctx.d_q_charges;
+    auto D_qcatypes = ctx.d_q_aggregate_catypes;
+    auto D_qcharges = ctx.d_q_aggregate_charges;
     auto D_qatoms = ctx.d_q_atoms;
-    auto D_lambdas = ctx.d_lambdas;
 
     dim3 threads, grid;
 
     threads = dim3(BLOCK_SIZE, BLOCK_SIZE);
     grid = dim3((n_waters + BLOCK_SIZE - 1) / threads.x, (n_qatoms + BLOCK_SIZE - 1) / threads.y);
 
-    double evdw, ecoul;
 
     calc_qw_dvel_matrix<<<grid, threads>>>(n_qatoms, n_waters, n_lambdas, crg_ow, crg_hw, A_O, B_O, X, X + n_atoms_solute, D_QW_Evdw, D_QW_Ecoul,
-                                           QW_MAT, D_qcatypes, D_qatypes, D_qcharges, D_qatoms, D_lambdas, topo);
+                                           QW_MAT, D_qcatypes, D_qcharges, D_qatoms, topo);
     calc_qw_dvel_vector_column<<<((n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_qatoms, n_waters, DV_X, DV_X + n_atoms_solute, QW_MAT);
     calc_qw_dvel_vector_row<<<((n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_qatoms, n_waters, DV_X, DV_X + n_atoms_solute, QW_MAT, D_qatoms);
 
@@ -331,17 +315,15 @@ void calc_nonbonded_qw_forces_host_v2() {
     cudaMemcpy(dvelocities, DV_X, mem_size_DV_X, cudaMemcpyDeviceToHost);
     cudaMemcpy(&dvelocities[n_atoms_solute], DV_X + n_atoms_solute, mem_size_DV_W, cudaMemcpyDeviceToHost);
 
-    // TODO make Evdw & Ecoul work for # of states > 2
-    for (int state = 0; state < min(2, n_lambdas); state++) {
-        calc_energy_sum<<<1, threads>>>(n_blocks_q, n_blocks_w, D_QW_evdw_TOT, D_QW_ecoul_TOT, &D_QW_Evdw[state * n_blocks_w * n_blocks_q],
-                                        &D_QW_Ecoul[state * n_blocks_w * n_blocks_q], false);
+    calc_energy_sum<<<1, threads>>>(n_blocks_q, n_blocks_w, D_QW_evdw_TOT, D_QW_ecoul_TOT, D_QW_Evdw,
+                                    D_QW_Ecoul, false);
 
-        cudaMemcpy(&QW_evdw_TOT, D_QW_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost);
-        cudaMemcpy(&QW_ecoul_TOT, D_QW_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&QW_evdw_TOT, D_QW_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&QW_ecoul_TOT, D_QW_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost);
 
-        EQ_nonbond_qw[state].Uvdw += QW_evdw_TOT;
-        EQ_nonbond_qw[state].Ucoul += QW_ecoul_TOT;
-    }
+    E_nonbond_qx.Uvdw += QW_evdw_TOT;
+    E_nonbond_qx.Ucoul += QW_ecoul_TOT;
+    printf("QW Energy: Evdw = %f Ecoul = %f\n", QW_evdw_TOT, QW_ecoul_TOT);
 }
 
 void init_nonbonded_qw_force_kernel_data() {
@@ -358,8 +340,8 @@ void init_nonbonded_qw_force_kernel_data() {
         int n_blocks_w = (n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE;
         int mem_size_MAT = 3 * n_waters * n_qatoms * sizeof(calc_qw_t);
 
-        int mem_size_QW_Evdw = min(n_lambdas, 2) * n_blocks_q * n_blocks_w * sizeof(double);
-        int mem_size_QW_Ecoul = min(n_lambdas, 2) * n_blocks_q * n_blocks_w * sizeof(double);
+        int mem_size_QW_Evdw = n_blocks_q * n_blocks_w * sizeof(double);
+        int mem_size_QW_Ecoul = n_blocks_q * n_blocks_w * sizeof(double);
 #ifdef DEBUG
         printf("Allocating QW_MAT\n");
 #endif
diff --git a/src/core/system.cu b/src/core/system.cu
index afb999cf..ee50ee61 100644
--- a/src/core/system.cu
+++ b/src/core/system.cu
@@ -1,3 +1,4 @@
+#include <iostream>
 #include "system.h"
 #include "utils.h"
 #include "parse.h"
@@ -1094,7 +1095,6 @@ void calc_integration_step(int iteration) {
     calc_bonded_forces();
 
     clock_t end_bonded = clock();
-
     clock_t start_pp, end_pp, start_qp, end_qp;
     if (run_gpu) {
         start_qp = clock();