diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..c6f9a44
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+.vscode/settings.json
diff --git a/PSEv1/Brownian.cu b/PSEv1/Brownian.cu
index 907f20d..f267c33 100644
--- a/PSEv1/Brownian.cu
+++ b/PSEv1/Brownian.cu
@@ -75,7 +75,7 @@ using namespace hoomd;
     \brief Defines functions for PSE calculation of the Brownian Displacements
 
     // Uses LAPACKE to perform the final square root of the tridiagonal matrix
-	resulting from the Lanczos Method
+  resulting from the Lanczos Method
 */
 
 //! Shared memory array for partial sum of dot product kernel
@@ -83,842 +83,842 @@ extern __shared__ Scalar partial_sum[];
 extern __shared__ Scalar4 shared_Fpos[];
 
 /*!
-  	Generate random numbers on particles
-	
-	\param d_psi            random vector
+    Generate random numbers on particles
+  
+  \param d_psi            random vector
         \param group_size       number of particles
-	\param d_group_members  index to particle arrays
-	\param timestep         current time step
-	\param seed             seed for random number generation
+  \param d_group_members  index to particle arrays
+  \param timestep         current time step
+  \param seed             seed for random number generation
 
-	Thread-per-particle operations to generate random numbers
-	for the real space part of the Brownian calculation. Grid
-	and blocks are 1-D.
+  Thread-per-particle operations to generate random numbers
+  for the real space part of the Brownian calculation. Grid
+  and blocks are 1-D.
 
 */
 __global__ void gpu_stokes_BrownianGenerate_kernel(
-					Scalar4 *d_psi,
-					unsigned int group_size,
-					unsigned int *d_group_members,
-					const unsigned int timestep, 
-					const unsigned int seed
-					){
+          Scalar4 *d_psi,
+          unsigned int group_size,
+          unsigned int *d_group_members,
+          const unsigned int timestep, 
+          const unsigned int seed
+          ){
 
-	// Thread ID
-	int group_idx = blockDim.x * blockIdx.x + threadIdx.x;
+  // Thread ID
+  int group_idx = blockDim.x * blockIdx.x + threadIdx.x;
 
-	// Make sure that thread is in bounds
-	if (group_idx < group_size) {
+  // Make sure that thread is in bounds
+  if (group_idx < group_size) {
 
-		// Global particle index
-		unsigned int idx = d_group_members[group_idx];
+    // Global particle index
+    unsigned int idx = d_group_members[group_idx];
 
-		// Initialize random number generator
-		detail::Saru s(idx, timestep + seed);
+    // Initialize random number generator
+    detail::Saru s(idx, timestep + seed);
 
-		// Draw numbers from a Uniform distribution (-sqrt(3),sqrt(3),
-		// so that variance = 1/3
-		Scalar sqrt3 = 1.73205080757;
-		Scalar randomx = s.f( -sqrt3, sqrt3 );
-		Scalar randomy = s.f( -sqrt3, sqrt3 );
-		Scalar randomz = s.f( -sqrt3, sqrt3 );
+    // Draw numbers from a Uniform distribution (-sqrt(3),sqrt(3),
+    // so that variance = 1/3
+    Scalar sqrt3 = 1.73205080757;
+    Scalar randomx = s.f( -sqrt3, sqrt3 );
+    Scalar randomy = s.f( -sqrt3, sqrt3 );
+    Scalar randomz = s.f( -sqrt3, sqrt3 );
 
-		// Write to global memory, leaving the 4th element unchanged
-		d_psi[idx] = make_scalar4(randomx, randomy, randomz, d_psi[idx].w);
+    // Write to global memory, leaving the 4th element unchanged
+    d_psi[idx] = make_scalar4(randomx, randomy, randomz, d_psi[idx].w);
 
-	}
+  }
 }
 
 /*!
-  	Generate random numbers for wave space Brownian motion ( random numbers on grid )
-        	- scale forces as they're generated and add directly to the existing grid.	
-
-	\param d_gridX		x-component of vectors on grid
-	\param d_gridY		y-component of vectors on grid
-	\param d_gridZ		z-component of vectors on grid
-	\param d_gridk		reciprocal lattice vectors for each grid point
-	\param NxNyNz		total number of grid points
-	\param Nx		number of grid points in x-direction
-	\param Ny		number of grid points in y-direction
-	\param Nz		number of grid points in z-direction
-	\param timestep         current simulation time step
-	\param seed             seed for random number generation
-	\param T		simulation temperature
-	\param dt		simulation time step size
-	\param quadW		quadrature weight for spectral Ewald integration
-
-	Thread per grid node. 1-D grid of blocks, 1-D block of threads.
+    Generate random numbers for wave space Brownian motion ( random numbers on grid )
+          - scale forces as they're generated and add directly to the existing grid.  
+
+  \param d_gridX    x-component of vectors on grid
+  \param d_gridY    y-component of vectors on grid
+  \param d_gridZ    z-component of vectors on grid
+  \param d_gridk    reciprocal lattice vectors for each grid point
+  \param NxNyNz    total number of grid points
+  \param Nx    number of grid points in x-direction
+  \param Ny    number of grid points in y-direction
+  \param Nz    number of grid points in z-direction
+  \param timestep         current simulation time step
+  \param seed             seed for random number generation
+  \param T    simulation temperature
+  \param dt    simulation time step size
+  \param quadW    quadrature weight for spectral Ewald integration
+
+  Thread per grid node. 1-D grid of blocks, 1-D block of threads.
 
 */
 __global__ void gpu_stokes_BrownianGridGenerate_kernel(  
-						CUFFTCOMPLEX *gridX,
-						CUFFTCOMPLEX *gridY,
-						CUFFTCOMPLEX *gridZ,
-						Scalar4 *gridk,
-				        	unsigned int NxNyNz,
-						int Nx,
-						int Ny,
-						int Nz,
-				        	const unsigned int timestep, 
-				        	const unsigned int seed,
-						Scalar T,
-						Scalar dt,
-						Scalar quadW
-				             	){
-
-	// Current thread index
-	int idx = blockDim.x * blockIdx.x + threadIdx.x;
-
-	// Check if threads are in bounds
-	if ( idx < NxNyNz ) {
+            CUFFTCOMPLEX *gridX,
+            CUFFTCOMPLEX *gridY,
+            CUFFTCOMPLEX *gridZ,
+            Scalar4 *gridk,
+                  unsigned int NxNyNz,
+            int Nx,
+            int Ny,
+            int Nz,
+                  const unsigned int timestep, 
+                  const unsigned int seed,
+            Scalar T,
+            Scalar dt,
+            Scalar quadW
+                       ){
+
+  // Current thread index
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+
+  // Check if threads are in bounds
+  if ( idx < NxNyNz ) {
       
-		// Random number generator
-		detail::Saru s(idx, timestep + seed);
-	
-		// Square root of 3.0 / 2.0
-		Scalar sqrt3d2 = 1.2247448713915889;
-	
-		// Get random numbers from uniform distribution
-		// on (-sqrt(3/2),sqrt(3/2)) so that variance
-		// of ( reX + reY ) = 1.0
-		Scalar reX = s.f( -sqrt3d2, sqrt3d2 );
-		Scalar reY = s.f( -sqrt3d2, sqrt3d2 );
-		Scalar reZ = s.f( -sqrt3d2, sqrt3d2 );
-		Scalar imX = s.f( -sqrt3d2, sqrt3d2 );
-		Scalar imY = s.f( -sqrt3d2, sqrt3d2 );
-		Scalar imZ = s.f( -sqrt3d2, sqrt3d2 );
-		
-		// Modulo arithmetic for indices for current grid point
-		int kk = idx % Nz;
-		int jj = ( ( idx - kk ) / Nz ) % Ny;
-		int ii = ( ( idx - kk ) / Nz - jj ) / Ny;
-			
-		// Scaling factor for covariance
-		Scalar fac = sqrtf(2.0*T/dt/quadW);
-		
-		// Variables required to place values on the grid
-		Scalar2 fX, fY, fZ;  		   // forces for thread's point
-		Scalar2 fX_conj, fY_conj, fZ_conj; // forces for thread's conjugate point
-		Scalar2 kdF, kdF_conj;		   // dot(k,F) for thread and conjugate point
-		Scalar B12, B12_conj;		   // Scaling factors for thread and conjugate point
-		
-		// Only do work on half the grid points because we are simultaneously assigning values
-		// to each grid point and its conjugate. The following check makes sure we pick all of
-		// the points without conjugates (zeros and nyquist points) as well as all the points
-		// in the upper half of the grid. Also, ignore the origin in the wave space sum. (Sum
-		// is over all k!= 0)
-		if ( 
-		     !( 2 * kk >= Nz + 1 ) &&
-		     !( ( kk == 0 ) && ( 2 * jj >= Ny + 1 ) ) &&
-		     !( ( kk == 0 ) && ( jj == 0 ) && ( 2 * ii >= Nx + 1 ) ) &&
-		     !( ( kk == 0 ) && ( jj == 0 ) && ( ii == 0 ) ) 
-		){
-
-			// Is current grid point a nyquist point
-			bool ii_nyquist = ( ( ii == Nx/2 ) && ( Nx/2 == (Nx+1)/2 ) );
-			bool jj_nyquist = ( ( jj == Ny/2 ) && ( Ny/2 == (Ny+1)/2 ) );
-			bool kk_nyquist = ( ( kk == Nz/2 ) && ( Nz/2 == (Nz+1)/2 ) );
-			
-			// Index of conjugate point
-			int ii_conj, jj_conj, kk_conj;
-			if ( ii == 0 ){
-				ii_conj = ii;
-			}
-			else {
-				ii_conj = Nx - ii;
-			}
-			if ( jj == 0 ){
-				jj_conj = jj;
-			}
-			else {
-				jj_conj = Ny - jj;
-			}
-			if ( kk == 0 ){
-				kk_conj = kk;
-			}
-			else {
-				kk_conj = Nz - kk;
-			}
-		
-			// Global index of conjugate grid point
-			int conj_idx = ii_conj * Ny*Nz + jj_conj * Nz + kk_conj;
-		
-			// Current wave-space vector, conjugate wave space vector, and their
-			// magnitudes
-			Scalar4 tk = gridk[idx];
-			Scalar4 tk_conj = gridk[conj_idx];
-		
-			Scalar ksq = tk.x*tk.x + tk.y*tk.y + tk.z*tk.z;
-			Scalar ksq_conj = tk_conj.x*tk_conj.x + tk_conj.y*tk_conj.y + tk_conj.z*tk_conj.z;
-		
-			// Assign fluctuating values to the Nyquist points (no conjugate points)
-			if ( ( ii == 0    && jj_nyquist && kk == 0 ) ||
-			     ( ii_nyquist && jj == 0    && kk == 0 ) ||
-			     ( ii_nyquist && jj_nyquist && kk == 0 ) ||
-			     ( ii == 0    && jj == 0    && kk_nyquist ) ||
-			     ( ii == 0    && jj_nyquist && kk_nyquist ) ||
-			     ( ii_nyquist && jj == 0    && kk_nyquist ) ||
-			     ( ii_nyquist && jj_nyquist && kk_nyquist ) ){
-		
-				// At the nyquist point, the random quantity only has a real component. Have to
-				// multiply by sqrt(2.0) to make sure the variance is still 1
-				Scalar sqrt2 = 1.4142135623730951;
-				fX = make_scalar2( sqrt2*reX, 0.0 );
-				fY = make_scalar2( sqrt2*reY, 0.0 );
-				fZ = make_scalar2( sqrt2*reZ, 0.0 );
-		
-				// Dot product of wave-vector with stochastic quantity
-				kdF = make_scalar2( ( tk.x*fX.x + tk.y*fY.x + tk.z*fZ.x ) / ksq,  ( tk.x*fX.y + tk.y*fY.y + tk.z*fZ.y ) / ksq );
-				
-				// Scaling factor
-				B12 = sqrtf( tk.w );
-				Scalar k = sqrtf( ksq );
-				B12 *= sinf( k ) / k;
-		
-				// Add random quantity to the grid AND scale by B^(1/2) simultaneously to save effort
-				gridX[idx].x = gridX[idx].x + fac * ( fX.x - tk.x * kdF.x ) * B12;
-				gridX[idx].y = gridX[idx].y + fac * ( fX.y - tk.x * kdF.y ) * B12;
-				
-				gridY[idx].x = gridY[idx].x + fac * ( fY.x - tk.y * kdF.x ) * B12;
-				gridY[idx].y = gridY[idx].y + fac * ( fY.y - tk.y * kdF.y ) * B12;
-				
-				gridZ[idx].x = gridZ[idx].x + fac * ( fZ.x - tk.z * kdF.x ) * B12;
-				gridZ[idx].y = gridZ[idx].y + fac * ( fZ.y - tk.z * kdF.y ) * B12;
-		
-			}
-			else {
-		
-				// Construct random force
-				fX = make_scalar2( reX, imX );
-				fY = make_scalar2( reY, imY );
-				fZ = make_scalar2( reZ, imZ );
-		
-				// The random force at the conjugate point is the conjugate of the force at
-				// the current point
-				fX_conj = make_scalar2( reX, -imX );
-				fY_conj = make_scalar2( reY, -imY );
-				fZ_conj = make_scalar2( reZ, -imZ );
-
-				// Dot prodcut of force with wave vector at current and conjugate point
-				kdF = make_scalar2( ( tk.x*fX.x + tk.y*fY.x + tk.z*fZ.x ) / ksq,  ( tk.x*fX.y + tk.y*fY.y + tk.z*fZ.y ) / ksq );
-				kdF_conj = make_scalar2( ( tk_conj.x*fX_conj.x + tk_conj.y*fY_conj.x + tk_conj.z*fZ_conj.x ) / ksq_conj,  ( tk_conj.x*fX_conj.y + tk_conj.y*fY_conj.y + tk_conj.z*fZ_conj.y ) / ksq_conj );
-			
-				// Scaling factors at current and conjugate point
-				B12 = sqrtf( tk.w );
-				B12_conj = sqrtf( tk_conj.w );
-				
-				Scalar k = sqrtf( ksq );
-				Scalar kconj = sqrtf( ksq_conj );
-				B12 *= sinf( k ) / k;
-				B12_conj *= sinf( kconj ) / kconj;
-		
-				// Add random quantity to the grid AND scale by B^(1/2) simultaneously to save effort
-				// Current grid point
-				gridX[idx].x = gridX[idx].x + fac * ( fX.x - tk.x * kdF.x ) * B12;
-				gridX[idx].y = gridX[idx].y + fac * ( fX.y - tk.x * kdF.y ) * B12;
-				
-				gridY[idx].x = gridY[idx].x + fac * ( fY.x - tk.y * kdF.x ) * B12;
-				gridY[idx].y = gridY[idx].y + fac * ( fY.y - tk.y * kdF.y ) * B12;
-				
-				gridZ[idx].x = gridZ[idx].x + fac * ( fZ.x - tk.z * kdF.x ) * B12;
-				gridZ[idx].y = gridZ[idx].y + fac * ( fZ.y - tk.z * kdF.y ) * B12;
-			
-				// Add random quantity to the grid AND scale by B^(1/2) simultaneously to save effort
-				// Conjugate grid point
-				gridX[conj_idx].x = gridX[conj_idx].x + fac * ( fX_conj.x - tk_conj.x * kdF_conj.x ) * B12_conj;
-				gridX[conj_idx].y = gridX[conj_idx].y + fac * ( fX_conj.y - tk_conj.x * kdF_conj.y ) * B12_conj;
-				
-				gridY[conj_idx].x = gridY[conj_idx].x + fac * ( fY_conj.x - tk_conj.y * kdF_conj.x ) * B12_conj;
-				gridY[conj_idx].y = gridY[conj_idx].y + fac * ( fY_conj.y - tk_conj.y * kdF_conj.y ) * B12_conj;
-				
-				gridZ[conj_idx].x = gridZ[conj_idx].x + fac * ( fZ_conj.x - tk_conj.z * kdF_conj.x ) * B12_conj;
-				gridZ[conj_idx].y = gridZ[conj_idx].y + fac * ( fZ_conj.y - tk_conj.z * kdF_conj.y ) * B12_conj;
-		
-			}
-		
-		
-		
-		}
+    // Random number generator
+    detail::Saru s(idx, timestep + seed);
+  
+    // Square root of 3.0 / 2.0
+    Scalar sqrt3d2 = 1.2247448713915889;
+  
+    // Get random numbers from uniform distribution
+    // on (-sqrt(3/2),sqrt(3/2)) so that variance
+    // of ( reX + reY ) = 1.0
+    Scalar reX = s.f( -sqrt3d2, sqrt3d2 );
+    Scalar reY = s.f( -sqrt3d2, sqrt3d2 );
+    Scalar reZ = s.f( -sqrt3d2, sqrt3d2 );
+    Scalar imX = s.f( -sqrt3d2, sqrt3d2 );
+    Scalar imY = s.f( -sqrt3d2, sqrt3d2 );
+    Scalar imZ = s.f( -sqrt3d2, sqrt3d2 );
+    
+    // Modulo arithmetic for indices for current grid point
+    int kk = idx % Nz;
+    int jj = ( ( idx - kk ) / Nz ) % Ny;
+    int ii = ( ( idx - kk ) / Nz - jj ) / Ny;
+      
+    // Scaling factor for covariance
+    Scalar fac = sqrtf(2.0*T/dt/quadW);
+    
+    // Variables required to place values on the grid
+    Scalar2 fX, fY, fZ;         // forces for thread's point
+    Scalar2 fX_conj, fY_conj, fZ_conj; // forces for thread's conjugate point
+    Scalar2 kdF, kdF_conj;       // dot(k,F) for thread and conjugate point
+    Scalar B12, B12_conj;       // Scaling factors for thread and conjugate point
+    
+    // Only do work on half the grid points because we are simultaneously assigning values
+    // to each grid point and its conjugate. The following check makes sure we pick all of
+    // the points without conjugates (zeros and nyquist points) as well as all the points
+    // in the upper half of the grid. Also, ignore the origin in the wave space sum. (Sum
+    // is over all k!= 0)
+    if ( 
+         !( 2 * kk >= Nz + 1 ) &&
+         !( ( kk == 0 ) && ( 2 * jj >= Ny + 1 ) ) &&
+         !( ( kk == 0 ) && ( jj == 0 ) && ( 2 * ii >= Nx + 1 ) ) &&
+         !( ( kk == 0 ) && ( jj == 0 ) && ( ii == 0 ) ) 
+    ){
+
+      // Is current grid point a nyquist point
+      bool ii_nyquist = ( ( ii == Nx/2 ) && ( Nx/2 == (Nx+1)/2 ) );
+      bool jj_nyquist = ( ( jj == Ny/2 ) && ( Ny/2 == (Ny+1)/2 ) );
+      bool kk_nyquist = ( ( kk == Nz/2 ) && ( Nz/2 == (Nz+1)/2 ) );
+      
+      // Index of conjugate point
+      int ii_conj, jj_conj, kk_conj;
+      if ( ii == 0 ){
+        ii_conj = ii;
+      }
+      else {
+        ii_conj = Nx - ii;
+      }
+      if ( jj == 0 ){
+        jj_conj = jj;
+      }
+      else {
+        jj_conj = Ny - jj;
+      }
+      if ( kk == 0 ){
+        kk_conj = kk;
+      }
+      else {
+        kk_conj = Nz - kk;
+      }
+    
+      // Global index of conjugate grid point
+      int conj_idx = ii_conj * Ny*Nz + jj_conj * Nz + kk_conj;
+    
+      // Current wave-space vector, conjugate wave space vector, and their
+      // magnitudes
+      Scalar4 tk = gridk[idx];
+      Scalar4 tk_conj = gridk[conj_idx];
+    
+      Scalar ksq = tk.x*tk.x + tk.y*tk.y + tk.z*tk.z;
+      Scalar ksq_conj = tk_conj.x*tk_conj.x + tk_conj.y*tk_conj.y + tk_conj.z*tk_conj.z;
+    
+      // Assign fluctuating values to the Nyquist points (no conjugate points)
+      if ( ( ii == 0    && jj_nyquist && kk == 0 ) ||
+           ( ii_nyquist && jj == 0    && kk == 0 ) ||
+           ( ii_nyquist && jj_nyquist && kk == 0 ) ||
+           ( ii == 0    && jj == 0    && kk_nyquist ) ||
+           ( ii == 0    && jj_nyquist && kk_nyquist ) ||
+           ( ii_nyquist && jj == 0    && kk_nyquist ) ||
+           ( ii_nyquist && jj_nyquist && kk_nyquist ) ){
+    
+        // At the nyquist point, the random quantity only has a real component. Have to
+        // multiply by sqrt(2.0) to make sure the variance is still 1
+        Scalar sqrt2 = 1.4142135623730951;
+        fX = make_scalar2( sqrt2*reX, 0.0 );
+        fY = make_scalar2( sqrt2*reY, 0.0 );
+        fZ = make_scalar2( sqrt2*reZ, 0.0 );
+    
+        // Dot product of wave-vector with stochastic quantity
+        kdF = make_scalar2( ( tk.x*fX.x + tk.y*fY.x + tk.z*fZ.x ) / ksq,  ( tk.x*fX.y + tk.y*fY.y + tk.z*fZ.y ) / ksq );
+        
+        // Scaling factor
+        B12 = sqrtf( tk.w );
+        Scalar k = sqrtf( ksq );
+        B12 *= sinf( k ) / k;
+    
+        // Add random quantity to the grid AND scale by B^(1/2) simultaneously to save effort
+        gridX[idx].x = gridX[idx].x + fac * ( fX.x - tk.x * kdF.x ) * B12;
+        gridX[idx].y = gridX[idx].y + fac * ( fX.y - tk.x * kdF.y ) * B12;
+        
+        gridY[idx].x = gridY[idx].x + fac * ( fY.x - tk.y * kdF.x ) * B12;
+        gridY[idx].y = gridY[idx].y + fac * ( fY.y - tk.y * kdF.y ) * B12;
+        
+        gridZ[idx].x = gridZ[idx].x + fac * ( fZ.x - tk.z * kdF.x ) * B12;
+        gridZ[idx].y = gridZ[idx].y + fac * ( fZ.y - tk.z * kdF.y ) * B12;
+    
+      }
+      else {
+    
+        // Construct random force
+        fX = make_scalar2( reX, imX );
+        fY = make_scalar2( reY, imY );
+        fZ = make_scalar2( reZ, imZ );
+    
+        // The random force at the conjugate point is the conjugate of the force at
+        // the current point
+        fX_conj = make_scalar2( reX, -imX );
+        fY_conj = make_scalar2( reY, -imY );
+        fZ_conj = make_scalar2( reZ, -imZ );
+
+        // Dot prodcut of force with wave vector at current and conjugate point
+        kdF = make_scalar2( ( tk.x*fX.x + tk.y*fY.x + tk.z*fZ.x ) / ksq,  ( tk.x*fX.y + tk.y*fY.y + tk.z*fZ.y ) / ksq );
+        kdF_conj = make_scalar2( ( tk_conj.x*fX_conj.x + tk_conj.y*fY_conj.x + tk_conj.z*fZ_conj.x ) / ksq_conj,  ( tk_conj.x*fX_conj.y + tk_conj.y*fY_conj.y + tk_conj.z*fZ_conj.y ) / ksq_conj );
+      
+        // Scaling factors at current and conjugate point
+        B12 = sqrtf( tk.w );
+        B12_conj = sqrtf( tk_conj.w );
+        
+        Scalar k = sqrtf( ksq );
+        Scalar kconj = sqrtf( ksq_conj );
+        B12 *= sinf( k ) / k;
+        B12_conj *= sinf( kconj ) / kconj;
+    
+        // Add random quantity to the grid AND scale by B^(1/2) simultaneously to save effort
+        // Current grid point
+        gridX[idx].x = gridX[idx].x + fac * ( fX.x - tk.x * kdF.x ) * B12;
+        gridX[idx].y = gridX[idx].y + fac * ( fX.y - tk.x * kdF.y ) * B12;
+        
+        gridY[idx].x = gridY[idx].x + fac * ( fY.x - tk.y * kdF.x ) * B12;
+        gridY[idx].y = gridY[idx].y + fac * ( fY.y - tk.y * kdF.y ) * B12;
+        
+        gridZ[idx].x = gridZ[idx].x + fac * ( fZ.x - tk.z * kdF.x ) * B12;
+        gridZ[idx].y = gridZ[idx].y + fac * ( fZ.y - tk.z * kdF.y ) * B12;
+      
+        // Add random quantity to the grid AND scale by B^(1/2) simultaneously to save effort
+        // Conjugate grid point
+        gridX[conj_idx].x = gridX[conj_idx].x + fac * ( fX_conj.x - tk_conj.x * kdF_conj.x ) * B12_conj;
+        gridX[conj_idx].y = gridX[conj_idx].y + fac * ( fX_conj.y - tk_conj.x * kdF_conj.y ) * B12_conj;
+        
+        gridY[conj_idx].x = gridY[conj_idx].x + fac * ( fY_conj.x - tk_conj.y * kdF_conj.x ) * B12_conj;
+        gridY[conj_idx].y = gridY[conj_idx].y + fac * ( fY_conj.y - tk_conj.y * kdF_conj.y ) * B12_conj;
+        
+        gridZ[conj_idx].x = gridZ[conj_idx].x + fac * ( fZ_conj.x - tk_conj.z * kdF_conj.x ) * B12_conj;
+        gridZ[conj_idx].y = gridZ[conj_idx].y + fac * ( fZ_conj.y - tk_conj.z * kdF_conj.y ) * B12_conj;
+    
+      }
+    
+    
+    
+    }
 
  
-    	}
+      }
 }
 
 
 /*!
-	Use Lanczos method to compute Mreal^0.5 * psi
+  Use Lanczos method to compute Mreal^0.5 * psi
 
-	This method is detailed in:
-		"Preconditioned Krylov Subspace Methods for Sampling Multivariate Gaussian Distributions"
-		Edmond Chow and Yousef Saad, IAM J. Sci. Comput., 36(2), A588–A608
+  This method is detailed in:
+    "Preconditioned Krylov Subspace Methods for Sampling Multivariate Gaussian Distributions"
+    Edmond Chow and Yousef Saad, IAM J. Sci. Comput., 36(2), A588–A608
 
 
 */
-void gpu_stokes_BrealLanczos_wrap( 	
-					Scalar4 *d_psi,
-				   	Scalar4 *d_pos,
-                                   	unsigned int *d_group_members,
-                                   	unsigned int group_size,
-                                   	const BoxDim& box,
-                                   	Scalar dt,
-			           	Scalar4 *d_vel,
-			           	const Scalar T,
-			           	const unsigned int timestep,
-			           	const unsigned int seed,
-			           	Scalar xi,
-			           	Scalar ewald_cut,
-			           	Scalar ewald_dr,
-			           	int ewald_n,
-			           	Scalar4 *d_ewaldC1, 
-			           	const unsigned int *d_n_neigh,
-                                   	const unsigned int *d_nlist,
-                                   	const unsigned int *d_headlist,
-			           	int& m,
-				   	Scalar cheb_error,
-			           	dim3 grid,
-			           	dim3 threads,
-			           	int gridBlockSize,
-			           	int gridNBlock,
-			           	Scalar3 gridh,
-			           	Scalar self 
-					){
-
-	// Dot product kernel specifications
-	unsigned int thread_for_dot = 512; // Must be 2^n
-	unsigned int grid_for_dot = (group_size/thread_for_dot) + 1;
-
-	// Temp var for dot product.
-	Scalar *dot_sum;
-	cudaMalloc( (void**)&dot_sum, grid_for_dot*sizeof(Scalar) );
-
-	// Allocate storage
-	// 
-	int m_in = m;
-	int m_max = 100;
+void gpu_stokes_BrealLanczos_wrap(   
+          Scalar4 *d_psi,
+             Scalar4 *d_pos,
+                                     unsigned int *d_group_members,
+                                     unsigned int group_size,
+                                     const BoxDim& box,
+                                     Scalar dt,
+                   Scalar4 *d_vel,
+                   const Scalar T,
+                   const unsigned int timestep,
+                   const unsigned int seed,
+                   Scalar xi,
+                   Scalar ewald_cut,
+                   Scalar ewald_dr,
+                   int ewald_n,
+                   Scalar4 *d_ewaldC1, 
+                   const unsigned int *d_n_neigh,
+                                     const unsigned int *d_nlist,
+                                     const unsigned int *d_headlist,
+                   int& m,
+             Scalar cheb_error,
+                   dim3 grid,
+                   dim3 threads,
+                   int gridBlockSize,
+                   int gridNBlock,
+                   Scalar3 gridh,
+                   Scalar self 
+          ){
+
+  // Dot product kernel specifications
+  unsigned int thread_for_dot = 512; // Must be 2^n
+  unsigned int grid_for_dot = (group_size/thread_for_dot) + 1;
+
+  // Temp var for dot product.
+  Scalar *dot_sum;
+  cudaMalloc( (void**)&dot_sum, grid_for_dot*sizeof(Scalar) );
+
+  // Allocate storage
+  // 
+  int m_in = m;
+  int m_max = 100;
 
         // Storage vectors for tridiagonal factorization
-	float *alpha, *beta, *alpha_save, *beta_save;
+  float *alpha, *beta, *alpha_save, *beta_save;
         alpha = (float *)malloc( (m_max)*sizeof(float) );
         alpha_save = (float *)malloc( (m_max)*sizeof(float) );
         beta = (float *)malloc( (m_max+1)*sizeof(float) );
         beta_save = (float *)malloc( (m_max+1)*sizeof(float) );
 
-	// Vectors for Lapacke and square root
-	float *W;
-	W = (float *)malloc( (m_max*m_max)*sizeof(float) );
-	float *W1; // W1 = Lambda^(1/2) * ( W^T * e1 )
-	W1 = (float *)malloc( (m_max)*sizeof(float) );
-	float *Tm;
-	Tm = (float *)malloc( m_max*sizeof(float) );
-	Scalar *d_Tm;
-	cudaMalloc( (void**)&d_Tm, m_max * sizeof(Scalar) );
-
-	// Vectors for Lanczos iterations
-	Scalar4 *d_v, *d_vj, *d_vjm1;
-	cudaMalloc( (void**)&d_v, group_size*sizeof(Scalar4) );
-	cudaMalloc( (void**)&d_vj, group_size*sizeof(Scalar4) );
-	cudaMalloc( (void**)&d_vjm1, group_size*sizeof(Scalar4) );
-
-	// Storage vector for M*vj
-	Scalar4 *d_Mvj;
-	cudaMalloc( (void**)&d_Mvj, group_size*sizeof(Scalar4) );
-
-	// Storage array for V
-	Scalar4 *d_V;
-	cudaMalloc( (void**)&d_V, m_max*group_size * sizeof(Scalar4) );
-
-	// Step-norm things
-	Scalar4 *d_vel_old, *d_Mpsi;
-	cudaMalloc( (void**)&d_vel_old, group_size*sizeof(Scalar4) );
-	cudaMalloc( (void**)&d_Mpsi, group_size*sizeof(Scalar4) );
-	Scalar psiMpsi;
-
-	// Temporary pointer
-	Scalar4 *d_temp;
-
-	// Copy random vector to v0
-	cudaMemcpy( d_vj, d_psi, group_size*sizeof(Scalar4), cudaMemcpyDeviceToDevice );
-	
-	// Compute the norm of the d_psi (also the norm of basis vector v0)
+  // Vectors for Lapacke and square root
+  float *W;
+  W = (float *)malloc( (m_max*m_max)*sizeof(float) );
+  float *W1; // W1 = Lambda^(1/2) * ( W^T * e1 )
+  W1 = (float *)malloc( (m_max)*sizeof(float) );
+  float *Tm;
+  Tm = (float *)malloc( m_max*sizeof(float) );
+  Scalar *d_Tm;
+  cudaMalloc( (void**)&d_Tm, m_max * sizeof(Scalar) );
+
+  // Vectors for Lanczos iterations
+  Scalar4 *d_v, *d_vj, *d_vjm1;
+  cudaMalloc( (void**)&d_v, group_size*sizeof(Scalar4) );
+  cudaMalloc( (void**)&d_vj, group_size*sizeof(Scalar4) );
+  cudaMalloc( (void**)&d_vjm1, group_size*sizeof(Scalar4) );
+
+  // Storage vector for M*vj
+  Scalar4 *d_Mvj;
+  cudaMalloc( (void**)&d_Mvj, group_size*sizeof(Scalar4) );
+
+  // Storage array for V
+  Scalar4 *d_V;
+  cudaMalloc( (void**)&d_V, m_max*group_size * sizeof(Scalar4) );
+
+  // Step-norm things
+  Scalar4 *d_vel_old, *d_Mpsi;
+  cudaMalloc( (void**)&d_vel_old, group_size*sizeof(Scalar4) );
+  cudaMalloc( (void**)&d_Mpsi, group_size*sizeof(Scalar4) );
+  Scalar psiMpsi;
+
+  // Temporary pointer
+  Scalar4 *d_temp;
+
+  // Copy random vector to v0
+  cudaMemcpy( d_vj, d_psi, group_size*sizeof(Scalar4), cudaMemcpyDeviceToDevice );
+  
+  // Compute the norm of the d_psi (also the norm of basis vector v0)
         Scalar vnorm;
-	gpu_stokes_DotStepOne_kernel<<< grid_for_dot, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(d_vj, d_vj, dot_sum, group_size, d_group_members);
-	gpu_stokes_DotStepTwo_kernel<<< 1, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(dot_sum, grid_for_dot);
-	cudaMemcpy(&vnorm, dot_sum, sizeof(Scalar), cudaMemcpyDeviceToHost);
-	vnorm = sqrtf( vnorm );
+  gpu_stokes_DotStepOne_kernel<<< grid_for_dot, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(d_vj, d_vj, dot_sum, group_size, d_group_members);
+  gpu_stokes_DotStepTwo_kernel<<< 1, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(dot_sum, grid_for_dot);
+  cudaMemcpy(&vnorm, dot_sum, sizeof(Scalar), cudaMemcpyDeviceToHost);
+  vnorm = sqrtf( vnorm );
 
-	Scalar psinorm = vnorm;
+  Scalar psinorm = vnorm;
 
-    	// Compute psi * M * psi ( for step norm )
-    	gpu_stokes_Mreal_kernel<<<grid, threads>>>(d_pos, d_Mpsi, d_psi, group_size, xi, d_ewaldC1, self, ewald_cut, ewald_n, ewald_dr, d_group_members, box, d_n_neigh, d_nlist, d_headlist );
-    	gpu_stokes_DotStepOne_kernel<<< grid_for_dot, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(d_psi, d_Mpsi, dot_sum, group_size, d_group_members);
-    	gpu_stokes_DotStepTwo_kernel<<< 1, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(dot_sum, grid_for_dot);
-    	cudaMemcpy(&psiMpsi, dot_sum, sizeof(Scalar), cudaMemcpyDeviceToHost);
+      // Compute psi * M * psi ( for step norm )
+      gpu_stokes_Mreal_kernel<<<grid, threads>>>(d_pos, d_Mpsi, d_psi, group_size, xi, d_ewaldC1, self, ewald_cut, ewald_n, ewald_dr, d_group_members, box, d_n_neigh, d_nlist, d_headlist );
+      gpu_stokes_DotStepOne_kernel<<< grid_for_dot, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(d_psi, d_Mpsi, dot_sum, group_size, d_group_members);
+      gpu_stokes_DotStepTwo_kernel<<< 1, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(dot_sum, grid_for_dot);
+      cudaMemcpy(&psiMpsi, dot_sum, sizeof(Scalar), cudaMemcpyDeviceToHost);
 
-	psiMpsi = psiMpsi / ( psinorm * psinorm );
+  psiMpsi = psiMpsi / ( psinorm * psinorm );
 
         // First iteration, vjm1 = 0, vj = psi / norm( psi )
-	gpu_stokes_LinearCombination_kernel<<<grid, threads>>>(d_vj, d_vj, d_vjm1, 0.0, 0.0, group_size, d_group_members);
-	gpu_stokes_LinearCombination_kernel<<<grid, threads>>>(d_vj, d_vj, d_vj, 1.0/vnorm, 0.0, group_size, d_group_members);
-
-	// Start by computing (m-1) iterations, so that the stepnorm for the given
-	// number of iterations can be compute
-	m = m_in - 1;
-	m = m < 1 ? 1 : m;
-
-	// Values for current alpha and beta in the iteration
-	Scalar tempalpha;
-	Scalar tempbeta = 0.0;
-
-	// Apply the Lanczos method
-	for ( int jj = 0; jj < m; ++jj ){
-
-		// Store current basis vector
-		cudaMemcpy( &d_V[jj*group_size], d_vj, group_size*sizeof(Scalar4), cudaMemcpyDeviceToDevice );
-
-		// Store beta
-		beta[jj] = tempbeta;
-
-		// v = M*vj - betaj*vjm1
-    		gpu_stokes_Mreal_kernel<<<grid, threads>>>(d_pos, d_Mvj, d_vj, group_size, xi, d_ewaldC1, self, ewald_cut, ewald_n, ewald_dr, d_group_members, box, d_n_neigh, d_nlist, d_headlist );
-		gpu_stokes_LinearCombination_kernel<<<grid, threads>>>(d_Mvj, d_vjm1, d_v, 1.0, -1.0*tempbeta, group_size, d_group_members);
-
-		// vj dot v
-	        gpu_stokes_DotStepOne_kernel<<< grid_for_dot, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(d_vj, d_v, dot_sum, group_size, d_group_members);
-	        gpu_stokes_DotStepTwo_kernel<<< 1, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(dot_sum, grid_for_dot);
-	        cudaMemcpy(&tempalpha, dot_sum, sizeof(Scalar), cudaMemcpyDeviceToHost);
-
-		// Store updated alpha
-		alpha[jj] = tempalpha;
-	
-		// v = v - alphaj*vj
-		gpu_stokes_LinearCombination_kernel<<<grid, threads>>>(d_v, d_vj, d_v, 1.0, -1.0*tempalpha, group_size, d_group_members);
-
-		// v dot v 
-	        gpu_stokes_DotStepOne_kernel<<< grid_for_dot, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(d_v, d_v, dot_sum, group_size, d_group_members);
-	        gpu_stokes_DotStepTwo_kernel<<< 1, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(dot_sum, grid_for_dot);
-	        cudaMemcpy(&vnorm, dot_sum, sizeof(Scalar), cudaMemcpyDeviceToHost);
-		vnorm = sqrtf( vnorm );
-
-		// betajp1 = norm( v )
-		tempbeta = vnorm;
-
-		// Check that the basis vector is not too small. If so, end the iteration
-		// (If norm is too small, will have numerical trouble)
-		if ( vnorm < 1E-8 ){
-		    m = jj;
-		    break;
-		}
-
-		// vjp1 = v / betajp1
-		gpu_stokes_LinearCombination_kernel<<<grid, threads>>>(d_v, d_v, d_v, 1.0/tempbeta, 0.0, group_size, d_group_members);
-
-		// Swap pointers
-		d_temp = d_vjm1;
-		d_vjm1 = d_vj;
-		d_vj = d_v;
-		d_v = d_temp;
-		
-	}
-
-	// Save alpha, beta vectors (will be overwritten by lapack)
-	for ( int ii = 0; ii < m; ++ii ){
-		alpha_save[ii] = alpha[ii];
-		beta_save[ii] = beta[ii];
-	}
-	beta_save[m] = beta[m];
-
-	// Now that we have alpha, beta, have to compute the square root of the tridiagonal
-	// matrix Tm. Do this using eigen-decomposition.
-	//
-	// Compute eigen-decomposition of tridiagonal matrix
-	// 	alpha (input) - vector of entries on main diagonal
-	//      alpha (output) - eigenvalues sorted in descending order
-	//      beta (input) - vector of entries of sub-diagonal
-	//      beta (output) - overwritten (zeros?)
-	//      W - (output) - matrix of eigenvectors. ith column corresponds to ith eigenvalue
-	// 	INFO (output) = 0 if operation was succesful
-	int INFO = LAPACKE_spteqr( LAPACK_ROW_MAJOR, 'I', m, alpha, &beta[1], W, m );
-
-	// Check whether the eigen-decomposition failed, and throw error on failure
-	if ( INFO != 0 ){
-		printf("Eigenvalue decomposition #1 failed \n");
-		printf("INFO = %i \n", INFO);
-
-		printf("\n alpha: \n");
-		for( int ii = 0; ii < m; ++ii ){
-		    printf("%f \n", alpha_save[ii]);
-		} 
-		printf("\n beta: \n");
-		for( int ii = 0; ii < m; ++ii ){
-		    printf("%f \n", beta_save[ii]);
-		}
-		printf("%f \n", beta_save[m]); 
-	    
-		printf("Note to User: restart simulation and proceed. \n");
-
-		exit(EXIT_FAILURE);
-	}
-
-	// Now, we have to compute Tm^(1/2) * e1
-	// 	Tm^(1/2) = W * Lambda^(1/2) * W^T * e1
-	//	         = W * Lambda^(1/2) * ( W^T * e1 )
-	// The quantity in parentheses is the first row of W 
-	// Lambda^(1/2) only has diagonal entries, so it's product with the first row of W
-	//     is easy to compute.
-	for ( int ii = 0; ii < m; ++ii ){
-	    W1[ii] = sqrtf( alpha[ii] ) * W[ii];
-	}
-
-	// Tm = W * W1 = W * Lambda^(1/2) * W^T * e1
-	float tempsum;
-	for ( int ii = 0; ii < m; ++ii ){
-	    tempsum = 0.0;
-	    for ( int jj = 0; jj < m; ++jj ){
-		int idx = m*ii + jj;
-
-		tempsum += W[idx] * W1[jj];
-	    }
-	    Tm[ii] = tempsum;
-	}
-
-	// Copy matrix to GPU
-	cudaMemcpy( d_Tm, Tm, m*sizeof(Scalar), cudaMemcpyHostToDevice );
-
-	// Multiply basis vectors by Tm, [ V0, V1, ..., Vm-1 ] * Tm
-	gpu_stokes_MatVecMultiply_kernel<<<grid,threads>>>(d_V, d_Tm, d_vel, group_size, m);
-
-	// Copy velocity
-	cudaMemcpy( d_vel_old, d_vel, group_size*sizeof(Scalar4), cudaMemcpyDeviceToDevice );
-
-	// Restore alpha, beta
-	for ( int ii = 0; ii < m; ++ii ){
-		alpha[ii] = alpha_save[ii];
-		beta[ii] = beta_save[ii];
-	}
-	beta[m] = beta_save[m];
-
-
-	//
-	// Keep adding to basis vectors until the step norm is small enough
-	//
-	Scalar stepnorm = 1.0;
-	int jj;
-	while( stepnorm > cheb_error && m < m_max ){
-		m++;
-		jj = m - 1;
-
-		//
-		// Do another Lanczos iteration
-		//
-
-		// Store the current basis vector
-		cudaMemcpy( &d_V[jj*group_size], d_vj, group_size*sizeof(Scalar4), cudaMemcpyDeviceToDevice );
-
-		// Store beta
-		beta[jj] = tempbeta;
-
-		// v = M*vj - betaj*vjm1
-		gpu_stokes_Mreal_kernel<<<grid, threads>>>(d_pos, d_Mvj, d_vj, group_size, xi, d_ewaldC1, self, ewald_cut, ewald_n, ewald_dr, d_group_members, box, d_n_neigh, d_nlist, d_headlist );
-		gpu_stokes_LinearCombination_kernel<<<grid, threads>>>(d_Mvj, d_vjm1, d_v, 1.0, -1.0*tempbeta, group_size, d_group_members);
-
-		// vj dot v
-	        gpu_stokes_DotStepOne_kernel<<< grid_for_dot, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(d_vj, d_v, dot_sum, group_size, d_group_members);
-	        gpu_stokes_DotStepTwo_kernel<<< 1, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(dot_sum, grid_for_dot);
-	        cudaMemcpy(&tempalpha, dot_sum, sizeof(Scalar), cudaMemcpyDeviceToHost);
-
-		// Store updated alpha
-		alpha[jj] = tempalpha;
-	
-		// v = v - alphaj*vj
-		gpu_stokes_LinearCombination_kernel<<<grid, threads>>>(d_v, d_vj, d_v, 1.0, -1.0*tempalpha, group_size, d_group_members);
-
-		// v dot v 
-	        gpu_stokes_DotStepOne_kernel<<< grid_for_dot, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(d_v, d_v, dot_sum, group_size, d_group_members);
-	        gpu_stokes_DotStepTwo_kernel<<< 1, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(dot_sum, grid_for_dot);
-	        cudaMemcpy(&vnorm, dot_sum, sizeof(Scalar), cudaMemcpyDeviceToHost);
-		vnorm = sqrtf( vnorm );
-
-		// betajp1 = norm( v )
-		tempbeta = vnorm; 
-
-		// Check if the norm of the basis vector is too small. If
-		// so, end the iteration. 
-		if ( vnorm < 1E-8 ){
-		    m = jj;
-		    break;
-		}
-
-		// vjp1 = v / betajp1
-		gpu_stokes_LinearCombination_kernel<<<grid, threads>>>(d_v, d_v, d_v, 1.0/tempbeta, 0.0, group_size, d_group_members);
-
-		// Swap pointers
-		d_temp = d_vjm1;
-		d_vjm1 = d_vj;
-		d_vj = d_v;
-		d_v = d_temp;
-			
-		// Save alpha, beta vectors (will be overwritten by lapack)
-		for ( int ii = 0; ii < m; ++ii ){
-			alpha_save[ii] = alpha[ii];
-			beta_save[ii] = beta[ii];
-		}
-		beta_save[m] = beta[m];
-	
-		//
-		// Square root calculation with addition of latest Lanczos iteration
-		// (see first implementation above more description)
-		//
-	
-		// Compute eigen-decomposition of tridiagonal matrix
-		int INFO = LAPACKE_spteqr( LAPACK_ROW_MAJOR, 'I', m, alpha, &beta[1], W, m );
-
-		// Check whether the eigen-decomposition failed, and throw error on failure
-		if ( INFO != 0 ){
-		    	printf("Eigenvalue decomposition #2 failed \n");
-		    	printf("INFO = %i \n", INFO); 
-	    
-	    	    	printf("\n alpha: \n");
-	    	    	for( int ii = 0; ii < m; ++ii ){
-	    	    	    printf("%f \n", alpha_save[ii]);
-	    	    	} 
-	    	    	printf("\n beta: \n");
-	    	    	for( int ii = 0; ii < m; ++ii ){
-	    	    	    printf("%f \n", beta_save[ii]);
-	    	    	}
-		    	printf("%f \n", beta_save[m]); 
-		
-			printf("Note to User: restart simulation and proceed. \n");
-	    
-			exit(EXIT_FAILURE);
-		}
-
-		// Now, we have to compute Tm^(1/2) * e1
-		for ( int ii = 0; ii < m; ++ii ){
-		    W1[ii] = sqrtf( alpha[ii] ) * W[ii];
-		}
-
-		// Tm = W * W1 = W * Lambda^(1/2) * W^T * e1
-		float tempsum;
-		for ( int ii = 0; ii < m; ++ii ){
-		    tempsum = 0.0;
-		    for ( int jj = 0; jj < m; ++jj ){
-			int idx = m*ii + jj;
-
-			tempsum += W[idx] * W1[jj];
-		    }
-		    Tm[ii] = tempsum;
-		}
-
-		// Copy matrix to GPU
-		cudaMemcpy( d_Tm, Tm, m*sizeof(Scalar), cudaMemcpyHostToDevice );
-
-		// Multiply basis vectors by Tm -- velocity = Vm * Tm
-		gpu_stokes_MatVecMultiply_kernel<<<grid,threads>>>(d_V, d_Tm, d_vel, group_size, m);
-
-		// Compute step norm error
-    		gpu_stokes_LinearCombination_kernel<<<grid, threads>>>(d_vel, d_vel_old, d_vel_old, 1.0, -1.0, group_size, d_group_members);
-        	gpu_stokes_DotStepOne_kernel<<< grid_for_dot, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(d_vel_old, d_vel_old, dot_sum, group_size, d_group_members);
-        	gpu_stokes_DotStepTwo_kernel<<< 1, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(dot_sum, grid_for_dot);
-        	cudaMemcpy(&stepnorm, dot_sum, sizeof(Scalar), cudaMemcpyDeviceToHost);
-
-		stepnorm = sqrtf( stepnorm / psiMpsi );
-
-		// Copy velocity
-		cudaMemcpy( d_vel_old, d_vel, group_size*sizeof(Scalar4), cudaMemcpyDeviceToDevice );
-
-		// Restore alpha, beta
-		for ( int ii = 0; ii < m; ++ii ){
-			alpha[ii] = alpha_save[ii];
-			beta[ii] = beta_save[ii];
-		}
-		beta[m] = beta_save[m];
-
-	}
-
-	// Rescale by original norm of Psi and include thermal variance
-	gpu_stokes_LinearCombination_kernel<<<grid, threads>>>(d_vel, d_vel, d_vel, psinorm * sqrtf(2.0*T/dt), 0.0, group_size, d_group_members);
-	
-	//
-	// Clean up
-	//
-	cudaFree(dot_sum);
-	cudaFree(d_Mvj);
-	cudaFree(d_v);
-	cudaFree(d_vj);
-	cudaFree(d_vjm1);
-	cudaFree(d_V);
-	cudaFree(d_Tm);
-	cudaFree(d_vel_old);
-	cudaFree(d_Mpsi);
-
-	d_temp = NULL;
-
-	free(alpha);
-	free(beta);
-	free(alpha_save);
-	free(beta_save);
-
-	free(W);
-	free(W1);
-	free(Tm);
-	
+  gpu_stokes_LinearCombination_kernel<<<grid, threads>>>(d_vj, d_vj, d_vjm1, 0.0, 0.0, group_size, d_group_members);
+  gpu_stokes_LinearCombination_kernel<<<grid, threads>>>(d_vj, d_vj, d_vj, 1.0/vnorm, 0.0, group_size, d_group_members);
+
+  // Start by computing (m-1) iterations, so that the stepnorm for the given
+  // number of iterations can be compute
+  m = m_in - 1;
+  m = m < 1 ? 1 : m;
+
+  // Values for current alpha and beta in the iteration
+  Scalar tempalpha;
+  Scalar tempbeta = 0.0;
+
+  // Apply the Lanczos method
+  for ( int jj = 0; jj < m; ++jj ){
+
+    // Store current basis vector
+    cudaMemcpy( &d_V[jj*group_size], d_vj, group_size*sizeof(Scalar4), cudaMemcpyDeviceToDevice );
+
+    // Store beta
+    beta[jj] = tempbeta;
+
+    // v = M*vj - betaj*vjm1
+        gpu_stokes_Mreal_kernel<<<grid, threads>>>(d_pos, d_Mvj, d_vj, group_size, xi, d_ewaldC1, self, ewald_cut, ewald_n, ewald_dr, d_group_members, box, d_n_neigh, d_nlist, d_headlist );
+    gpu_stokes_LinearCombination_kernel<<<grid, threads>>>(d_Mvj, d_vjm1, d_v, 1.0, -1.0*tempbeta, group_size, d_group_members);
+
+    // vj dot v
+          gpu_stokes_DotStepOne_kernel<<< grid_for_dot, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(d_vj, d_v, dot_sum, group_size, d_group_members);
+          gpu_stokes_DotStepTwo_kernel<<< 1, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(dot_sum, grid_for_dot);
+          cudaMemcpy(&tempalpha, dot_sum, sizeof(Scalar), cudaMemcpyDeviceToHost);
+
+    // Store updated alpha
+    alpha[jj] = tempalpha;
+  
+    // v = v - alphaj*vj
+    gpu_stokes_LinearCombination_kernel<<<grid, threads>>>(d_v, d_vj, d_v, 1.0, -1.0*tempalpha, group_size, d_group_members);
+
+    // v dot v 
+          gpu_stokes_DotStepOne_kernel<<< grid_for_dot, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(d_v, d_v, dot_sum, group_size, d_group_members);
+          gpu_stokes_DotStepTwo_kernel<<< 1, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(dot_sum, grid_for_dot);
+          cudaMemcpy(&vnorm, dot_sum, sizeof(Scalar), cudaMemcpyDeviceToHost);
+    vnorm = sqrtf( vnorm );
+
+    // betajp1 = norm( v )
+    tempbeta = vnorm;
+
+    // Check that the basis vector is not too small. If so, end the iteration
+    // (If norm is too small, will have numerical trouble)
+    if ( vnorm < 1E-8 ){
+        m = jj;
+        break;
+    }
+
+    // vjp1 = v / betajp1
+    gpu_stokes_LinearCombination_kernel<<<grid, threads>>>(d_v, d_v, d_v, 1.0/tempbeta, 0.0, group_size, d_group_members);
+
+    // Swap pointers
+    d_temp = d_vjm1;
+    d_vjm1 = d_vj;
+    d_vj = d_v;
+    d_v = d_temp;
+    
+  }
+
+  // Save alpha, beta vectors (will be overwritten by lapack)
+  for ( int ii = 0; ii < m; ++ii ){
+    alpha_save[ii] = alpha[ii];
+    beta_save[ii] = beta[ii];
+  }
+  beta_save[m] = beta[m];
+
+  // Now that we have alpha, beta, have to compute the square root of the tridiagonal
+  // matrix Tm. Do this using eigen-decomposition.
+  //
+  // Compute eigen-decomposition of tridiagonal matrix
+  //   alpha (input) - vector of entries on main diagonal
+  //      alpha (output) - eigenvalues sorted in descending order
+  //      beta (input) - vector of entries of sub-diagonal
+  //      beta (output) - overwritten (zeros?)
+  //      W - (output) - matrix of eigenvectors. ith column corresponds to ith eigenvalue
+  //   INFO (output) = 0 if operation was succesful
+  int INFO = LAPACKE_spteqr( LAPACK_ROW_MAJOR, 'I', m, alpha, &beta[1], W, m );
+
+  // Check whether the eigen-decomposition failed, and throw error on failure
+  if ( INFO != 0 ){
+    printf("Eigenvalue decomposition #1 failed \n");
+    printf("INFO = %i \n", INFO);
+
+    printf("\n alpha: \n");
+    for( int ii = 0; ii < m; ++ii ){
+        printf("%f \n", alpha_save[ii]);
+    } 
+    printf("\n beta: \n");
+    for( int ii = 0; ii < m; ++ii ){
+        printf("%f \n", beta_save[ii]);
+    }
+    printf("%f \n", beta_save[m]); 
+      
+    printf("Note to User: restart simulation and proceed. \n");
+
+    exit(EXIT_FAILURE);
+  }
+
+  // Now, we have to compute Tm^(1/2) * e1
+  //   Tm^(1/2) = W * Lambda^(1/2) * W^T * e1
+  //           = W * Lambda^(1/2) * ( W^T * e1 )
+  // The quantity in parentheses is the first row of W 
+  // Lambda^(1/2) only has diagonal entries, so it's product with the first row of W
+  //     is easy to compute.
+  for ( int ii = 0; ii < m; ++ii ){
+      W1[ii] = sqrtf( alpha[ii] ) * W[ii];
+  }
+
+  // Tm = W * W1 = W * Lambda^(1/2) * W^T * e1
+  float tempsum;
+  for ( int ii = 0; ii < m; ++ii ){
+      tempsum = 0.0;
+      for ( int jj = 0; jj < m; ++jj ){
+    int idx = m*ii + jj;
+
+    tempsum += W[idx] * W1[jj];
+      }
+      Tm[ii] = tempsum;
+  }
+
+  // Copy matrix to GPU
+  cudaMemcpy( d_Tm, Tm, m*sizeof(Scalar), cudaMemcpyHostToDevice );
+
+  // Multiply basis vectors by Tm, [ V0, V1, ..., Vm-1 ] * Tm
+  gpu_stokes_MatVecMultiply_kernel<<<grid,threads>>>(d_V, d_Tm, d_vel, group_size, m);
+
+  // Copy velocity
+  cudaMemcpy( d_vel_old, d_vel, group_size*sizeof(Scalar4), cudaMemcpyDeviceToDevice );
+
+  // Restore alpha, beta
+  for ( int ii = 0; ii < m; ++ii ){
+    alpha[ii] = alpha_save[ii];
+    beta[ii] = beta_save[ii];
+  }
+  beta[m] = beta_save[m];
+
+
+  //
+  // Keep adding to basis vectors until the step norm is small enough
+  //
+  Scalar stepnorm = 1.0;
+  int jj;
+  while( stepnorm > cheb_error && m < m_max ){
+    m++;
+    jj = m - 1;
+
+    //
+    // Do another Lanczos iteration
+    //
+
+    // Store the current basis vector
+    cudaMemcpy( &d_V[jj*group_size], d_vj, group_size*sizeof(Scalar4), cudaMemcpyDeviceToDevice );
+
+    // Store beta
+    beta[jj] = tempbeta;
+
+    // v = M*vj - betaj*vjm1
+    gpu_stokes_Mreal_kernel<<<grid, threads>>>(d_pos, d_Mvj, d_vj, group_size, xi, d_ewaldC1, self, ewald_cut, ewald_n, ewald_dr, d_group_members, box, d_n_neigh, d_nlist, d_headlist );
+    gpu_stokes_LinearCombination_kernel<<<grid, threads>>>(d_Mvj, d_vjm1, d_v, 1.0, -1.0*tempbeta, group_size, d_group_members);
+
+    // vj dot v
+          gpu_stokes_DotStepOne_kernel<<< grid_for_dot, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(d_vj, d_v, dot_sum, group_size, d_group_members);
+          gpu_stokes_DotStepTwo_kernel<<< 1, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(dot_sum, grid_for_dot);
+          cudaMemcpy(&tempalpha, dot_sum, sizeof(Scalar), cudaMemcpyDeviceToHost);
+
+    // Store updated alpha
+    alpha[jj] = tempalpha;
+  
+    // v = v - alphaj*vj
+    gpu_stokes_LinearCombination_kernel<<<grid, threads>>>(d_v, d_vj, d_v, 1.0, -1.0*tempalpha, group_size, d_group_members);
+
+    // v dot v 
+          gpu_stokes_DotStepOne_kernel<<< grid_for_dot, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(d_v, d_v, dot_sum, group_size, d_group_members);
+          gpu_stokes_DotStepTwo_kernel<<< 1, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(dot_sum, grid_for_dot);
+          cudaMemcpy(&vnorm, dot_sum, sizeof(Scalar), cudaMemcpyDeviceToHost);
+    vnorm = sqrtf( vnorm );
+
+    // betajp1 = norm( v )
+    tempbeta = vnorm; 
+
+    // Check if the norm of the basis vector is too small. If
+    // so, end the iteration. 
+    if ( vnorm < 1E-8 ){
+        m = jj;
+        break;
+    }
+
+    // vjp1 = v / betajp1
+    gpu_stokes_LinearCombination_kernel<<<grid, threads>>>(d_v, d_v, d_v, 1.0/tempbeta, 0.0, group_size, d_group_members);
+
+    // Swap pointers
+    d_temp = d_vjm1;
+    d_vjm1 = d_vj;
+    d_vj = d_v;
+    d_v = d_temp;
+      
+    // Save alpha, beta vectors (will be overwritten by lapack)
+    for ( int ii = 0; ii < m; ++ii ){
+      alpha_save[ii] = alpha[ii];
+      beta_save[ii] = beta[ii];
+    }
+    beta_save[m] = beta[m];
+  
+    //
+    // Square root calculation with addition of latest Lanczos iteration
+    // (see first implementation above more description)
+    //
+  
+    // Compute eigen-decomposition of tridiagonal matrix
+    int INFO = LAPACKE_spteqr( LAPACK_ROW_MAJOR, 'I', m, alpha, &beta[1], W, m );
+
+    // Check whether the eigen-decomposition failed, and throw error on failure
+    if ( INFO != 0 ){
+          printf("Eigenvalue decomposition #2 failed \n");
+          printf("INFO = %i \n", INFO); 
+      
+              printf("\n alpha: \n");
+              for( int ii = 0; ii < m; ++ii ){
+                  printf("%f \n", alpha_save[ii]);
+              } 
+              printf("\n beta: \n");
+              for( int ii = 0; ii < m; ++ii ){
+                  printf("%f \n", beta_save[ii]);
+              }
+          printf("%f \n", beta_save[m]); 
+    
+      printf("Note to User: restart simulation and proceed. \n");
+      
+      exit(EXIT_FAILURE);
+    }
+
+    // Now, we have to compute Tm^(1/2) * e1
+    for ( int ii = 0; ii < m; ++ii ){
+        W1[ii] = sqrtf( alpha[ii] ) * W[ii];
+    }
+
+    // Tm = W * W1 = W * Lambda^(1/2) * W^T * e1
+    float tempsum;
+    for ( int ii = 0; ii < m; ++ii ){
+        tempsum = 0.0;
+        for ( int jj = 0; jj < m; ++jj ){
+      int idx = m*ii + jj;
+
+      tempsum += W[idx] * W1[jj];
+        }
+        Tm[ii] = tempsum;
+    }
+
+    // Copy matrix to GPU
+    cudaMemcpy( d_Tm, Tm, m*sizeof(Scalar), cudaMemcpyHostToDevice );
+
+    // Multiply basis vectors by Tm -- velocity = Vm * Tm
+    gpu_stokes_MatVecMultiply_kernel<<<grid,threads>>>(d_V, d_Tm, d_vel, group_size, m);
+
+    // Compute step norm error
+        gpu_stokes_LinearCombination_kernel<<<grid, threads>>>(d_vel, d_vel_old, d_vel_old, 1.0, -1.0, group_size, d_group_members);
+          gpu_stokes_DotStepOne_kernel<<< grid_for_dot, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(d_vel_old, d_vel_old, dot_sum, group_size, d_group_members);
+          gpu_stokes_DotStepTwo_kernel<<< 1, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(dot_sum, grid_for_dot);
+          cudaMemcpy(&stepnorm, dot_sum, sizeof(Scalar), cudaMemcpyDeviceToHost);
+
+    stepnorm = sqrtf( stepnorm / psiMpsi );
+
+    // Copy velocity
+    cudaMemcpy( d_vel_old, d_vel, group_size*sizeof(Scalar4), cudaMemcpyDeviceToDevice );
+
+    // Restore alpha, beta
+    for ( int ii = 0; ii < m; ++ii ){
+      alpha[ii] = alpha_save[ii];
+      beta[ii] = beta_save[ii];
+    }
+    beta[m] = beta_save[m];
+
+  }
+
+  // Rescale by original norm of Psi and include thermal variance
+  gpu_stokes_LinearCombination_kernel<<<grid, threads>>>(d_vel, d_vel, d_vel, psinorm * sqrtf(2.0*T/dt), 0.0, group_size, d_group_members);
+  
+  //
+  // Clean up
+  //
+  cudaFree(dot_sum);
+  cudaFree(d_Mvj);
+  cudaFree(d_v);
+  cudaFree(d_vj);
+  cudaFree(d_vjm1);
+  cudaFree(d_V);
+  cudaFree(d_Tm);
+  cudaFree(d_vel_old);
+  cudaFree(d_Mpsi);
+
+  d_temp = NULL;
+
+  free(alpha);
+  free(beta);
+  free(alpha_save);
+  free(beta_save);
+
+  free(W);
+  free(W1);
+  free(Tm);
+  
 }
 
 
 // Wrap up everything to compute mobility AND brownian if necessary
-// 	- Combine Fourier components of Deterministic and Brownian calculation
-//		in order to save extra FFTs and contraction operations
+//   - Combine Fourier components of Deterministic and Brownian calculation
+//    in order to save extra FFTs and contraction operations
 //      - Add deterministic and stochastic real space contributions
 void gpu_stokes_CombinedMobilityBrownian_wrap(  
-				Scalar4 *d_pos,
-				Scalar4 *d_net_force,
+        Scalar4 *d_pos,
+        Scalar4 *d_net_force,
                                 unsigned int *d_group_members,
                                 unsigned int group_size,
                                 const BoxDim& box,
                                 Scalar dt,
-			        Scalar4 *d_vel,
-			        const Scalar T,
-			        const unsigned int timestep,
-			        const unsigned int seed,
-			        Scalar xi,
-				Scalar eta,
-				Scalar P,
-			        Scalar ewald_cut,
-			        Scalar ewald_dr,
-			        int ewald_n,
-			        Scalar4 *d_ewaldC1, 
-			        Scalar4 *d_gridk,
-			        CUFFTCOMPLEX *d_gridX,
-			        CUFFTCOMPLEX *d_gridY,
-			        CUFFTCOMPLEX *d_gridZ,
-			        cufftHandle plan,
-			        const int Nx,
-			        const int Ny,
-			        const int Nz,
-			        const unsigned int *d_n_neigh,
+              Scalar4 *d_vel,
+              const Scalar T,
+              const unsigned int timestep,
+              const unsigned int seed,
+              Scalar xi,
+        Scalar eta,
+        Scalar P,
+              Scalar ewald_cut,
+              Scalar ewald_dr,
+              int ewald_n,
+              Scalar4 *d_ewaldC1, 
+              Scalar4 *d_gridk,
+              CUFFTCOMPLEX *d_gridX,
+              CUFFTCOMPLEX *d_gridY,
+              CUFFTCOMPLEX *d_gridZ,
+              cufftHandle plan,
+              const int Nx,
+              const int Ny,
+              const int Nz,
+              const unsigned int *d_n_neigh,
                                 const unsigned int *d_nlist,
                                 const unsigned int *d_headlist,
-			        int& m_Lanczos,
-			        const unsigned int N_total,
-			        unsigned int NxNyNz,
-			        dim3 grid,
-			        dim3 threads,
-			        int gridBlockSize,
-			        int gridNBlock,
-				Scalar3 gridh,
-			        Scalar cheb_error,
-				Scalar self ){
-
-	// Real space velocity to add
-	Scalar4 *d_vel2;
-	cudaMalloc( (void**)&d_vel2, group_size*sizeof(Scalar4) );
-	
-	// Generate uniform distribution (-1,1) on d_psi
-	Scalar4 *d_psi;
-	cudaMalloc( (void**)&d_psi, group_size*sizeof(Scalar4) );
-	gpu_stokes_BrownianGenerate_kernel<<<grid, threads>>>( d_psi, group_size, d_group_members, timestep, seed );
-	
-	// Spreading and contraction grid information and parameters
-	dim3 Cgrid( group_size, 1, 1);
-	int B = ( P < 10 ) ? P : 10;
-	dim3 Cthreads(B, B, B);
-	
-	Scalar quadW = gridh.x * gridh.y * gridh.z;
-	Scalar xisq = xi * xi;
-	Scalar prefac = ( 2.0 * xisq / 3.1415926536 / eta ) * sqrtf( 2.0 * xisq / 3.1415926536 / eta );
-	Scalar expfac = 2.0 * xisq / eta;
-	
-	// ********************************************
-	// Wave Space Part of Deterministic Calculation
-	// ********************************************
-	
-	// Reset the grid (remove any previously distributed forces)
-	gpu_stokes_ZeroGrid_kernel<<<gridNBlock,gridBlockSize>>>(d_gridX,NxNyNz);
-	gpu_stokes_ZeroGrid_kernel<<<gridNBlock,gridBlockSize>>>(d_gridY,NxNyNz);
-	gpu_stokes_ZeroGrid_kernel<<<gridNBlock,gridBlockSize>>>(d_gridZ,NxNyNz);
-	
-	// Spread forces onto grid
-	gpu_stokes_Spread_kernel<<<Cgrid, Cthreads>>>( d_pos, d_net_force, d_gridX, d_gridY, d_gridZ, group_size, Nx, Ny, Nz, d_group_members, box, P, gridh, xi, eta, prefac, expfac );
-	
-	// Perform FFT on gridded forces
-	cufftExecC2C(plan, d_gridX, d_gridX, CUFFT_FORWARD);
-	cufftExecC2C(plan, d_gridY, d_gridY, CUFFT_FORWARD);
-	cufftExecC2C(plan, d_gridZ, d_gridZ, CUFFT_FORWARD);
-	
-	// Apply wave space scaling to FFT'd forces
-	gpu_stokes_Green_kernel<<<gridNBlock,gridBlockSize>>>( d_gridX, d_gridY, d_gridZ, d_gridk, NxNyNz);
-	
-	
-	// ***************************************
-	// Wave Space Part of Brownian Calculation
-	// ***************************************
-	if ( T > 0.0 ){
-	
-		// Apply random fluctuations to wave space grid
-		gpu_stokes_BrownianGridGenerate_kernel<<<gridNBlock,gridBlockSize>>>( d_gridX, d_gridY, d_gridZ, d_gridk, NxNyNz, Nx, Ny, Nz, timestep, seed, T, dt, quadW );
-	
-	}
-	
-	// ************************************
-	// Finish the Wave Space Calculation
-	// ************************************
-	
-	// Return rescaled forces to real space
-	cufftExecC2C(plan, d_gridX, d_gridX, CUFFT_INVERSE);
-	cufftExecC2C(plan, d_gridY, d_gridY, CUFFT_INVERSE);
-	cufftExecC2C(plan, d_gridZ, d_gridZ, CUFFT_INVERSE);
-	
-	// Evaluate contribution of grid velocities at particle centers
-	gpu_stokes_Contract_kernel<<<Cgrid, Cthreads, (B*B*B+1)*sizeof(float3)>>>( d_pos, d_vel, d_gridX, d_gridY, d_gridZ, group_size, Nx, Ny, Nz, xi, eta, d_group_members, box, P, gridh, quadW*prefac, expfac );
-	
-	// ***************************************
-	// Real Space Part of Both Calculations
-	// ***************************************
-	
-	// Deterministic part
-	gpu_stokes_Mreal_kernel<<<grid, threads>>>(d_pos, d_vel2, d_net_force, group_size, xi, d_ewaldC1, self, ewald_cut, ewald_n, ewald_dr, d_group_members, box, d_n_neigh, d_nlist, d_headlist );
-	
-	// Add to velocity
-	gpu_stokes_LinearCombination_kernel<<<grid, threads>>>(d_vel2, d_vel, d_vel, 1.0, 1.0, group_size, d_group_members);
-	
-	// Stochastic
-	if ( T > 0.0 ){
-	
-		gpu_stokes_BrealLanczos_wrap( 	d_psi,
-						d_pos,
-						d_group_members,
-						group_size,
-						box,
-						dt,
-						d_vel2,
-						T,
-						timestep,
-						seed,
-						xi,
-						ewald_cut,
-						ewald_dr,
-						ewald_n,
-						d_ewaldC1, 
-						d_n_neigh,
-						d_nlist,
-						d_headlist,
-						m_Lanczos,
-			    			cheb_error,
-						grid,
-						threads,
-						gridBlockSize,
-						gridNBlock,
-						gridh,
-			    			self );
-	
-		// Add to velocity
-		gpu_stokes_LinearCombination_kernel<<<grid, threads>>>(d_vel2, d_vel, d_vel, 1.0, 1.0, group_size, d_group_members);
-	
-	}
-	
-	// Free Memory
-	cudaFree( d_vel2 );
-	cudaFree( d_psi );
+              int& m_Lanczos,
+              const unsigned int N_total,
+              unsigned int NxNyNz,
+              dim3 grid,
+              dim3 threads,
+              int gridBlockSize,
+              int gridNBlock,
+        Scalar3 gridh,
+              Scalar cheb_error,
+        Scalar self ){
+
+  // Real space velocity to add
+  Scalar4 *d_vel2;
+  cudaMalloc( (void**)&d_vel2, group_size*sizeof(Scalar4) );
+  
+  // Generate uniform distribution (-1,1) on d_psi
+  Scalar4 *d_psi;
+  cudaMalloc( (void**)&d_psi, group_size*sizeof(Scalar4) );
+  gpu_stokes_BrownianGenerate_kernel<<<grid, threads>>>( d_psi, group_size, d_group_members, timestep, seed );
+  
+  // Spreading and contraction grid information and parameters
+  dim3 Cgrid( group_size, 1, 1);
+  int B = ( P < 10 ) ? P : 10;
+  dim3 Cthreads(B, B, B);
+  
+  Scalar quadW = gridh.x * gridh.y * gridh.z;
+  Scalar xisq = xi * xi;
+  Scalar prefac = ( 2.0 * xisq / 3.1415926536 / eta ) * sqrtf( 2.0 * xisq / 3.1415926536 / eta );
+  Scalar expfac = 2.0 * xisq / eta;
+  
+  // ********************************************
+  // Wave Space Part of Deterministic Calculation
+  // ********************************************
+  
+  // Reset the grid (remove any previously distributed forces)
+  gpu_stokes_ZeroGrid_kernel<<<gridNBlock,gridBlockSize>>>(d_gridX,NxNyNz);
+  gpu_stokes_ZeroGrid_kernel<<<gridNBlock,gridBlockSize>>>(d_gridY,NxNyNz);
+  gpu_stokes_ZeroGrid_kernel<<<gridNBlock,gridBlockSize>>>(d_gridZ,NxNyNz);
+  
+  // Spread forces onto grid
+  gpu_stokes_Spread_kernel<<<Cgrid, Cthreads>>>( d_pos, d_net_force, d_gridX, d_gridY, d_gridZ, group_size, Nx, Ny, Nz, d_group_members, box, P, gridh, xi, eta, prefac, expfac );
+  
+  // Perform FFT on gridded forces
+  cufftExecC2C(plan, d_gridX, d_gridX, CUFFT_FORWARD);
+  cufftExecC2C(plan, d_gridY, d_gridY, CUFFT_FORWARD);
+  cufftExecC2C(plan, d_gridZ, d_gridZ, CUFFT_FORWARD);
+  
+  // Apply wave space scaling to FFT'd forces
+  gpu_stokes_Green_kernel<<<gridNBlock,gridBlockSize>>>( d_gridX, d_gridY, d_gridZ, d_gridk, NxNyNz);
+  
+  
+  // ***************************************
+  // Wave Space Part of Brownian Calculation
+  // ***************************************
+  if ( T > 0.0 ){
+  
+    // Apply random fluctuations to wave space grid
+    gpu_stokes_BrownianGridGenerate_kernel<<<gridNBlock,gridBlockSize>>>( d_gridX, d_gridY, d_gridZ, d_gridk, NxNyNz, Nx, Ny, Nz, timestep, seed, T, dt, quadW );
+  
+  }
+  
+  // ************************************
+  // Finish the Wave Space Calculation
+  // ************************************
+  
+  // Return rescaled forces to real space
+  cufftExecC2C(plan, d_gridX, d_gridX, CUFFT_INVERSE);
+  cufftExecC2C(plan, d_gridY, d_gridY, CUFFT_INVERSE);
+  cufftExecC2C(plan, d_gridZ, d_gridZ, CUFFT_INVERSE);
+  
+  // Evaluate contribution of grid velocities at particle centers
+  gpu_stokes_Contract_kernel<<<Cgrid, Cthreads, (B*B*B+1)*sizeof(float3)>>>( d_pos, d_vel, d_gridX, d_gridY, d_gridZ, group_size, Nx, Ny, Nz, xi, eta, d_group_members, box, P, gridh, quadW*prefac, expfac );
+  
+  // ***************************************
+  // Real Space Part of Both Calculations
+  // ***************************************
+  
+  // Deterministic part
+  gpu_stokes_Mreal_kernel<<<grid, threads>>>(d_pos, d_vel2, d_net_force, group_size, xi, d_ewaldC1, self, ewald_cut, ewald_n, ewald_dr, d_group_members, box, d_n_neigh, d_nlist, d_headlist );
+  
+  // Add to velocity
+  gpu_stokes_LinearCombination_kernel<<<grid, threads>>>(d_vel2, d_vel, d_vel, 1.0, 1.0, group_size, d_group_members);
+  
+  // Stochastic
+  if ( T > 0.0 ){
+  
+    gpu_stokes_BrealLanczos_wrap(   d_psi,
+            d_pos,
+            d_group_members,
+            group_size,
+            box,
+            dt,
+            d_vel2,
+            T,
+            timestep,
+            seed,
+            xi,
+            ewald_cut,
+            ewald_dr,
+            ewald_n,
+            d_ewaldC1, 
+            d_n_neigh,
+            d_nlist,
+            d_headlist,
+            m_Lanczos,
+                cheb_error,
+            grid,
+            threads,
+            gridBlockSize,
+            gridNBlock,
+            gridh,
+                self );
+  
+    // Add to velocity
+    gpu_stokes_LinearCombination_kernel<<<grid, threads>>>(d_vel2, d_vel, d_vel, 1.0, 1.0, group_size, d_group_members);
+  
+  }
+  
+  // Free Memory
+  cudaFree( d_vel2 );
+  cudaFree( d_psi );
 
 }
 
diff --git a/PSEv1/Brownian.cuh b/PSEv1/Brownian.cuh
index 5332e43..8bb2dbc 100644
--- a/PSEv1/Brownian.cuh
+++ b/PSEv1/Brownian.cuh
@@ -71,68 +71,68 @@ ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 __global__ void gpu_stokes_BrownianGenerate_kernel(
-				Scalar4 *d_psi,
-				unsigned int group_size,
-				unsigned int *d_group_members,
-				const unsigned int timestep, 
-				const unsigned int seed 
-				);
+        Scalar4 *d_psi,
+        unsigned int group_size,
+        unsigned int *d_group_members,
+        const unsigned int timestep, 
+        const unsigned int seed 
+        );
 
 __global__ void gpu_stokes_BrownianGridGenerate_kernel(  
-					CUFFTCOMPLEX *gridX,
-					CUFFTCOMPLEX *gridY,
-					CUFFTCOMPLEX *gridZ,
-					Scalar4 *gridk,
-				        unsigned int NxNyNz,
-					int Nx,
-					int Ny,
-					int Nz,
-				        const unsigned int timestep, 
-				        const unsigned int seed,
-					Scalar T,
-					Scalar dt,
-					Scalar quadW 
-					);
+          CUFFTCOMPLEX *gridX,
+          CUFFTCOMPLEX *gridY,
+          CUFFTCOMPLEX *gridZ,
+          Scalar4 *gridk,
+                unsigned int NxNyNz,
+          int Nx,
+          int Ny,
+          int Nz,
+                const unsigned int timestep, 
+                const unsigned int seed,
+          Scalar T,
+          Scalar dt,
+          Scalar quadW 
+          );
 
 void gpu_stokes_CombinedMobilityBrownian_wrap( 
-				Scalar4 *d_pos,
-				Scalar4 *d_net_force,
+        Scalar4 *d_pos,
+        Scalar4 *d_net_force,
                                 unsigned int *d_group_members,
                                 unsigned int group_size,
                                 const BoxDim& box,
                                 Scalar dt,
-			        Scalar4 *d_vel,
-			        const Scalar T,
-			        const unsigned int timestep,
-			        const unsigned int seed,
-			        Scalar xi,
-				Scalar eta,
-				Scalar P,
-			        Scalar ewald_cut,
-			        Scalar ewald_dr,
-			        int ewald_n,
-			        Scalar4 *d_ewaldC1, 
-			        Scalar4 *d_gridk,
-			        CUFFTCOMPLEX *d_gridX,
-			        CUFFTCOMPLEX *d_gridY,
-			        CUFFTCOMPLEX *d_gridZ,
-			        cufftHandle plan,
-			        const int Nx,
-			        const int Ny,
-			        const int Nz,
-			        const unsigned int *d_n_neigh,
+              Scalar4 *d_vel,
+              const Scalar T,
+              const unsigned int timestep,
+              const unsigned int seed,
+              Scalar xi,
+        Scalar eta,
+        Scalar P,
+              Scalar ewald_cut,
+              Scalar ewald_dr,
+              int ewald_n,
+              Scalar4 *d_ewaldC1, 
+              Scalar4 *d_gridk,
+              CUFFTCOMPLEX *d_gridX,
+              CUFFTCOMPLEX *d_gridY,
+              CUFFTCOMPLEX *d_gridZ,
+              cufftHandle plan,
+              const int Nx,
+              const int Ny,
+              const int Nz,
+              const unsigned int *d_n_neigh,
                                 const unsigned int *d_nlist,
                                 const unsigned int *d_headlist,
-			        int& m_Lanczos,
-			        const unsigned int N_total,
-			        unsigned int NxNyNz,
-			        dim3 grid,
-			        dim3 threads,
-			        int gridBlockSize,
-			        int gridNBlock,
-				Scalar3 gridh,
-			        Scalar cheb_error,
-				Scalar self 
-				);
+              int& m_Lanczos,
+              const unsigned int N_total,
+              unsigned int NxNyNz,
+              dim3 grid,
+              dim3 threads,
+              int gridBlockSize,
+              int gridNBlock,
+        Scalar3 gridh,
+              Scalar cheb_error,
+        Scalar self 
+        );
 
 #endif
diff --git a/PSEv1/Helper.cu b/PSEv1/Helper.cu
index dfc582e..c9f7ac4 100644
--- a/PSEv1/Helper.cu
+++ b/PSEv1/Helper.cu
@@ -73,7 +73,7 @@ ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 /*! \file Helper.cu
-    	\brief Helper functions to perform additions, dot products, etc., for Mobility and Brownian
+      \brief Helper functions to perform additions, dot products, etc., for Mobility and Brownian
 */
 
 //! Shared memory array for partial sum of dot product kernel
@@ -81,201 +81,202 @@ extern __shared__ Scalar partial_sum[];
 
 //! Zero out the force grid
 /*! 
-	\param grid the grid going to be zero out
-   	\param NxNyNz dimension of the grid
+  \param grid the grid going to be zero out
+     \param NxNyNz dimension of the grid
 */
 __global__
 void gpu_stokes_ZeroGrid_kernel(CUFFTCOMPLEX *grid, unsigned int NxNyNz) {
 
-	unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
-	
-	if ( tid < NxNyNz ) {
-	
-		grid[tid] = make_scalar2( 0.0, 0.0 );  
-	
-	}
+  unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
+  
+  if ( tid < NxNyNz ) {
+  
+    grid[tid].x = 0;
+    grid[tid].y = 0;
+  
+  }
 }
 
 /*!
-	Linear combination helper function
-	C = a*A + b*B
-	C can be A or B, so that A or B will be overwritten
-	The fourth element of Scalar4 is not changed!
-
-	\param d_a              input vector, A
-	\param d_b              input vector, B
-	\param d_c              output vector, C
-	\param coeff_a          scaling factor for A, a
-	\param coeff_b          scaling factor for B, b
-	\param group_size       length of vectors
-	\param d_group_members  index into vectors
+  Linear combination helper function
+  C = a*A + b*B
+  C can be A or B, so that A or B will be overwritten
+  The fourth element of Scalar4 is not changed!
+
+  \param d_a              input vector, A
+  \param d_b              input vector, B
+  \param d_c              output vector, C
+  \param coeff_a          scaling factor for A, a
+  \param coeff_b          scaling factor for B, b
+  \param group_size       length of vectors
+  \param d_group_members  index into vectors
 */
 __global__ void gpu_stokes_LinearCombination_kernel(
-						Scalar4 *d_a,
-						Scalar4 *d_b,
-						Scalar4 *d_c,
-						Scalar coeff_a,
-						Scalar coeff_b,
-						unsigned int group_size,
-						unsigned int *d_group_members
-						){
-
-	int group_idx = blockDim.x * blockIdx.x + threadIdx.x;
-	if (group_idx < group_size) {
-		unsigned int idx = d_group_members[group_idx];
-		Scalar4 A4 = d_a[idx];
-		Scalar4 B4 = d_b[idx];
-		Scalar3 A = make_scalar3(A4.x, A4.y, A4.z);
-		Scalar3 B = make_scalar3(B4.x, B4.y, B4.z);
-		A = coeff_a * A + coeff_b * B;
-		d_c[idx] = make_scalar4(A.x, A.y, A.z, d_c[idx].w);
-	}
+            Scalar4 *d_a,
+            Scalar4 *d_b,
+            Scalar4 *d_c,
+            Scalar coeff_a,
+            Scalar coeff_b,
+            unsigned int group_size,
+            unsigned int *d_group_members
+            ){
+
+  int group_idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if (group_idx < group_size) {
+    unsigned int idx = d_group_members[group_idx];
+    Scalar4 A4 = d_a[idx];
+    Scalar4 B4 = d_b[idx];
+    Scalar3 A = make_scalar3(A4.x, A4.y, A4.z);
+    Scalar3 B = make_scalar3(B4.x, B4.y, B4.z);
+    A = coeff_a * A + coeff_b * B;
+    d_c[idx] = make_scalar4(A.x, A.y, A.z, d_c[idx].w);
+  }
 }
 
 /*!
-	Dot product helper function: First step
-	d_a .* d_b -> d_c -> Partial sum
-	BlockDim of this kernel should be 2^n, which is 512. (Based on HOOMD ComputeThermoGPU class)
-	
-	\param d_a              first vector in dot product
-	\param d_b              second vector in dot product
-	\param dot_sum          partial dot product sum
-	\param group_size       length of vectors a and b
+  Dot product helper function: First step
+  d_a .* d_b -> d_c -> Partial sum
+  BlockDim of this kernel should be 2^n, which is 512. (Based on HOOMD ComputeThermoGPU class)
+  
+  \param d_a              first vector in dot product
+  \param d_b              second vector in dot product
+  \param dot_sum          partial dot product sum
+  \param group_size       length of vectors a and b
         \param d_group_members  index into vectors
 */
 __global__ void gpu_stokes_DotStepOne_kernel(
-					Scalar4 *d_a, 
-					Scalar4 *d_b, 
-					Scalar *dot_sum, 
-					unsigned int group_size, 
-					unsigned int *d_group_members
-					){
+          Scalar4 *d_a, 
+          Scalar4 *d_b, 
+          Scalar *dot_sum, 
+          unsigned int group_size, 
+          unsigned int *d_group_members
+          ){
 
-	int group_idx = blockDim.x * blockIdx.x + threadIdx.x;
-	Scalar temp;
+  int group_idx = blockDim.x * blockIdx.x + threadIdx.x;
+  Scalar temp;
 
-	if (group_idx < group_size) {
+  if (group_idx < group_size) {
 
-		unsigned int idx = d_group_members[group_idx];
-		Scalar4 a4 = d_a[idx];
-		Scalar4 b4 = d_b[idx];
-		Scalar3 a = make_scalar3(a4.x, a4.y, a4.z);
-		Scalar3 b = make_scalar3(b4.x, b4.y, b4.z);
+    unsigned int idx = d_group_members[group_idx];
+    Scalar4 a4 = d_a[idx];
+    Scalar4 b4 = d_b[idx];
+    Scalar3 a = make_scalar3(a4.x, a4.y, a4.z);
+    Scalar3 b = make_scalar3(b4.x, b4.y, b4.z);
 
-		temp = dot(a,b); // Partial sum, each thread, shared memory
+    temp = dot(a,b); // Partial sum, each thread, shared memory
 
-	}
-	else {
-		temp = 0;
-	}
+  }
+  else {
+    temp = 0;
+  }
 
-	partial_sum[threadIdx.x] = temp;
+  partial_sum[threadIdx.x] = temp;
 
-	__syncthreads();
+  __syncthreads();
 
-	int offs = blockDim.x >> 1;
+  int offs = blockDim.x >> 1;
 
-	while (offs > 0)
+  while (offs > 0)
         {
-        	if (threadIdx.x < offs)
-            	{
-            		partial_sum[threadIdx.x] += partial_sum[threadIdx.x + offs];
-            	}
-        	offs >>= 1;
-        	__syncthreads();
+          if (threadIdx.x < offs)
+              {
+                partial_sum[threadIdx.x] += partial_sum[threadIdx.x + offs];
+              }
+          offs >>= 1;
+          __syncthreads();
         }
 
-	if (threadIdx.x == 0){
-		dot_sum[blockIdx.x] = partial_sum[0];
-	}
+  if (threadIdx.x == 0){
+    dot_sum[blockIdx.x] = partial_sum[0];
+  }
 }
 
 
 
 /*!
-	Dot product helper function: Second step
-	Partial sum -> Final sum
-	Only one block will be launched for this step
+  Dot product helper function: Second step
+  Partial sum -> Final sum
+  Only one block will be launched for this step
 
-	\param dot_sum           partial sum from first dot product kernel
-	\param num_partial_sums  length of dot_sum array
+  \param dot_sum           partial sum from first dot product kernel
+  \param num_partial_sums  length of dot_sum array
 
 */
 __global__ void gpu_stokes_DotStepTwo_kernel(
-					Scalar *dot_sum, 
-					unsigned int num_partial_sums
-					){
-
-	partial_sum[threadIdx.x] = 0.0;
-	__syncthreads();
-	for (unsigned int start = 0; start < num_partial_sums; start += blockDim.x)
-       	{
-        	if (start + threadIdx.x < num_partial_sums)
-            	{
-            		partial_sum[threadIdx.x] += dot_sum[start + threadIdx.x];
-            	}
-	}
-
-	int offs = blockDim.x >> 1;
-	while (offs > 0)
-       	{
-		__syncthreads();
-            	if (threadIdx.x < offs)
+          Scalar *dot_sum, 
+          unsigned int num_partial_sums
+          ){
+
+  partial_sum[threadIdx.x] = 0.0;
+  __syncthreads();
+  for (unsigned int start = 0; start < num_partial_sums; start += blockDim.x)
+         {
+          if (start + threadIdx.x < num_partial_sums)
+              {
+                partial_sum[threadIdx.x] += dot_sum[start + threadIdx.x];
+              }
+  }
+
+  int offs = blockDim.x >> 1;
+  while (offs > 0)
+         {
+    __syncthreads();
+              if (threadIdx.x < offs)
                 {
-                	partial_sum[threadIdx.x] += partial_sum[threadIdx.x + offs];
+                  partial_sum[threadIdx.x] += partial_sum[threadIdx.x + offs];
                 }
-            	offs >>= 1;
-            	
+              offs >>= 1;
+              
         }
-	__syncthreads();
+  __syncthreads();
         if (threadIdx.x == 0)
-	{
-            	dot_sum[0] = partial_sum[0]; // Save the dot product to the first element of dot_sum array
-	}
+  {
+              dot_sum[0] = partial_sum[0]; // Save the dot product to the first element of dot_sum array
+  }
 
 }
 
 
 /*!
 
-	Perform matrix-vector multiply needed for the Lanczos contribution to the Brownian velocity
+  Perform matrix-vector multiply needed for the Lanczos contribution to the Brownian velocity
 
-	\param d_A 		matrix, N x m
-	\param d_x		multiplying vector, m x 1
-	\param d_b		result vector, A*x, m x 1
-	\param group_size	number of particles
-	\param m		number of iterations ( number of columns of A, length of x )
+  \param d_A     matrix, N x m
+  \param d_x    multiplying vector, m x 1
+  \param d_b    result vector, A*x, m x 1
+  \param group_size  number of particles
+  \param m    number of iterations ( number of columns of A, length of x )
 
 */
 
 __global__ void gpu_stokes_MatVecMultiply_kernel(
-						Scalar4 *d_A, 
-						Scalar *d_x, 
-						Scalar4 *d_b, 
-						unsigned int group_size, 
-						int m
-						){
+            Scalar4 *d_A, 
+            Scalar *d_x, 
+            Scalar4 *d_b, 
+            unsigned int group_size, 
+            int m
+            ){
 
-	int idx = blockDim.x * blockIdx.x + threadIdx.x;
-	if (idx < group_size) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if (idx < group_size) {
 
-		Scalar3 tempprod = make_scalar3( 0.0, 0.0, 0.0 );
+    Scalar3 tempprod = make_scalar3( 0.0, 0.0, 0.0 );
 
-		for ( int ii = 0; ii < m; ++ii ){
+    for ( int ii = 0; ii < m; ++ii ){
 
-		    Scalar4 matidx = d_A[ idx + ii*group_size ];
+        Scalar4 matidx = d_A[ idx + ii*group_size ];
 
-		    Scalar xcurr = d_x[ii];
+        Scalar xcurr = d_x[ii];
 
-		    tempprod.x = tempprod.x + matidx.x * xcurr;
-		    tempprod.y = tempprod.y + matidx.y * xcurr;
-		    tempprod.z = tempprod.z + matidx.z * xcurr;
+        tempprod.x = tempprod.x + matidx.x * xcurr;
+        tempprod.y = tempprod.y + matidx.y * xcurr;
+        tempprod.z = tempprod.z + matidx.z * xcurr;
 
-		}
+    }
 
-		d_b[idx] = make_scalar4( tempprod.x, tempprod.y, tempprod.z, d_A[idx].w );
+    d_b[idx] = make_scalar4( tempprod.x, tempprod.y, tempprod.z, d_A[idx].w );
 
-	}
+  }
 }
 
 /*!
@@ -283,15 +284,15 @@ __global__ void gpu_stokes_MatVecMultiply_kernel(
   */
 __global__
 void gpu_stokes_SetGridk_kernel(
-				Scalar4 *gridk,
+        Scalar4 *gridk,
                                 int Nx,
                                 int Ny,
                                 int Nz,
                                 unsigned int NxNyNz,
                                 BoxDim box,
                                 Scalar xi,
-				Scalar eta
-				){
+        Scalar eta
+        ){
 
         int tid = blockDim.x * blockIdx.x + threadIdx.x;
 
@@ -310,21 +311,21 @@ void gpu_stokes_SetGridk_kernel(
                 gridk_value.x = gridk_value.x / L.x;
                 gridk_value.z = ((k < (Nz+1) / 2) ? k : k - Nz) / L.z;
 
-		gridk_value.x *= 2.0*3.1416926536;
-		gridk_value.y *= 2.0*3.1416926536;
-		gridk_value.z *= 2.0*3.1416926536;
+    gridk_value.x *= 2.0*3.1416926536;
+    gridk_value.y *= 2.0*3.1416926536;
+    gridk_value.z *= 2.0*3.1416926536;
 
                 Scalar k2 = gridk_value.x*gridk_value.x + gridk_value.y*gridk_value.y + gridk_value.z*gridk_value.z;
-		Scalar xisq = xi * xi;
-
-		// Scaling factor used in wave space sum
-		if (i == 0 && j == 0 && k == 0){
-			gridk_value.w = 0.0;
-		}
-		else{
-			// Have to divide by Nx*Ny*Nz to normalize the FFTs
-			gridk_value.w = 6.0*3.1415926536 * (1.0 + k2/4.0/xisq) * expf( -(1-eta) * k2/4.0/xisq ) / ( k2 ) / Scalar( Nx*Ny*Nz );
-		}
+    Scalar xisq = xi * xi;
+
+    // Scaling factor used in wave space sum
+    if (i == 0 && j == 0 && k == 0){
+      gridk_value.w = 0.0;
+    }
+    else{
+      // Have to divide by Nx*Ny*Nz to normalize the FFTs
+      gridk_value.w = 6.0*3.1415926536 * (1.0 + k2/4.0/xisq) * expf( -(1-eta) * k2/4.0/xisq ) / ( k2 ) / Scalar( Nx*Ny*Nz );
+    }
 
                 gridk[tid] = gridk_value;
 
diff --git a/PSEv1/Mobility.cu b/PSEv1/Mobility.cu
index 2bcb3ff..0afad7f 100644
--- a/PSEv1/Mobility.cu
+++ b/PSEv1/Mobility.cu
@@ -104,156 +104,156 @@ scalar4_tex_t pos_tex;
     \param gridh            space between grid nodes in each dimension
     \param xi               Ewald splitting parameter
     \param eta              Spectral splitting parameter
-    \param prefac	    Spreading function prefactor
-    \param expfac	    Spreading function exponential factor
+    \param prefac      Spreading function prefactor
+    \param expfac      Spreading function exponential factor
 
     One 3-D block of threads is launched per particle (block dimension = PxPxP). Max dimension
     is 10x10x10. If P > 10, each thread will do more than one grid point worth of work. 
 
 */
-__global__ void gpu_stokes_Spread_kernel( 	
-				Scalar4 *d_pos,
-			    	Scalar4 *d_net_force,
-			    	CUFFTCOMPLEX *gridX,
-			    	CUFFTCOMPLEX *gridY,
-			    	CUFFTCOMPLEX *gridZ,
-			    	int group_size,
-			    	int Nx,
-			    	int Ny,
-			    	int Nz,
-			    	unsigned int *d_group_members,
-			    	BoxDim box,
-			    	const int P,
-			    	Scalar3 gridh,
-			    	Scalar xi,
-			    	Scalar eta,
-				Scalar prefac,
-				Scalar expfac 
-				){
-
-	// Shared memory for particle force and position, so that each block
-	// only has to read once
-	__shared__ Scalar3 shared[2]; // 16 kb max
-	
-	Scalar3 *force_shared = shared;
-	Scalar3 *pos_shared = &shared[1];
-
-	// Offset for the block (i.e. particle ID within group)	
-	int group_idx = blockIdx.x;
-
-	// Offset for the thread (i.e. grid point ID within particle's support)
-	int thread_offset = threadIdx.z + threadIdx.y * blockDim.z + threadIdx.x * blockDim.z*blockDim.y;
-	
-	// Global particle ID
-	unsigned int idx = d_group_members[group_idx];
-	
-	// Initialize shared memory and get particle position
-	if ( thread_offset == 0 ){
-		Scalar4 tpos = texFetchScalar4(d_pos, pos_tex, idx);
-		pos_shared[0].x = tpos.x; 
-		pos_shared[0].y = tpos.y; 
-		pos_shared[0].z = tpos.z;
-		
-		Scalar4 tforce = d_net_force[idx];
-		force_shared[0].x = tforce.x;
-		force_shared[0].y = tforce.y;
-		force_shared[0].z = tforce.z;
-	}
-	__syncthreads();
-	
-	// Box dimension
-	Scalar3 L = box.getL();
-	Scalar3 Ld2 = L / 2.0;
-	
-	// Retrieve position from shared memory
-	Scalar3 pos = pos_shared[0];
-	Scalar3 force = force_shared[0];
-	
-	// Fractional position within box 
-	Scalar3 pos_frac = box.makeFraction(pos);
-	
-	pos_frac.x *= (Scalar)Nx;
-	pos_frac.y *= (Scalar)Ny;
-	pos_frac.z *= (Scalar)Nz;
-	
-	// Grid index of floor of fractional position
-	int x = int( pos_frac.x );
-	int y = int( pos_frac.y );
-	int z = int( pos_frac.z );
-
-	// Amount of work needed for each thread to cover support
-	// (Required in case support size is larger than grid dimension,
-	//  but in most cases, should have n.x = n.y = n.z = 1 )
-	int3 n, t;
+__global__ void gpu_stokes_Spread_kernel(   
+        Scalar4 *d_pos,
+            Scalar4 *d_net_force,
+            CUFFTCOMPLEX *gridX,
+            CUFFTCOMPLEX *gridY,
+            CUFFTCOMPLEX *gridZ,
+            int group_size,
+            int Nx,
+            int Ny,
+            int Nz,
+            unsigned int *d_group_members,
+            BoxDim box,
+            const int P,
+            Scalar3 gridh,
+            Scalar xi,
+            Scalar eta,
+        Scalar prefac,
+        Scalar expfac 
+        ){
+
+  // Shared memory for particle force and position, so that each block
+  // only has to read once
+  __shared__ Scalar3 shared[2]; // 16 kb max
+  
+  Scalar3 *force_shared = shared;
+  Scalar3 *pos_shared = &shared[1];
+
+  // Offset for the block (i.e. particle ID within group)  
+  int group_idx = blockIdx.x;
+
+  // Offset for the thread (i.e. grid point ID within particle's support)
+  int thread_offset = threadIdx.z + threadIdx.y * blockDim.z + threadIdx.x * blockDim.z*blockDim.y;
+  
+  // Global particle ID
+  unsigned int idx = d_group_members[group_idx];
+  
+  // Initialize shared memory and get particle position
+  if ( thread_offset == 0 ){
+    Scalar4 tpos = texFetchScalar4(d_pos, pos_tex, idx);
+    pos_shared[0].x = tpos.x; 
+    pos_shared[0].y = tpos.y; 
+    pos_shared[0].z = tpos.z;
+    
+    Scalar4 tforce = d_net_force[idx];
+    force_shared[0].x = tforce.x;
+    force_shared[0].y = tforce.y;
+    force_shared[0].z = tforce.z;
+  }
+  __syncthreads();
+  
+  // Box dimension
+  Scalar3 L = box.getL();
+  Scalar3 Ld2 = L / 2.0;
+  
+  // Retrieve position from shared memory
+  Scalar3 pos = pos_shared[0];
+  Scalar3 force = force_shared[0];
+  
+  // Fractional position within box 
+  Scalar3 pos_frac = box.makeFraction(pos);
+  
+  pos_frac.x *= (Scalar)Nx;
+  pos_frac.y *= (Scalar)Ny;
+  pos_frac.z *= (Scalar)Nz;
+  
+  // Grid index of floor of fractional position
+  int x = int( pos_frac.x );
+  int y = int( pos_frac.y );
+  int z = int( pos_frac.z );
+
+  // Amount of work needed for each thread to cover support
+  // (Required in case support size is larger than grid dimension,
+  //  but in most cases, should have n.x = n.y = n.z = 1 )
+  int3 n, t;
         n.x = ( P + blockDim.x - 1 ) / blockDim.x; // ceiling
         n.y = ( P + blockDim.y - 1 ) / blockDim.y;
         n.z = ( P + blockDim.z - 1 ) / blockDim.z;
 
-	// Grid point associated with current thread
-	int Pd2 = P/2; // integer division does floor
-
-	for ( int ii = 0; ii < n.x; ++ii ){
-
-		t.x = threadIdx.x + ii*blockDim.x;
-
-		for ( int jj = 0; jj < n.y; ++jj ){
-
-			t.y = threadIdx.y + jj*blockDim.y;
-
-			for ( int kk = 0; kk < n.z; ++kk ){
-
-				t.z = threadIdx.z + kk*blockDim.z;
-
-				if ( ( t.x < P ) && ( t.y < P ) && ( t.z < P ) ){
-
-					// x,y,z indices for current thread
-					// 
-					// Arithmetic with P makes sure distribution is centered on the particle
-					int x_inp = x + t.x - Pd2 + 1 - (P % 2) * ( pos_frac.x - Scalar( x ) < 0.5  );
-					int y_inp = y + t.y - Pd2 + 1 - (P % 2) * ( pos_frac.y - Scalar( y ) < 0.5  );
-					int z_inp = z + t.z - Pd2 + 1 - (P % 2) * ( pos_frac.z - Scalar( z ) < 0.5  );
-
-					// Periodic wrapping of grid point
-					x_inp = (x_inp<0) ? x_inp+Nx : ( (x_inp>Nx-1) ? x_inp-Nx : x_inp );
-					y_inp = (y_inp<0) ? y_inp+Ny : ( (y_inp>Ny-1) ? y_inp-Ny : y_inp );
-					z_inp = (z_inp<0) ? z_inp+Nz : ( (z_inp>Nz-1) ? z_inp-Nz : z_inp );
-					
-					// x,y,z coordinates for current thread
-					Scalar3 pos_grid;
-					pos_grid.x = gridh.x*x_inp - Ld2.x;
-					pos_grid.y = gridh.y*y_inp - Ld2.y;
-					pos_grid.z = gridh.z*z_inp - Ld2.z;
-
-					// Shear the grid position 
-					// !!! This only works for linear shear where the shear gradient is along y
-					//     and the shear direction is along x
-					pos_grid.x = pos_grid.x + box.getTiltFactorXY() * pos_grid.y;
-					
-					// Global index for current grid point
-					int grid_idx = x_inp * Ny * Nz + y_inp * Nz + z_inp;
-					
-					// Distance from particle to grid node
-					Scalar3 r = pos_grid - pos;
-					r = box.minImage(r);
-					Scalar rsq = r.x*r.x + r.y*r.y + r.z*r.z;
-					
-					// Magnitude of the force contribution to the current grid node
-					Scalar3 force_inp = prefac * expf( -expfac * rsq ) * force;
-					
-					// Add force to the grid
-					atomicAdd( &(gridX[grid_idx].x), force_inp.x);
-					atomicAdd( &(gridY[grid_idx].x), force_inp.y);
-					atomicAdd( &(gridZ[grid_idx].x), force_inp.z);
-				}// check thread is within support
-			}// kk
-		}// jj
-	}// ii
+  // Grid point associated with current thread
+  int Pd2 = P/2; // integer division does floor
+
+  for ( int ii = 0; ii < n.x; ++ii ){
+
+    t.x = threadIdx.x + ii*blockDim.x;
+
+    for ( int jj = 0; jj < n.y; ++jj ){
+
+      t.y = threadIdx.y + jj*blockDim.y;
+
+      for ( int kk = 0; kk < n.z; ++kk ){
+
+        t.z = threadIdx.z + kk*blockDim.z;
+
+        if ( ( t.x < P ) && ( t.y < P ) && ( t.z < P ) ){
+
+          // x,y,z indices for current thread
+          // 
+          // Arithmetic with P makes sure distribution is centered on the particle
+          int x_inp = x + t.x - Pd2 + 1 - (P % 2) * ( pos_frac.x - Scalar( x ) < 0.5  );
+          int y_inp = y + t.y - Pd2 + 1 - (P % 2) * ( pos_frac.y - Scalar( y ) < 0.5  );
+          int z_inp = z + t.z - Pd2 + 1 - (P % 2) * ( pos_frac.z - Scalar( z ) < 0.5  );
+
+          // Periodic wrapping of grid point
+          x_inp = (x_inp<0) ? x_inp+Nx : ( (x_inp>Nx-1) ? x_inp-Nx : x_inp );
+          y_inp = (y_inp<0) ? y_inp+Ny : ( (y_inp>Ny-1) ? y_inp-Ny : y_inp );
+          z_inp = (z_inp<0) ? z_inp+Nz : ( (z_inp>Nz-1) ? z_inp-Nz : z_inp );
+          
+          // x,y,z coordinates for current thread
+          Scalar3 pos_grid;
+          pos_grid.x = gridh.x*x_inp - Ld2.x;
+          pos_grid.y = gridh.y*y_inp - Ld2.y;
+          pos_grid.z = gridh.z*z_inp - Ld2.z;
+
+          // Shear the grid position 
+          // !!! This only works for linear shear where the shear gradient is along y
+          //     and the shear direction is along x
+          pos_grid.x = pos_grid.x + box.getTiltFactorXY() * pos_grid.y;
+          
+          // Global index for current grid point
+          int grid_idx = x_inp * Ny * Nz + y_inp * Nz + z_inp;
+          
+          // Distance from particle to grid node
+          Scalar3 r = pos_grid - pos;
+          r = box.minImage(r);
+          Scalar rsq = r.x*r.x + r.y*r.y + r.z*r.z;
+          
+          // Magnitude of the force contribution to the current grid node
+          Scalar3 force_inp = prefac * expf( -expfac * rsq ) * force;
+          
+          // Add force to the grid
+          atomicAdd( &(gridX[grid_idx].x), force_inp.x);
+          atomicAdd( &(gridY[grid_idx].x), force_inp.y);
+          atomicAdd( &(gridZ[grid_idx].x), force_inp.z);
+        }// check thread is within support
+      }// kk
+    }// jj
+  }// ii
 
 }
 
 //! Compute the velocity from the force moments on the grid (Same Size Particles)
 //
-//	This is the operator "B" from the paper
+//  This is the operator "B" from the paper
 //
 /*! \param gridX            x-component of force moments projected onto grid
     \param gridY            y-component of force moments projected onto grid
@@ -262,40 +262,49 @@ __global__ void gpu_stokes_Spread_kernel(
     \param NxNyNz           total number of grid nodes
 */
 __global__ void gpu_stokes_Green_kernel(
-				CUFFTCOMPLEX *gridX, 
-				CUFFTCOMPLEX *gridY, 
-				CUFFTCOMPLEX *gridZ, 
-				Scalar4 *gridk, 
-				unsigned int NxNyNz
-				) {
-
-	int tid = blockDim.x * blockIdx.x + threadIdx.x;
-	
-	if ( tid < NxNyNz ) {
-	
-	  // Read the FFT force from global memory
-	  Scalar2 fX = gridX[tid];  
-	  Scalar2 fY = gridY[tid];
-	  Scalar2 fZ = gridZ[tid];
-	
-	  // Current wave-space vector 
-	  Scalar4 tk = gridk[tid];
-	  Scalar ksq = tk.x*tk.x + tk.y*tk.y + tk.z*tk.z;
-	  Scalar k = sqrtf( ksq );
-	
-	  // Dot product of the wave-vector with the force 
-	  Scalar2 kdF = (tid==0) ? make_scalar2(0.0,0.0) : make_scalar2( ( tk.x*fX.x + tk.y*fY.x + tk.z*fZ.x ) / ksq,  ( tk.x*fX.y + tk.y*fY.y + tk.z*fZ.y ) / ksq );
-	
-	  // Scaling factor
-	  Scalar B = (tid==0) ? 0.0 : tk.w * ( sinf( k ) / k ) * ( sinf( k ) / k );
-	
-	  // Write the velocity to global memory
-	  gridX[tid] = make_scalar2( ( fX.x - tk.x * kdF.x ) * B, ( fX.y - tk.x * kdF.y ) * B );
-	  gridY[tid] = make_scalar2( ( fY.x - tk.y * kdF.x ) * B, ( fY.y - tk.y * kdF.y ) * B );
-	  gridZ[tid] = make_scalar2( ( fZ.x - tk.z * kdF.x ) * B, ( fZ.y - tk.z * kdF.y ) * B );
-	
-	
-	}
+        CUFFTCOMPLEX *gridX, 
+        CUFFTCOMPLEX *gridY, 
+        CUFFTCOMPLEX *gridZ, 
+        Scalar4 *gridk, 
+        unsigned int NxNyNz
+        ) {
+
+  int tid = blockDim.x * blockIdx.x + threadIdx.x;
+  
+  if ( tid < NxNyNz ) {
+  
+    // Read the FFT force from global memory
+          Scalar2 fX, fY, fZ;
+    fX.x = gridX[tid].x;  
+    fX.y = gridX[tid].y;  
+    fY.x = gridY[tid].x;
+    fY.y = gridY[tid].y;
+    fZ.x = gridZ[tid].x;
+    fZ.y = gridZ[tid].y;
+  
+    // Current wave-space vector 
+    Scalar4 tk = gridk[tid];
+    Scalar ksq = tk.x*tk.x + tk.y*tk.y + tk.z*tk.z;
+    Scalar k = sqrtf( ksq );
+  
+    // Dot product of the wave-vector with the force 
+    Scalar2 kdF = (tid==0) ? make_scalar2(0.0,0.0) : make_scalar2( ( tk.x*fX.x + tk.y*fY.x + tk.z*fZ.x ) / ksq,  ( tk.x*fX.y + tk.y*fY.y + tk.z*fZ.y ) / ksq );
+  
+    // Scaling factor
+    Scalar B = (tid==0) ? 0.0 : tk.w * ( sinf( k ) / k ) * ( sinf( k ) / k );
+  
+    // Write the velocity to global memory
+          Scalar2 gX, gY, gZ;
+    gX = make_scalar2( ( fX.x - tk.x * kdF.x ) * B, ( fX.y - tk.x * kdF.y ) * B );
+    gridX[tid].x = gX.x;
+    gridX[tid].y = gX.y;
+    gY = make_scalar2( ( fY.x - tk.y * kdF.x ) * B, ( fY.y - tk.y * kdF.y ) * B );
+    gridY[tid].x = gY.x;
+    gridY[tid].y = gY.y;
+    gZ = make_scalar2( ( fZ.x - tk.z * kdF.x ) * B, ( fZ.y - tk.z * kdF.y ) * B );
+    gridZ[tid].x = gZ.x;
+    gridZ[tid].y = gZ.y;
+  }
 }
 
 //! Add velocity from grid to particles ( Same Size Particles, Block Per Particle (support) )
@@ -315,170 +324,170 @@ __global__ void gpu_stokes_Green_kernel(
     \param box              array containing box dimensions
     \param P                number of grid nodes in support of spreading Gaussians
     \param gridh            space between grid nodes in each dimension
-    \param prefac	    Spreading function prefactor
-    \param expfac	    Spreading function exponential factor
+    \param prefac      Spreading function prefactor
+    \param expfac      Spreading function exponential factor
 
     One 3-D block of threads is launched per particle (block dimension = PxPxP). Max dimension
     is 10x10x10 because of shared memory limitations. If P > 10, each thread will do more 
     than one grid point worth of work. 
 */
-__global__ void gpu_stokes_Contract_kernel( 	
-					Scalar4 *d_pos,
-				 	Scalar4 *d_vel,
-				 	CUFFTCOMPLEX *gridX,
-				 	CUFFTCOMPLEX *gridY,
-				 	CUFFTCOMPLEX *gridZ,
-				 	int group_size,
-				 	int Nx,
-				 	int Ny,
-				 	int Nz,
-				 	Scalar xi,
-				 	Scalar eta,
-				 	unsigned int *d_group_members,
-				 	BoxDim box,
-				 	const int P,
-				 	Scalar3 gridh,
-				 	Scalar prefac,
-				 	Scalar expfac 
-					){
-
-	// Shared memory for particle velocity and position, so that each block
-	// only has to read one
-	extern __shared__ Scalar3 shared[];
-	
-	Scalar3 *velocity = shared;
-	Scalar3 *pos_shared = &shared[blockDim.x*blockDim.y*blockDim.z];
-	
-	// Particle index within each group (block per particle)
-	int group_idx = blockIdx.x;
-
-	// Thread index within the block (grid point index)
-	int thread_offset = threadIdx.z + threadIdx.y * blockDim.z + threadIdx.x * blockDim.z*blockDim.y;
-
-	// Total number of threads within the block
-	int block_size = blockDim.x * blockDim.y * blockDim.z;
-	
-	// Global particle ID
-	unsigned int idx = d_group_members[group_idx];
-	
-	// Initialize shared memory and get particle position
-	velocity[thread_offset] = make_scalar3(0.0,0.0,0.0);
-	if ( thread_offset == 0 ){
-		Scalar4 tpos = texFetchScalar4(d_pos, pos_tex, idx);
-		pos_shared[0] = make_scalar3( tpos.x, tpos.y, tpos.z ); 
-	}
-	__syncthreads();
-	
-	// Box dimension
-	Scalar3 L = box.getL();
-	Scalar3 Ld2 = L / 2.0;
-	
-	// Retrieve position from shared memory
-	Scalar3 pos = pos_shared[0];
-	
-	// Fractional position within box 
-	Scalar3 pos_frac = box.makeFraction(pos);
-	
-	pos_frac.x *= (Scalar)Nx;
-	pos_frac.y *= (Scalar)Ny;
-	pos_frac.z *= (Scalar)Nz;
-	
-	int x = int( pos_frac.x );
-	int y = int( pos_frac.y );
-	int z = int( pos_frac.z );
-	
-	// Amount of work needed for each thread to cover support
-	// (Required in case support size is larger than grid dimension,
-	//  but in most cases, should have n.x = n.y = n.z = 1 )
-	int3 n, t;
+__global__ void gpu_stokes_Contract_kernel(   
+          Scalar4 *d_pos,
+           Scalar4 *d_vel,
+           CUFFTCOMPLEX *gridX,
+           CUFFTCOMPLEX *gridY,
+           CUFFTCOMPLEX *gridZ,
+           int group_size,
+           int Nx,
+           int Ny,
+           int Nz,
+           Scalar xi,
+           Scalar eta,
+           unsigned int *d_group_members,
+           BoxDim box,
+           const int P,
+           Scalar3 gridh,
+           Scalar prefac,
+           Scalar expfac 
+          ){
+
+  // Shared memory for particle velocity and position, so that each block
+  // only has to read one
+  extern __shared__ Scalar3 shared[];
+  
+  Scalar3 *velocity = shared;
+  Scalar3 *pos_shared = &shared[blockDim.x*blockDim.y*blockDim.z];
+  
+  // Particle index within each group (block per particle)
+  int group_idx = blockIdx.x;
+
+  // Thread index within the block (grid point index)
+  int thread_offset = threadIdx.z + threadIdx.y * blockDim.z + threadIdx.x * blockDim.z*blockDim.y;
+
+  // Total number of threads within the block
+  int block_size = blockDim.x * blockDim.y * blockDim.z;
+  
+  // Global particle ID
+  unsigned int idx = d_group_members[group_idx];
+  
+  // Initialize shared memory and get particle position
+  velocity[thread_offset] = make_scalar3(0.0,0.0,0.0);
+  if ( thread_offset == 0 ){
+    Scalar4 tpos = texFetchScalar4(d_pos, pos_tex, idx);
+    pos_shared[0] = make_scalar3( tpos.x, tpos.y, tpos.z ); 
+  }
+  __syncthreads();
+  
+  // Box dimension
+  Scalar3 L = box.getL();
+  Scalar3 Ld2 = L / 2.0;
+  
+  // Retrieve position from shared memory
+  Scalar3 pos = pos_shared[0];
+  
+  // Fractional position within box 
+  Scalar3 pos_frac = box.makeFraction(pos);
+  
+  pos_frac.x *= (Scalar)Nx;
+  pos_frac.y *= (Scalar)Ny;
+  pos_frac.z *= (Scalar)Nz;
+  
+  int x = int( pos_frac.x );
+  int y = int( pos_frac.y );
+  int z = int( pos_frac.z );
+  
+  // Amount of work needed for each thread to cover support
+  // (Required in case support size is larger than grid dimension,
+  //  but in most cases, should have n.x = n.y = n.z = 1 )
+  int3 n, t;
         n.x = ( P + blockDim.x - 1 ) / blockDim.x; // ceiling
         n.y = ( P + blockDim.y - 1 ) / blockDim.y;
         n.z = ( P + blockDim.z - 1 ) / blockDim.z;
  
-	// Grid point associated with current thread
-	int Pd2 = P / 2; // integer division does floor
-	
-	for ( int ii = 0; ii < n.x; ++ii ){
-
-		t.x = threadIdx.x + ii*blockDim.x;
-
-		for ( int jj = 0; jj < n.y; ++jj ){
-
-			t.y = threadIdx.y + jj*blockDim.y;
-
-			for ( int kk = 0; kk < n.z; ++kk ){
-
-				t.z = threadIdx.z + kk*blockDim.z;
-
-				if( ( t.x < P ) && ( t.y < P ) && ( t.z < P ) ){
-
-					// x,y,z indices for current thread
-					// 
-					// Arithmetic with P makes sure distribution is centered on the particle
-					int x_inp = x + t.x - Pd2 + 1 - (P % 2) * ( pos_frac.x - Scalar( x ) < 0.5  );
-					int y_inp = y + t.y - Pd2 + 1 - (P % 2) * ( pos_frac.y - Scalar( y ) < 0.5  );
-					int z_inp = z + t.z - Pd2 + 1 - (P % 2) * ( pos_frac.z - Scalar( z ) < 0.5  );
-					
-					// Periodic wrapping of grid point
-					x_inp = (x_inp<0) ? x_inp+Nx : ( (x_inp>Nx-1) ? x_inp-Nx : x_inp );
-					y_inp = (y_inp<0) ? y_inp+Ny : ( (y_inp>Ny-1) ? y_inp-Ny : y_inp );
-					z_inp = (z_inp<0) ? z_inp+Nz : ( (z_inp>Nz-1) ? z_inp-Nz : z_inp );
-					
-					// x,y,z coordinates for current thread
-					Scalar3 pos_grid;
-					pos_grid.x = gridh.x*x_inp - Ld2.x;
-					pos_grid.y = gridh.y*y_inp - Ld2.y;
-					pos_grid.z = gridh.z*z_inp - Ld2.z;
-
-					// Shear the grid position 
-					// !!! This only works for linear shear where the shear gradient is along y
-					//     and the shear direction is along x
-					pos_grid.x = pos_grid.x + box.getTiltFactorXY() * pos_grid.y;
-					
-					// Global index for current grid point
-					int grid_idx = x_inp * Ny * Nz + y_inp * Nz + z_inp;
-					
-					// Distance from particle to grid node
-					Scalar3 r = pos_grid - pos;
-					r = box.minImage(r);
-					Scalar rsq = r.x*r.x + r.y*r.y + r.z*r.z;
-					
-					// Spreading Factor
-					Scalar Cfac = prefac * expf( -expfac * rsq );
-					
-					// Get velocity from reduction (THIS IS THE SLOW STEP):
-					velocity[thread_offset] += Cfac * make_scalar3( gridX[grid_idx].x, gridY[grid_idx].x, gridZ[grid_idx].x );
-				}
-			}//kk
-		}//jj
-	}//ii
-
-	// Intra-block reduction for the total particle velocity
-	// (add contributions from all grid points)
-	int offs = block_size;
-	int offs_prev; 
-	while (offs > 1)
-	{
-	      offs_prev = offs; 
-	      offs = ( offs + 1 ) / 2;
-		__syncthreads();
-	    	if (thread_offset + offs < offs_prev)
-	        {
-	        	velocity[thread_offset] += velocity[thread_offset + offs];
-	        }
-	    	
-	}
-	
-	// Write out to global memory
-	if (thread_offset == 0){
-		d_vel[idx] = make_scalar4(velocity[0].x, velocity[0].y, velocity[0].z, d_vel[idx].w);
-	}
-	
+  // Grid point associated with current thread
+  int Pd2 = P / 2; // integer division does floor
+  
+  for ( int ii = 0; ii < n.x; ++ii ){
+
+    t.x = threadIdx.x + ii*blockDim.x;
+
+    for ( int jj = 0; jj < n.y; ++jj ){
+
+      t.y = threadIdx.y + jj*blockDim.y;
+
+      for ( int kk = 0; kk < n.z; ++kk ){
+
+        t.z = threadIdx.z + kk*blockDim.z;
+
+        if( ( t.x < P ) && ( t.y < P ) && ( t.z < P ) ){
+
+          // x,y,z indices for current thread
+          // 
+          // Arithmetic with P makes sure distribution is centered on the particle
+          int x_inp = x + t.x - Pd2 + 1 - (P % 2) * ( pos_frac.x - Scalar( x ) < 0.5  );
+          int y_inp = y + t.y - Pd2 + 1 - (P % 2) * ( pos_frac.y - Scalar( y ) < 0.5  );
+          int z_inp = z + t.z - Pd2 + 1 - (P % 2) * ( pos_frac.z - Scalar( z ) < 0.5  );
+          
+          // Periodic wrapping of grid point
+          x_inp = (x_inp<0) ? x_inp+Nx : ( (x_inp>Nx-1) ? x_inp-Nx : x_inp );
+          y_inp = (y_inp<0) ? y_inp+Ny : ( (y_inp>Ny-1) ? y_inp-Ny : y_inp );
+          z_inp = (z_inp<0) ? z_inp+Nz : ( (z_inp>Nz-1) ? z_inp-Nz : z_inp );
+          
+          // x,y,z coordinates for current thread
+          Scalar3 pos_grid;
+          pos_grid.x = gridh.x*x_inp - Ld2.x;
+          pos_grid.y = gridh.y*y_inp - Ld2.y;
+          pos_grid.z = gridh.z*z_inp - Ld2.z;
+
+          // Shear the grid position 
+          // !!! This only works for linear shear where the shear gradient is along y
+          //     and the shear direction is along x
+          pos_grid.x = pos_grid.x + box.getTiltFactorXY() * pos_grid.y;
+          
+          // Global index for current grid point
+          int grid_idx = x_inp * Ny * Nz + y_inp * Nz + z_inp;
+          
+          // Distance from particle to grid node
+          Scalar3 r = pos_grid - pos;
+          r = box.minImage(r);
+          Scalar rsq = r.x*r.x + r.y*r.y + r.z*r.z;
+          
+          // Spreading Factor
+          Scalar Cfac = prefac * expf( -expfac * rsq );
+          
+          // Get velocity from reduction (THIS IS THE SLOW STEP):
+          velocity[thread_offset] += Cfac * make_scalar3( gridX[grid_idx].x, gridY[grid_idx].x, gridZ[grid_idx].x );
+        }
+      }//kk
+    }//jj
+  }//ii
+
+  // Intra-block reduction for the total particle velocity
+  // (add contributions from all grid points)
+  int offs = block_size;
+  int offs_prev; 
+  while (offs > 1)
+  {
+        offs_prev = offs; 
+        offs = ( offs + 1 ) / 2;
+    __syncthreads();
+        if (thread_offset + offs < offs_prev)
+          {
+            velocity[thread_offset] += velocity[thread_offset + offs];
+          }
+        
+  }
+  
+  // Write out to global memory
+  if (thread_offset == 0){
+    d_vel[idx] = make_scalar4(velocity[0].x, velocity[0].y, velocity[0].z, d_vel[idx].w);
+  }
+  
 }
 
 /*!
-	Wrapper to drive all the kernel functions used to compute 
-	the wave space part of Mobility ( Same Size Particles )
+  Wrapper to drive all the kernel functions used to compute 
+  the wave space part of Mobility ( Same Size Particles )
 
 */
 /*! \param d_pos            positions of the particles, actually they are fetched on texture memory
@@ -513,64 +522,64 @@ __global__ void gpu_stokes_Contract_kernel(
     \param gridh            distance between grid nodes
 */
 void gpu_stokes_Mwave_wrap( 
-				Scalar4 *d_pos,
-                        	Scalar4 *d_vel,
-                        	Scalar4 *d_net_force,
-				unsigned int *d_group_members,
-				unsigned int group_size,
-                        	const BoxDim& box,
-				Scalar xi,
-				Scalar eta,
-				Scalar4 *d_gridk,
-				CUFFTCOMPLEX *d_gridX,
-				CUFFTCOMPLEX *d_gridY,
-				CUFFTCOMPLEX *d_gridZ,
-				cufftHandle plan,
-				const int Nx,
-				const int Ny,
-				const int Nz,
-				unsigned int NxNyNz,
-				dim3 grid,
-				dim3 threads,
-				int gridBlockSize,
-				int gridNBlock,
-				const int P,
-				Scalar3 gridh 
-				){
+        Scalar4 *d_pos,
+                          Scalar4 *d_vel,
+                          Scalar4 *d_net_force,
+        unsigned int *d_group_members,
+        unsigned int group_size,
+                          const BoxDim& box,
+        Scalar xi,
+        Scalar eta,
+        Scalar4 *d_gridk,
+        CUFFTCOMPLEX *d_gridX,
+        CUFFTCOMPLEX *d_gridY,
+        CUFFTCOMPLEX *d_gridZ,
+        cufftHandle plan,
+        const int Nx,
+        const int Ny,
+        const int Nz,
+        unsigned int NxNyNz,
+        dim3 grid,
+        dim3 threads,
+        int gridBlockSize,
+        int gridNBlock,
+        const int P,
+        Scalar3 gridh 
+        ){
     
-	// Spreading and contraction stuff
-	dim3 Cgrid( group_size, 1, 1);
-	int B = ( P < 10 ) ? P : 10;
-	dim3 Cthreads(B, B, B);
-
-	Scalar quadW = gridh.x * gridh.y * gridh.z;
-	Scalar xisq = xi * xi;
-	Scalar prefac = ( 2.0 * xisq / 3.1415926536 / eta ) * sqrtf( 2.0 * xisq / 3.1415926536 / eta );
-	Scalar expfac = 2.0 * xisq / eta;
-	
-	// Reset the grid ( remove any previously distributed forces )
-	gpu_stokes_ZeroGrid_kernel<<<gridNBlock,gridBlockSize>>>(d_gridX,NxNyNz);
-	gpu_stokes_ZeroGrid_kernel<<<gridNBlock,gridBlockSize>>>(d_gridY,NxNyNz);
-	gpu_stokes_ZeroGrid_kernel<<<gridNBlock,gridBlockSize>>>(d_gridZ,NxNyNz);
-	
-	// Spread forces onto grid
-	gpu_stokes_Spread_kernel<<<Cgrid, Cthreads>>>( d_pos, d_net_force, d_gridX, d_gridY, d_gridZ, group_size, Nx, Ny, Nz, d_group_members, box, P, gridh, xi, eta, prefac, expfac );
-	
-	// Perform FFT on gridded forces
-	cufftExecC2C(plan, d_gridX, d_gridX, CUFFT_FORWARD);
-	cufftExecC2C(plan, d_gridY, d_gridY, CUFFT_FORWARD);
-	cufftExecC2C(plan, d_gridZ, d_gridZ, CUFFT_FORWARD);
-	
-	// Apply wave space scaling to FFT'd forces
-	gpu_stokes_Green_kernel<<<gridNBlock,gridBlockSize>>>( d_gridX, d_gridY, d_gridZ, d_gridk, NxNyNz);
-	
-	// Return rescaled forces to real space
-	cufftExecC2C(plan, d_gridX, d_gridX, CUFFT_INVERSE);
-	cufftExecC2C(plan, d_gridY, d_gridY, CUFFT_INVERSE);
-	cufftExecC2C(plan, d_gridZ, d_gridZ, CUFFT_INVERSE);
-	
-	// Evaluate contribution of grid velocities at particle centers
-	gpu_stokes_Contract_kernel<<<Cgrid, Cthreads, (B*B*B+1)*sizeof(float3)>>>( d_pos, d_vel, d_gridX, d_gridY, d_gridZ, group_size, Nx, Ny, Nz, xi, eta, d_group_members, box, P, gridh, quadW*prefac, expfac );
+  // Spreading and contraction stuff
+  dim3 Cgrid( group_size, 1, 1);
+  int B = ( P < 10 ) ? P : 10;
+  dim3 Cthreads(B, B, B);
+
+  Scalar quadW = gridh.x * gridh.y * gridh.z;
+  Scalar xisq = xi * xi;
+  Scalar prefac = ( 2.0 * xisq / 3.1415926536 / eta ) * sqrtf( 2.0 * xisq / 3.1415926536 / eta );
+  Scalar expfac = 2.0 * xisq / eta;
+  
+  // Reset the grid ( remove any previously distributed forces )
+  gpu_stokes_ZeroGrid_kernel<<<gridNBlock,gridBlockSize>>>(d_gridX,NxNyNz);
+  gpu_stokes_ZeroGrid_kernel<<<gridNBlock,gridBlockSize>>>(d_gridY,NxNyNz);
+  gpu_stokes_ZeroGrid_kernel<<<gridNBlock,gridBlockSize>>>(d_gridZ,NxNyNz);
+  
+  // Spread forces onto grid
+  gpu_stokes_Spread_kernel<<<Cgrid, Cthreads>>>( d_pos, d_net_force, d_gridX, d_gridY, d_gridZ, group_size, Nx, Ny, Nz, d_group_members, box, P, gridh, xi, eta, prefac, expfac );
+  
+  // Perform FFT on gridded forces
+  cufftExecC2C(plan, d_gridX, d_gridX, CUFFT_FORWARD);
+  cufftExecC2C(plan, d_gridY, d_gridY, CUFFT_FORWARD);
+  cufftExecC2C(plan, d_gridZ, d_gridZ, CUFFT_FORWARD);
+  
+  // Apply wave space scaling to FFT'd forces
+  gpu_stokes_Green_kernel<<<gridNBlock,gridBlockSize>>>( d_gridX, d_gridY, d_gridZ, d_gridk, NxNyNz);
+  
+  // Return rescaled forces to real space
+  cufftExecC2C(plan, d_gridX, d_gridX, CUFFT_INVERSE);
+  cufftExecC2C(plan, d_gridY, d_gridY, CUFFT_INVERSE);
+  cufftExecC2C(plan, d_gridZ, d_gridZ, CUFFT_INVERSE);
+  
+  // Evaluate contribution of grid velocities at particle centers
+  gpu_stokes_Contract_kernel<<<Cgrid, Cthreads, (B*B*B+1)*sizeof(float3)>>>( d_pos, d_vel, d_gridX, d_gridY, d_gridZ, group_size, Nx, Ny, Nz, xi, eta, d_group_members, box, P, gridh, quadW*prefac, expfac );
  
 }
 
@@ -591,108 +600,108 @@ void gpu_stokes_Mwave_wrap(
     \param d_nlist          list containing neighbors of all particles
     \param d_headlist       list of particle offsets into d_nlist
 */
-__global__ void gpu_stokes_Mreal_kernel( 	
-				Scalar4 *d_pos,
-			      	Scalar4 *d_vel,
-			      	Scalar4 *d_net_force,
-			      	int group_size,
-			      	Scalar xi,
-			      	Scalar4 *d_ewaldC1, 
-			      	Scalar self,
-			      	Scalar ewald_cut,
-			      	int ewald_n,
-			      	Scalar ewald_dr,
-			      	unsigned int *d_group_members,
-			      	BoxDim box,
-			      	const unsigned int *d_n_neigh,
-                              	const unsigned int *d_nlist,
-                              	const unsigned int *d_headlist
-				){
+__global__ void gpu_stokes_Mreal_kernel(   
+        Scalar4 *d_pos,
+              Scalar4 *d_vel,
+              Scalar4 *d_net_force,
+              int group_size,
+              Scalar xi,
+              Scalar4 *d_ewaldC1, 
+              Scalar self,
+              Scalar ewald_cut,
+              int ewald_n,
+              Scalar ewald_dr,
+              unsigned int *d_group_members,
+              BoxDim box,
+              const unsigned int *d_n_neigh,
+                                const unsigned int *d_nlist,
+                                const unsigned int *d_headlist
+        ){
  
-	// Index for current thread 
-	int group_idx = blockDim.x * blockIdx.x + threadIdx.x;
-	
-	// Initialize contribution to velocity
-	Scalar4 u = make_scalar4( 0.0, 0.0, 0.0, 0.0 );
-	
-	if (group_idx < group_size) {
-	  
-		// Particle for this thread
-		unsigned int idx = d_group_members[group_idx];
-		
-		// Number of neighbors for current particle
-		unsigned int n_neigh = d_n_neigh[idx]; 
-		unsigned int head_idx = d_headlist[idx];
-		
-		// Particle position and table ID
-		Scalar4 posi = texFetchScalar4(d_pos, pos_tex, idx);
-		
-		// Self contribution
-		Scalar4 F = d_net_force[idx];
-		u = make_scalar4( self * F.x, self * F.y, self * F.z, 0.0 );
-		
-		// Minimum and maximum distance for pair calculation
-		Scalar mindistSq = ewald_dr * ewald_dr;
-		Scalar maxdistSq = ewald_cut * ewald_cut;
-		
-		for (int neigh_idx = 0; neigh_idx < n_neigh; neigh_idx++) {
-
-			// Get index for current neightbor
-			unsigned int cur_j = d_nlist[ head_idx + neigh_idx ];	
-	
-			// Position and size of neighbor particle
-			Scalar4 posj = texFetchScalar4(d_pos, pos_tex, cur_j);
-		
-			// Distance vector between current particle and neighbor
-			Scalar3 r = make_scalar3( posi.x - posj.x, posi.y - posj.y, posi.z - posj.z );
-			r = box.minImage(r);
-			Scalar distSqr = dot(r,r);
-		
-			// Add neighbor contribution if it is within the real space cutoff radius
-			if ( ( distSqr < maxdistSq ) && ( distSqr >= mindistSq ) ) {
-		
-				// Need distance 
-				Scalar dist = sqrtf( distSqr );
-				
-				// Force on neighbor particle
-				Scalar4 Fj = d_net_force[cur_j];
-			
-				// Fetch relevant elements from textured table for real space interaction
-				int r_ind = __scalar2int_rd( ewald_n * ( dist - ewald_dr ) / ( ewald_cut - ewald_dr ) );
-				int offset = r_ind;
-		
-				Scalar4 tewaldC1 = texFetchScalar4(d_ewaldC1, tables1_tex, offset);
-		
-				// Linear interpolation of table
-				Scalar fac = dist / ewald_dr - r_ind - Scalar(1.0);
-		
-				Scalar Imrr = tewaldC1.x + ( tewaldC1.z - tewaldC1.x ) * fac;
-				Scalar rr = tewaldC1.y + ( tewaldC1.w - tewaldC1.y ) * fac;
-		
-				// Update velocity
-				Scalar rdotf = ( r.x*Fj.x + r.y*Fj.y + r.z*Fj.z ) / distSqr;
-		
-				u.x += Imrr * Fj.x + ( rr - Imrr ) * rdotf * r.x;
-				u.y += Imrr * Fj.y + ( rr - Imrr ) * rdotf * r.y;
-				u.z += Imrr * Fj.z + ( rr - Imrr ) * rdotf * r.z;
-		
-			}
-		
-		}
-		
-		// Write to output
-		d_vel[idx] = u;
-	
-	}    
+  // Index for current thread 
+  int group_idx = blockDim.x * blockIdx.x + threadIdx.x;
+  
+  // Initialize contribution to velocity
+  Scalar4 u = make_scalar4( 0.0, 0.0, 0.0, 0.0 );
+  
+  if (group_idx < group_size) {
+    
+    // Particle for this thread
+    unsigned int idx = d_group_members[group_idx];
+    
+    // Number of neighbors for current particle
+    unsigned int n_neigh = d_n_neigh[idx]; 
+    unsigned int head_idx = d_headlist[idx];
+    
+    // Particle position and table ID
+    Scalar4 posi = texFetchScalar4(d_pos, pos_tex, idx);
+    
+    // Self contribution
+    Scalar4 F = d_net_force[idx];
+    u = make_scalar4( self * F.x, self * F.y, self * F.z, 0.0 );
+    
+    // Minimum and maximum distance for pair calculation
+    Scalar mindistSq = ewald_dr * ewald_dr;
+    Scalar maxdistSq = ewald_cut * ewald_cut;
+    
+    for (int neigh_idx = 0; neigh_idx < n_neigh; neigh_idx++) {
+
+      // Get index for current neightbor
+      unsigned int cur_j = d_nlist[ head_idx + neigh_idx ];  
+  
+      // Position and size of neighbor particle
+      Scalar4 posj = texFetchScalar4(d_pos, pos_tex, cur_j);
+    
+      // Distance vector between current particle and neighbor
+      Scalar3 r = make_scalar3( posi.x - posj.x, posi.y - posj.y, posi.z - posj.z );
+      r = box.minImage(r);
+      Scalar distSqr = dot(r,r);
+    
+      // Add neighbor contribution if it is within the real space cutoff radius
+      if ( ( distSqr < maxdistSq ) && ( distSqr >= mindistSq ) ) {
+    
+        // Need distance 
+        Scalar dist = sqrtf( distSqr );
+        
+        // Force on neighbor particle
+        Scalar4 Fj = d_net_force[cur_j];
+      
+        // Fetch relevant elements from textured table for real space interaction
+        int r_ind = __scalar2int_rd( ewald_n * ( dist - ewald_dr ) / ( ewald_cut - ewald_dr ) );
+        int offset = r_ind;
+    
+        Scalar4 tewaldC1 = texFetchScalar4(d_ewaldC1, tables1_tex, offset);
+    
+        // Linear interpolation of table
+        Scalar fac = dist / ewald_dr - r_ind - Scalar(1.0);
+    
+        Scalar Imrr = tewaldC1.x + ( tewaldC1.z - tewaldC1.x ) * fac;
+        Scalar rr = tewaldC1.y + ( tewaldC1.w - tewaldC1.y ) * fac;
+    
+        // Update velocity
+        Scalar rdotf = ( r.x*Fj.x + r.y*Fj.y + r.z*Fj.z ) / distSqr;
+    
+        u.x += Imrr * Fj.x + ( rr - Imrr ) * rdotf * r.x;
+        u.y += Imrr * Fj.y + ( rr - Imrr ) * rdotf * r.y;
+        u.z += Imrr * Fj.z + ( rr - Imrr ) * rdotf * r.z;
+    
+      }
+    
+    }
+    
+    // Write to output
+    d_vel[idx] = u;
+  
+  }    
 }
 
 
 
 /*!
-	Wrap all the functions to compute U = M * F ( SAME SIZE PARTICLES )
-	Drive GPU kernel functions
+  Wrap all the functions to compute U = M * F ( SAME SIZE PARTICLES )
+  Drive GPU kernel functions
 
-	d_vel = M * d_net_force
+  d_vel = M * d_net_force
 
 */
 /*! \param d_pos            positions of the particles, actually they are fetched on texture memory
@@ -727,57 +736,57 @@ __global__ void gpu_stokes_Mreal_kernel(
     \param gridh            distance between grid nodes
 */
 void gpu_stokes_Mobility_wrap( 
-				Scalar4 *d_pos,
-				Scalar4 *d_vel,
-				Scalar4 *d_net_force,
-				unsigned int *d_group_members,
-				unsigned int group_size,
-				const BoxDim& box,
-				Scalar xi,
-				Scalar eta,
-				Scalar ewald_cut,
-				Scalar ewald_dr,
-				int ewald_n,
-				Scalar4 *d_ewaldC1, 
-				Scalar self,
-				Scalar4 *d_gridk,
-				CUFFTCOMPLEX *d_gridX,
-				CUFFTCOMPLEX *d_gridY,
-				CUFFTCOMPLEX *d_gridZ,
-				cufftHandle plan,
-				const int Nx,
-				const int Ny,
-				const int Nz,
-				const unsigned int *d_n_neigh,
-				const unsigned int *d_nlist,
-				const unsigned int *d_headlist,
-				unsigned int NxNyNz,
-				dim3 grid,
-				dim3 threads,
-				int gridBlockSize,
-				int gridNBlock,
-				const int P,
-				Scalar3 gridh ){
-
-	// Real and wave space velocity
-	Scalar4 *d_vel1, *d_vel2;
-	cudaMalloc( &d_vel1, group_size*sizeof(Scalar4) );
-	cudaMalloc( &d_vel2, group_size*sizeof(Scalar4) );
-	
-	// Add the wave space contribution to the velocity
-	gpu_stokes_Mwave_wrap( d_pos, d_vel1, d_net_force, d_group_members, group_size, box, xi, eta, d_gridk, d_gridX, d_gridY, d_gridZ, plan, Nx, Ny, Nz, NxNyNz, grid, threads, gridBlockSize, gridNBlock, P, gridh );
-	
-	// Add the real space contribution to the velocity
-	//
-	// Real space calculation takes care of self contributions
-	gpu_stokes_Mreal_kernel<<<grid, threads>>>(d_pos, d_vel2, d_net_force, group_size, xi, d_ewaldC1, self, ewald_cut, ewald_n, ewald_dr, d_group_members, box, d_n_neigh, d_nlist, d_headlist );
-	
-	// Add real and wave space parts together
-	gpu_stokes_LinearCombination_kernel<<<grid, threads>>>(d_vel1, d_vel2, d_vel, 1.0, 1.0, group_size, d_group_members);
-	
-	// Free memory
-	cudaFree(d_vel1);
-	cudaFree(d_vel2);
+        Scalar4 *d_pos,
+        Scalar4 *d_vel,
+        Scalar4 *d_net_force,
+        unsigned int *d_group_members,
+        unsigned int group_size,
+        const BoxDim& box,
+        Scalar xi,
+        Scalar eta,
+        Scalar ewald_cut,
+        Scalar ewald_dr,
+        int ewald_n,
+        Scalar4 *d_ewaldC1, 
+        Scalar self,
+        Scalar4 *d_gridk,
+        CUFFTCOMPLEX *d_gridX,
+        CUFFTCOMPLEX *d_gridY,
+        CUFFTCOMPLEX *d_gridZ,
+        cufftHandle plan,
+        const int Nx,
+        const int Ny,
+        const int Nz,
+        const unsigned int *d_n_neigh,
+        const unsigned int *d_nlist,
+        const unsigned int *d_headlist,
+        unsigned int NxNyNz,
+        dim3 grid,
+        dim3 threads,
+        int gridBlockSize,
+        int gridNBlock,
+        const int P,
+        Scalar3 gridh ){
+
+  // Real and wave space velocity
+  Scalar4 *d_vel1, *d_vel2;
+  cudaMalloc( &d_vel1, group_size*sizeof(Scalar4) );
+  cudaMalloc( &d_vel2, group_size*sizeof(Scalar4) );
+  
+  // Add the wave space contribution to the velocity
+  gpu_stokes_Mwave_wrap( d_pos, d_vel1, d_net_force, d_group_members, group_size, box, xi, eta, d_gridk, d_gridX, d_gridY, d_gridZ, plan, Nx, Ny, Nz, NxNyNz, grid, threads, gridBlockSize, gridNBlock, P, gridh );
+  
+  // Add the real space contribution to the velocity
+  //
+  // Real space calculation takes care of self contributions
+  gpu_stokes_Mreal_kernel<<<grid, threads>>>(d_pos, d_vel2, d_net_force, group_size, xi, d_ewaldC1, self, ewald_cut, ewald_n, ewald_dr, d_group_members, box, d_n_neigh, d_nlist, d_headlist );
+  
+  // Add real and wave space parts together
+  gpu_stokes_LinearCombination_kernel<<<grid, threads>>>(d_vel1, d_vel2, d_vel, 1.0, 1.0, group_size, d_group_members);
+  
+  // Free memory
+  cudaFree(d_vel1);
+  cudaFree(d_vel2);
  
 }
 
diff --git a/PSEv1/Mobility.cuh b/PSEv1/Mobility.cuh
index a5a7963..b62f8e6 100644
--- a/PSEv1/Mobility.cuh
+++ b/PSEv1/Mobility.cuh
@@ -71,90 +71,90 @@ ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 void gpu_stokes_Mobility_wrap( Scalar4 *d_pos,
-                               	Scalar4 *d_vel,
-                               	Scalar4 *d_net_force,
-			       	unsigned int *d_group_members,
-			       	unsigned int group_size,
-                               	const BoxDim& box,
-			       	Scalar xi,
-			       	Scalar eta,
-			       	Scalar ewald_cut,
-			       	Scalar ewald_dr,
-			       	int ewald_n,
-			       	Scalar4 *d_ewaldC1, 
-			       	Scalar self, 
-			       	Scalar4 *d_gridk,
-			       	CUFFTCOMPLEX *d_gridX,
-			       	CUFFTCOMPLEX *d_gridY,
-			       	CUFFTCOMPLEX *d_gridZ,
-			       	cufftHandle plan,
-			       	const int Nx,
-			       	const int Ny,
-			       	const int Nz,
-			       	const unsigned int *d_n_neigh,
-                               	const unsigned int *d_nlist,
-                               	const unsigned int *d_headlist,
-			       	unsigned int NxNyNz,
-			       	dim3 grid,
-			       	dim3 threads,
-			       	int gridBlockSize,
-			       	int gridNBlock,
-			       	const int P,
-			       	Scalar3 gridh );
+                                 Scalar4 *d_vel,
+                                 Scalar4 *d_net_force,
+               unsigned int *d_group_members,
+               unsigned int group_size,
+                                 const BoxDim& box,
+               Scalar xi,
+               Scalar eta,
+               Scalar ewald_cut,
+               Scalar ewald_dr,
+               int ewald_n,
+               Scalar4 *d_ewaldC1, 
+               Scalar self, 
+               Scalar4 *d_gridk,
+               CUFFTCOMPLEX *d_gridX,
+               CUFFTCOMPLEX *d_gridY,
+               CUFFTCOMPLEX *d_gridZ,
+               cufftHandle plan,
+               const int Nx,
+               const int Ny,
+               const int Nz,
+               const unsigned int *d_n_neigh,
+                                 const unsigned int *d_nlist,
+                                 const unsigned int *d_headlist,
+               unsigned int NxNyNz,
+               dim3 grid,
+               dim3 threads,
+               int gridBlockSize,
+               int gridNBlock,
+               const int P,
+               Scalar3 gridh );
 
 __global__
-void gpu_stokes_Mreal_kernel( 	Scalar4 *d_pos,
-			      	Scalar4 *d_vel,
-			      	Scalar4 *d_net_force,
-			      	int group_size,
-			      	Scalar xi,
-			      	Scalar4 *d_ewaldC1, 
-			      	Scalar self, 
-			      	Scalar ewald_cut,
-			      	int ewald_n,
-			      	Scalar ewald_dr,
-			      	unsigned int *d_group_members,
-			      	BoxDim box,
-			      	const unsigned int *d_n_neigh,
-                              	const unsigned int *d_nlist,
-                              	const unsigned int *d_headlist );
-
-__global__ void gpu_stokes_Spread_kernel( 	Scalar4 *d_pos,
-				    		Scalar4 *d_net_force,
-				    		CUFFTCOMPLEX *gridX,
-				    		CUFFTCOMPLEX *gridY,
-				    		CUFFTCOMPLEX *gridZ,
-				    		int group_size,
-				    		int Nx,
-				    		int Ny,
-				    		int Nz,
-				    		unsigned int *d_group_members,
-				    		BoxDim box,
-				    		const int P,
-				    		Scalar3 gridh,
-				    		Scalar xi,
-				    		Scalar eta,
-						Scalar prefac,
-						Scalar expfac );
+void gpu_stokes_Mreal_kernel(   Scalar4 *d_pos,
+              Scalar4 *d_vel,
+              Scalar4 *d_net_force,
+              int group_size,
+              Scalar xi,
+              Scalar4 *d_ewaldC1, 
+              Scalar self, 
+              Scalar ewald_cut,
+              int ewald_n,
+              Scalar ewald_dr,
+              unsigned int *d_group_members,
+              BoxDim box,
+              const unsigned int *d_n_neigh,
+                                const unsigned int *d_nlist,
+                                const unsigned int *d_headlist );
+
+__global__ void gpu_stokes_Spread_kernel(   Scalar4 *d_pos,
+                Scalar4 *d_net_force,
+                CUFFTCOMPLEX *gridX,
+                CUFFTCOMPLEX *gridY,
+                CUFFTCOMPLEX *gridZ,
+                int group_size,
+                int Nx,
+                int Ny,
+                int Nz,
+                unsigned int *d_group_members,
+                BoxDim box,
+                const int P,
+                Scalar3 gridh,
+                Scalar xi,
+                Scalar eta,
+            Scalar prefac,
+            Scalar expfac );
 
 __global__ void gpu_stokes_Green_kernel(CUFFTCOMPLEX *gridX, CUFFTCOMPLEX *gridY, CUFFTCOMPLEX *gridZ, Scalar4 *gridk, unsigned int NxNyNz);
 
-__global__ void gpu_stokes_Contract_kernel( 	Scalar4 *d_pos,
-				 		Scalar4 *d_vel,
-				 		CUFFTCOMPLEX *gridX,
-				 		CUFFTCOMPLEX *gridY,
-				 		CUFFTCOMPLEX *gridZ,
-				 		int group_size,
-				 		int Nx,
-				 		int Ny,
-				 		int Nz,
-				 		Scalar xi,
-				 		Scalar eta,
-				 		unsigned int *d_group_members,
-				 		BoxDim box,
-				 		const int P,
-				 		Scalar3 gridh,
-				 		Scalar prefac,
-				 		Scalar expfac );
+__global__ void gpu_stokes_Contract_kernel(   Scalar4 *d_pos,
+             Scalar4 *d_vel,
+             CUFFTCOMPLEX *gridX,
+             CUFFTCOMPLEX *gridY,
+             CUFFTCOMPLEX *gridZ,
+             int group_size,
+             int Nx,
+             int Ny,
+             int Nz,
+             Scalar xi,
+             Scalar eta,
+             unsigned int *d_group_members,
+             BoxDim box,
+             const int P,
+             Scalar3 gridh,
+             Scalar prefac,
+             Scalar expfac );
 
 #endif
diff --git a/PSEv1/Stokes.cc b/PSEv1/Stokes.cc
index db54b2a..4ca1939 100644
--- a/PSEv1/Stokes.cc
+++ b/PSEv1/Stokes.cc
@@ -73,40 +73,40 @@ using namespace std;
 */
 
 /*!
-	\param sysdef             SystemDefinition this method will act on. Must not be NULL.
-    	\param group              The group of particles this integration method is to work on
-	\param T                  temperature
-	\param seed               seed for random number generator
-	\param nlist              neighbor list
-	\param xi                 Ewald parameter
-	\param m_error            Tolerance for all calculations
+  \param sysdef             SystemDefinition this method will act on. Must not be NULL.
+      \param group              The group of particles this integration method is to work on
+  \param T                  temperature
+  \param seed               seed for random number generator
+  \param nlist              neighbor list
+  \param xi                 Ewald parameter
+  \param m_error            Tolerance for all calculations
 
 */
-Stokes::Stokes(	std::shared_ptr<SystemDefinition> sysdef,
+Stokes::Stokes(  std::shared_ptr<SystemDefinition> sysdef,
                 std::shared_ptr<ParticleGroup> group,
-		std::shared_ptr<Variant> T,
-		unsigned int seed,
-		std::shared_ptr<NeighborList> nlist,
-		Scalar xi,
-		Scalar error )
-		: IntegrationMethodTwoStep(sysdef, group),
-		m_T(T),
-		m_seed(seed),
-		m_nlist(nlist),
-		m_xi(xi),
-		m_error(error)
+    std::shared_ptr<Variant> T,
+    unsigned int seed,
+    std::shared_ptr<NeighborList> nlist,
+    Scalar xi,
+    Scalar error )
+    : IntegrationMethodTwoStep(sysdef, group),
+    m_T(T),
+    m_seed(seed),
+    m_nlist(nlist),
+    m_xi(xi),
+    m_error(error)
     {
     m_exec_conf->msg->notice(5) << "Constructing Stokes" << endl;
 
-	// Hash the User's Seed to make it less likely to be a low positive integer
-	m_seed = m_seed * 0x12345677 + 0x12345; m_seed ^= (m_seed >> 16); m_seed *= 0x45679;
+  // Hash the User's Seed to make it less likely to be a low positive integer
+  m_seed = m_seed * 0x12345677 + 0x12345; m_seed ^= (m_seed >> 16); m_seed *= 0x45679;
 
-	// only one GPU is supported
-	if (!m_exec_conf->isCUDAEnabled())
-	{
-		m_exec_conf->msg->error() << "Creating a Stokes when CUDA is disabled" << endl;
-		throw std::runtime_error("Error initializing Stokes");
-	}
+  // only one GPU is supported
+  if (!m_exec_conf->isCUDAEnabled())
+  {
+    m_exec_conf->msg->error() << "Creating a Stokes when CUDA is disabled" << endl;
+    throw std::runtime_error("Error initializing Stokes");
+  }
 
     }
 
@@ -114,240 +114,240 @@ Stokes::Stokes(	std::shared_ptr<SystemDefinition> sysdef,
 Stokes::~Stokes()
     {
     m_exec_conf->msg->notice(5) << "Destroying Stokes" << endl;
-	cufftDestroy(plan);
+  cufftDestroy(plan);
     }
 
 
 /*!
-	Set the parameters for Hydrodynamic Calculation. Do once at the beginning
-	of the simulation and then reuse computed values
+  Set the parameters for Hydrodynamic Calculation. Do once at the beginning
+  of the simulation and then reuse computed values
 
-	- Pre-tabulate real space interaction functions (f and g)
-	- Set wave space vectors
+  - Pre-tabulate real space interaction functions (f and g)
+  - Set wave space vectors
 
 */
 void Stokes::setParams()
 {
-	// Try two Lanczos iterations to start (number of iterations will adapt as needed)
-	m_m_Lanczos = 2;
-
-	// Real space cutoff
-	m_ewald_cut = sqrtf( - logf( m_error ) ) / m_xi;
-
-	// Number of grid points
-	int kmax = int( 2.0 * sqrtf( - logf( m_error ) ) * m_xi ) + 1;
-
-	const BoxDim& box = m_pdata->getBox(); // Only for box not changing with time.
-	Scalar3 L = box.getL();
-
-	m_Nx = int( kmax * L.x / (2.0 * 3.1415926536 ) * 2.0 ) + 1;
-	m_Ny = int( kmax * L.y / (2.0 * 3.1415926536 ) * 2.0 ) + 1;
-	m_Nz = int( kmax * L.z / (2.0 * 3.1415926536 ) * 2.0 ) + 1;
-
-	// Get list of int values between 8 and 4096 that can be written as
-	// 	(2^a)*(3^b)*(5^c)
-	// Then sort list from low to high
-	//
-	// Go to such large values so as to be able simulate boxes with large
-	// aspect ratios
-	std::vector<int> Mlist;
-	for ( int ii = 0; ii < 13; ++ii ){
-		int pow2 = 1;
-		for ( int i = 0; i < ii; ++i ){
-			pow2 *= 2;
-		}
-		for ( int jj = 0; jj < 8; ++jj ){
-			int pow3 = 1;
-			for ( int j = 0; j < jj; ++j ){
-				pow3 *= 3;
-			}
-			for ( int kk = 0; kk < 6; ++kk ){
-				int pow5 = 1;
-				for ( int k = 0; k < kk; ++k ){
-					pow5 *= 5;
-				}
-				int Mcurr = pow2 * pow3 * pow5;
-				if ( Mcurr >= 8 && Mcurr <= 4096 ){
-					Mlist.push_back(Mcurr);
-				}
-			}
-		}
-	}
-	std::sort(Mlist.begin(), Mlist.end());
-	const int nmult = Mlist.size(); 
-
-	// Compute the number of grid points in each direction
-	//
-	// Number of grid points should be a power of 2,3,5 for most efficient FFTs
-	for ( int ii = 0; ii < nmult; ++ii ){
-		if (m_Nx <= Mlist[ii]){
-			m_Nx = Mlist[ii];
-			break;
-		}
-	}
-	for ( int ii = 0; ii < nmult; ++ii ){
-		if (m_Ny <= Mlist[ii]){
-			m_Ny = Mlist[ii];
-			break;
-		}
-	}
-	for ( int ii = 0; ii < nmult; ++ii ){
-		if (m_Nz <= Mlist[ii]){
-			m_Nz = Mlist[ii];
-			break;
-		}
-	}
-
-	// Check that we haven't asked for too many grid points
-	// Max allowable by cuFFT is 512^3
-	if ( m_Nx * m_Ny * m_Nz > 512*512*512 ){
-
-		printf("Requested Number of Fourier Nodes Exceeds Max Dimension of 512^3\n");
-		printf("Mx = %i \n", m_Nx);
-		printf("My = %i \n", m_Ny);
-		printf("Mz = %i \n", m_Nz);
-		printf("Mx*My*Mz = %i \n", m_Nx * m_Ny * m_Nz);
-		printf("\n");
-		printf("Note to User: Fix is to reduce xi and try again. \n");
-
-		exit(EXIT_FAILURE);
-	}
-
-	// Maximum eigenvalue of A'*A to scale P
-	Scalar gamma = m_max_strain;
-	Scalar gamma2 = gamma*gamma;
-	Scalar lambda = 1.0 + gamma2/2.0 + gamma*sqrtf(1.0 + gamma2/4.0);
-
-	// Grid spacing
-	m_gridh = L / make_scalar3(m_Nx,m_Ny,m_Nz);
-
-	// Parameters for the Spectral Ewald Method (Lindbo and Tornberg, J. Comp. Phys., 2011)
-	m_gaussm = 1.0;
-	while ( erfcf( m_gaussm / sqrtf(2.0*lambda) ) > m_error ){
-	    m_gaussm = m_gaussm + 0.01;
-	}
-	m_gaussP = int( m_gaussm*m_gaussm / 3.1415926536 )  + 1;
-
-	if (m_gaussP > m_Nx) m_gaussP = m_Nx; // Can't be supported beyond grid
-	if (m_gaussP > m_Ny) m_gaussP = m_Ny;
-	if (m_gaussP > m_Nz) m_gaussP = m_Nz;
-	Scalar w = m_gaussP*m_gridh.x / 2.0;	               // Gaussian width in simulation units
-	Scalar xisq  = m_xi * m_xi;
-	m_eta = (2.0*w/m_gaussm)*(2.0*w/m_gaussm) * ( xisq );  // Gaussian splitting parameter
-
-	// Print summary to command line output
-	printf("\n");
-	printf("\n");
-	m_exec_conf->msg->notice(2) << "--- NUFFT Hydrodynamics Statistics ---" << endl;
-	m_exec_conf->msg->notice(2) << "Mx: " << m_Nx << endl;
-	m_exec_conf->msg->notice(2) << "My: " << m_Ny << endl;
-	m_exec_conf->msg->notice(2) << "Mz: " << m_Nz << endl;
-	m_exec_conf->msg->notice(2) << "rcut: " << m_ewald_cut << endl;
-	m_exec_conf->msg->notice(2) << "Points per radius (x,y,z): " << m_Nx / L.x << ", " << m_Ny / L.y << ", " << m_Nz / L.z << endl;
-	m_exec_conf->msg->notice(2) << "--- Gaussian Spreading Parameters ---"  << endl;
-	m_exec_conf->msg->notice(2) << "gauss_m: " << m_gaussm << endl;
+  // Try two Lanczos iterations to start (number of iterations will adapt as needed)
+  m_m_Lanczos = 2;
+
+  // Real space cutoff
+  m_ewald_cut = sqrtf( - logf( m_error ) ) / m_xi;
+
+  // Number of grid points
+  int kmax = int( 2.0 * sqrtf( - logf( m_error ) ) * m_xi ) + 1;
+
+  const BoxDim& box = m_pdata->getBox(); // Only for box not changing with time.
+  Scalar3 L = box.getL();
+
+  m_Nx = int( kmax * L.x / (2.0 * 3.1415926536 ) * 2.0 ) + 1;
+  m_Ny = int( kmax * L.y / (2.0 * 3.1415926536 ) * 2.0 ) + 1;
+  m_Nz = int( kmax * L.z / (2.0 * 3.1415926536 ) * 2.0 ) + 1;
+
+  // Get list of int values between 8 and 4096 that can be written as
+  //   (2^a)*(3^b)*(5^c)
+  // Then sort list from low to high
+  //
+  // Go to such large values so as to be able simulate boxes with large
+  // aspect ratios
+  std::vector<int> Mlist;
+  for ( int ii = 0; ii < 13; ++ii ){
+    int pow2 = 1;
+    for ( int i = 0; i < ii; ++i ){
+      pow2 *= 2;
+    }
+    for ( int jj = 0; jj < 8; ++jj ){
+      int pow3 = 1;
+      for ( int j = 0; j < jj; ++j ){
+        pow3 *= 3;
+      }
+      for ( int kk = 0; kk < 6; ++kk ){
+        int pow5 = 1;
+        for ( int k = 0; k < kk; ++k ){
+          pow5 *= 5;
+        }
+        int Mcurr = pow2 * pow3 * pow5;
+        if ( Mcurr >= 8 && Mcurr <= 4096 ){
+          Mlist.push_back(Mcurr);
+        }
+      }
+    }
+  }
+  std::sort(Mlist.begin(), Mlist.end());
+  const int nmult = Mlist.size(); 
+
+  // Compute the number of grid points in each direction
+  //
+  // Number of grid points should be a power of 2,3,5 for most efficient FFTs
+  for ( int ii = 0; ii < nmult; ++ii ){
+    if (m_Nx <= Mlist[ii]){
+      m_Nx = Mlist[ii];
+      break;
+    }
+  }
+  for ( int ii = 0; ii < nmult; ++ii ){
+    if (m_Ny <= Mlist[ii]){
+      m_Ny = Mlist[ii];
+      break;
+    }
+  }
+  for ( int ii = 0; ii < nmult; ++ii ){
+    if (m_Nz <= Mlist[ii]){
+      m_Nz = Mlist[ii];
+      break;
+    }
+  }
+
+  // Check that we haven't asked for too many grid points
+  // Max allowable by cuFFT is 512^3
+  if ( m_Nx * m_Ny * m_Nz > 512*512*512 ){
+
+    printf("Requested Number of Fourier Nodes Exceeds Max Dimension of 512^3\n");
+    printf("Mx = %i \n", m_Nx);
+    printf("My = %i \n", m_Ny);
+    printf("Mz = %i \n", m_Nz);
+    printf("Mx*My*Mz = %i \n", m_Nx * m_Ny * m_Nz);
+    printf("\n");
+    printf("Note to User: Fix is to reduce xi and try again. \n");
+
+    exit(EXIT_FAILURE);
+  }
+
+  // Maximum eigenvalue of A'*A to scale P
+  Scalar gamma = m_max_strain;
+  Scalar gamma2 = gamma*gamma;
+  Scalar lambda = 1.0 + gamma2/2.0 + gamma*sqrtf(1.0 + gamma2/4.0);
+
+  // Grid spacing
+  m_gridh = L / make_scalar3(m_Nx,m_Ny,m_Nz);
+
+  // Parameters for the Spectral Ewald Method (Lindbo and Tornberg, J. Comp. Phys., 2011)
+  m_gaussm = 1.0;
+  while ( erfcf( m_gaussm / sqrtf(2.0*lambda) ) > m_error ){
+      m_gaussm = m_gaussm + 0.01;
+  }
+  m_gaussP = int( m_gaussm*m_gaussm / 3.1415926536 )  + 1;
+
+  if (m_gaussP > m_Nx) m_gaussP = m_Nx; // Can't be supported beyond grid
+  if (m_gaussP > m_Ny) m_gaussP = m_Ny;
+  if (m_gaussP > m_Nz) m_gaussP = m_Nz;
+  Scalar w = m_gaussP*m_gridh.x / 2.0;                 // Gaussian width in simulation units
+  Scalar xisq  = m_xi * m_xi;
+  m_eta = (2.0*w/m_gaussm)*(2.0*w/m_gaussm) * ( xisq );  // Gaussian splitting parameter
+
+  // Print summary to command line output
+  printf("\n");
+  printf("\n");
+  m_exec_conf->msg->notice(2) << "--- NUFFT Hydrodynamics Statistics ---" << endl;
+  m_exec_conf->msg->notice(2) << "Mx: " << m_Nx << endl;
+  m_exec_conf->msg->notice(2) << "My: " << m_Ny << endl;
+  m_exec_conf->msg->notice(2) << "Mz: " << m_Nz << endl;
+  m_exec_conf->msg->notice(2) << "rcut: " << m_ewald_cut << endl;
+  m_exec_conf->msg->notice(2) << "Points per radius (x,y,z): " << m_Nx / L.x << ", " << m_Ny / L.y << ", " << m_Nz / L.z << endl;
+  m_exec_conf->msg->notice(2) << "--- Gaussian Spreading Parameters ---"  << endl;
+  m_exec_conf->msg->notice(2) << "gauss_m: " << m_gaussm << endl;
         m_exec_conf->msg->notice(2) << "gauss_P: " << m_gaussP << endl;
-	m_exec_conf->msg->notice(2) << "gauss_eta: " << m_eta << endl;
-	m_exec_conf->msg->notice(2) << "gauss_w: " << w << endl;
-	m_exec_conf->msg->notice(2) << "gauss_gridh (x,y,z): " << L.x/m_Nx << ", " << L.y/m_Ny << ", " << L.z/m_Nz << endl;
-	printf("\n");
-	printf("\n");
-
-	// Create plan for CUFFT on the GPU
-	cufftPlan3d(&plan, m_Nx, m_Ny, m_Nz, CUFFT_C2C);
-
-	// Prepare GPUArrays for grid vectors and gridded forces
-	GPUArray<Scalar4> n_gridk(m_Nx*m_Ny*m_Nz, m_exec_conf);
-	m_gridk.swap(n_gridk);
-	GPUArray<CUFFTCOMPLEX> n_gridX(m_Nx*m_Ny*m_Nz, m_exec_conf);
-	m_gridX.swap(n_gridX);
-	GPUArray<CUFFTCOMPLEX> n_gridY(m_Nx*m_Ny*m_Nz, m_exec_conf);
-	m_gridY.swap(n_gridY);
-	GPUArray<CUFFTCOMPLEX> n_gridZ(m_Nx*m_Ny*m_Nz, m_exec_conf);
-	m_gridZ.swap(n_gridZ);
-
-	// Get list of reciprocal space vectors, and scaling factor for the wave space calculation at each grid point
-	ArrayHandle<Scalar4> h_gridk(m_gridk, access_location::host, access_mode::readwrite);
-	for (int i = 0; i < m_Nx; i++) {
-		for (int j = 0; j < m_Ny; j++) {
-			for (int k = 0; k < m_Nz; k++) {
-
-				// Index into grid vector storage array
-				int idx = i * m_Ny*m_Nz + j * m_Nz + k;
-
-				// k goes from -N/2 to N/2
-				h_gridk.data[idx].x = 2.0*3.1415926536 * ((i < ( m_Nx + 1 ) / 2) ? i : i - m_Nx) / L.x;
-				h_gridk.data[idx].y = 2.0*3.1415926536 * ((j < ( m_Ny + 1 ) / 2) ? j : j - m_Ny) / L.y;
-				h_gridk.data[idx].z = 2.0*3.1415926536 * ((k < ( m_Nz + 1 ) / 2) ? k : k - m_Nz) / L.z;
-
-				// k dot k
-				Scalar k2 = h_gridk.data[idx].x*h_gridk.data[idx].x + h_gridk.data[idx].y*h_gridk.data[idx].y + h_gridk.data[idx].z*h_gridk.data[idx].z;
-
-				// Scaling factor used in wave space sum
-				//
-				// Can't include k=0 term in the Ewald sum
-				if (i == 0 && j == 0 && k == 0){
-					h_gridk.data[idx].w = 0;
-				}
-				else{
-					// Have to divide by Nx*Ny*Nz to normalize the FFTs
-					h_gridk.data[idx].w = 6.0*3.1415926536 * (1.0 + k2/4.0/xisq) * expf( -(1-m_eta) * k2/4.0/xisq ) / ( k2 ) / Scalar( m_Nx*m_Ny*m_Nz );
-				}
-
-			}
-		}
-	}
-
-	// Store the coefficients for the real space part of Ewald summation
-	//
-	// Will precompute scaling factors for real space component of summation for a given
-	//     discretization to speed up GPU calculations
-	//
-	// Do calculation in double precision, then truncate and tabulate, because the
-	// expressions don't behave very well numerically, and double precision ensures
-	// it works. 
-	m_ewald_dr = 0.001; 		           // Distance resolution
-	m_ewald_n = m_ewald_cut / m_ewald_dr - 1;  // Number of entries in tabulation
-
-	double dr = 0.0010000000000000;
-
-	// Assume all particles have radius of 1.0
+  m_exec_conf->msg->notice(2) << "gauss_eta: " << m_eta << endl;
+  m_exec_conf->msg->notice(2) << "gauss_w: " << w << endl;
+  m_exec_conf->msg->notice(2) << "gauss_gridh (x,y,z): " << L.x/m_Nx << ", " << L.y/m_Ny << ", " << L.z/m_Nz << endl;
+  printf("\n");
+  printf("\n");
+
+  // Create plan for CUFFT on the GPU
+  cufftPlan3d(&plan, m_Nx, m_Ny, m_Nz, CUFFT_C2C);
+
+  // Prepare GPUArrays for grid vectors and gridded forces
+  GPUArray<Scalar4> n_gridk(m_Nx*m_Ny*m_Nz, m_exec_conf);
+  m_gridk.swap(n_gridk);
+  GPUArray<CUFFTCOMPLEX> n_gridX(m_Nx*m_Ny*m_Nz, m_exec_conf);
+  m_gridX.swap(n_gridX);
+  GPUArray<CUFFTCOMPLEX> n_gridY(m_Nx*m_Ny*m_Nz, m_exec_conf);
+  m_gridY.swap(n_gridY);
+  GPUArray<CUFFTCOMPLEX> n_gridZ(m_Nx*m_Ny*m_Nz, m_exec_conf);
+  m_gridZ.swap(n_gridZ);
+
+  // Get list of reciprocal space vectors, and scaling factor for the wave space calculation at each grid point
+  ArrayHandle<Scalar4> h_gridk(m_gridk, access_location::host, access_mode::readwrite);
+  for (int i = 0; i < m_Nx; i++) {
+    for (int j = 0; j < m_Ny; j++) {
+      for (int k = 0; k < m_Nz; k++) {
+
+        // Index into grid vector storage array
+        int idx = i * m_Ny*m_Nz + j * m_Nz + k;
+
+        // k goes from -N/2 to N/2
+        h_gridk.data[idx].x = 2.0*3.1415926536 * ((i < ( m_Nx + 1 ) / 2) ? i : i - m_Nx) / L.x;
+        h_gridk.data[idx].y = 2.0*3.1415926536 * ((j < ( m_Ny + 1 ) / 2) ? j : j - m_Ny) / L.y;
+        h_gridk.data[idx].z = 2.0*3.1415926536 * ((k < ( m_Nz + 1 ) / 2) ? k : k - m_Nz) / L.z;
+
+        // k dot k
+        Scalar k2 = h_gridk.data[idx].x*h_gridk.data[idx].x + h_gridk.data[idx].y*h_gridk.data[idx].y + h_gridk.data[idx].z*h_gridk.data[idx].z;
+
+        // Scaling factor used in wave space sum
+        //
+        // Can't include k=0 term in the Ewald sum
+        if (i == 0 && j == 0 && k == 0){
+          h_gridk.data[idx].w = 0;
+        }
+        else{
+          // Have to divide by Nx*Ny*Nz to normalize the FFTs
+          h_gridk.data[idx].w = 6.0*3.1415926536 * (1.0 + k2/4.0/xisq) * expf( -(1-m_eta) * k2/4.0/xisq ) / ( k2 ) / Scalar( m_Nx*m_Ny*m_Nz );
+        }
+
+      }
+    }
+  }
+
+  // Store the coefficients for the real space part of Ewald summation
+  //
+  // Will precompute scaling factors for real space component of summation for a given
+  //     discretization to speed up GPU calculations
+  //
+  // Do calculation in double precision, then truncate and tabulate, because the
+  // expressions don't behave very well numerically, and double precision ensures
+  // it works. 
+  m_ewald_dr = 0.001;                // Distance resolution
+  m_ewald_n = m_ewald_cut / m_ewald_dr - 1;  // Number of entries in tabulation
+
+  double dr = 0.0010000000000000;
+
+  // Assume all particles have radius of 1.0
         Scalar pi12 = 1.77245385091;
         Scalar aa = 1.0;
-	Scalar axi = aa * m_xi;
-	Scalar axi2 = axi * axi;
+  Scalar axi = aa * m_xi;
+  Scalar axi2 = axi * axi;
         m_self = (1. + 4.*pi12*axi*erfc(2.*axi) - exp(-4.*axi2))/(4.*pi12*axi*aa);
 
-	// Allocate storage for real space Ewald table
-	int nR = m_ewald_n + 1; // number of entries in ewald table
-	GPUArray<Scalar4> n_ewaldC1( nR, m_exec_conf);
-	m_ewaldC1.swap(n_ewaldC1);
-	ArrayHandle<Scalar4> h_ewaldC1(m_ewaldC1, access_location::host, access_mode::readwrite);
+  // Allocate storage for real space Ewald table
+  int nR = m_ewald_n + 1; // number of entries in ewald table
+  GPUArray<Scalar4> n_ewaldC1( nR, m_exec_conf);
+  m_ewaldC1.swap(n_ewaldC1);
+  ArrayHandle<Scalar4> h_ewaldC1(m_ewaldC1, access_location::host, access_mode::readwrite);
 
-	// Functions are complicated so calculation should be done in double precision, then truncated to single precision
-	// in order to ensure accurate evaluation
-	double xi  = m_xi;
-	double Pi = 3.141592653589793;
-	double a = aa;
+  // Functions are complicated so calculation should be done in double precision, then truncated to single precision
+  // in order to ensure accurate evaluation
+  double xi  = m_xi;
+  double Pi = 3.141592653589793;
+  double a = aa;
 
-	// Fill tables
-	for ( int kk = 0; kk < nR; kk++ )
-	{
+  // Fill tables
+  for ( int kk = 0; kk < nR; kk++ )
+  {
 
-		// Initialize entries
-		h_ewaldC1.data[ kk ].x = 0.0; // UF1 at r
-		h_ewaldC1.data[ kk ].y = 0.0; // UF2 at r
-		h_ewaldC1.data[ kk ].z = 0.0; // UF1 at r + dr
-		h_ewaldC1.data[ kk ].w = 0.0; // UF2 at r + dr
+    // Initialize entries
+    h_ewaldC1.data[ kk ].x = 0.0; // UF1 at r
+    h_ewaldC1.data[ kk ].y = 0.0; // UF2 at r
+    h_ewaldC1.data[ kk ].z = 0.0; // UF1 at r + dr
+    h_ewaldC1.data[ kk ].w = 0.0; // UF2 at r + dr
 
-		// Distance for current entry
-		double r = double( kk ) * dr + dr;
-		double Imrr = 0, rr = 0;
+    // Distance for current entry
+    double r = double( kk ) * dr + dr;
+    double Imrr = 0, rr = 0;
 
-		// Expression have been simplified assuming no overlap, touching, and overlap
-		if ( r > 2.0*a ){
+    // Expression have been simplified assuming no overlap, touching, and overlap
+    if ( r > 2.0*a ){
 
-			Imrr = -pow(a,-1) + (pow(a,2)*pow(r,-3))/2. + (3*pow(r,-1))/4. + (3*erfc(r*xi)*pow(a,-2)*pow(r,-3)*(-12*pow(r,4) + pow(xi,-4)))/128. +
+      Imrr = -pow(a,-1) + (pow(a,2)*pow(r,-3))/2. + (3*pow(r,-1))/4. + (3*erfc(r*xi)*pow(a,-2)*pow(r,-3)*(-12*pow(r,4) + pow(xi,-4)))/128. +
    pow(a,-2)*((9*r)/32. - (3*pow(r,-3)*pow(xi,-4))/128.) +
    (erfc((2*a + r)*xi)*(128*pow(a,-1) + 64*pow(a,2)*pow(r,-3) + 96*pow(r,-1) + pow(a,-2)*(36*r - 3*pow(r,-3)*pow(xi,-4))))/256. +
    (erfc(2*a*xi - r*xi)*(128*pow(a,-1) - 64*pow(a,2)*pow(r,-3) - 96*pow(r,-1) + pow(a,-2)*(-36*r + 3*pow(r,-3)*pow(xi,-4))))/
@@ -357,7 +357,7 @@ void Stokes::setParams()
    (exp(-(pow(-2*a + r,2)*pow(xi,2)))*pow(a,-2)*pow(Pi,-0.5)*pow(r,-3)*pow(xi,-3)*
       (8*r*pow(a,2)*pow(xi,2) + 16*pow(a,3)*pow(xi,2) + a*(-2 + 28*pow(r,2)*pow(xi,2)) - 3*(r + 6*pow(r,3)*pow(xi,2))))/128.;
 
-			rr = -pow(a,-1) - pow(a,2)*pow(r,-3) + (3*pow(r,-1))/2. + (3*pow(a,-2)*pow(r,-3)*(4*pow(r,4) + pow(xi,-4)))/64. +
+      rr = -pow(a,-1) - pow(a,2)*pow(r,-3) + (3*pow(r,-1))/2. + (3*pow(a,-2)*pow(r,-3)*(4*pow(r,4) + pow(xi,-4)))/64. +
    (erfc(2*a*xi - r*xi)*(64*pow(a,-1) + 64*pow(a,2)*pow(r,-3) - 96*pow(r,-1) + pow(a,-2)*(-12*r - 3*pow(r,-3)*pow(xi,-4))))/128. +
    (erfc((2*a + r)*xi)*(64*pow(a,-1) - 64*pow(a,2)*pow(r,-3) + 96*pow(r,-1) + pow(a,-2)*(12*r + 3*pow(r,-3)*pow(xi,-4))))/128. +
    (3*exp(-(pow(r,2)*pow(xi,2)))*pow(a,-2)*pow(Pi,-0.5)*pow(r,-2)*pow(xi,-3)*(-1 + 2*pow(r,2)*pow(xi,2)))/32. -
@@ -367,23 +367,23 @@ void Stokes::setParams()
       (-1 + 8*a*r*pow(xi,2) + 8*pow(a,2)*pow(xi,2) + 2*pow(r,2)*pow(xi,2)))/64. -
    (3*erfc(r*xi)*pow(a,-2)*pow(r,-3)*pow(xi,-4)*(1 + 4*pow(r,4)*pow(xi,4)))/64.;
 
-		}
-		else if ( r == 2.0*a ){
+    }
+    else if ( r == 2.0*a ){
 
-			Imrr = -(pow(a,-5)*(3 + 16*a*xi*pow(Pi,-0.5))*pow(xi,-4))/2048. + (3*erfc(2*a*xi)*pow(a,-5)*(-192*pow(a,4) + pow(xi,-4)))/1024. +
+      Imrr = -(pow(a,-5)*(3 + 16*a*xi*pow(Pi,-0.5))*pow(xi,-4))/2048. + (3*erfc(2*a*xi)*pow(a,-5)*(-192*pow(a,4) + pow(xi,-4)))/1024. +
    erfc(4*a*xi)*(pow(a,-1) - (3*pow(a,-5)*pow(xi,-4))/2048.) +
    (exp(-16*pow(a,2)*pow(xi,2))*pow(a,-4)*pow(Pi,-0.5)*pow(xi,-3)*(-1 - 64*pow(a,2)*pow(xi,2)))/256. +
    (3*exp(-4*pow(a,2)*pow(xi,2))*pow(a,-4)*pow(Pi,-0.5)*pow(xi,-3)*(1 + 24*pow(a,2)*pow(xi,2)))/256.;
 
-			rr = (pow(a,-5)*(3 + 16*a*xi*pow(Pi,-0.5))*pow(xi,-4))/1024. + erfc(2*a*xi)*((-3*pow(a,-1))/8. - (3*pow(a,-5)*pow(xi,-4))/512.) +
+      rr = (pow(a,-5)*(3 + 16*a*xi*pow(Pi,-0.5))*pow(xi,-4))/1024. + erfc(2*a*xi)*((-3*pow(a,-1))/8. - (3*pow(a,-5)*pow(xi,-4))/512.) +
    erfc(4*a*xi)*(pow(a,-1) + (3*pow(a,-5)*pow(xi,-4))/1024.) +
    (exp(-16*pow(a,2)*pow(xi,2))*pow(a,-4)*pow(Pi,-0.5)*pow(xi,-3)*(1 - 32*pow(a,2)*pow(xi,2)))/128. +
    (3*exp(-4*pow(a,2)*pow(xi,2))*pow(a,-4)*pow(Pi,-0.5)*pow(xi,-3)*(-1 + 8*pow(a,2)*pow(xi,2)))/128.;
 
-		}
-		else if ( r < 2*a){
+    }
+    else if ( r < 2*a){
 
-			Imrr = (-9*r*pow(a,-2))/32. + pow(a,-1) - (pow(a,2)*pow(r,-3))/2. - (3*pow(r,-1))/4. +
+      Imrr = (-9*r*pow(a,-2))/32. + pow(a,-1) - (pow(a,2)*pow(r,-3))/2. - (3*pow(r,-1))/4. +
    (3*erfc(r*xi)*pow(a,-2)*pow(r,-3)*(-12*pow(r,4) + pow(xi,-4)))/128. +
    (erfc((-2*a + r)*xi)*(-128*pow(a,-1) + 64*pow(a,2)*pow(r,-3) + 96*pow(r,-1) + pow(a,-2)*(36*r - 3*pow(r,-3)*pow(xi,-4))))/
     256. + (erfc((2*a + r)*xi)*(128*pow(a,-1) + 64*pow(a,2)*pow(r,-3) + 96*pow(r,-1) + pow(a,-2)*(36*r - 3*pow(r,-3)*pow(xi,-4))))/
@@ -393,7 +393,7 @@ void Stokes::setParams()
    (exp(-(pow(-2*a + r,2)*pow(xi,2)))*pow(a,-2)*pow(Pi,-0.5)*pow(r,-3)*pow(xi,-3)*
       (8*r*pow(a,2)*pow(xi,2) + 16*pow(a,3)*pow(xi,2) + a*(-2 + 28*pow(r,2)*pow(xi,2)) - 3*(r + 6*pow(r,3)*pow(xi,2))))/128.;
 
-			rr = ((2*a + 3*r)*pow(a,-2)*pow(2*a - r,3)*pow(r,-3))/16. +
+      rr = ((2*a + 3*r)*pow(a,-2)*pow(2*a - r,3)*pow(r,-3))/16. +
    (erfc((-2*a + r)*xi)*(-64*pow(a,-1) - 64*pow(a,2)*pow(r,-3) + 96*pow(r,-1) + pow(a,-2)*(12*r + 3*pow(r,-3)*pow(xi,-4))))/128. +
    (erfc((2*a + r)*xi)*(64*pow(a,-1) - 64*pow(a,2)*pow(r,-3) + 96*pow(r,-1) + pow(a,-2)*(12*r + 3*pow(r,-3)*pow(xi,-4))))/128. +
    (3*exp(-(pow(r,2)*pow(xi,2)))*pow(a,-2)*pow(Pi,-0.5)*pow(r,-2)*pow(xi,-3)*(-1 + 2*pow(r,2)*pow(xi,2)))/32. -
@@ -403,23 +403,23 @@ void Stokes::setParams()
       (-1 + 8*a*r*pow(xi,2) + 8*pow(a,2)*pow(xi,2) + 2*pow(r,2)*pow(xi,2)))/64. -
    (3*erfc(r*xi)*pow(a,-2)*pow(r,-3)*pow(xi,-4)*(1 + 4*pow(r,4)*pow(xi,4)))/64.;
 
-		}
+    }
 
-		// Save values to table
-		h_ewaldC1.data[ kk ].x = Scalar( Imrr ); // UF1
-		h_ewaldC1.data[ kk ].y = Scalar( rr );   // UF2
+    // Save values to table
+    h_ewaldC1.data[ kk ].x = Scalar( Imrr ); // UF1
+    h_ewaldC1.data[ kk ].y = Scalar( rr );   // UF2
 
-	} // kk loop over distances
+  } // kk loop over distances
 
-	// Both pieces of UF data for faster interpolation (r and r+dr stored in same Scalar4)
-	for ( int kk = 0; kk < (nR-1); kk++ ){
+  // Both pieces of UF data for faster interpolation (r and r+dr stored in same Scalar4)
+  for ( int kk = 0; kk < (nR-1); kk++ ){
 
-		int offset1 = kk;
-		int offset2 = (kk+1);
+    int offset1 = kk;
+    int offset2 = (kk+1);
 
-		h_ewaldC1.data[ offset1 ].z = h_ewaldC1.data[ offset2 ].x;
-		h_ewaldC1.data[ offset1 ].w = h_ewaldC1.data[ offset2 ].y;
-	}
+    h_ewaldC1.data[ offset1 ].z = h_ewaldC1.data[ offset2 ].x;
+    h_ewaldC1.data[ offset1 ].w = h_ewaldC1.data[ offset2 ].y;
+  }
 
 }
 
@@ -429,96 +429,96 @@ void Stokes::setParams()
 void Stokes::integrateStepOne(unsigned int timestep)
 {
 
-	// Recompute neighborlist ( if needed )
-	m_nlist->compute(timestep);
+  // Recompute neighborlist ( if needed )
+  m_nlist->compute(timestep);
 
-	// access the neighbor list
-	ArrayHandle<unsigned int> d_n_neigh(m_nlist->getNNeighArray(), access_location::device, access_mode::read);
-	ArrayHandle<unsigned int> d_nlist(m_nlist->getNListArray(), access_location::device, access_mode::read);
-	ArrayHandle<unsigned int> d_headlist(m_nlist->getHeadList(), access_location::device, access_mode::read);
+  // access the neighbor list
+  ArrayHandle<unsigned int> d_n_neigh(m_nlist->getNNeighArray(), access_location::device, access_mode::read);
+  ArrayHandle<unsigned int> d_nlist(m_nlist->getNListArray(), access_location::device, access_mode::read);
+  ArrayHandle<unsigned int> d_headlist(m_nlist->getHeadList(), access_location::device, access_mode::read);
 
-	// Consistency check
-	unsigned int group_size = m_group->getNumMembers();
-	assert(group_size <= m_pdata->getN());
-	if (group_size == 0)
-		return;
+  // Consistency check
+  unsigned int group_size = m_group->getNumMembers();
+  assert(group_size <= m_pdata->getN());
+  if (group_size == 0)
+    return;
 
-	// Get particle forces
-	const GPUArray< Scalar4 >& net_force = m_pdata->getNetForce();
+  // Get particle forces
+  const GPUArray< Scalar4 >& net_force = m_pdata->getNetForce();
 
-	// profile this step
-	if (m_prof)
-		m_prof->push(m_exec_conf, "Stokes step 1 (no step 2)");
+  // profile this step
+  if (m_prof)
+    m_prof->push(m_exec_conf, "Stokes step 1 (no step 2)");
 
-	// Access all the needed data for the calculation
-	ArrayHandle<Scalar4> d_pos(m_pdata->getPositions(), access_location::device, access_mode::readwrite);
-	ArrayHandle<Scalar4> d_vel(m_pdata->getVelocities(), access_location::device, access_mode::readwrite);
-	ArrayHandle<Scalar3> d_accel(m_pdata->getAccelerations(), access_location::device, access_mode::readwrite);
-	ArrayHandle<Scalar4> d_net_force(net_force, access_location::device, access_mode::read);
-	ArrayHandle<int3> d_image(m_pdata->getImages(), access_location::device, access_mode::readwrite);
+  // Access all the needed data for the calculation
+  ArrayHandle<Scalar4> d_pos(m_pdata->getPositions(), access_location::device, access_mode::readwrite);
+  ArrayHandle<Scalar4> d_vel(m_pdata->getVelocities(), access_location::device, access_mode::readwrite);
+  ArrayHandle<Scalar3> d_accel(m_pdata->getAccelerations(), access_location::device, access_mode::readwrite);
+  ArrayHandle<Scalar4> d_net_force(net_force, access_location::device, access_mode::read);
+  ArrayHandle<int3> d_image(m_pdata->getImages(), access_location::device, access_mode::readwrite);
 
-	BoxDim box = m_pdata->getBox();
-	ArrayHandle< unsigned int > d_index_array(m_group->getIndexArray(), access_location::device, access_mode::read);
+  BoxDim box = m_pdata->getBox();
+  ArrayHandle< unsigned int > d_index_array(m_group->getIndexArray(), access_location::device, access_mode::read);
 
-	// Grid vectors
-	ArrayHandle<Scalar4> d_gridk(m_gridk, access_location::device, access_mode::readwrite);
-	ArrayHandle<CUFFTCOMPLEX> d_gridX(m_gridX, access_location::device, access_mode::readwrite);
-	ArrayHandle<CUFFTCOMPLEX> d_gridY(m_gridY, access_location::device, access_mode::readwrite);
-	ArrayHandle<CUFFTCOMPLEX> d_gridZ(m_gridZ, access_location::device, access_mode::readwrite);
+  // Grid vectors
+  ArrayHandle<Scalar4> d_gridk(m_gridk, access_location::device, access_mode::readwrite);
+  ArrayHandle<CUFFTCOMPLEX> d_gridX(m_gridX, access_location::device, access_mode::readwrite);
+  ArrayHandle<CUFFTCOMPLEX> d_gridY(m_gridY, access_location::device, access_mode::readwrite);
+  ArrayHandle<CUFFTCOMPLEX> d_gridZ(m_gridZ, access_location::device, access_mode::readwrite);
 
-	// Real space interaction tabulation
-	ArrayHandle<Scalar4> d_ewaldC1(m_ewaldC1, access_location::device, access_mode::read);
+  // Real space interaction tabulation
+  ArrayHandle<Scalar4> d_ewaldC1(m_ewaldC1, access_location::device, access_mode::read);
 
         // Calculate the shear rate of the current timestep
         Scalar current_shear_rate = m_shear_func -> getShearRate(timestep);
 
-	// perform the update on the GPU
-	gpu_stokes_step_one(
-				d_pos.data,
-				d_vel.data,
-				d_accel.data,
-				d_image.data,
-				d_index_array.data,
-				group_size,
-				box,
-				m_deltaT,
-				256,
-				d_net_force.data,
-				m_T->getValue(timestep),
-				timestep,
-				m_seed,
-				m_xi,
-				m_eta,
-				m_ewald_cut,
-				m_ewald_dr,
-				m_ewald_n,
-				d_ewaldC1.data,
-				m_self,
-				d_gridk.data,
-				d_gridX.data,
-				d_gridY.data,
-				d_gridZ.data,
-				plan,
-				m_Nx,
-				m_Ny,
-				m_Nz,
-				d_n_neigh.data,
-				d_nlist.data,
-				d_headlist.data,
-				m_m_Lanczos,
-				m_pdata->getN(),
-				m_gaussP,
-				m_gridh,
-				m_error,
-				current_shear_rate
-				);
-
-	if (m_exec_conf->isCUDAErrorCheckingEnabled())
-		CHECK_CUDA_ERROR();
-
-	// done profiling
-	if (m_prof)
-		m_prof->pop(m_exec_conf);
+  // perform the update on the GPU
+  gpu_stokes_step_one(
+        d_pos.data,
+        d_vel.data,
+        d_accel.data,
+        d_image.data,
+        d_index_array.data,
+        group_size,
+        box,
+        m_deltaT,
+        256,
+        d_net_force.data,
+        m_T->getValue(timestep),
+        timestep,
+        m_seed,
+        m_xi,
+        m_eta,
+        m_ewald_cut,
+        m_ewald_dr,
+        m_ewald_n,
+        d_ewaldC1.data,
+        m_self,
+        d_gridk.data,
+        d_gridX.data,
+        d_gridY.data,
+        d_gridZ.data,
+        plan,
+        m_Nx,
+        m_Ny,
+        m_Nz,
+        d_n_neigh.data,
+        d_nlist.data,
+        d_headlist.data,
+        m_m_Lanczos,
+        m_pdata->getN(),
+        m_gaussP,
+        m_gridh,
+        m_error,
+        current_shear_rate
+        );
+
+  if (m_exec_conf->isCUDAErrorCheckingEnabled())
+    CHECK_CUDA_ERROR();
+
+  // done profiling
+  if (m_prof)
+    m_prof->pop(m_exec_conf);
 
 }
 
@@ -532,9 +532,9 @@ void Stokes::integrateStepTwo(unsigned int timestep)
 void export_Stokes(pybind11::module& m)
     {
     pybind11::class_<Stokes, std::shared_ptr<Stokes> > (m, "Stokes", pybind11::base<IntegrationMethodTwoStep>()) 
-		.def(pybind11::init< std::shared_ptr<SystemDefinition>, std::shared_ptr<ParticleGroup>, std::shared_ptr<Variant>, unsigned int, std::shared_ptr<NeighborList>, Scalar, Scalar >())
-		.def("setT", &Stokes::setT)
-		.def("setParams", &Stokes::setParams)
+    .def(pybind11::init< std::shared_ptr<SystemDefinition>, std::shared_ptr<ParticleGroup>, std::shared_ptr<Variant>, unsigned int, std::shared_ptr<NeighborList>, Scalar, Scalar >())
+    .def("setT", &Stokes::setT)
+    .def("setParams", &Stokes::setParams)
                 .def("setShear", &Stokes::setShear)
         ;
     }
diff --git a/PSEv1/Stokes.cu b/PSEv1/Stokes.cu
index d86d767..dca1665 100644
--- a/PSEv1/Stokes.cu
+++ b/PSEv1/Stokes.cu
@@ -136,17 +136,17 @@ scalar4_tex_t pos_tex;
 */
 extern "C" __global__
 void gpu_stokes_step_one_kernel(
-				Scalar4 *d_pos,
-				Scalar4 *d_vel,
-				Scalar3 *d_accel,
-				int3 *d_image,
-				unsigned int *d_group_members,
-				unsigned int group_size,
-				BoxDim box,
-				Scalar deltaT,
-				Scalar4 *d_net_force,
-				Scalar shear_rate
-				){
+        Scalar4 *d_pos,
+        Scalar4 *d_vel,
+        Scalar3 *d_accel,
+        int3 *d_image,
+        unsigned int *d_group_members,
+        unsigned int group_size,
+        BoxDim box,
+        Scalar deltaT,
+        Scalar4 *d_net_force,
+        Scalar shear_rate
+        ){
 
     // determine which particle this thread works on (MEM TRANSFER: 4 bytes)
     int group_idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -161,13 +161,13 @@ void gpu_stokes_step_one_kernel(
 
         // read the particle's velocity and acceleration (MEM TRANSFER: 32 bytes)
         Scalar4 velmass = d_vel[idx];
-	Scalar mass = velmass.w;
+  Scalar mass = velmass.w;
         Scalar3 vel = make_scalar3(velmass.x, velmass.y, velmass.z);
 
-	// Add the shear
+  // Add the shear
         vel.x += shear_rate * pos.y;
 
-	Scalar4 net_force = d_net_force[idx];
+  Scalar4 net_force = d_net_force[idx];
         Scalar3 accel = make_scalar3(net_force.x, net_force.y, net_force.z);
 
         // update the position
@@ -176,7 +176,7 @@ void gpu_stokes_step_one_kernel(
         // FLOPS: 3
         pos += dx;
 
-	accel = accel/mass;
+  accel = accel/mass;
 
         // read in the particle's image (MEM TRANSFER: 16 bytes)
         int3 image = d_image[idx];
@@ -185,7 +185,7 @@ void gpu_stokes_step_one_kernel(
         box.wrap(pos, image);
 
         // write out the results (MEM_TRANSFER: 48 bytes)
-	d_accel[idx] = accel;
+  d_accel[idx] = accel;
         d_pos[idx] = make_scalar4(pos.x, pos.y, pos.z, postype.w);
         d_image[idx] = image;
         }
@@ -216,7 +216,7 @@ void gpu_stokes_step_one_kernel(
     \param d_gridY            y-component of force moment projection onto the grid
     \param d_gridZ            z-component of force moment projection onto the grid
     \param plan cudaFFT       plan
-    \param Nx 		      number of grid nodes in the x-direction
+    \param Nx           number of grid nodes in the x-direction
     \param Ny                 number of grid nodes in the y-direction
     \param Nz                 number of grid nodes in the z-direction
     \param d_n_neigh          Number of neighbors for every particle
@@ -232,134 +232,134 @@ void gpu_stokes_step_one_kernel(
     \param cheb_error         error tolerance in chebyshev approximation
 */
 cudaError_t gpu_stokes_step_one(
-				Scalar4 *d_pos,
-				Scalar4 *d_vel,
-				Scalar3 *d_accel,
-				int3 *d_image,
-				unsigned int *d_group_members,
-				unsigned int group_size,
-				const BoxDim& box,
-				Scalar dt,
-				unsigned int block_size,
-				Scalar4 *d_net_force,
-				const Scalar T,
-				const unsigned int timestep,
-				const unsigned int seed,
-				Scalar xi,
-				Scalar eta,
-				Scalar ewald_cut,
-				Scalar ewald_dr,
-				int ewald_n,
-				Scalar4 *d_ewaldC1, 
-				Scalar self,
-				Scalar4 *d_gridk,
-				CUFFTCOMPLEX *d_gridX,
-				CUFFTCOMPLEX *d_gridY,
-				CUFFTCOMPLEX *d_gridZ,
-				cufftHandle plan,
-				const int Nx,
-				const int Ny,
-				const int Nz,
-				const unsigned int *d_n_neigh,
-				const unsigned int *d_nlist,
-				const unsigned int *d_headlist,
-				int& m_Lanczos,
-				const unsigned int N_total,
-				const int P,
-				Scalar3 gridh,
-				Scalar cheb_error,
-				Scalar shear_rate
-				){
-
-	// Total number of grid points
-	unsigned int NxNyNz = Nx*Ny*Nz;
-
-	// setup the grid to run the kernel
-	// block for particle calculation
-	dim3 grid( (group_size/block_size) + 1, 1, 1);
-	dim3 threads(block_size, 1, 1);
-	
-	// block for grid calculation
-	int gridBlockSize = ( NxNyNz > block_size ) ? block_size : NxNyNz;
-	int gridNBlock = ( NxNyNz + gridBlockSize - 1 ) / gridBlockSize ; 
-	
-	// Get the textured tables for real space Ewald sum tabulation
-	tables1_tex.normalized = false; // Not normalized
-	tables1_tex.filterMode = cudaFilterModeLinear; // Filter mode: floor of the index
-	// One dimension, Read mode: ElementType(Get what we write)
-	cudaBindTexture(0, tables1_tex, d_ewaldC1, sizeof(Scalar4) * (ewald_n+1)); // This was a bug in former versions!
-	
-	// Same for the positions and forces
-	pos_tex.normalized = false; // Not normalized
-	pos_tex.filterMode = cudaFilterModePoint; // Filter mode: floor of the index
-	cudaBindTexture(0, pos_tex, d_pos, sizeof(Scalar4) * N_total);
-
-	// Get sheared grid vectors
-    	gpu_stokes_SetGridk_kernel<<<gridNBlock,gridBlockSize>>>(d_gridk,Nx,Ny,Nz,NxNyNz,box,xi,eta);
-
-	// Do Mobility and Brownian Calculations (compute the velocity from the forces)
-	gpu_stokes_CombinedMobilityBrownian_wrap(  	
-							d_pos,
-							d_net_force,
-                                			d_group_members,
-                                			group_size,
-                                			box,
-                                			dt,
-			        			d_vel, // output
-			        			T,
-			        			timestep,
-			        			seed,
-			        			xi,
-							eta,
-							P,
-			        			ewald_cut,
-			        			ewald_dr,
-			        			ewald_n,
-			        			d_ewaldC1, 
-			        			d_gridk,
-			        			d_gridX,
-			        			d_gridY,
-			        			d_gridZ,
-			        			plan,
-			        			Nx,
-			        			Ny,
-			        			Nz,
-			        			d_n_neigh,
-                                			d_nlist,
-                                			d_headlist,
-			        			m_Lanczos,
-			        			N_total,
-			        			NxNyNz,
-			        			grid,
-			        			threads,
-			        			gridBlockSize,
-			        			gridNBlock,
-							gridh,
-			        			cheb_error,
-							self );
-
-
-	// Use forward Euler integration to move the particles according the velocity
-	// computed from the Mobility and Brownian calculations
-	gpu_stokes_step_one_kernel<<< grid, threads >>>(
-							d_pos, 
-							d_vel, 
-							d_accel, 
-							d_image, 
-							d_group_members, 
-							group_size, 
-							box, 
-							dt, 
-							d_net_force, 
-							shear_rate
-							);
-
-	// Quick error check
-	gpuErrchk(cudaPeekAtLastError());
-	
-	// Cleanup
-	cudaUnbindTexture(tables1_tex);
-	cudaUnbindTexture(pos_tex);
-	
-	return cudaSuccess;
+        Scalar4 *d_pos,
+        Scalar4 *d_vel,
+        Scalar3 *d_accel,
+        int3 *d_image,
+        unsigned int *d_group_members,
+        unsigned int group_size,
+        const BoxDim& box,
+        Scalar dt,
+        unsigned int block_size,
+        Scalar4 *d_net_force,
+        const Scalar T,
+        const unsigned int timestep,
+        const unsigned int seed,
+        Scalar xi,
+        Scalar eta,
+        Scalar ewald_cut,
+        Scalar ewald_dr,
+        int ewald_n,
+        Scalar4 *d_ewaldC1, 
+        Scalar self,
+        Scalar4 *d_gridk,
+        CUFFTCOMPLEX *d_gridX,
+        CUFFTCOMPLEX *d_gridY,
+        CUFFTCOMPLEX *d_gridZ,
+        cufftHandle plan,
+        const int Nx,
+        const int Ny,
+        const int Nz,
+        const unsigned int *d_n_neigh,
+        const unsigned int *d_nlist,
+        const unsigned int *d_headlist,
+        int& m_Lanczos,
+        const unsigned int N_total,
+        const int P,
+        Scalar3 gridh,
+        Scalar cheb_error,
+        Scalar shear_rate
+        ){
+
+  // Total number of grid points
+  unsigned int NxNyNz = Nx*Ny*Nz;
+
+  // setup the grid to run the kernel
+  // block for particle calculation
+  dim3 grid( (group_size/block_size) + 1, 1, 1);
+  dim3 threads(block_size, 1, 1);
+  
+  // block for grid calculation
+  int gridBlockSize = ( NxNyNz > block_size ) ? block_size : NxNyNz;
+  int gridNBlock = ( NxNyNz + gridBlockSize - 1 ) / gridBlockSize ; 
+  
+  // Get the textured tables for real space Ewald sum tabulation
+  tables1_tex.normalized = false; // Not normalized
+  tables1_tex.filterMode = cudaFilterModeLinear; // Filter mode: floor of the index
+  // One dimension, Read mode: ElementType(Get what we write)
+  cudaBindTexture(0, tables1_tex, d_ewaldC1, sizeof(Scalar4) * (ewald_n+1)); // This was a bug in former versions!
+  
+  // Same for the positions and forces
+  pos_tex.normalized = false; // Not normalized
+  pos_tex.filterMode = cudaFilterModePoint; // Filter mode: floor of the index
+  cudaBindTexture(0, pos_tex, d_pos, sizeof(Scalar4) * N_total);
+
+  // Get sheared grid vectors
+      gpu_stokes_SetGridk_kernel<<<gridNBlock,gridBlockSize>>>(d_gridk,Nx,Ny,Nz,NxNyNz,box,xi,eta);
+
+  // Do Mobility and Brownian Calculations (compute the velocity from the forces)
+  gpu_stokes_CombinedMobilityBrownian_wrap(    
+              d_pos,
+              d_net_force,
+                                      d_group_members,
+                                      group_size,
+                                      box,
+                                      dt,
+                    d_vel, // output
+                    T,
+                    timestep,
+                    seed,
+                    xi,
+              eta,
+              P,
+                    ewald_cut,
+                    ewald_dr,
+                    ewald_n,
+                    d_ewaldC1, 
+                    d_gridk,
+                    d_gridX,
+                    d_gridY,
+                    d_gridZ,
+                    plan,
+                    Nx,
+                    Ny,
+                    Nz,
+                    d_n_neigh,
+                                      d_nlist,
+                                      d_headlist,
+                    m_Lanczos,
+                    N_total,
+                    NxNyNz,
+                    grid,
+                    threads,
+                    gridBlockSize,
+                    gridNBlock,
+              gridh,
+                    cheb_error,
+              self );
+
+
+  // Use forward Euler integration to move the particles according the velocity
+  // computed from the Mobility and Brownian calculations
+  gpu_stokes_step_one_kernel<<< grid, threads >>>(
+              d_pos, 
+              d_vel, 
+              d_accel, 
+              d_image, 
+              d_group_members, 
+              group_size, 
+              box, 
+              dt, 
+              d_net_force, 
+              shear_rate
+              );
+
+  // Quick error check
+  gpuErrchk(cudaPeekAtLastError());
+  
+  // Cleanup
+  cudaUnbindTexture(tables1_tex);
+  cudaUnbindTexture(pos_tex);
+  
+  return cudaSuccess;
 }
diff --git a/PSEv1/Stokes.cuh b/PSEv1/Stokes.cuh
index db2d304..73f1713 100644
--- a/PSEv1/Stokes.cuh
+++ b/PSEv1/Stokes.cuh
@@ -81,34 +81,34 @@ cudaError_t gpu_stokes_step_one(Scalar4 *d_pos,
                              const BoxDim& box,
                              Scalar deltaT,
                              unsigned int block_size,
-			     Scalar4 *d_net_force,
-			     const Scalar T,
-			     const unsigned int timestep,
-			     const unsigned int seed,
-			     Scalar xi,
-			     Scalar eta,
-			     Scalar ewald_cut,
-			     Scalar ewald_dr,
-		     	     int ewald_n,
-			     Scalar4 *d_ewald1,
-			     Scalar self,
-			     Scalar4 *d_gridk,
-			     CUFFTCOMPLEX *d_gridX,
-			     CUFFTCOMPLEX *d_gridY,
-			     CUFFTCOMPLEX *d_gridZ,
-			     cufftHandle plan,
-			     const int Nx,
-			     const int Ny,
-			     const int Nz,
-			     const unsigned int *d_n_neigh,
+           Scalar4 *d_net_force,
+           const Scalar T,
+           const unsigned int timestep,
+           const unsigned int seed,
+           Scalar xi,
+           Scalar eta,
+           Scalar ewald_cut,
+           Scalar ewald_dr,
+                int ewald_n,
+           Scalar4 *d_ewald1,
+           Scalar self,
+           Scalar4 *d_gridk,
+           CUFFTCOMPLEX *d_gridX,
+           CUFFTCOMPLEX *d_gridY,
+           CUFFTCOMPLEX *d_gridZ,
+           cufftHandle plan,
+           const int Nx,
+           const int Ny,
+           const int Nz,
+           const unsigned int *d_n_neigh,
                              const unsigned int *d_nlist,
                              const unsigned int *d_headlist,
-			     int& m_Lanczos,
-			     const unsigned int N_total,
-			     const int P,
-			     Scalar3 gridh,
-			     Scalar cheb_error,
-			     Scalar current_shear_rate);
+           int& m_Lanczos,
+           const unsigned int N_total,
+           const int P,
+           Scalar3 gridh,
+           Scalar cheb_error,
+           Scalar current_shear_rate);
 
 
 #endif
diff --git a/PSEv1/integrate.py b/PSEv1/integrate.py
index 7a959bb..e5a173c 100644
--- a/PSEv1/integrate.py
+++ b/PSEv1/integrate.py
@@ -20,9 +20,9 @@ class PSEv1(hoomd.md.integrate._integration_method):
     # \param seed               Random seed to use for the run. Simulations that are identical, except for the seed, will follow
     #                             different trajectories.
     # \param xi                 Ewald splitting parameter
-    # \param error		Relative error for all calculations
-    # \param function_form	Functional form for shear
-    # \param max_strain		Maximum box deformation for shear
+    # \param error    Relative error for all calculations
+    # \param function_form  Functional form for shear
+    # \param max_strain    Maximum box deformation for shear
     #
     #
     # T can be a variant type, allowing for temperature ramps in simulation runs.
@@ -31,7 +31,7 @@ class PSEv1(hoomd.md.integrate._integration_method):
     
     def __init__(self, group, T, seed=0, xi = 0.5, error = 0.001, function_form = None, max_strain = 0.5, nlist_type = "cell" ):
 
-	# Print the status of the initialization        
+  # Print the status of the initialization        
         hoomd.util.print_status_line();
         
         # initialize base class
@@ -52,9 +52,9 @@ def __init__(self, group, T, seed=0, xi = 0.5, error = 0.001, function_form = No
             hoomd.context.msg.error("Sorry, we have not written CPU code for PSE RPY simulation. \n");
             raise RuntimeError('Error creating Stokes');
         else:
-	    
-	    # Create a neighborlist exclusively for real space interactions. Use cell lists by 
-	    # default, but also allow the user to specify
+      
+      # Create a neighborlist exclusively for real space interactions. Use cell lists by 
+      # default, but also allow the user to specify
             if ( nlist_type.upper() == "CELL" ):
 
                 cl_stokes = _hoomd.CellListGPU(hoomd.context.current.system_definition);
diff --git a/PSEv1/module.cc b/PSEv1/module.cc
index 32a4940..775d32b 100644
--- a/PSEv1/module.cc
+++ b/PSEv1/module.cc
@@ -13,7 +13,7 @@
 PYBIND11_MODULE(_PSEv1, m)
     {
     #ifdef ENABLE_CUDA
-	export_Stokes(m);
+  export_Stokes(m);
     #endif
     export_ShearFunction(m);
     export_ShearFunctionWrap(m);
diff --git a/PSEv1/variant.py b/PSEv1/variant.py
index a73541e..5dafa38 100644
--- a/PSEv1/variant.py
+++ b/PSEv1/variant.py
@@ -23,7 +23,7 @@ def __init__(self, function_form, total_timestep, max_strain = 0.5):
         # initialize the base class
         _variant.__init__(self)
 
-	# check total_timestep is positive
+  # check total_timestep is positive
         if total_timestep <= 0:
             hoomd.context.msg.error("Cannot create a shear_variant with 0 or negative points\n")
             raise RuntimeError('Error creating variant')
diff --git a/examples/run.py b/examples/run.py
index d04bd1f..b8b80ed 100644
--- a/examples/run.py
+++ b/examples/run.py
@@ -8,7 +8,7 @@
 
 # Time stepping information
 dt = 1e-3      # time step
-tf = 1e0       # the final time of the simulation (in units of bare particle diffusion time)
+tf = 10e0       # the final time of the simulation (in units of bare particle diffusion time)
 nrun = tf / dt # number of steps
 
 # Particle size
@@ -25,8 +25,8 @@
         os.mkdir( loc )
 
 # Simple cubic crystal of 1000 particles
-N = 1000;
-L = 64
+N = 6400
+L = 40 
 n = math.ceil(N ** (1.0/3.0)) # number of particles along 1D
 a = L / n # spacing between particles