diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c6f9a44 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.vscode/settings.json diff --git a/PSEv1/Brownian.cu b/PSEv1/Brownian.cu index 907f20d..f267c33 100644 --- a/PSEv1/Brownian.cu +++ b/PSEv1/Brownian.cu @@ -75,7 +75,7 @@ using namespace hoomd; \brief Defines functions for PSE calculation of the Brownian Displacements // Uses LAPACKE to perform the final square root of the tridiagonal matrix - resulting from the Lanczos Method + resulting from the Lanczos Method */ //! Shared memory array for partial sum of dot product kernel @@ -83,842 +83,842 @@ extern __shared__ Scalar partial_sum[]; extern __shared__ Scalar4 shared_Fpos[]; /*! - Generate random numbers on particles - - \param d_psi random vector + Generate random numbers on particles + + \param d_psi random vector \param group_size number of particles - \param d_group_members index to particle arrays - \param timestep current time step - \param seed seed for random number generation + \param d_group_members index to particle arrays + \param timestep current time step + \param seed seed for random number generation - Thread-per-particle operations to generate random numbers - for the real space part of the Brownian calculation. Grid - and blocks are 1-D. + Thread-per-particle operations to generate random numbers + for the real space part of the Brownian calculation. Grid + and blocks are 1-D. */ __global__ void gpu_stokes_BrownianGenerate_kernel( - Scalar4 *d_psi, - unsigned int group_size, - unsigned int *d_group_members, - const unsigned int timestep, - const unsigned int seed - ){ + Scalar4 *d_psi, + unsigned int group_size, + unsigned int *d_group_members, + const unsigned int timestep, + const unsigned int seed + ){ - // Thread ID - int group_idx = blockDim.x * blockIdx.x + threadIdx.x; + // Thread ID + int group_idx = blockDim.x * blockIdx.x + threadIdx.x; - // Make sure that thread is in bounds - if (group_idx < group_size) { + // Make sure that thread is in bounds + if (group_idx < group_size) { - // Global particle index - unsigned int idx = d_group_members[group_idx]; + // Global particle index + unsigned int idx = d_group_members[group_idx]; - // Initialize random number generator - detail::Saru s(idx, timestep + seed); + // Initialize random number generator + detail::Saru s(idx, timestep + seed); - // Draw numbers from a Uniform distribution (-sqrt(3),sqrt(3), - // so that variance = 1/3 - Scalar sqrt3 = 1.73205080757; - Scalar randomx = s.f( -sqrt3, sqrt3 ); - Scalar randomy = s.f( -sqrt3, sqrt3 ); - Scalar randomz = s.f( -sqrt3, sqrt3 ); + // Draw numbers from a Uniform distribution (-sqrt(3),sqrt(3), + // so that variance = 1/3 + Scalar sqrt3 = 1.73205080757; + Scalar randomx = s.f( -sqrt3, sqrt3 ); + Scalar randomy = s.f( -sqrt3, sqrt3 ); + Scalar randomz = s.f( -sqrt3, sqrt3 ); - // Write to global memory, leaving the 4th element unchanged - d_psi[idx] = make_scalar4(randomx, randomy, randomz, d_psi[idx].w); + // Write to global memory, leaving the 4th element unchanged + d_psi[idx] = make_scalar4(randomx, randomy, randomz, d_psi[idx].w); - } + } } /*! - Generate random numbers for wave space Brownian motion ( random numbers on grid ) - - scale forces as they're generated and add directly to the existing grid. - - \param d_gridX x-component of vectors on grid - \param d_gridY y-component of vectors on grid - \param d_gridZ z-component of vectors on grid - \param d_gridk reciprocal lattice vectors for each grid point - \param NxNyNz total number of grid points - \param Nx number of grid points in x-direction - \param Ny number of grid points in y-direction - \param Nz number of grid points in z-direction - \param timestep current simulation time step - \param seed seed for random number generation - \param T simulation temperature - \param dt simulation time step size - \param quadW quadrature weight for spectral Ewald integration - - Thread per grid node. 1-D grid of blocks, 1-D block of threads. + Generate random numbers for wave space Brownian motion ( random numbers on grid ) + - scale forces as they're generated and add directly to the existing grid. + + \param d_gridX x-component of vectors on grid + \param d_gridY y-component of vectors on grid + \param d_gridZ z-component of vectors on grid + \param d_gridk reciprocal lattice vectors for each grid point + \param NxNyNz total number of grid points + \param Nx number of grid points in x-direction + \param Ny number of grid points in y-direction + \param Nz number of grid points in z-direction + \param timestep current simulation time step + \param seed seed for random number generation + \param T simulation temperature + \param dt simulation time step size + \param quadW quadrature weight for spectral Ewald integration + + Thread per grid node. 1-D grid of blocks, 1-D block of threads. */ __global__ void gpu_stokes_BrownianGridGenerate_kernel( - CUFFTCOMPLEX *gridX, - CUFFTCOMPLEX *gridY, - CUFFTCOMPLEX *gridZ, - Scalar4 *gridk, - unsigned int NxNyNz, - int Nx, - int Ny, - int Nz, - const unsigned int timestep, - const unsigned int seed, - Scalar T, - Scalar dt, - Scalar quadW - ){ - - // Current thread index - int idx = blockDim.x * blockIdx.x + threadIdx.x; - - // Check if threads are in bounds - if ( idx < NxNyNz ) { + CUFFTCOMPLEX *gridX, + CUFFTCOMPLEX *gridY, + CUFFTCOMPLEX *gridZ, + Scalar4 *gridk, + unsigned int NxNyNz, + int Nx, + int Ny, + int Nz, + const unsigned int timestep, + const unsigned int seed, + Scalar T, + Scalar dt, + Scalar quadW + ){ + + // Current thread index + int idx = blockDim.x * blockIdx.x + threadIdx.x; + + // Check if threads are in bounds + if ( idx < NxNyNz ) { - // Random number generator - detail::Saru s(idx, timestep + seed); - - // Square root of 3.0 / 2.0 - Scalar sqrt3d2 = 1.2247448713915889; - - // Get random numbers from uniform distribution - // on (-sqrt(3/2),sqrt(3/2)) so that variance - // of ( reX + reY ) = 1.0 - Scalar reX = s.f( -sqrt3d2, sqrt3d2 ); - Scalar reY = s.f( -sqrt3d2, sqrt3d2 ); - Scalar reZ = s.f( -sqrt3d2, sqrt3d2 ); - Scalar imX = s.f( -sqrt3d2, sqrt3d2 ); - Scalar imY = s.f( -sqrt3d2, sqrt3d2 ); - Scalar imZ = s.f( -sqrt3d2, sqrt3d2 ); - - // Modulo arithmetic for indices for current grid point - int kk = idx % Nz; - int jj = ( ( idx - kk ) / Nz ) % Ny; - int ii = ( ( idx - kk ) / Nz - jj ) / Ny; - - // Scaling factor for covariance - Scalar fac = sqrtf(2.0*T/dt/quadW); - - // Variables required to place values on the grid - Scalar2 fX, fY, fZ; // forces for thread's point - Scalar2 fX_conj, fY_conj, fZ_conj; // forces for thread's conjugate point - Scalar2 kdF, kdF_conj; // dot(k,F) for thread and conjugate point - Scalar B12, B12_conj; // Scaling factors for thread and conjugate point - - // Only do work on half the grid points because we are simultaneously assigning values - // to each grid point and its conjugate. The following check makes sure we pick all of - // the points without conjugates (zeros and nyquist points) as well as all the points - // in the upper half of the grid. Also, ignore the origin in the wave space sum. (Sum - // is over all k!= 0) - if ( - !( 2 * kk >= Nz + 1 ) && - !( ( kk == 0 ) && ( 2 * jj >= Ny + 1 ) ) && - !( ( kk == 0 ) && ( jj == 0 ) && ( 2 * ii >= Nx + 1 ) ) && - !( ( kk == 0 ) && ( jj == 0 ) && ( ii == 0 ) ) - ){ - - // Is current grid point a nyquist point - bool ii_nyquist = ( ( ii == Nx/2 ) && ( Nx/2 == (Nx+1)/2 ) ); - bool jj_nyquist = ( ( jj == Ny/2 ) && ( Ny/2 == (Ny+1)/2 ) ); - bool kk_nyquist = ( ( kk == Nz/2 ) && ( Nz/2 == (Nz+1)/2 ) ); - - // Index of conjugate point - int ii_conj, jj_conj, kk_conj; - if ( ii == 0 ){ - ii_conj = ii; - } - else { - ii_conj = Nx - ii; - } - if ( jj == 0 ){ - jj_conj = jj; - } - else { - jj_conj = Ny - jj; - } - if ( kk == 0 ){ - kk_conj = kk; - } - else { - kk_conj = Nz - kk; - } - - // Global index of conjugate grid point - int conj_idx = ii_conj * Ny*Nz + jj_conj * Nz + kk_conj; - - // Current wave-space vector, conjugate wave space vector, and their - // magnitudes - Scalar4 tk = gridk[idx]; - Scalar4 tk_conj = gridk[conj_idx]; - - Scalar ksq = tk.x*tk.x + tk.y*tk.y + tk.z*tk.z; - Scalar ksq_conj = tk_conj.x*tk_conj.x + tk_conj.y*tk_conj.y + tk_conj.z*tk_conj.z; - - // Assign fluctuating values to the Nyquist points (no conjugate points) - if ( ( ii == 0 && jj_nyquist && kk == 0 ) || - ( ii_nyquist && jj == 0 && kk == 0 ) || - ( ii_nyquist && jj_nyquist && kk == 0 ) || - ( ii == 0 && jj == 0 && kk_nyquist ) || - ( ii == 0 && jj_nyquist && kk_nyquist ) || - ( ii_nyquist && jj == 0 && kk_nyquist ) || - ( ii_nyquist && jj_nyquist && kk_nyquist ) ){ - - // At the nyquist point, the random quantity only has a real component. Have to - // multiply by sqrt(2.0) to make sure the variance is still 1 - Scalar sqrt2 = 1.4142135623730951; - fX = make_scalar2( sqrt2*reX, 0.0 ); - fY = make_scalar2( sqrt2*reY, 0.0 ); - fZ = make_scalar2( sqrt2*reZ, 0.0 ); - - // Dot product of wave-vector with stochastic quantity - kdF = make_scalar2( ( tk.x*fX.x + tk.y*fY.x + tk.z*fZ.x ) / ksq, ( tk.x*fX.y + tk.y*fY.y + tk.z*fZ.y ) / ksq ); - - // Scaling factor - B12 = sqrtf( tk.w ); - Scalar k = sqrtf( ksq ); - B12 *= sinf( k ) / k; - - // Add random quantity to the grid AND scale by B^(1/2) simultaneously to save effort - gridX[idx].x = gridX[idx].x + fac * ( fX.x - tk.x * kdF.x ) * B12; - gridX[idx].y = gridX[idx].y + fac * ( fX.y - tk.x * kdF.y ) * B12; - - gridY[idx].x = gridY[idx].x + fac * ( fY.x - tk.y * kdF.x ) * B12; - gridY[idx].y = gridY[idx].y + fac * ( fY.y - tk.y * kdF.y ) * B12; - - gridZ[idx].x = gridZ[idx].x + fac * ( fZ.x - tk.z * kdF.x ) * B12; - gridZ[idx].y = gridZ[idx].y + fac * ( fZ.y - tk.z * kdF.y ) * B12; - - } - else { - - // Construct random force - fX = make_scalar2( reX, imX ); - fY = make_scalar2( reY, imY ); - fZ = make_scalar2( reZ, imZ ); - - // The random force at the conjugate point is the conjugate of the force at - // the current point - fX_conj = make_scalar2( reX, -imX ); - fY_conj = make_scalar2( reY, -imY ); - fZ_conj = make_scalar2( reZ, -imZ ); - - // Dot prodcut of force with wave vector at current and conjugate point - kdF = make_scalar2( ( tk.x*fX.x + tk.y*fY.x + tk.z*fZ.x ) / ksq, ( tk.x*fX.y + tk.y*fY.y + tk.z*fZ.y ) / ksq ); - kdF_conj = make_scalar2( ( tk_conj.x*fX_conj.x + tk_conj.y*fY_conj.x + tk_conj.z*fZ_conj.x ) / ksq_conj, ( tk_conj.x*fX_conj.y + tk_conj.y*fY_conj.y + tk_conj.z*fZ_conj.y ) / ksq_conj ); - - // Scaling factors at current and conjugate point - B12 = sqrtf( tk.w ); - B12_conj = sqrtf( tk_conj.w ); - - Scalar k = sqrtf( ksq ); - Scalar kconj = sqrtf( ksq_conj ); - B12 *= sinf( k ) / k; - B12_conj *= sinf( kconj ) / kconj; - - // Add random quantity to the grid AND scale by B^(1/2) simultaneously to save effort - // Current grid point - gridX[idx].x = gridX[idx].x + fac * ( fX.x - tk.x * kdF.x ) * B12; - gridX[idx].y = gridX[idx].y + fac * ( fX.y - tk.x * kdF.y ) * B12; - - gridY[idx].x = gridY[idx].x + fac * ( fY.x - tk.y * kdF.x ) * B12; - gridY[idx].y = gridY[idx].y + fac * ( fY.y - tk.y * kdF.y ) * B12; - - gridZ[idx].x = gridZ[idx].x + fac * ( fZ.x - tk.z * kdF.x ) * B12; - gridZ[idx].y = gridZ[idx].y + fac * ( fZ.y - tk.z * kdF.y ) * B12; - - // Add random quantity to the grid AND scale by B^(1/2) simultaneously to save effort - // Conjugate grid point - gridX[conj_idx].x = gridX[conj_idx].x + fac * ( fX_conj.x - tk_conj.x * kdF_conj.x ) * B12_conj; - gridX[conj_idx].y = gridX[conj_idx].y + fac * ( fX_conj.y - tk_conj.x * kdF_conj.y ) * B12_conj; - - gridY[conj_idx].x = gridY[conj_idx].x + fac * ( fY_conj.x - tk_conj.y * kdF_conj.x ) * B12_conj; - gridY[conj_idx].y = gridY[conj_idx].y + fac * ( fY_conj.y - tk_conj.y * kdF_conj.y ) * B12_conj; - - gridZ[conj_idx].x = gridZ[conj_idx].x + fac * ( fZ_conj.x - tk_conj.z * kdF_conj.x ) * B12_conj; - gridZ[conj_idx].y = gridZ[conj_idx].y + fac * ( fZ_conj.y - tk_conj.z * kdF_conj.y ) * B12_conj; - - } - - - - } + // Random number generator + detail::Saru s(idx, timestep + seed); + + // Square root of 3.0 / 2.0 + Scalar sqrt3d2 = 1.2247448713915889; + + // Get random numbers from uniform distribution + // on (-sqrt(3/2),sqrt(3/2)) so that variance + // of ( reX + reY ) = 1.0 + Scalar reX = s.f( -sqrt3d2, sqrt3d2 ); + Scalar reY = s.f( -sqrt3d2, sqrt3d2 ); + Scalar reZ = s.f( -sqrt3d2, sqrt3d2 ); + Scalar imX = s.f( -sqrt3d2, sqrt3d2 ); + Scalar imY = s.f( -sqrt3d2, sqrt3d2 ); + Scalar imZ = s.f( -sqrt3d2, sqrt3d2 ); + + // Modulo arithmetic for indices for current grid point + int kk = idx % Nz; + int jj = ( ( idx - kk ) / Nz ) % Ny; + int ii = ( ( idx - kk ) / Nz - jj ) / Ny; + + // Scaling factor for covariance + Scalar fac = sqrtf(2.0*T/dt/quadW); + + // Variables required to place values on the grid + Scalar2 fX, fY, fZ; // forces for thread's point + Scalar2 fX_conj, fY_conj, fZ_conj; // forces for thread's conjugate point + Scalar2 kdF, kdF_conj; // dot(k,F) for thread and conjugate point + Scalar B12, B12_conj; // Scaling factors for thread and conjugate point + + // Only do work on half the grid points because we are simultaneously assigning values + // to each grid point and its conjugate. The following check makes sure we pick all of + // the points without conjugates (zeros and nyquist points) as well as all the points + // in the upper half of the grid. Also, ignore the origin in the wave space sum. (Sum + // is over all k!= 0) + if ( + !( 2 * kk >= Nz + 1 ) && + !( ( kk == 0 ) && ( 2 * jj >= Ny + 1 ) ) && + !( ( kk == 0 ) && ( jj == 0 ) && ( 2 * ii >= Nx + 1 ) ) && + !( ( kk == 0 ) && ( jj == 0 ) && ( ii == 0 ) ) + ){ + + // Is current grid point a nyquist point + bool ii_nyquist = ( ( ii == Nx/2 ) && ( Nx/2 == (Nx+1)/2 ) ); + bool jj_nyquist = ( ( jj == Ny/2 ) && ( Ny/2 == (Ny+1)/2 ) ); + bool kk_nyquist = ( ( kk == Nz/2 ) && ( Nz/2 == (Nz+1)/2 ) ); + + // Index of conjugate point + int ii_conj, jj_conj, kk_conj; + if ( ii == 0 ){ + ii_conj = ii; + } + else { + ii_conj = Nx - ii; + } + if ( jj == 0 ){ + jj_conj = jj; + } + else { + jj_conj = Ny - jj; + } + if ( kk == 0 ){ + kk_conj = kk; + } + else { + kk_conj = Nz - kk; + } + + // Global index of conjugate grid point + int conj_idx = ii_conj * Ny*Nz + jj_conj * Nz + kk_conj; + + // Current wave-space vector, conjugate wave space vector, and their + // magnitudes + Scalar4 tk = gridk[idx]; + Scalar4 tk_conj = gridk[conj_idx]; + + Scalar ksq = tk.x*tk.x + tk.y*tk.y + tk.z*tk.z; + Scalar ksq_conj = tk_conj.x*tk_conj.x + tk_conj.y*tk_conj.y + tk_conj.z*tk_conj.z; + + // Assign fluctuating values to the Nyquist points (no conjugate points) + if ( ( ii == 0 && jj_nyquist && kk == 0 ) || + ( ii_nyquist && jj == 0 && kk == 0 ) || + ( ii_nyquist && jj_nyquist && kk == 0 ) || + ( ii == 0 && jj == 0 && kk_nyquist ) || + ( ii == 0 && jj_nyquist && kk_nyquist ) || + ( ii_nyquist && jj == 0 && kk_nyquist ) || + ( ii_nyquist && jj_nyquist && kk_nyquist ) ){ + + // At the nyquist point, the random quantity only has a real component. Have to + // multiply by sqrt(2.0) to make sure the variance is still 1 + Scalar sqrt2 = 1.4142135623730951; + fX = make_scalar2( sqrt2*reX, 0.0 ); + fY = make_scalar2( sqrt2*reY, 0.0 ); + fZ = make_scalar2( sqrt2*reZ, 0.0 ); + + // Dot product of wave-vector with stochastic quantity + kdF = make_scalar2( ( tk.x*fX.x + tk.y*fY.x + tk.z*fZ.x ) / ksq, ( tk.x*fX.y + tk.y*fY.y + tk.z*fZ.y ) / ksq ); + + // Scaling factor + B12 = sqrtf( tk.w ); + Scalar k = sqrtf( ksq ); + B12 *= sinf( k ) / k; + + // Add random quantity to the grid AND scale by B^(1/2) simultaneously to save effort + gridX[idx].x = gridX[idx].x + fac * ( fX.x - tk.x * kdF.x ) * B12; + gridX[idx].y = gridX[idx].y + fac * ( fX.y - tk.x * kdF.y ) * B12; + + gridY[idx].x = gridY[idx].x + fac * ( fY.x - tk.y * kdF.x ) * B12; + gridY[idx].y = gridY[idx].y + fac * ( fY.y - tk.y * kdF.y ) * B12; + + gridZ[idx].x = gridZ[idx].x + fac * ( fZ.x - tk.z * kdF.x ) * B12; + gridZ[idx].y = gridZ[idx].y + fac * ( fZ.y - tk.z * kdF.y ) * B12; + + } + else { + + // Construct random force + fX = make_scalar2( reX, imX ); + fY = make_scalar2( reY, imY ); + fZ = make_scalar2( reZ, imZ ); + + // The random force at the conjugate point is the conjugate of the force at + // the current point + fX_conj = make_scalar2( reX, -imX ); + fY_conj = make_scalar2( reY, -imY ); + fZ_conj = make_scalar2( reZ, -imZ ); + + // Dot prodcut of force with wave vector at current and conjugate point + kdF = make_scalar2( ( tk.x*fX.x + tk.y*fY.x + tk.z*fZ.x ) / ksq, ( tk.x*fX.y + tk.y*fY.y + tk.z*fZ.y ) / ksq ); + kdF_conj = make_scalar2( ( tk_conj.x*fX_conj.x + tk_conj.y*fY_conj.x + tk_conj.z*fZ_conj.x ) / ksq_conj, ( tk_conj.x*fX_conj.y + tk_conj.y*fY_conj.y + tk_conj.z*fZ_conj.y ) / ksq_conj ); + + // Scaling factors at current and conjugate point + B12 = sqrtf( tk.w ); + B12_conj = sqrtf( tk_conj.w ); + + Scalar k = sqrtf( ksq ); + Scalar kconj = sqrtf( ksq_conj ); + B12 *= sinf( k ) / k; + B12_conj *= sinf( kconj ) / kconj; + + // Add random quantity to the grid AND scale by B^(1/2) simultaneously to save effort + // Current grid point + gridX[idx].x = gridX[idx].x + fac * ( fX.x - tk.x * kdF.x ) * B12; + gridX[idx].y = gridX[idx].y + fac * ( fX.y - tk.x * kdF.y ) * B12; + + gridY[idx].x = gridY[idx].x + fac * ( fY.x - tk.y * kdF.x ) * B12; + gridY[idx].y = gridY[idx].y + fac * ( fY.y - tk.y * kdF.y ) * B12; + + gridZ[idx].x = gridZ[idx].x + fac * ( fZ.x - tk.z * kdF.x ) * B12; + gridZ[idx].y = gridZ[idx].y + fac * ( fZ.y - tk.z * kdF.y ) * B12; + + // Add random quantity to the grid AND scale by B^(1/2) simultaneously to save effort + // Conjugate grid point + gridX[conj_idx].x = gridX[conj_idx].x + fac * ( fX_conj.x - tk_conj.x * kdF_conj.x ) * B12_conj; + gridX[conj_idx].y = gridX[conj_idx].y + fac * ( fX_conj.y - tk_conj.x * kdF_conj.y ) * B12_conj; + + gridY[conj_idx].x = gridY[conj_idx].x + fac * ( fY_conj.x - tk_conj.y * kdF_conj.x ) * B12_conj; + gridY[conj_idx].y = gridY[conj_idx].y + fac * ( fY_conj.y - tk_conj.y * kdF_conj.y ) * B12_conj; + + gridZ[conj_idx].x = gridZ[conj_idx].x + fac * ( fZ_conj.x - tk_conj.z * kdF_conj.x ) * B12_conj; + gridZ[conj_idx].y = gridZ[conj_idx].y + fac * ( fZ_conj.y - tk_conj.z * kdF_conj.y ) * B12_conj; + + } + + + + } - } + } } /*! - Use Lanczos method to compute Mreal^0.5 * psi + Use Lanczos method to compute Mreal^0.5 * psi - This method is detailed in: - "Preconditioned Krylov Subspace Methods for Sampling Multivariate Gaussian Distributions" - Edmond Chow and Yousef Saad, IAM J. Sci. Comput., 36(2), A588–A608 + This method is detailed in: + "Preconditioned Krylov Subspace Methods for Sampling Multivariate Gaussian Distributions" + Edmond Chow and Yousef Saad, IAM J. Sci. Comput., 36(2), A588–A608 */ -void gpu_stokes_BrealLanczos_wrap( - Scalar4 *d_psi, - Scalar4 *d_pos, - unsigned int *d_group_members, - unsigned int group_size, - const BoxDim& box, - Scalar dt, - Scalar4 *d_vel, - const Scalar T, - const unsigned int timestep, - const unsigned int seed, - Scalar xi, - Scalar ewald_cut, - Scalar ewald_dr, - int ewald_n, - Scalar4 *d_ewaldC1, - const unsigned int *d_n_neigh, - const unsigned int *d_nlist, - const unsigned int *d_headlist, - int& m, - Scalar cheb_error, - dim3 grid, - dim3 threads, - int gridBlockSize, - int gridNBlock, - Scalar3 gridh, - Scalar self - ){ - - // Dot product kernel specifications - unsigned int thread_for_dot = 512; // Must be 2^n - unsigned int grid_for_dot = (group_size/thread_for_dot) + 1; - - // Temp var for dot product. - Scalar *dot_sum; - cudaMalloc( (void**)&dot_sum, grid_for_dot*sizeof(Scalar) ); - - // Allocate storage - // - int m_in = m; - int m_max = 100; +void gpu_stokes_BrealLanczos_wrap( + Scalar4 *d_psi, + Scalar4 *d_pos, + unsigned int *d_group_members, + unsigned int group_size, + const BoxDim& box, + Scalar dt, + Scalar4 *d_vel, + const Scalar T, + const unsigned int timestep, + const unsigned int seed, + Scalar xi, + Scalar ewald_cut, + Scalar ewald_dr, + int ewald_n, + Scalar4 *d_ewaldC1, + const unsigned int *d_n_neigh, + const unsigned int *d_nlist, + const unsigned int *d_headlist, + int& m, + Scalar cheb_error, + dim3 grid, + dim3 threads, + int gridBlockSize, + int gridNBlock, + Scalar3 gridh, + Scalar self + ){ + + // Dot product kernel specifications + unsigned int thread_for_dot = 512; // Must be 2^n + unsigned int grid_for_dot = (group_size/thread_for_dot) + 1; + + // Temp var for dot product. + Scalar *dot_sum; + cudaMalloc( (void**)&dot_sum, grid_for_dot*sizeof(Scalar) ); + + // Allocate storage + // + int m_in = m; + int m_max = 100; // Storage vectors for tridiagonal factorization - float *alpha, *beta, *alpha_save, *beta_save; + float *alpha, *beta, *alpha_save, *beta_save; alpha = (float *)malloc( (m_max)*sizeof(float) ); alpha_save = (float *)malloc( (m_max)*sizeof(float) ); beta = (float *)malloc( (m_max+1)*sizeof(float) ); beta_save = (float *)malloc( (m_max+1)*sizeof(float) ); - // Vectors for Lapacke and square root - float *W; - W = (float *)malloc( (m_max*m_max)*sizeof(float) ); - float *W1; // W1 = Lambda^(1/2) * ( W^T * e1 ) - W1 = (float *)malloc( (m_max)*sizeof(float) ); - float *Tm; - Tm = (float *)malloc( m_max*sizeof(float) ); - Scalar *d_Tm; - cudaMalloc( (void**)&d_Tm, m_max * sizeof(Scalar) ); - - // Vectors for Lanczos iterations - Scalar4 *d_v, *d_vj, *d_vjm1; - cudaMalloc( (void**)&d_v, group_size*sizeof(Scalar4) ); - cudaMalloc( (void**)&d_vj, group_size*sizeof(Scalar4) ); - cudaMalloc( (void**)&d_vjm1, group_size*sizeof(Scalar4) ); - - // Storage vector for M*vj - Scalar4 *d_Mvj; - cudaMalloc( (void**)&d_Mvj, group_size*sizeof(Scalar4) ); - - // Storage array for V - Scalar4 *d_V; - cudaMalloc( (void**)&d_V, m_max*group_size * sizeof(Scalar4) ); - - // Step-norm things - Scalar4 *d_vel_old, *d_Mpsi; - cudaMalloc( (void**)&d_vel_old, group_size*sizeof(Scalar4) ); - cudaMalloc( (void**)&d_Mpsi, group_size*sizeof(Scalar4) ); - Scalar psiMpsi; - - // Temporary pointer - Scalar4 *d_temp; - - // Copy random vector to v0 - cudaMemcpy( d_vj, d_psi, group_size*sizeof(Scalar4), cudaMemcpyDeviceToDevice ); - - // Compute the norm of the d_psi (also the norm of basis vector v0) + // Vectors for Lapacke and square root + float *W; + W = (float *)malloc( (m_max*m_max)*sizeof(float) ); + float *W1; // W1 = Lambda^(1/2) * ( W^T * e1 ) + W1 = (float *)malloc( (m_max)*sizeof(float) ); + float *Tm; + Tm = (float *)malloc( m_max*sizeof(float) ); + Scalar *d_Tm; + cudaMalloc( (void**)&d_Tm, m_max * sizeof(Scalar) ); + + // Vectors for Lanczos iterations + Scalar4 *d_v, *d_vj, *d_vjm1; + cudaMalloc( (void**)&d_v, group_size*sizeof(Scalar4) ); + cudaMalloc( (void**)&d_vj, group_size*sizeof(Scalar4) ); + cudaMalloc( (void**)&d_vjm1, group_size*sizeof(Scalar4) ); + + // Storage vector for M*vj + Scalar4 *d_Mvj; + cudaMalloc( (void**)&d_Mvj, group_size*sizeof(Scalar4) ); + + // Storage array for V + Scalar4 *d_V; + cudaMalloc( (void**)&d_V, m_max*group_size * sizeof(Scalar4) ); + + // Step-norm things + Scalar4 *d_vel_old, *d_Mpsi; + cudaMalloc( (void**)&d_vel_old, group_size*sizeof(Scalar4) ); + cudaMalloc( (void**)&d_Mpsi, group_size*sizeof(Scalar4) ); + Scalar psiMpsi; + + // Temporary pointer + Scalar4 *d_temp; + + // Copy random vector to v0 + cudaMemcpy( d_vj, d_psi, group_size*sizeof(Scalar4), cudaMemcpyDeviceToDevice ); + + // Compute the norm of the d_psi (also the norm of basis vector v0) Scalar vnorm; - gpu_stokes_DotStepOne_kernel<<< grid_for_dot, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(d_vj, d_vj, dot_sum, group_size, d_group_members); - gpu_stokes_DotStepTwo_kernel<<< 1, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(dot_sum, grid_for_dot); - cudaMemcpy(&vnorm, dot_sum, sizeof(Scalar), cudaMemcpyDeviceToHost); - vnorm = sqrtf( vnorm ); + gpu_stokes_DotStepOne_kernel<<< grid_for_dot, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(d_vj, d_vj, dot_sum, group_size, d_group_members); + gpu_stokes_DotStepTwo_kernel<<< 1, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(dot_sum, grid_for_dot); + cudaMemcpy(&vnorm, dot_sum, sizeof(Scalar), cudaMemcpyDeviceToHost); + vnorm = sqrtf( vnorm ); - Scalar psinorm = vnorm; + Scalar psinorm = vnorm; - // Compute psi * M * psi ( for step norm ) - gpu_stokes_Mreal_kernel<<>>(d_pos, d_Mpsi, d_psi, group_size, xi, d_ewaldC1, self, ewald_cut, ewald_n, ewald_dr, d_group_members, box, d_n_neigh, d_nlist, d_headlist ); - gpu_stokes_DotStepOne_kernel<<< grid_for_dot, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(d_psi, d_Mpsi, dot_sum, group_size, d_group_members); - gpu_stokes_DotStepTwo_kernel<<< 1, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(dot_sum, grid_for_dot); - cudaMemcpy(&psiMpsi, dot_sum, sizeof(Scalar), cudaMemcpyDeviceToHost); + // Compute psi * M * psi ( for step norm ) + gpu_stokes_Mreal_kernel<<>>(d_pos, d_Mpsi, d_psi, group_size, xi, d_ewaldC1, self, ewald_cut, ewald_n, ewald_dr, d_group_members, box, d_n_neigh, d_nlist, d_headlist ); + gpu_stokes_DotStepOne_kernel<<< grid_for_dot, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(d_psi, d_Mpsi, dot_sum, group_size, d_group_members); + gpu_stokes_DotStepTwo_kernel<<< 1, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(dot_sum, grid_for_dot); + cudaMemcpy(&psiMpsi, dot_sum, sizeof(Scalar), cudaMemcpyDeviceToHost); - psiMpsi = psiMpsi / ( psinorm * psinorm ); + psiMpsi = psiMpsi / ( psinorm * psinorm ); // First iteration, vjm1 = 0, vj = psi / norm( psi ) - gpu_stokes_LinearCombination_kernel<<>>(d_vj, d_vj, d_vjm1, 0.0, 0.0, group_size, d_group_members); - gpu_stokes_LinearCombination_kernel<<>>(d_vj, d_vj, d_vj, 1.0/vnorm, 0.0, group_size, d_group_members); - - // Start by computing (m-1) iterations, so that the stepnorm for the given - // number of iterations can be compute - m = m_in - 1; - m = m < 1 ? 1 : m; - - // Values for current alpha and beta in the iteration - Scalar tempalpha; - Scalar tempbeta = 0.0; - - // Apply the Lanczos method - for ( int jj = 0; jj < m; ++jj ){ - - // Store current basis vector - cudaMemcpy( &d_V[jj*group_size], d_vj, group_size*sizeof(Scalar4), cudaMemcpyDeviceToDevice ); - - // Store beta - beta[jj] = tempbeta; - - // v = M*vj - betaj*vjm1 - gpu_stokes_Mreal_kernel<<>>(d_pos, d_Mvj, d_vj, group_size, xi, d_ewaldC1, self, ewald_cut, ewald_n, ewald_dr, d_group_members, box, d_n_neigh, d_nlist, d_headlist ); - gpu_stokes_LinearCombination_kernel<<>>(d_Mvj, d_vjm1, d_v, 1.0, -1.0*tempbeta, group_size, d_group_members); - - // vj dot v - gpu_stokes_DotStepOne_kernel<<< grid_for_dot, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(d_vj, d_v, dot_sum, group_size, d_group_members); - gpu_stokes_DotStepTwo_kernel<<< 1, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(dot_sum, grid_for_dot); - cudaMemcpy(&tempalpha, dot_sum, sizeof(Scalar), cudaMemcpyDeviceToHost); - - // Store updated alpha - alpha[jj] = tempalpha; - - // v = v - alphaj*vj - gpu_stokes_LinearCombination_kernel<<>>(d_v, d_vj, d_v, 1.0, -1.0*tempalpha, group_size, d_group_members); - - // v dot v - gpu_stokes_DotStepOne_kernel<<< grid_for_dot, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(d_v, d_v, dot_sum, group_size, d_group_members); - gpu_stokes_DotStepTwo_kernel<<< 1, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(dot_sum, grid_for_dot); - cudaMemcpy(&vnorm, dot_sum, sizeof(Scalar), cudaMemcpyDeviceToHost); - vnorm = sqrtf( vnorm ); - - // betajp1 = norm( v ) - tempbeta = vnorm; - - // Check that the basis vector is not too small. If so, end the iteration - // (If norm is too small, will have numerical trouble) - if ( vnorm < 1E-8 ){ - m = jj; - break; - } - - // vjp1 = v / betajp1 - gpu_stokes_LinearCombination_kernel<<>>(d_v, d_v, d_v, 1.0/tempbeta, 0.0, group_size, d_group_members); - - // Swap pointers - d_temp = d_vjm1; - d_vjm1 = d_vj; - d_vj = d_v; - d_v = d_temp; - - } - - // Save alpha, beta vectors (will be overwritten by lapack) - for ( int ii = 0; ii < m; ++ii ){ - alpha_save[ii] = alpha[ii]; - beta_save[ii] = beta[ii]; - } - beta_save[m] = beta[m]; - - // Now that we have alpha, beta, have to compute the square root of the tridiagonal - // matrix Tm. Do this using eigen-decomposition. - // - // Compute eigen-decomposition of tridiagonal matrix - // alpha (input) - vector of entries on main diagonal - // alpha (output) - eigenvalues sorted in descending order - // beta (input) - vector of entries of sub-diagonal - // beta (output) - overwritten (zeros?) - // W - (output) - matrix of eigenvectors. ith column corresponds to ith eigenvalue - // INFO (output) = 0 if operation was succesful - int INFO = LAPACKE_spteqr( LAPACK_ROW_MAJOR, 'I', m, alpha, &beta[1], W, m ); - - // Check whether the eigen-decomposition failed, and throw error on failure - if ( INFO != 0 ){ - printf("Eigenvalue decomposition #1 failed \n"); - printf("INFO = %i \n", INFO); - - printf("\n alpha: \n"); - for( int ii = 0; ii < m; ++ii ){ - printf("%f \n", alpha_save[ii]); - } - printf("\n beta: \n"); - for( int ii = 0; ii < m; ++ii ){ - printf("%f \n", beta_save[ii]); - } - printf("%f \n", beta_save[m]); - - printf("Note to User: restart simulation and proceed. \n"); - - exit(EXIT_FAILURE); - } - - // Now, we have to compute Tm^(1/2) * e1 - // Tm^(1/2) = W * Lambda^(1/2) * W^T * e1 - // = W * Lambda^(1/2) * ( W^T * e1 ) - // The quantity in parentheses is the first row of W - // Lambda^(1/2) only has diagonal entries, so it's product with the first row of W - // is easy to compute. - for ( int ii = 0; ii < m; ++ii ){ - W1[ii] = sqrtf( alpha[ii] ) * W[ii]; - } - - // Tm = W * W1 = W * Lambda^(1/2) * W^T * e1 - float tempsum; - for ( int ii = 0; ii < m; ++ii ){ - tempsum = 0.0; - for ( int jj = 0; jj < m; ++jj ){ - int idx = m*ii + jj; - - tempsum += W[idx] * W1[jj]; - } - Tm[ii] = tempsum; - } - - // Copy matrix to GPU - cudaMemcpy( d_Tm, Tm, m*sizeof(Scalar), cudaMemcpyHostToDevice ); - - // Multiply basis vectors by Tm, [ V0, V1, ..., Vm-1 ] * Tm - gpu_stokes_MatVecMultiply_kernel<<>>(d_V, d_Tm, d_vel, group_size, m); - - // Copy velocity - cudaMemcpy( d_vel_old, d_vel, group_size*sizeof(Scalar4), cudaMemcpyDeviceToDevice ); - - // Restore alpha, beta - for ( int ii = 0; ii < m; ++ii ){ - alpha[ii] = alpha_save[ii]; - beta[ii] = beta_save[ii]; - } - beta[m] = beta_save[m]; - - - // - // Keep adding to basis vectors until the step norm is small enough - // - Scalar stepnorm = 1.0; - int jj; - while( stepnorm > cheb_error && m < m_max ){ - m++; - jj = m - 1; - - // - // Do another Lanczos iteration - // - - // Store the current basis vector - cudaMemcpy( &d_V[jj*group_size], d_vj, group_size*sizeof(Scalar4), cudaMemcpyDeviceToDevice ); - - // Store beta - beta[jj] = tempbeta; - - // v = M*vj - betaj*vjm1 - gpu_stokes_Mreal_kernel<<>>(d_pos, d_Mvj, d_vj, group_size, xi, d_ewaldC1, self, ewald_cut, ewald_n, ewald_dr, d_group_members, box, d_n_neigh, d_nlist, d_headlist ); - gpu_stokes_LinearCombination_kernel<<>>(d_Mvj, d_vjm1, d_v, 1.0, -1.0*tempbeta, group_size, d_group_members); - - // vj dot v - gpu_stokes_DotStepOne_kernel<<< grid_for_dot, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(d_vj, d_v, dot_sum, group_size, d_group_members); - gpu_stokes_DotStepTwo_kernel<<< 1, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(dot_sum, grid_for_dot); - cudaMemcpy(&tempalpha, dot_sum, sizeof(Scalar), cudaMemcpyDeviceToHost); - - // Store updated alpha - alpha[jj] = tempalpha; - - // v = v - alphaj*vj - gpu_stokes_LinearCombination_kernel<<>>(d_v, d_vj, d_v, 1.0, -1.0*tempalpha, group_size, d_group_members); - - // v dot v - gpu_stokes_DotStepOne_kernel<<< grid_for_dot, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(d_v, d_v, dot_sum, group_size, d_group_members); - gpu_stokes_DotStepTwo_kernel<<< 1, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(dot_sum, grid_for_dot); - cudaMemcpy(&vnorm, dot_sum, sizeof(Scalar), cudaMemcpyDeviceToHost); - vnorm = sqrtf( vnorm ); - - // betajp1 = norm( v ) - tempbeta = vnorm; - - // Check if the norm of the basis vector is too small. If - // so, end the iteration. - if ( vnorm < 1E-8 ){ - m = jj; - break; - } - - // vjp1 = v / betajp1 - gpu_stokes_LinearCombination_kernel<<>>(d_v, d_v, d_v, 1.0/tempbeta, 0.0, group_size, d_group_members); - - // Swap pointers - d_temp = d_vjm1; - d_vjm1 = d_vj; - d_vj = d_v; - d_v = d_temp; - - // Save alpha, beta vectors (will be overwritten by lapack) - for ( int ii = 0; ii < m; ++ii ){ - alpha_save[ii] = alpha[ii]; - beta_save[ii] = beta[ii]; - } - beta_save[m] = beta[m]; - - // - // Square root calculation with addition of latest Lanczos iteration - // (see first implementation above more description) - // - - // Compute eigen-decomposition of tridiagonal matrix - int INFO = LAPACKE_spteqr( LAPACK_ROW_MAJOR, 'I', m, alpha, &beta[1], W, m ); - - // Check whether the eigen-decomposition failed, and throw error on failure - if ( INFO != 0 ){ - printf("Eigenvalue decomposition #2 failed \n"); - printf("INFO = %i \n", INFO); - - printf("\n alpha: \n"); - for( int ii = 0; ii < m; ++ii ){ - printf("%f \n", alpha_save[ii]); - } - printf("\n beta: \n"); - for( int ii = 0; ii < m; ++ii ){ - printf("%f \n", beta_save[ii]); - } - printf("%f \n", beta_save[m]); - - printf("Note to User: restart simulation and proceed. \n"); - - exit(EXIT_FAILURE); - } - - // Now, we have to compute Tm^(1/2) * e1 - for ( int ii = 0; ii < m; ++ii ){ - W1[ii] = sqrtf( alpha[ii] ) * W[ii]; - } - - // Tm = W * W1 = W * Lambda^(1/2) * W^T * e1 - float tempsum; - for ( int ii = 0; ii < m; ++ii ){ - tempsum = 0.0; - for ( int jj = 0; jj < m; ++jj ){ - int idx = m*ii + jj; - - tempsum += W[idx] * W1[jj]; - } - Tm[ii] = tempsum; - } - - // Copy matrix to GPU - cudaMemcpy( d_Tm, Tm, m*sizeof(Scalar), cudaMemcpyHostToDevice ); - - // Multiply basis vectors by Tm -- velocity = Vm * Tm - gpu_stokes_MatVecMultiply_kernel<<>>(d_V, d_Tm, d_vel, group_size, m); - - // Compute step norm error - gpu_stokes_LinearCombination_kernel<<>>(d_vel, d_vel_old, d_vel_old, 1.0, -1.0, group_size, d_group_members); - gpu_stokes_DotStepOne_kernel<<< grid_for_dot, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(d_vel_old, d_vel_old, dot_sum, group_size, d_group_members); - gpu_stokes_DotStepTwo_kernel<<< 1, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(dot_sum, grid_for_dot); - cudaMemcpy(&stepnorm, dot_sum, sizeof(Scalar), cudaMemcpyDeviceToHost); - - stepnorm = sqrtf( stepnorm / psiMpsi ); - - // Copy velocity - cudaMemcpy( d_vel_old, d_vel, group_size*sizeof(Scalar4), cudaMemcpyDeviceToDevice ); - - // Restore alpha, beta - for ( int ii = 0; ii < m; ++ii ){ - alpha[ii] = alpha_save[ii]; - beta[ii] = beta_save[ii]; - } - beta[m] = beta_save[m]; - - } - - // Rescale by original norm of Psi and include thermal variance - gpu_stokes_LinearCombination_kernel<<>>(d_vel, d_vel, d_vel, psinorm * sqrtf(2.0*T/dt), 0.0, group_size, d_group_members); - - // - // Clean up - // - cudaFree(dot_sum); - cudaFree(d_Mvj); - cudaFree(d_v); - cudaFree(d_vj); - cudaFree(d_vjm1); - cudaFree(d_V); - cudaFree(d_Tm); - cudaFree(d_vel_old); - cudaFree(d_Mpsi); - - d_temp = NULL; - - free(alpha); - free(beta); - free(alpha_save); - free(beta_save); - - free(W); - free(W1); - free(Tm); - + gpu_stokes_LinearCombination_kernel<<>>(d_vj, d_vj, d_vjm1, 0.0, 0.0, group_size, d_group_members); + gpu_stokes_LinearCombination_kernel<<>>(d_vj, d_vj, d_vj, 1.0/vnorm, 0.0, group_size, d_group_members); + + // Start by computing (m-1) iterations, so that the stepnorm for the given + // number of iterations can be compute + m = m_in - 1; + m = m < 1 ? 1 : m; + + // Values for current alpha and beta in the iteration + Scalar tempalpha; + Scalar tempbeta = 0.0; + + // Apply the Lanczos method + for ( int jj = 0; jj < m; ++jj ){ + + // Store current basis vector + cudaMemcpy( &d_V[jj*group_size], d_vj, group_size*sizeof(Scalar4), cudaMemcpyDeviceToDevice ); + + // Store beta + beta[jj] = tempbeta; + + // v = M*vj - betaj*vjm1 + gpu_stokes_Mreal_kernel<<>>(d_pos, d_Mvj, d_vj, group_size, xi, d_ewaldC1, self, ewald_cut, ewald_n, ewald_dr, d_group_members, box, d_n_neigh, d_nlist, d_headlist ); + gpu_stokes_LinearCombination_kernel<<>>(d_Mvj, d_vjm1, d_v, 1.0, -1.0*tempbeta, group_size, d_group_members); + + // vj dot v + gpu_stokes_DotStepOne_kernel<<< grid_for_dot, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(d_vj, d_v, dot_sum, group_size, d_group_members); + gpu_stokes_DotStepTwo_kernel<<< 1, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(dot_sum, grid_for_dot); + cudaMemcpy(&tempalpha, dot_sum, sizeof(Scalar), cudaMemcpyDeviceToHost); + + // Store updated alpha + alpha[jj] = tempalpha; + + // v = v - alphaj*vj + gpu_stokes_LinearCombination_kernel<<>>(d_v, d_vj, d_v, 1.0, -1.0*tempalpha, group_size, d_group_members); + + // v dot v + gpu_stokes_DotStepOne_kernel<<< grid_for_dot, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(d_v, d_v, dot_sum, group_size, d_group_members); + gpu_stokes_DotStepTwo_kernel<<< 1, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(dot_sum, grid_for_dot); + cudaMemcpy(&vnorm, dot_sum, sizeof(Scalar), cudaMemcpyDeviceToHost); + vnorm = sqrtf( vnorm ); + + // betajp1 = norm( v ) + tempbeta = vnorm; + + // Check that the basis vector is not too small. If so, end the iteration + // (If norm is too small, will have numerical trouble) + if ( vnorm < 1E-8 ){ + m = jj; + break; + } + + // vjp1 = v / betajp1 + gpu_stokes_LinearCombination_kernel<<>>(d_v, d_v, d_v, 1.0/tempbeta, 0.0, group_size, d_group_members); + + // Swap pointers + d_temp = d_vjm1; + d_vjm1 = d_vj; + d_vj = d_v; + d_v = d_temp; + + } + + // Save alpha, beta vectors (will be overwritten by lapack) + for ( int ii = 0; ii < m; ++ii ){ + alpha_save[ii] = alpha[ii]; + beta_save[ii] = beta[ii]; + } + beta_save[m] = beta[m]; + + // Now that we have alpha, beta, have to compute the square root of the tridiagonal + // matrix Tm. Do this using eigen-decomposition. + // + // Compute eigen-decomposition of tridiagonal matrix + // alpha (input) - vector of entries on main diagonal + // alpha (output) - eigenvalues sorted in descending order + // beta (input) - vector of entries of sub-diagonal + // beta (output) - overwritten (zeros?) + // W - (output) - matrix of eigenvectors. ith column corresponds to ith eigenvalue + // INFO (output) = 0 if operation was succesful + int INFO = LAPACKE_spteqr( LAPACK_ROW_MAJOR, 'I', m, alpha, &beta[1], W, m ); + + // Check whether the eigen-decomposition failed, and throw error on failure + if ( INFO != 0 ){ + printf("Eigenvalue decomposition #1 failed \n"); + printf("INFO = %i \n", INFO); + + printf("\n alpha: \n"); + for( int ii = 0; ii < m; ++ii ){ + printf("%f \n", alpha_save[ii]); + } + printf("\n beta: \n"); + for( int ii = 0; ii < m; ++ii ){ + printf("%f \n", beta_save[ii]); + } + printf("%f \n", beta_save[m]); + + printf("Note to User: restart simulation and proceed. \n"); + + exit(EXIT_FAILURE); + } + + // Now, we have to compute Tm^(1/2) * e1 + // Tm^(1/2) = W * Lambda^(1/2) * W^T * e1 + // = W * Lambda^(1/2) * ( W^T * e1 ) + // The quantity in parentheses is the first row of W + // Lambda^(1/2) only has diagonal entries, so it's product with the first row of W + // is easy to compute. + for ( int ii = 0; ii < m; ++ii ){ + W1[ii] = sqrtf( alpha[ii] ) * W[ii]; + } + + // Tm = W * W1 = W * Lambda^(1/2) * W^T * e1 + float tempsum; + for ( int ii = 0; ii < m; ++ii ){ + tempsum = 0.0; + for ( int jj = 0; jj < m; ++jj ){ + int idx = m*ii + jj; + + tempsum += W[idx] * W1[jj]; + } + Tm[ii] = tempsum; + } + + // Copy matrix to GPU + cudaMemcpy( d_Tm, Tm, m*sizeof(Scalar), cudaMemcpyHostToDevice ); + + // Multiply basis vectors by Tm, [ V0, V1, ..., Vm-1 ] * Tm + gpu_stokes_MatVecMultiply_kernel<<>>(d_V, d_Tm, d_vel, group_size, m); + + // Copy velocity + cudaMemcpy( d_vel_old, d_vel, group_size*sizeof(Scalar4), cudaMemcpyDeviceToDevice ); + + // Restore alpha, beta + for ( int ii = 0; ii < m; ++ii ){ + alpha[ii] = alpha_save[ii]; + beta[ii] = beta_save[ii]; + } + beta[m] = beta_save[m]; + + + // + // Keep adding to basis vectors until the step norm is small enough + // + Scalar stepnorm = 1.0; + int jj; + while( stepnorm > cheb_error && m < m_max ){ + m++; + jj = m - 1; + + // + // Do another Lanczos iteration + // + + // Store the current basis vector + cudaMemcpy( &d_V[jj*group_size], d_vj, group_size*sizeof(Scalar4), cudaMemcpyDeviceToDevice ); + + // Store beta + beta[jj] = tempbeta; + + // v = M*vj - betaj*vjm1 + gpu_stokes_Mreal_kernel<<>>(d_pos, d_Mvj, d_vj, group_size, xi, d_ewaldC1, self, ewald_cut, ewald_n, ewald_dr, d_group_members, box, d_n_neigh, d_nlist, d_headlist ); + gpu_stokes_LinearCombination_kernel<<>>(d_Mvj, d_vjm1, d_v, 1.0, -1.0*tempbeta, group_size, d_group_members); + + // vj dot v + gpu_stokes_DotStepOne_kernel<<< grid_for_dot, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(d_vj, d_v, dot_sum, group_size, d_group_members); + gpu_stokes_DotStepTwo_kernel<<< 1, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(dot_sum, grid_for_dot); + cudaMemcpy(&tempalpha, dot_sum, sizeof(Scalar), cudaMemcpyDeviceToHost); + + // Store updated alpha + alpha[jj] = tempalpha; + + // v = v - alphaj*vj + gpu_stokes_LinearCombination_kernel<<>>(d_v, d_vj, d_v, 1.0, -1.0*tempalpha, group_size, d_group_members); + + // v dot v + gpu_stokes_DotStepOne_kernel<<< grid_for_dot, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(d_v, d_v, dot_sum, group_size, d_group_members); + gpu_stokes_DotStepTwo_kernel<<< 1, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(dot_sum, grid_for_dot); + cudaMemcpy(&vnorm, dot_sum, sizeof(Scalar), cudaMemcpyDeviceToHost); + vnorm = sqrtf( vnorm ); + + // betajp1 = norm( v ) + tempbeta = vnorm; + + // Check if the norm of the basis vector is too small. If + // so, end the iteration. + if ( vnorm < 1E-8 ){ + m = jj; + break; + } + + // vjp1 = v / betajp1 + gpu_stokes_LinearCombination_kernel<<>>(d_v, d_v, d_v, 1.0/tempbeta, 0.0, group_size, d_group_members); + + // Swap pointers + d_temp = d_vjm1; + d_vjm1 = d_vj; + d_vj = d_v; + d_v = d_temp; + + // Save alpha, beta vectors (will be overwritten by lapack) + for ( int ii = 0; ii < m; ++ii ){ + alpha_save[ii] = alpha[ii]; + beta_save[ii] = beta[ii]; + } + beta_save[m] = beta[m]; + + // + // Square root calculation with addition of latest Lanczos iteration + // (see first implementation above more description) + // + + // Compute eigen-decomposition of tridiagonal matrix + int INFO = LAPACKE_spteqr( LAPACK_ROW_MAJOR, 'I', m, alpha, &beta[1], W, m ); + + // Check whether the eigen-decomposition failed, and throw error on failure + if ( INFO != 0 ){ + printf("Eigenvalue decomposition #2 failed \n"); + printf("INFO = %i \n", INFO); + + printf("\n alpha: \n"); + for( int ii = 0; ii < m; ++ii ){ + printf("%f \n", alpha_save[ii]); + } + printf("\n beta: \n"); + for( int ii = 0; ii < m; ++ii ){ + printf("%f \n", beta_save[ii]); + } + printf("%f \n", beta_save[m]); + + printf("Note to User: restart simulation and proceed. \n"); + + exit(EXIT_FAILURE); + } + + // Now, we have to compute Tm^(1/2) * e1 + for ( int ii = 0; ii < m; ++ii ){ + W1[ii] = sqrtf( alpha[ii] ) * W[ii]; + } + + // Tm = W * W1 = W * Lambda^(1/2) * W^T * e1 + float tempsum; + for ( int ii = 0; ii < m; ++ii ){ + tempsum = 0.0; + for ( int jj = 0; jj < m; ++jj ){ + int idx = m*ii + jj; + + tempsum += W[idx] * W1[jj]; + } + Tm[ii] = tempsum; + } + + // Copy matrix to GPU + cudaMemcpy( d_Tm, Tm, m*sizeof(Scalar), cudaMemcpyHostToDevice ); + + // Multiply basis vectors by Tm -- velocity = Vm * Tm + gpu_stokes_MatVecMultiply_kernel<<>>(d_V, d_Tm, d_vel, group_size, m); + + // Compute step norm error + gpu_stokes_LinearCombination_kernel<<>>(d_vel, d_vel_old, d_vel_old, 1.0, -1.0, group_size, d_group_members); + gpu_stokes_DotStepOne_kernel<<< grid_for_dot, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(d_vel_old, d_vel_old, dot_sum, group_size, d_group_members); + gpu_stokes_DotStepTwo_kernel<<< 1, thread_for_dot, thread_for_dot*sizeof(Scalar) >>>(dot_sum, grid_for_dot); + cudaMemcpy(&stepnorm, dot_sum, sizeof(Scalar), cudaMemcpyDeviceToHost); + + stepnorm = sqrtf( stepnorm / psiMpsi ); + + // Copy velocity + cudaMemcpy( d_vel_old, d_vel, group_size*sizeof(Scalar4), cudaMemcpyDeviceToDevice ); + + // Restore alpha, beta + for ( int ii = 0; ii < m; ++ii ){ + alpha[ii] = alpha_save[ii]; + beta[ii] = beta_save[ii]; + } + beta[m] = beta_save[m]; + + } + + // Rescale by original norm of Psi and include thermal variance + gpu_stokes_LinearCombination_kernel<<>>(d_vel, d_vel, d_vel, psinorm * sqrtf(2.0*T/dt), 0.0, group_size, d_group_members); + + // + // Clean up + // + cudaFree(dot_sum); + cudaFree(d_Mvj); + cudaFree(d_v); + cudaFree(d_vj); + cudaFree(d_vjm1); + cudaFree(d_V); + cudaFree(d_Tm); + cudaFree(d_vel_old); + cudaFree(d_Mpsi); + + d_temp = NULL; + + free(alpha); + free(beta); + free(alpha_save); + free(beta_save); + + free(W); + free(W1); + free(Tm); + } // Wrap up everything to compute mobility AND brownian if necessary -// - Combine Fourier components of Deterministic and Brownian calculation -// in order to save extra FFTs and contraction operations +// - Combine Fourier components of Deterministic and Brownian calculation +// in order to save extra FFTs and contraction operations // - Add deterministic and stochastic real space contributions void gpu_stokes_CombinedMobilityBrownian_wrap( - Scalar4 *d_pos, - Scalar4 *d_net_force, + Scalar4 *d_pos, + Scalar4 *d_net_force, unsigned int *d_group_members, unsigned int group_size, const BoxDim& box, Scalar dt, - Scalar4 *d_vel, - const Scalar T, - const unsigned int timestep, - const unsigned int seed, - Scalar xi, - Scalar eta, - Scalar P, - Scalar ewald_cut, - Scalar ewald_dr, - int ewald_n, - Scalar4 *d_ewaldC1, - Scalar4 *d_gridk, - CUFFTCOMPLEX *d_gridX, - CUFFTCOMPLEX *d_gridY, - CUFFTCOMPLEX *d_gridZ, - cufftHandle plan, - const int Nx, - const int Ny, - const int Nz, - const unsigned int *d_n_neigh, + Scalar4 *d_vel, + const Scalar T, + const unsigned int timestep, + const unsigned int seed, + Scalar xi, + Scalar eta, + Scalar P, + Scalar ewald_cut, + Scalar ewald_dr, + int ewald_n, + Scalar4 *d_ewaldC1, + Scalar4 *d_gridk, + CUFFTCOMPLEX *d_gridX, + CUFFTCOMPLEX *d_gridY, + CUFFTCOMPLEX *d_gridZ, + cufftHandle plan, + const int Nx, + const int Ny, + const int Nz, + const unsigned int *d_n_neigh, const unsigned int *d_nlist, const unsigned int *d_headlist, - int& m_Lanczos, - const unsigned int N_total, - unsigned int NxNyNz, - dim3 grid, - dim3 threads, - int gridBlockSize, - int gridNBlock, - Scalar3 gridh, - Scalar cheb_error, - Scalar self ){ - - // Real space velocity to add - Scalar4 *d_vel2; - cudaMalloc( (void**)&d_vel2, group_size*sizeof(Scalar4) ); - - // Generate uniform distribution (-1,1) on d_psi - Scalar4 *d_psi; - cudaMalloc( (void**)&d_psi, group_size*sizeof(Scalar4) ); - gpu_stokes_BrownianGenerate_kernel<<>>( d_psi, group_size, d_group_members, timestep, seed ); - - // Spreading and contraction grid information and parameters - dim3 Cgrid( group_size, 1, 1); - int B = ( P < 10 ) ? P : 10; - dim3 Cthreads(B, B, B); - - Scalar quadW = gridh.x * gridh.y * gridh.z; - Scalar xisq = xi * xi; - Scalar prefac = ( 2.0 * xisq / 3.1415926536 / eta ) * sqrtf( 2.0 * xisq / 3.1415926536 / eta ); - Scalar expfac = 2.0 * xisq / eta; - - // ******************************************** - // Wave Space Part of Deterministic Calculation - // ******************************************** - - // Reset the grid (remove any previously distributed forces) - gpu_stokes_ZeroGrid_kernel<<>>(d_gridX,NxNyNz); - gpu_stokes_ZeroGrid_kernel<<>>(d_gridY,NxNyNz); - gpu_stokes_ZeroGrid_kernel<<>>(d_gridZ,NxNyNz); - - // Spread forces onto grid - gpu_stokes_Spread_kernel<<>>( d_pos, d_net_force, d_gridX, d_gridY, d_gridZ, group_size, Nx, Ny, Nz, d_group_members, box, P, gridh, xi, eta, prefac, expfac ); - - // Perform FFT on gridded forces - cufftExecC2C(plan, d_gridX, d_gridX, CUFFT_FORWARD); - cufftExecC2C(plan, d_gridY, d_gridY, CUFFT_FORWARD); - cufftExecC2C(plan, d_gridZ, d_gridZ, CUFFT_FORWARD); - - // Apply wave space scaling to FFT'd forces - gpu_stokes_Green_kernel<<>>( d_gridX, d_gridY, d_gridZ, d_gridk, NxNyNz); - - - // *************************************** - // Wave Space Part of Brownian Calculation - // *************************************** - if ( T > 0.0 ){ - - // Apply random fluctuations to wave space grid - gpu_stokes_BrownianGridGenerate_kernel<<>>( d_gridX, d_gridY, d_gridZ, d_gridk, NxNyNz, Nx, Ny, Nz, timestep, seed, T, dt, quadW ); - - } - - // ************************************ - // Finish the Wave Space Calculation - // ************************************ - - // Return rescaled forces to real space - cufftExecC2C(plan, d_gridX, d_gridX, CUFFT_INVERSE); - cufftExecC2C(plan, d_gridY, d_gridY, CUFFT_INVERSE); - cufftExecC2C(plan, d_gridZ, d_gridZ, CUFFT_INVERSE); - - // Evaluate contribution of grid velocities at particle centers - gpu_stokes_Contract_kernel<<>>( d_pos, d_vel, d_gridX, d_gridY, d_gridZ, group_size, Nx, Ny, Nz, xi, eta, d_group_members, box, P, gridh, quadW*prefac, expfac ); - - // *************************************** - // Real Space Part of Both Calculations - // *************************************** - - // Deterministic part - gpu_stokes_Mreal_kernel<<>>(d_pos, d_vel2, d_net_force, group_size, xi, d_ewaldC1, self, ewald_cut, ewald_n, ewald_dr, d_group_members, box, d_n_neigh, d_nlist, d_headlist ); - - // Add to velocity - gpu_stokes_LinearCombination_kernel<<>>(d_vel2, d_vel, d_vel, 1.0, 1.0, group_size, d_group_members); - - // Stochastic - if ( T > 0.0 ){ - - gpu_stokes_BrealLanczos_wrap( d_psi, - d_pos, - d_group_members, - group_size, - box, - dt, - d_vel2, - T, - timestep, - seed, - xi, - ewald_cut, - ewald_dr, - ewald_n, - d_ewaldC1, - d_n_neigh, - d_nlist, - d_headlist, - m_Lanczos, - cheb_error, - grid, - threads, - gridBlockSize, - gridNBlock, - gridh, - self ); - - // Add to velocity - gpu_stokes_LinearCombination_kernel<<>>(d_vel2, d_vel, d_vel, 1.0, 1.0, group_size, d_group_members); - - } - - // Free Memory - cudaFree( d_vel2 ); - cudaFree( d_psi ); + int& m_Lanczos, + const unsigned int N_total, + unsigned int NxNyNz, + dim3 grid, + dim3 threads, + int gridBlockSize, + int gridNBlock, + Scalar3 gridh, + Scalar cheb_error, + Scalar self ){ + + // Real space velocity to add + Scalar4 *d_vel2; + cudaMalloc( (void**)&d_vel2, group_size*sizeof(Scalar4) ); + + // Generate uniform distribution (-1,1) on d_psi + Scalar4 *d_psi; + cudaMalloc( (void**)&d_psi, group_size*sizeof(Scalar4) ); + gpu_stokes_BrownianGenerate_kernel<<>>( d_psi, group_size, d_group_members, timestep, seed ); + + // Spreading and contraction grid information and parameters + dim3 Cgrid( group_size, 1, 1); + int B = ( P < 10 ) ? P : 10; + dim3 Cthreads(B, B, B); + + Scalar quadW = gridh.x * gridh.y * gridh.z; + Scalar xisq = xi * xi; + Scalar prefac = ( 2.0 * xisq / 3.1415926536 / eta ) * sqrtf( 2.0 * xisq / 3.1415926536 / eta ); + Scalar expfac = 2.0 * xisq / eta; + + // ******************************************** + // Wave Space Part of Deterministic Calculation + // ******************************************** + + // Reset the grid (remove any previously distributed forces) + gpu_stokes_ZeroGrid_kernel<<>>(d_gridX,NxNyNz); + gpu_stokes_ZeroGrid_kernel<<>>(d_gridY,NxNyNz); + gpu_stokes_ZeroGrid_kernel<<>>(d_gridZ,NxNyNz); + + // Spread forces onto grid + gpu_stokes_Spread_kernel<<>>( d_pos, d_net_force, d_gridX, d_gridY, d_gridZ, group_size, Nx, Ny, Nz, d_group_members, box, P, gridh, xi, eta, prefac, expfac ); + + // Perform FFT on gridded forces + cufftExecC2C(plan, d_gridX, d_gridX, CUFFT_FORWARD); + cufftExecC2C(plan, d_gridY, d_gridY, CUFFT_FORWARD); + cufftExecC2C(plan, d_gridZ, d_gridZ, CUFFT_FORWARD); + + // Apply wave space scaling to FFT'd forces + gpu_stokes_Green_kernel<<>>( d_gridX, d_gridY, d_gridZ, d_gridk, NxNyNz); + + + // *************************************** + // Wave Space Part of Brownian Calculation + // *************************************** + if ( T > 0.0 ){ + + // Apply random fluctuations to wave space grid + gpu_stokes_BrownianGridGenerate_kernel<<>>( d_gridX, d_gridY, d_gridZ, d_gridk, NxNyNz, Nx, Ny, Nz, timestep, seed, T, dt, quadW ); + + } + + // ************************************ + // Finish the Wave Space Calculation + // ************************************ + + // Return rescaled forces to real space + cufftExecC2C(plan, d_gridX, d_gridX, CUFFT_INVERSE); + cufftExecC2C(plan, d_gridY, d_gridY, CUFFT_INVERSE); + cufftExecC2C(plan, d_gridZ, d_gridZ, CUFFT_INVERSE); + + // Evaluate contribution of grid velocities at particle centers + gpu_stokes_Contract_kernel<<>>( d_pos, d_vel, d_gridX, d_gridY, d_gridZ, group_size, Nx, Ny, Nz, xi, eta, d_group_members, box, P, gridh, quadW*prefac, expfac ); + + // *************************************** + // Real Space Part of Both Calculations + // *************************************** + + // Deterministic part + gpu_stokes_Mreal_kernel<<>>(d_pos, d_vel2, d_net_force, group_size, xi, d_ewaldC1, self, ewald_cut, ewald_n, ewald_dr, d_group_members, box, d_n_neigh, d_nlist, d_headlist ); + + // Add to velocity + gpu_stokes_LinearCombination_kernel<<>>(d_vel2, d_vel, d_vel, 1.0, 1.0, group_size, d_group_members); + + // Stochastic + if ( T > 0.0 ){ + + gpu_stokes_BrealLanczos_wrap( d_psi, + d_pos, + d_group_members, + group_size, + box, + dt, + d_vel2, + T, + timestep, + seed, + xi, + ewald_cut, + ewald_dr, + ewald_n, + d_ewaldC1, + d_n_neigh, + d_nlist, + d_headlist, + m_Lanczos, + cheb_error, + grid, + threads, + gridBlockSize, + gridNBlock, + gridh, + self ); + + // Add to velocity + gpu_stokes_LinearCombination_kernel<<>>(d_vel2, d_vel, d_vel, 1.0, 1.0, group_size, d_group_members); + + } + + // Free Memory + cudaFree( d_vel2 ); + cudaFree( d_psi ); } diff --git a/PSEv1/Brownian.cuh b/PSEv1/Brownian.cuh index 5332e43..8bb2dbc 100644 --- a/PSEv1/Brownian.cuh +++ b/PSEv1/Brownian.cuh @@ -71,68 +71,68 @@ ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif __global__ void gpu_stokes_BrownianGenerate_kernel( - Scalar4 *d_psi, - unsigned int group_size, - unsigned int *d_group_members, - const unsigned int timestep, - const unsigned int seed - ); + Scalar4 *d_psi, + unsigned int group_size, + unsigned int *d_group_members, + const unsigned int timestep, + const unsigned int seed + ); __global__ void gpu_stokes_BrownianGridGenerate_kernel( - CUFFTCOMPLEX *gridX, - CUFFTCOMPLEX *gridY, - CUFFTCOMPLEX *gridZ, - Scalar4 *gridk, - unsigned int NxNyNz, - int Nx, - int Ny, - int Nz, - const unsigned int timestep, - const unsigned int seed, - Scalar T, - Scalar dt, - Scalar quadW - ); + CUFFTCOMPLEX *gridX, + CUFFTCOMPLEX *gridY, + CUFFTCOMPLEX *gridZ, + Scalar4 *gridk, + unsigned int NxNyNz, + int Nx, + int Ny, + int Nz, + const unsigned int timestep, + const unsigned int seed, + Scalar T, + Scalar dt, + Scalar quadW + ); void gpu_stokes_CombinedMobilityBrownian_wrap( - Scalar4 *d_pos, - Scalar4 *d_net_force, + Scalar4 *d_pos, + Scalar4 *d_net_force, unsigned int *d_group_members, unsigned int group_size, const BoxDim& box, Scalar dt, - Scalar4 *d_vel, - const Scalar T, - const unsigned int timestep, - const unsigned int seed, - Scalar xi, - Scalar eta, - Scalar P, - Scalar ewald_cut, - Scalar ewald_dr, - int ewald_n, - Scalar4 *d_ewaldC1, - Scalar4 *d_gridk, - CUFFTCOMPLEX *d_gridX, - CUFFTCOMPLEX *d_gridY, - CUFFTCOMPLEX *d_gridZ, - cufftHandle plan, - const int Nx, - const int Ny, - const int Nz, - const unsigned int *d_n_neigh, + Scalar4 *d_vel, + const Scalar T, + const unsigned int timestep, + const unsigned int seed, + Scalar xi, + Scalar eta, + Scalar P, + Scalar ewald_cut, + Scalar ewald_dr, + int ewald_n, + Scalar4 *d_ewaldC1, + Scalar4 *d_gridk, + CUFFTCOMPLEX *d_gridX, + CUFFTCOMPLEX *d_gridY, + CUFFTCOMPLEX *d_gridZ, + cufftHandle plan, + const int Nx, + const int Ny, + const int Nz, + const unsigned int *d_n_neigh, const unsigned int *d_nlist, const unsigned int *d_headlist, - int& m_Lanczos, - const unsigned int N_total, - unsigned int NxNyNz, - dim3 grid, - dim3 threads, - int gridBlockSize, - int gridNBlock, - Scalar3 gridh, - Scalar cheb_error, - Scalar self - ); + int& m_Lanczos, + const unsigned int N_total, + unsigned int NxNyNz, + dim3 grid, + dim3 threads, + int gridBlockSize, + int gridNBlock, + Scalar3 gridh, + Scalar cheb_error, + Scalar self + ); #endif diff --git a/PSEv1/Helper.cu b/PSEv1/Helper.cu index dfc582e..c9f7ac4 100644 --- a/PSEv1/Helper.cu +++ b/PSEv1/Helper.cu @@ -73,7 +73,7 @@ ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*! \file Helper.cu - \brief Helper functions to perform additions, dot products, etc., for Mobility and Brownian + \brief Helper functions to perform additions, dot products, etc., for Mobility and Brownian */ //! Shared memory array for partial sum of dot product kernel @@ -81,201 +81,202 @@ extern __shared__ Scalar partial_sum[]; //! Zero out the force grid /*! - \param grid the grid going to be zero out - \param NxNyNz dimension of the grid + \param grid the grid going to be zero out + \param NxNyNz dimension of the grid */ __global__ void gpu_stokes_ZeroGrid_kernel(CUFFTCOMPLEX *grid, unsigned int NxNyNz) { - unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x; - - if ( tid < NxNyNz ) { - - grid[tid] = make_scalar2( 0.0, 0.0 ); - - } + unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x; + + if ( tid < NxNyNz ) { + + grid[tid].x = 0; + grid[tid].y = 0; + + } } /*! - Linear combination helper function - C = a*A + b*B - C can be A or B, so that A or B will be overwritten - The fourth element of Scalar4 is not changed! - - \param d_a input vector, A - \param d_b input vector, B - \param d_c output vector, C - \param coeff_a scaling factor for A, a - \param coeff_b scaling factor for B, b - \param group_size length of vectors - \param d_group_members index into vectors + Linear combination helper function + C = a*A + b*B + C can be A or B, so that A or B will be overwritten + The fourth element of Scalar4 is not changed! + + \param d_a input vector, A + \param d_b input vector, B + \param d_c output vector, C + \param coeff_a scaling factor for A, a + \param coeff_b scaling factor for B, b + \param group_size length of vectors + \param d_group_members index into vectors */ __global__ void gpu_stokes_LinearCombination_kernel( - Scalar4 *d_a, - Scalar4 *d_b, - Scalar4 *d_c, - Scalar coeff_a, - Scalar coeff_b, - unsigned int group_size, - unsigned int *d_group_members - ){ - - int group_idx = blockDim.x * blockIdx.x + threadIdx.x; - if (group_idx < group_size) { - unsigned int idx = d_group_members[group_idx]; - Scalar4 A4 = d_a[idx]; - Scalar4 B4 = d_b[idx]; - Scalar3 A = make_scalar3(A4.x, A4.y, A4.z); - Scalar3 B = make_scalar3(B4.x, B4.y, B4.z); - A = coeff_a * A + coeff_b * B; - d_c[idx] = make_scalar4(A.x, A.y, A.z, d_c[idx].w); - } + Scalar4 *d_a, + Scalar4 *d_b, + Scalar4 *d_c, + Scalar coeff_a, + Scalar coeff_b, + unsigned int group_size, + unsigned int *d_group_members + ){ + + int group_idx = blockDim.x * blockIdx.x + threadIdx.x; + if (group_idx < group_size) { + unsigned int idx = d_group_members[group_idx]; + Scalar4 A4 = d_a[idx]; + Scalar4 B4 = d_b[idx]; + Scalar3 A = make_scalar3(A4.x, A4.y, A4.z); + Scalar3 B = make_scalar3(B4.x, B4.y, B4.z); + A = coeff_a * A + coeff_b * B; + d_c[idx] = make_scalar4(A.x, A.y, A.z, d_c[idx].w); + } } /*! - Dot product helper function: First step - d_a .* d_b -> d_c -> Partial sum - BlockDim of this kernel should be 2^n, which is 512. (Based on HOOMD ComputeThermoGPU class) - - \param d_a first vector in dot product - \param d_b second vector in dot product - \param dot_sum partial dot product sum - \param group_size length of vectors a and b + Dot product helper function: First step + d_a .* d_b -> d_c -> Partial sum + BlockDim of this kernel should be 2^n, which is 512. (Based on HOOMD ComputeThermoGPU class) + + \param d_a first vector in dot product + \param d_b second vector in dot product + \param dot_sum partial dot product sum + \param group_size length of vectors a and b \param d_group_members index into vectors */ __global__ void gpu_stokes_DotStepOne_kernel( - Scalar4 *d_a, - Scalar4 *d_b, - Scalar *dot_sum, - unsigned int group_size, - unsigned int *d_group_members - ){ + Scalar4 *d_a, + Scalar4 *d_b, + Scalar *dot_sum, + unsigned int group_size, + unsigned int *d_group_members + ){ - int group_idx = blockDim.x * blockIdx.x + threadIdx.x; - Scalar temp; + int group_idx = blockDim.x * blockIdx.x + threadIdx.x; + Scalar temp; - if (group_idx < group_size) { + if (group_idx < group_size) { - unsigned int idx = d_group_members[group_idx]; - Scalar4 a4 = d_a[idx]; - Scalar4 b4 = d_b[idx]; - Scalar3 a = make_scalar3(a4.x, a4.y, a4.z); - Scalar3 b = make_scalar3(b4.x, b4.y, b4.z); + unsigned int idx = d_group_members[group_idx]; + Scalar4 a4 = d_a[idx]; + Scalar4 b4 = d_b[idx]; + Scalar3 a = make_scalar3(a4.x, a4.y, a4.z); + Scalar3 b = make_scalar3(b4.x, b4.y, b4.z); - temp = dot(a,b); // Partial sum, each thread, shared memory + temp = dot(a,b); // Partial sum, each thread, shared memory - } - else { - temp = 0; - } + } + else { + temp = 0; + } - partial_sum[threadIdx.x] = temp; + partial_sum[threadIdx.x] = temp; - __syncthreads(); + __syncthreads(); - int offs = blockDim.x >> 1; + int offs = blockDim.x >> 1; - while (offs > 0) + while (offs > 0) { - if (threadIdx.x < offs) - { - partial_sum[threadIdx.x] += partial_sum[threadIdx.x + offs]; - } - offs >>= 1; - __syncthreads(); + if (threadIdx.x < offs) + { + partial_sum[threadIdx.x] += partial_sum[threadIdx.x + offs]; + } + offs >>= 1; + __syncthreads(); } - if (threadIdx.x == 0){ - dot_sum[blockIdx.x] = partial_sum[0]; - } + if (threadIdx.x == 0){ + dot_sum[blockIdx.x] = partial_sum[0]; + } } /*! - Dot product helper function: Second step - Partial sum -> Final sum - Only one block will be launched for this step + Dot product helper function: Second step + Partial sum -> Final sum + Only one block will be launched for this step - \param dot_sum partial sum from first dot product kernel - \param num_partial_sums length of dot_sum array + \param dot_sum partial sum from first dot product kernel + \param num_partial_sums length of dot_sum array */ __global__ void gpu_stokes_DotStepTwo_kernel( - Scalar *dot_sum, - unsigned int num_partial_sums - ){ - - partial_sum[threadIdx.x] = 0.0; - __syncthreads(); - for (unsigned int start = 0; start < num_partial_sums; start += blockDim.x) - { - if (start + threadIdx.x < num_partial_sums) - { - partial_sum[threadIdx.x] += dot_sum[start + threadIdx.x]; - } - } - - int offs = blockDim.x >> 1; - while (offs > 0) - { - __syncthreads(); - if (threadIdx.x < offs) + Scalar *dot_sum, + unsigned int num_partial_sums + ){ + + partial_sum[threadIdx.x] = 0.0; + __syncthreads(); + for (unsigned int start = 0; start < num_partial_sums; start += blockDim.x) + { + if (start + threadIdx.x < num_partial_sums) + { + partial_sum[threadIdx.x] += dot_sum[start + threadIdx.x]; + } + } + + int offs = blockDim.x >> 1; + while (offs > 0) + { + __syncthreads(); + if (threadIdx.x < offs) { - partial_sum[threadIdx.x] += partial_sum[threadIdx.x + offs]; + partial_sum[threadIdx.x] += partial_sum[threadIdx.x + offs]; } - offs >>= 1; - + offs >>= 1; + } - __syncthreads(); + __syncthreads(); if (threadIdx.x == 0) - { - dot_sum[0] = partial_sum[0]; // Save the dot product to the first element of dot_sum array - } + { + dot_sum[0] = partial_sum[0]; // Save the dot product to the first element of dot_sum array + } } /*! - Perform matrix-vector multiply needed for the Lanczos contribution to the Brownian velocity + Perform matrix-vector multiply needed for the Lanczos contribution to the Brownian velocity - \param d_A matrix, N x m - \param d_x multiplying vector, m x 1 - \param d_b result vector, A*x, m x 1 - \param group_size number of particles - \param m number of iterations ( number of columns of A, length of x ) + \param d_A matrix, N x m + \param d_x multiplying vector, m x 1 + \param d_b result vector, A*x, m x 1 + \param group_size number of particles + \param m number of iterations ( number of columns of A, length of x ) */ __global__ void gpu_stokes_MatVecMultiply_kernel( - Scalar4 *d_A, - Scalar *d_x, - Scalar4 *d_b, - unsigned int group_size, - int m - ){ + Scalar4 *d_A, + Scalar *d_x, + Scalar4 *d_b, + unsigned int group_size, + int m + ){ - int idx = blockDim.x * blockIdx.x + threadIdx.x; - if (idx < group_size) { + int idx = blockDim.x * blockIdx.x + threadIdx.x; + if (idx < group_size) { - Scalar3 tempprod = make_scalar3( 0.0, 0.0, 0.0 ); + Scalar3 tempprod = make_scalar3( 0.0, 0.0, 0.0 ); - for ( int ii = 0; ii < m; ++ii ){ + for ( int ii = 0; ii < m; ++ii ){ - Scalar4 matidx = d_A[ idx + ii*group_size ]; + Scalar4 matidx = d_A[ idx + ii*group_size ]; - Scalar xcurr = d_x[ii]; + Scalar xcurr = d_x[ii]; - tempprod.x = tempprod.x + matidx.x * xcurr; - tempprod.y = tempprod.y + matidx.y * xcurr; - tempprod.z = tempprod.z + matidx.z * xcurr; + tempprod.x = tempprod.x + matidx.x * xcurr; + tempprod.y = tempprod.y + matidx.y * xcurr; + tempprod.z = tempprod.z + matidx.z * xcurr; - } + } - d_b[idx] = make_scalar4( tempprod.x, tempprod.y, tempprod.z, d_A[idx].w ); + d_b[idx] = make_scalar4( tempprod.x, tempprod.y, tempprod.z, d_A[idx].w ); - } + } } /*! @@ -283,15 +284,15 @@ __global__ void gpu_stokes_MatVecMultiply_kernel( */ __global__ void gpu_stokes_SetGridk_kernel( - Scalar4 *gridk, + Scalar4 *gridk, int Nx, int Ny, int Nz, unsigned int NxNyNz, BoxDim box, Scalar xi, - Scalar eta - ){ + Scalar eta + ){ int tid = blockDim.x * blockIdx.x + threadIdx.x; @@ -310,21 +311,21 @@ void gpu_stokes_SetGridk_kernel( gridk_value.x = gridk_value.x / L.x; gridk_value.z = ((k < (Nz+1) / 2) ? k : k - Nz) / L.z; - gridk_value.x *= 2.0*3.1416926536; - gridk_value.y *= 2.0*3.1416926536; - gridk_value.z *= 2.0*3.1416926536; + gridk_value.x *= 2.0*3.1416926536; + gridk_value.y *= 2.0*3.1416926536; + gridk_value.z *= 2.0*3.1416926536; Scalar k2 = gridk_value.x*gridk_value.x + gridk_value.y*gridk_value.y + gridk_value.z*gridk_value.z; - Scalar xisq = xi * xi; - - // Scaling factor used in wave space sum - if (i == 0 && j == 0 && k == 0){ - gridk_value.w = 0.0; - } - else{ - // Have to divide by Nx*Ny*Nz to normalize the FFTs - gridk_value.w = 6.0*3.1415926536 * (1.0 + k2/4.0/xisq) * expf( -(1-eta) * k2/4.0/xisq ) / ( k2 ) / Scalar( Nx*Ny*Nz ); - } + Scalar xisq = xi * xi; + + // Scaling factor used in wave space sum + if (i == 0 && j == 0 && k == 0){ + gridk_value.w = 0.0; + } + else{ + // Have to divide by Nx*Ny*Nz to normalize the FFTs + gridk_value.w = 6.0*3.1415926536 * (1.0 + k2/4.0/xisq) * expf( -(1-eta) * k2/4.0/xisq ) / ( k2 ) / Scalar( Nx*Ny*Nz ); + } gridk[tid] = gridk_value; diff --git a/PSEv1/Mobility.cu b/PSEv1/Mobility.cu index 2bcb3ff..0afad7f 100644 --- a/PSEv1/Mobility.cu +++ b/PSEv1/Mobility.cu @@ -104,156 +104,156 @@ scalar4_tex_t pos_tex; \param gridh space between grid nodes in each dimension \param xi Ewald splitting parameter \param eta Spectral splitting parameter - \param prefac Spreading function prefactor - \param expfac Spreading function exponential factor + \param prefac Spreading function prefactor + \param expfac Spreading function exponential factor One 3-D block of threads is launched per particle (block dimension = PxPxP). Max dimension is 10x10x10. If P > 10, each thread will do more than one grid point worth of work. */ -__global__ void gpu_stokes_Spread_kernel( - Scalar4 *d_pos, - Scalar4 *d_net_force, - CUFFTCOMPLEX *gridX, - CUFFTCOMPLEX *gridY, - CUFFTCOMPLEX *gridZ, - int group_size, - int Nx, - int Ny, - int Nz, - unsigned int *d_group_members, - BoxDim box, - const int P, - Scalar3 gridh, - Scalar xi, - Scalar eta, - Scalar prefac, - Scalar expfac - ){ - - // Shared memory for particle force and position, so that each block - // only has to read once - __shared__ Scalar3 shared[2]; // 16 kb max - - Scalar3 *force_shared = shared; - Scalar3 *pos_shared = &shared[1]; - - // Offset for the block (i.e. particle ID within group) - int group_idx = blockIdx.x; - - // Offset for the thread (i.e. grid point ID within particle's support) - int thread_offset = threadIdx.z + threadIdx.y * blockDim.z + threadIdx.x * blockDim.z*blockDim.y; - - // Global particle ID - unsigned int idx = d_group_members[group_idx]; - - // Initialize shared memory and get particle position - if ( thread_offset == 0 ){ - Scalar4 tpos = texFetchScalar4(d_pos, pos_tex, idx); - pos_shared[0].x = tpos.x; - pos_shared[0].y = tpos.y; - pos_shared[0].z = tpos.z; - - Scalar4 tforce = d_net_force[idx]; - force_shared[0].x = tforce.x; - force_shared[0].y = tforce.y; - force_shared[0].z = tforce.z; - } - __syncthreads(); - - // Box dimension - Scalar3 L = box.getL(); - Scalar3 Ld2 = L / 2.0; - - // Retrieve position from shared memory - Scalar3 pos = pos_shared[0]; - Scalar3 force = force_shared[0]; - - // Fractional position within box - Scalar3 pos_frac = box.makeFraction(pos); - - pos_frac.x *= (Scalar)Nx; - pos_frac.y *= (Scalar)Ny; - pos_frac.z *= (Scalar)Nz; - - // Grid index of floor of fractional position - int x = int( pos_frac.x ); - int y = int( pos_frac.y ); - int z = int( pos_frac.z ); - - // Amount of work needed for each thread to cover support - // (Required in case support size is larger than grid dimension, - // but in most cases, should have n.x = n.y = n.z = 1 ) - int3 n, t; +__global__ void gpu_stokes_Spread_kernel( + Scalar4 *d_pos, + Scalar4 *d_net_force, + CUFFTCOMPLEX *gridX, + CUFFTCOMPLEX *gridY, + CUFFTCOMPLEX *gridZ, + int group_size, + int Nx, + int Ny, + int Nz, + unsigned int *d_group_members, + BoxDim box, + const int P, + Scalar3 gridh, + Scalar xi, + Scalar eta, + Scalar prefac, + Scalar expfac + ){ + + // Shared memory for particle force and position, so that each block + // only has to read once + __shared__ Scalar3 shared[2]; // 16 kb max + + Scalar3 *force_shared = shared; + Scalar3 *pos_shared = &shared[1]; + + // Offset for the block (i.e. particle ID within group) + int group_idx = blockIdx.x; + + // Offset for the thread (i.e. grid point ID within particle's support) + int thread_offset = threadIdx.z + threadIdx.y * blockDim.z + threadIdx.x * blockDim.z*blockDim.y; + + // Global particle ID + unsigned int idx = d_group_members[group_idx]; + + // Initialize shared memory and get particle position + if ( thread_offset == 0 ){ + Scalar4 tpos = texFetchScalar4(d_pos, pos_tex, idx); + pos_shared[0].x = tpos.x; + pos_shared[0].y = tpos.y; + pos_shared[0].z = tpos.z; + + Scalar4 tforce = d_net_force[idx]; + force_shared[0].x = tforce.x; + force_shared[0].y = tforce.y; + force_shared[0].z = tforce.z; + } + __syncthreads(); + + // Box dimension + Scalar3 L = box.getL(); + Scalar3 Ld2 = L / 2.0; + + // Retrieve position from shared memory + Scalar3 pos = pos_shared[0]; + Scalar3 force = force_shared[0]; + + // Fractional position within box + Scalar3 pos_frac = box.makeFraction(pos); + + pos_frac.x *= (Scalar)Nx; + pos_frac.y *= (Scalar)Ny; + pos_frac.z *= (Scalar)Nz; + + // Grid index of floor of fractional position + int x = int( pos_frac.x ); + int y = int( pos_frac.y ); + int z = int( pos_frac.z ); + + // Amount of work needed for each thread to cover support + // (Required in case support size is larger than grid dimension, + // but in most cases, should have n.x = n.y = n.z = 1 ) + int3 n, t; n.x = ( P + blockDim.x - 1 ) / blockDim.x; // ceiling n.y = ( P + blockDim.y - 1 ) / blockDim.y; n.z = ( P + blockDim.z - 1 ) / blockDim.z; - // Grid point associated with current thread - int Pd2 = P/2; // integer division does floor - - for ( int ii = 0; ii < n.x; ++ii ){ - - t.x = threadIdx.x + ii*blockDim.x; - - for ( int jj = 0; jj < n.y; ++jj ){ - - t.y = threadIdx.y + jj*blockDim.y; - - for ( int kk = 0; kk < n.z; ++kk ){ - - t.z = threadIdx.z + kk*blockDim.z; - - if ( ( t.x < P ) && ( t.y < P ) && ( t.z < P ) ){ - - // x,y,z indices for current thread - // - // Arithmetic with P makes sure distribution is centered on the particle - int x_inp = x + t.x - Pd2 + 1 - (P % 2) * ( pos_frac.x - Scalar( x ) < 0.5 ); - int y_inp = y + t.y - Pd2 + 1 - (P % 2) * ( pos_frac.y - Scalar( y ) < 0.5 ); - int z_inp = z + t.z - Pd2 + 1 - (P % 2) * ( pos_frac.z - Scalar( z ) < 0.5 ); - - // Periodic wrapping of grid point - x_inp = (x_inp<0) ? x_inp+Nx : ( (x_inp>Nx-1) ? x_inp-Nx : x_inp ); - y_inp = (y_inp<0) ? y_inp+Ny : ( (y_inp>Ny-1) ? y_inp-Ny : y_inp ); - z_inp = (z_inp<0) ? z_inp+Nz : ( (z_inp>Nz-1) ? z_inp-Nz : z_inp ); - - // x,y,z coordinates for current thread - Scalar3 pos_grid; - pos_grid.x = gridh.x*x_inp - Ld2.x; - pos_grid.y = gridh.y*y_inp - Ld2.y; - pos_grid.z = gridh.z*z_inp - Ld2.z; - - // Shear the grid position - // !!! This only works for linear shear where the shear gradient is along y - // and the shear direction is along x - pos_grid.x = pos_grid.x + box.getTiltFactorXY() * pos_grid.y; - - // Global index for current grid point - int grid_idx = x_inp * Ny * Nz + y_inp * Nz + z_inp; - - // Distance from particle to grid node - Scalar3 r = pos_grid - pos; - r = box.minImage(r); - Scalar rsq = r.x*r.x + r.y*r.y + r.z*r.z; - - // Magnitude of the force contribution to the current grid node - Scalar3 force_inp = prefac * expf( -expfac * rsq ) * force; - - // Add force to the grid - atomicAdd( &(gridX[grid_idx].x), force_inp.x); - atomicAdd( &(gridY[grid_idx].x), force_inp.y); - atomicAdd( &(gridZ[grid_idx].x), force_inp.z); - }// check thread is within support - }// kk - }// jj - }// ii + // Grid point associated with current thread + int Pd2 = P/2; // integer division does floor + + for ( int ii = 0; ii < n.x; ++ii ){ + + t.x = threadIdx.x + ii*blockDim.x; + + for ( int jj = 0; jj < n.y; ++jj ){ + + t.y = threadIdx.y + jj*blockDim.y; + + for ( int kk = 0; kk < n.z; ++kk ){ + + t.z = threadIdx.z + kk*blockDim.z; + + if ( ( t.x < P ) && ( t.y < P ) && ( t.z < P ) ){ + + // x,y,z indices for current thread + // + // Arithmetic with P makes sure distribution is centered on the particle + int x_inp = x + t.x - Pd2 + 1 - (P % 2) * ( pos_frac.x - Scalar( x ) < 0.5 ); + int y_inp = y + t.y - Pd2 + 1 - (P % 2) * ( pos_frac.y - Scalar( y ) < 0.5 ); + int z_inp = z + t.z - Pd2 + 1 - (P % 2) * ( pos_frac.z - Scalar( z ) < 0.5 ); + + // Periodic wrapping of grid point + x_inp = (x_inp<0) ? x_inp+Nx : ( (x_inp>Nx-1) ? x_inp-Nx : x_inp ); + y_inp = (y_inp<0) ? y_inp+Ny : ( (y_inp>Ny-1) ? y_inp-Ny : y_inp ); + z_inp = (z_inp<0) ? z_inp+Nz : ( (z_inp>Nz-1) ? z_inp-Nz : z_inp ); + + // x,y,z coordinates for current thread + Scalar3 pos_grid; + pos_grid.x = gridh.x*x_inp - Ld2.x; + pos_grid.y = gridh.y*y_inp - Ld2.y; + pos_grid.z = gridh.z*z_inp - Ld2.z; + + // Shear the grid position + // !!! This only works for linear shear where the shear gradient is along y + // and the shear direction is along x + pos_grid.x = pos_grid.x + box.getTiltFactorXY() * pos_grid.y; + + // Global index for current grid point + int grid_idx = x_inp * Ny * Nz + y_inp * Nz + z_inp; + + // Distance from particle to grid node + Scalar3 r = pos_grid - pos; + r = box.minImage(r); + Scalar rsq = r.x*r.x + r.y*r.y + r.z*r.z; + + // Magnitude of the force contribution to the current grid node + Scalar3 force_inp = prefac * expf( -expfac * rsq ) * force; + + // Add force to the grid + atomicAdd( &(gridX[grid_idx].x), force_inp.x); + atomicAdd( &(gridY[grid_idx].x), force_inp.y); + atomicAdd( &(gridZ[grid_idx].x), force_inp.z); + }// check thread is within support + }// kk + }// jj + }// ii } //! Compute the velocity from the force moments on the grid (Same Size Particles) // -// This is the operator "B" from the paper +// This is the operator "B" from the paper // /*! \param gridX x-component of force moments projected onto grid \param gridY y-component of force moments projected onto grid @@ -262,40 +262,49 @@ __global__ void gpu_stokes_Spread_kernel( \param NxNyNz total number of grid nodes */ __global__ void gpu_stokes_Green_kernel( - CUFFTCOMPLEX *gridX, - CUFFTCOMPLEX *gridY, - CUFFTCOMPLEX *gridZ, - Scalar4 *gridk, - unsigned int NxNyNz - ) { - - int tid = blockDim.x * blockIdx.x + threadIdx.x; - - if ( tid < NxNyNz ) { - - // Read the FFT force from global memory - Scalar2 fX = gridX[tid]; - Scalar2 fY = gridY[tid]; - Scalar2 fZ = gridZ[tid]; - - // Current wave-space vector - Scalar4 tk = gridk[tid]; - Scalar ksq = tk.x*tk.x + tk.y*tk.y + tk.z*tk.z; - Scalar k = sqrtf( ksq ); - - // Dot product of the wave-vector with the force - Scalar2 kdF = (tid==0) ? make_scalar2(0.0,0.0) : make_scalar2( ( tk.x*fX.x + tk.y*fY.x + tk.z*fZ.x ) / ksq, ( tk.x*fX.y + tk.y*fY.y + tk.z*fZ.y ) / ksq ); - - // Scaling factor - Scalar B = (tid==0) ? 0.0 : tk.w * ( sinf( k ) / k ) * ( sinf( k ) / k ); - - // Write the velocity to global memory - gridX[tid] = make_scalar2( ( fX.x - tk.x * kdF.x ) * B, ( fX.y - tk.x * kdF.y ) * B ); - gridY[tid] = make_scalar2( ( fY.x - tk.y * kdF.x ) * B, ( fY.y - tk.y * kdF.y ) * B ); - gridZ[tid] = make_scalar2( ( fZ.x - tk.z * kdF.x ) * B, ( fZ.y - tk.z * kdF.y ) * B ); - - - } + CUFFTCOMPLEX *gridX, + CUFFTCOMPLEX *gridY, + CUFFTCOMPLEX *gridZ, + Scalar4 *gridk, + unsigned int NxNyNz + ) { + + int tid = blockDim.x * blockIdx.x + threadIdx.x; + + if ( tid < NxNyNz ) { + + // Read the FFT force from global memory + Scalar2 fX, fY, fZ; + fX.x = gridX[tid].x; + fX.y = gridX[tid].y; + fY.x = gridY[tid].x; + fY.y = gridY[tid].y; + fZ.x = gridZ[tid].x; + fZ.y = gridZ[tid].y; + + // Current wave-space vector + Scalar4 tk = gridk[tid]; + Scalar ksq = tk.x*tk.x + tk.y*tk.y + tk.z*tk.z; + Scalar k = sqrtf( ksq ); + + // Dot product of the wave-vector with the force + Scalar2 kdF = (tid==0) ? make_scalar2(0.0,0.0) : make_scalar2( ( tk.x*fX.x + tk.y*fY.x + tk.z*fZ.x ) / ksq, ( tk.x*fX.y + tk.y*fY.y + tk.z*fZ.y ) / ksq ); + + // Scaling factor + Scalar B = (tid==0) ? 0.0 : tk.w * ( sinf( k ) / k ) * ( sinf( k ) / k ); + + // Write the velocity to global memory + Scalar2 gX, gY, gZ; + gX = make_scalar2( ( fX.x - tk.x * kdF.x ) * B, ( fX.y - tk.x * kdF.y ) * B ); + gridX[tid].x = gX.x; + gridX[tid].y = gX.y; + gY = make_scalar2( ( fY.x - tk.y * kdF.x ) * B, ( fY.y - tk.y * kdF.y ) * B ); + gridY[tid].x = gY.x; + gridY[tid].y = gY.y; + gZ = make_scalar2( ( fZ.x - tk.z * kdF.x ) * B, ( fZ.y - tk.z * kdF.y ) * B ); + gridZ[tid].x = gZ.x; + gridZ[tid].y = gZ.y; + } } //! Add velocity from grid to particles ( Same Size Particles, Block Per Particle (support) ) @@ -315,170 +324,170 @@ __global__ void gpu_stokes_Green_kernel( \param box array containing box dimensions \param P number of grid nodes in support of spreading Gaussians \param gridh space between grid nodes in each dimension - \param prefac Spreading function prefactor - \param expfac Spreading function exponential factor + \param prefac Spreading function prefactor + \param expfac Spreading function exponential factor One 3-D block of threads is launched per particle (block dimension = PxPxP). Max dimension is 10x10x10 because of shared memory limitations. If P > 10, each thread will do more than one grid point worth of work. */ -__global__ void gpu_stokes_Contract_kernel( - Scalar4 *d_pos, - Scalar4 *d_vel, - CUFFTCOMPLEX *gridX, - CUFFTCOMPLEX *gridY, - CUFFTCOMPLEX *gridZ, - int group_size, - int Nx, - int Ny, - int Nz, - Scalar xi, - Scalar eta, - unsigned int *d_group_members, - BoxDim box, - const int P, - Scalar3 gridh, - Scalar prefac, - Scalar expfac - ){ - - // Shared memory for particle velocity and position, so that each block - // only has to read one - extern __shared__ Scalar3 shared[]; - - Scalar3 *velocity = shared; - Scalar3 *pos_shared = &shared[blockDim.x*blockDim.y*blockDim.z]; - - // Particle index within each group (block per particle) - int group_idx = blockIdx.x; - - // Thread index within the block (grid point index) - int thread_offset = threadIdx.z + threadIdx.y * blockDim.z + threadIdx.x * blockDim.z*blockDim.y; - - // Total number of threads within the block - int block_size = blockDim.x * blockDim.y * blockDim.z; - - // Global particle ID - unsigned int idx = d_group_members[group_idx]; - - // Initialize shared memory and get particle position - velocity[thread_offset] = make_scalar3(0.0,0.0,0.0); - if ( thread_offset == 0 ){ - Scalar4 tpos = texFetchScalar4(d_pos, pos_tex, idx); - pos_shared[0] = make_scalar3( tpos.x, tpos.y, tpos.z ); - } - __syncthreads(); - - // Box dimension - Scalar3 L = box.getL(); - Scalar3 Ld2 = L / 2.0; - - // Retrieve position from shared memory - Scalar3 pos = pos_shared[0]; - - // Fractional position within box - Scalar3 pos_frac = box.makeFraction(pos); - - pos_frac.x *= (Scalar)Nx; - pos_frac.y *= (Scalar)Ny; - pos_frac.z *= (Scalar)Nz; - - int x = int( pos_frac.x ); - int y = int( pos_frac.y ); - int z = int( pos_frac.z ); - - // Amount of work needed for each thread to cover support - // (Required in case support size is larger than grid dimension, - // but in most cases, should have n.x = n.y = n.z = 1 ) - int3 n, t; +__global__ void gpu_stokes_Contract_kernel( + Scalar4 *d_pos, + Scalar4 *d_vel, + CUFFTCOMPLEX *gridX, + CUFFTCOMPLEX *gridY, + CUFFTCOMPLEX *gridZ, + int group_size, + int Nx, + int Ny, + int Nz, + Scalar xi, + Scalar eta, + unsigned int *d_group_members, + BoxDim box, + const int P, + Scalar3 gridh, + Scalar prefac, + Scalar expfac + ){ + + // Shared memory for particle velocity and position, so that each block + // only has to read one + extern __shared__ Scalar3 shared[]; + + Scalar3 *velocity = shared; + Scalar3 *pos_shared = &shared[blockDim.x*blockDim.y*blockDim.z]; + + // Particle index within each group (block per particle) + int group_idx = blockIdx.x; + + // Thread index within the block (grid point index) + int thread_offset = threadIdx.z + threadIdx.y * blockDim.z + threadIdx.x * blockDim.z*blockDim.y; + + // Total number of threads within the block + int block_size = blockDim.x * blockDim.y * blockDim.z; + + // Global particle ID + unsigned int idx = d_group_members[group_idx]; + + // Initialize shared memory and get particle position + velocity[thread_offset] = make_scalar3(0.0,0.0,0.0); + if ( thread_offset == 0 ){ + Scalar4 tpos = texFetchScalar4(d_pos, pos_tex, idx); + pos_shared[0] = make_scalar3( tpos.x, tpos.y, tpos.z ); + } + __syncthreads(); + + // Box dimension + Scalar3 L = box.getL(); + Scalar3 Ld2 = L / 2.0; + + // Retrieve position from shared memory + Scalar3 pos = pos_shared[0]; + + // Fractional position within box + Scalar3 pos_frac = box.makeFraction(pos); + + pos_frac.x *= (Scalar)Nx; + pos_frac.y *= (Scalar)Ny; + pos_frac.z *= (Scalar)Nz; + + int x = int( pos_frac.x ); + int y = int( pos_frac.y ); + int z = int( pos_frac.z ); + + // Amount of work needed for each thread to cover support + // (Required in case support size is larger than grid dimension, + // but in most cases, should have n.x = n.y = n.z = 1 ) + int3 n, t; n.x = ( P + blockDim.x - 1 ) / blockDim.x; // ceiling n.y = ( P + blockDim.y - 1 ) / blockDim.y; n.z = ( P + blockDim.z - 1 ) / blockDim.z; - // Grid point associated with current thread - int Pd2 = P / 2; // integer division does floor - - for ( int ii = 0; ii < n.x; ++ii ){ - - t.x = threadIdx.x + ii*blockDim.x; - - for ( int jj = 0; jj < n.y; ++jj ){ - - t.y = threadIdx.y + jj*blockDim.y; - - for ( int kk = 0; kk < n.z; ++kk ){ - - t.z = threadIdx.z + kk*blockDim.z; - - if( ( t.x < P ) && ( t.y < P ) && ( t.z < P ) ){ - - // x,y,z indices for current thread - // - // Arithmetic with P makes sure distribution is centered on the particle - int x_inp = x + t.x - Pd2 + 1 - (P % 2) * ( pos_frac.x - Scalar( x ) < 0.5 ); - int y_inp = y + t.y - Pd2 + 1 - (P % 2) * ( pos_frac.y - Scalar( y ) < 0.5 ); - int z_inp = z + t.z - Pd2 + 1 - (P % 2) * ( pos_frac.z - Scalar( z ) < 0.5 ); - - // Periodic wrapping of grid point - x_inp = (x_inp<0) ? x_inp+Nx : ( (x_inp>Nx-1) ? x_inp-Nx : x_inp ); - y_inp = (y_inp<0) ? y_inp+Ny : ( (y_inp>Ny-1) ? y_inp-Ny : y_inp ); - z_inp = (z_inp<0) ? z_inp+Nz : ( (z_inp>Nz-1) ? z_inp-Nz : z_inp ); - - // x,y,z coordinates for current thread - Scalar3 pos_grid; - pos_grid.x = gridh.x*x_inp - Ld2.x; - pos_grid.y = gridh.y*y_inp - Ld2.y; - pos_grid.z = gridh.z*z_inp - Ld2.z; - - // Shear the grid position - // !!! This only works for linear shear where the shear gradient is along y - // and the shear direction is along x - pos_grid.x = pos_grid.x + box.getTiltFactorXY() * pos_grid.y; - - // Global index for current grid point - int grid_idx = x_inp * Ny * Nz + y_inp * Nz + z_inp; - - // Distance from particle to grid node - Scalar3 r = pos_grid - pos; - r = box.minImage(r); - Scalar rsq = r.x*r.x + r.y*r.y + r.z*r.z; - - // Spreading Factor - Scalar Cfac = prefac * expf( -expfac * rsq ); - - // Get velocity from reduction (THIS IS THE SLOW STEP): - velocity[thread_offset] += Cfac * make_scalar3( gridX[grid_idx].x, gridY[grid_idx].x, gridZ[grid_idx].x ); - } - }//kk - }//jj - }//ii - - // Intra-block reduction for the total particle velocity - // (add contributions from all grid points) - int offs = block_size; - int offs_prev; - while (offs > 1) - { - offs_prev = offs; - offs = ( offs + 1 ) / 2; - __syncthreads(); - if (thread_offset + offs < offs_prev) - { - velocity[thread_offset] += velocity[thread_offset + offs]; - } - - } - - // Write out to global memory - if (thread_offset == 0){ - d_vel[idx] = make_scalar4(velocity[0].x, velocity[0].y, velocity[0].z, d_vel[idx].w); - } - + // Grid point associated with current thread + int Pd2 = P / 2; // integer division does floor + + for ( int ii = 0; ii < n.x; ++ii ){ + + t.x = threadIdx.x + ii*blockDim.x; + + for ( int jj = 0; jj < n.y; ++jj ){ + + t.y = threadIdx.y + jj*blockDim.y; + + for ( int kk = 0; kk < n.z; ++kk ){ + + t.z = threadIdx.z + kk*blockDim.z; + + if( ( t.x < P ) && ( t.y < P ) && ( t.z < P ) ){ + + // x,y,z indices for current thread + // + // Arithmetic with P makes sure distribution is centered on the particle + int x_inp = x + t.x - Pd2 + 1 - (P % 2) * ( pos_frac.x - Scalar( x ) < 0.5 ); + int y_inp = y + t.y - Pd2 + 1 - (P % 2) * ( pos_frac.y - Scalar( y ) < 0.5 ); + int z_inp = z + t.z - Pd2 + 1 - (P % 2) * ( pos_frac.z - Scalar( z ) < 0.5 ); + + // Periodic wrapping of grid point + x_inp = (x_inp<0) ? x_inp+Nx : ( (x_inp>Nx-1) ? x_inp-Nx : x_inp ); + y_inp = (y_inp<0) ? y_inp+Ny : ( (y_inp>Ny-1) ? y_inp-Ny : y_inp ); + z_inp = (z_inp<0) ? z_inp+Nz : ( (z_inp>Nz-1) ? z_inp-Nz : z_inp ); + + // x,y,z coordinates for current thread + Scalar3 pos_grid; + pos_grid.x = gridh.x*x_inp - Ld2.x; + pos_grid.y = gridh.y*y_inp - Ld2.y; + pos_grid.z = gridh.z*z_inp - Ld2.z; + + // Shear the grid position + // !!! This only works for linear shear where the shear gradient is along y + // and the shear direction is along x + pos_grid.x = pos_grid.x + box.getTiltFactorXY() * pos_grid.y; + + // Global index for current grid point + int grid_idx = x_inp * Ny * Nz + y_inp * Nz + z_inp; + + // Distance from particle to grid node + Scalar3 r = pos_grid - pos; + r = box.minImage(r); + Scalar rsq = r.x*r.x + r.y*r.y + r.z*r.z; + + // Spreading Factor + Scalar Cfac = prefac * expf( -expfac * rsq ); + + // Get velocity from reduction (THIS IS THE SLOW STEP): + velocity[thread_offset] += Cfac * make_scalar3( gridX[grid_idx].x, gridY[grid_idx].x, gridZ[grid_idx].x ); + } + }//kk + }//jj + }//ii + + // Intra-block reduction for the total particle velocity + // (add contributions from all grid points) + int offs = block_size; + int offs_prev; + while (offs > 1) + { + offs_prev = offs; + offs = ( offs + 1 ) / 2; + __syncthreads(); + if (thread_offset + offs < offs_prev) + { + velocity[thread_offset] += velocity[thread_offset + offs]; + } + + } + + // Write out to global memory + if (thread_offset == 0){ + d_vel[idx] = make_scalar4(velocity[0].x, velocity[0].y, velocity[0].z, d_vel[idx].w); + } + } /*! - Wrapper to drive all the kernel functions used to compute - the wave space part of Mobility ( Same Size Particles ) + Wrapper to drive all the kernel functions used to compute + the wave space part of Mobility ( Same Size Particles ) */ /*! \param d_pos positions of the particles, actually they are fetched on texture memory @@ -513,64 +522,64 @@ __global__ void gpu_stokes_Contract_kernel( \param gridh distance between grid nodes */ void gpu_stokes_Mwave_wrap( - Scalar4 *d_pos, - Scalar4 *d_vel, - Scalar4 *d_net_force, - unsigned int *d_group_members, - unsigned int group_size, - const BoxDim& box, - Scalar xi, - Scalar eta, - Scalar4 *d_gridk, - CUFFTCOMPLEX *d_gridX, - CUFFTCOMPLEX *d_gridY, - CUFFTCOMPLEX *d_gridZ, - cufftHandle plan, - const int Nx, - const int Ny, - const int Nz, - unsigned int NxNyNz, - dim3 grid, - dim3 threads, - int gridBlockSize, - int gridNBlock, - const int P, - Scalar3 gridh - ){ + Scalar4 *d_pos, + Scalar4 *d_vel, + Scalar4 *d_net_force, + unsigned int *d_group_members, + unsigned int group_size, + const BoxDim& box, + Scalar xi, + Scalar eta, + Scalar4 *d_gridk, + CUFFTCOMPLEX *d_gridX, + CUFFTCOMPLEX *d_gridY, + CUFFTCOMPLEX *d_gridZ, + cufftHandle plan, + const int Nx, + const int Ny, + const int Nz, + unsigned int NxNyNz, + dim3 grid, + dim3 threads, + int gridBlockSize, + int gridNBlock, + const int P, + Scalar3 gridh + ){ - // Spreading and contraction stuff - dim3 Cgrid( group_size, 1, 1); - int B = ( P < 10 ) ? P : 10; - dim3 Cthreads(B, B, B); - - Scalar quadW = gridh.x * gridh.y * gridh.z; - Scalar xisq = xi * xi; - Scalar prefac = ( 2.0 * xisq / 3.1415926536 / eta ) * sqrtf( 2.0 * xisq / 3.1415926536 / eta ); - Scalar expfac = 2.0 * xisq / eta; - - // Reset the grid ( remove any previously distributed forces ) - gpu_stokes_ZeroGrid_kernel<<>>(d_gridX,NxNyNz); - gpu_stokes_ZeroGrid_kernel<<>>(d_gridY,NxNyNz); - gpu_stokes_ZeroGrid_kernel<<>>(d_gridZ,NxNyNz); - - // Spread forces onto grid - gpu_stokes_Spread_kernel<<>>( d_pos, d_net_force, d_gridX, d_gridY, d_gridZ, group_size, Nx, Ny, Nz, d_group_members, box, P, gridh, xi, eta, prefac, expfac ); - - // Perform FFT on gridded forces - cufftExecC2C(plan, d_gridX, d_gridX, CUFFT_FORWARD); - cufftExecC2C(plan, d_gridY, d_gridY, CUFFT_FORWARD); - cufftExecC2C(plan, d_gridZ, d_gridZ, CUFFT_FORWARD); - - // Apply wave space scaling to FFT'd forces - gpu_stokes_Green_kernel<<>>( d_gridX, d_gridY, d_gridZ, d_gridk, NxNyNz); - - // Return rescaled forces to real space - cufftExecC2C(plan, d_gridX, d_gridX, CUFFT_INVERSE); - cufftExecC2C(plan, d_gridY, d_gridY, CUFFT_INVERSE); - cufftExecC2C(plan, d_gridZ, d_gridZ, CUFFT_INVERSE); - - // Evaluate contribution of grid velocities at particle centers - gpu_stokes_Contract_kernel<<>>( d_pos, d_vel, d_gridX, d_gridY, d_gridZ, group_size, Nx, Ny, Nz, xi, eta, d_group_members, box, P, gridh, quadW*prefac, expfac ); + // Spreading and contraction stuff + dim3 Cgrid( group_size, 1, 1); + int B = ( P < 10 ) ? P : 10; + dim3 Cthreads(B, B, B); + + Scalar quadW = gridh.x * gridh.y * gridh.z; + Scalar xisq = xi * xi; + Scalar prefac = ( 2.0 * xisq / 3.1415926536 / eta ) * sqrtf( 2.0 * xisq / 3.1415926536 / eta ); + Scalar expfac = 2.0 * xisq / eta; + + // Reset the grid ( remove any previously distributed forces ) + gpu_stokes_ZeroGrid_kernel<<>>(d_gridX,NxNyNz); + gpu_stokes_ZeroGrid_kernel<<>>(d_gridY,NxNyNz); + gpu_stokes_ZeroGrid_kernel<<>>(d_gridZ,NxNyNz); + + // Spread forces onto grid + gpu_stokes_Spread_kernel<<>>( d_pos, d_net_force, d_gridX, d_gridY, d_gridZ, group_size, Nx, Ny, Nz, d_group_members, box, P, gridh, xi, eta, prefac, expfac ); + + // Perform FFT on gridded forces + cufftExecC2C(plan, d_gridX, d_gridX, CUFFT_FORWARD); + cufftExecC2C(plan, d_gridY, d_gridY, CUFFT_FORWARD); + cufftExecC2C(plan, d_gridZ, d_gridZ, CUFFT_FORWARD); + + // Apply wave space scaling to FFT'd forces + gpu_stokes_Green_kernel<<>>( d_gridX, d_gridY, d_gridZ, d_gridk, NxNyNz); + + // Return rescaled forces to real space + cufftExecC2C(plan, d_gridX, d_gridX, CUFFT_INVERSE); + cufftExecC2C(plan, d_gridY, d_gridY, CUFFT_INVERSE); + cufftExecC2C(plan, d_gridZ, d_gridZ, CUFFT_INVERSE); + + // Evaluate contribution of grid velocities at particle centers + gpu_stokes_Contract_kernel<<>>( d_pos, d_vel, d_gridX, d_gridY, d_gridZ, group_size, Nx, Ny, Nz, xi, eta, d_group_members, box, P, gridh, quadW*prefac, expfac ); } @@ -591,108 +600,108 @@ void gpu_stokes_Mwave_wrap( \param d_nlist list containing neighbors of all particles \param d_headlist list of particle offsets into d_nlist */ -__global__ void gpu_stokes_Mreal_kernel( - Scalar4 *d_pos, - Scalar4 *d_vel, - Scalar4 *d_net_force, - int group_size, - Scalar xi, - Scalar4 *d_ewaldC1, - Scalar self, - Scalar ewald_cut, - int ewald_n, - Scalar ewald_dr, - unsigned int *d_group_members, - BoxDim box, - const unsigned int *d_n_neigh, - const unsigned int *d_nlist, - const unsigned int *d_headlist - ){ +__global__ void gpu_stokes_Mreal_kernel( + Scalar4 *d_pos, + Scalar4 *d_vel, + Scalar4 *d_net_force, + int group_size, + Scalar xi, + Scalar4 *d_ewaldC1, + Scalar self, + Scalar ewald_cut, + int ewald_n, + Scalar ewald_dr, + unsigned int *d_group_members, + BoxDim box, + const unsigned int *d_n_neigh, + const unsigned int *d_nlist, + const unsigned int *d_headlist + ){ - // Index for current thread - int group_idx = blockDim.x * blockIdx.x + threadIdx.x; - - // Initialize contribution to velocity - Scalar4 u = make_scalar4( 0.0, 0.0, 0.0, 0.0 ); - - if (group_idx < group_size) { - - // Particle for this thread - unsigned int idx = d_group_members[group_idx]; - - // Number of neighbors for current particle - unsigned int n_neigh = d_n_neigh[idx]; - unsigned int head_idx = d_headlist[idx]; - - // Particle position and table ID - Scalar4 posi = texFetchScalar4(d_pos, pos_tex, idx); - - // Self contribution - Scalar4 F = d_net_force[idx]; - u = make_scalar4( self * F.x, self * F.y, self * F.z, 0.0 ); - - // Minimum and maximum distance for pair calculation - Scalar mindistSq = ewald_dr * ewald_dr; - Scalar maxdistSq = ewald_cut * ewald_cut; - - for (int neigh_idx = 0; neigh_idx < n_neigh; neigh_idx++) { - - // Get index for current neightbor - unsigned int cur_j = d_nlist[ head_idx + neigh_idx ]; - - // Position and size of neighbor particle - Scalar4 posj = texFetchScalar4(d_pos, pos_tex, cur_j); - - // Distance vector between current particle and neighbor - Scalar3 r = make_scalar3( posi.x - posj.x, posi.y - posj.y, posi.z - posj.z ); - r = box.minImage(r); - Scalar distSqr = dot(r,r); - - // Add neighbor contribution if it is within the real space cutoff radius - if ( ( distSqr < maxdistSq ) && ( distSqr >= mindistSq ) ) { - - // Need distance - Scalar dist = sqrtf( distSqr ); - - // Force on neighbor particle - Scalar4 Fj = d_net_force[cur_j]; - - // Fetch relevant elements from textured table for real space interaction - int r_ind = __scalar2int_rd( ewald_n * ( dist - ewald_dr ) / ( ewald_cut - ewald_dr ) ); - int offset = r_ind; - - Scalar4 tewaldC1 = texFetchScalar4(d_ewaldC1, tables1_tex, offset); - - // Linear interpolation of table - Scalar fac = dist / ewald_dr - r_ind - Scalar(1.0); - - Scalar Imrr = tewaldC1.x + ( tewaldC1.z - tewaldC1.x ) * fac; - Scalar rr = tewaldC1.y + ( tewaldC1.w - tewaldC1.y ) * fac; - - // Update velocity - Scalar rdotf = ( r.x*Fj.x + r.y*Fj.y + r.z*Fj.z ) / distSqr; - - u.x += Imrr * Fj.x + ( rr - Imrr ) * rdotf * r.x; - u.y += Imrr * Fj.y + ( rr - Imrr ) * rdotf * r.y; - u.z += Imrr * Fj.z + ( rr - Imrr ) * rdotf * r.z; - - } - - } - - // Write to output - d_vel[idx] = u; - - } + // Index for current thread + int group_idx = blockDim.x * blockIdx.x + threadIdx.x; + + // Initialize contribution to velocity + Scalar4 u = make_scalar4( 0.0, 0.0, 0.0, 0.0 ); + + if (group_idx < group_size) { + + // Particle for this thread + unsigned int idx = d_group_members[group_idx]; + + // Number of neighbors for current particle + unsigned int n_neigh = d_n_neigh[idx]; + unsigned int head_idx = d_headlist[idx]; + + // Particle position and table ID + Scalar4 posi = texFetchScalar4(d_pos, pos_tex, idx); + + // Self contribution + Scalar4 F = d_net_force[idx]; + u = make_scalar4( self * F.x, self * F.y, self * F.z, 0.0 ); + + // Minimum and maximum distance for pair calculation + Scalar mindistSq = ewald_dr * ewald_dr; + Scalar maxdistSq = ewald_cut * ewald_cut; + + for (int neigh_idx = 0; neigh_idx < n_neigh; neigh_idx++) { + + // Get index for current neightbor + unsigned int cur_j = d_nlist[ head_idx + neigh_idx ]; + + // Position and size of neighbor particle + Scalar4 posj = texFetchScalar4(d_pos, pos_tex, cur_j); + + // Distance vector between current particle and neighbor + Scalar3 r = make_scalar3( posi.x - posj.x, posi.y - posj.y, posi.z - posj.z ); + r = box.minImage(r); + Scalar distSqr = dot(r,r); + + // Add neighbor contribution if it is within the real space cutoff radius + if ( ( distSqr < maxdistSq ) && ( distSqr >= mindistSq ) ) { + + // Need distance + Scalar dist = sqrtf( distSqr ); + + // Force on neighbor particle + Scalar4 Fj = d_net_force[cur_j]; + + // Fetch relevant elements from textured table for real space interaction + int r_ind = __scalar2int_rd( ewald_n * ( dist - ewald_dr ) / ( ewald_cut - ewald_dr ) ); + int offset = r_ind; + + Scalar4 tewaldC1 = texFetchScalar4(d_ewaldC1, tables1_tex, offset); + + // Linear interpolation of table + Scalar fac = dist / ewald_dr - r_ind - Scalar(1.0); + + Scalar Imrr = tewaldC1.x + ( tewaldC1.z - tewaldC1.x ) * fac; + Scalar rr = tewaldC1.y + ( tewaldC1.w - tewaldC1.y ) * fac; + + // Update velocity + Scalar rdotf = ( r.x*Fj.x + r.y*Fj.y + r.z*Fj.z ) / distSqr; + + u.x += Imrr * Fj.x + ( rr - Imrr ) * rdotf * r.x; + u.y += Imrr * Fj.y + ( rr - Imrr ) * rdotf * r.y; + u.z += Imrr * Fj.z + ( rr - Imrr ) * rdotf * r.z; + + } + + } + + // Write to output + d_vel[idx] = u; + + } } /*! - Wrap all the functions to compute U = M * F ( SAME SIZE PARTICLES ) - Drive GPU kernel functions + Wrap all the functions to compute U = M * F ( SAME SIZE PARTICLES ) + Drive GPU kernel functions - d_vel = M * d_net_force + d_vel = M * d_net_force */ /*! \param d_pos positions of the particles, actually they are fetched on texture memory @@ -727,57 +736,57 @@ __global__ void gpu_stokes_Mreal_kernel( \param gridh distance between grid nodes */ void gpu_stokes_Mobility_wrap( - Scalar4 *d_pos, - Scalar4 *d_vel, - Scalar4 *d_net_force, - unsigned int *d_group_members, - unsigned int group_size, - const BoxDim& box, - Scalar xi, - Scalar eta, - Scalar ewald_cut, - Scalar ewald_dr, - int ewald_n, - Scalar4 *d_ewaldC1, - Scalar self, - Scalar4 *d_gridk, - CUFFTCOMPLEX *d_gridX, - CUFFTCOMPLEX *d_gridY, - CUFFTCOMPLEX *d_gridZ, - cufftHandle plan, - const int Nx, - const int Ny, - const int Nz, - const unsigned int *d_n_neigh, - const unsigned int *d_nlist, - const unsigned int *d_headlist, - unsigned int NxNyNz, - dim3 grid, - dim3 threads, - int gridBlockSize, - int gridNBlock, - const int P, - Scalar3 gridh ){ - - // Real and wave space velocity - Scalar4 *d_vel1, *d_vel2; - cudaMalloc( &d_vel1, group_size*sizeof(Scalar4) ); - cudaMalloc( &d_vel2, group_size*sizeof(Scalar4) ); - - // Add the wave space contribution to the velocity - gpu_stokes_Mwave_wrap( d_pos, d_vel1, d_net_force, d_group_members, group_size, box, xi, eta, d_gridk, d_gridX, d_gridY, d_gridZ, plan, Nx, Ny, Nz, NxNyNz, grid, threads, gridBlockSize, gridNBlock, P, gridh ); - - // Add the real space contribution to the velocity - // - // Real space calculation takes care of self contributions - gpu_stokes_Mreal_kernel<<>>(d_pos, d_vel2, d_net_force, group_size, xi, d_ewaldC1, self, ewald_cut, ewald_n, ewald_dr, d_group_members, box, d_n_neigh, d_nlist, d_headlist ); - - // Add real and wave space parts together - gpu_stokes_LinearCombination_kernel<<>>(d_vel1, d_vel2, d_vel, 1.0, 1.0, group_size, d_group_members); - - // Free memory - cudaFree(d_vel1); - cudaFree(d_vel2); + Scalar4 *d_pos, + Scalar4 *d_vel, + Scalar4 *d_net_force, + unsigned int *d_group_members, + unsigned int group_size, + const BoxDim& box, + Scalar xi, + Scalar eta, + Scalar ewald_cut, + Scalar ewald_dr, + int ewald_n, + Scalar4 *d_ewaldC1, + Scalar self, + Scalar4 *d_gridk, + CUFFTCOMPLEX *d_gridX, + CUFFTCOMPLEX *d_gridY, + CUFFTCOMPLEX *d_gridZ, + cufftHandle plan, + const int Nx, + const int Ny, + const int Nz, + const unsigned int *d_n_neigh, + const unsigned int *d_nlist, + const unsigned int *d_headlist, + unsigned int NxNyNz, + dim3 grid, + dim3 threads, + int gridBlockSize, + int gridNBlock, + const int P, + Scalar3 gridh ){ + + // Real and wave space velocity + Scalar4 *d_vel1, *d_vel2; + cudaMalloc( &d_vel1, group_size*sizeof(Scalar4) ); + cudaMalloc( &d_vel2, group_size*sizeof(Scalar4) ); + + // Add the wave space contribution to the velocity + gpu_stokes_Mwave_wrap( d_pos, d_vel1, d_net_force, d_group_members, group_size, box, xi, eta, d_gridk, d_gridX, d_gridY, d_gridZ, plan, Nx, Ny, Nz, NxNyNz, grid, threads, gridBlockSize, gridNBlock, P, gridh ); + + // Add the real space contribution to the velocity + // + // Real space calculation takes care of self contributions + gpu_stokes_Mreal_kernel<<>>(d_pos, d_vel2, d_net_force, group_size, xi, d_ewaldC1, self, ewald_cut, ewald_n, ewald_dr, d_group_members, box, d_n_neigh, d_nlist, d_headlist ); + + // Add real and wave space parts together + gpu_stokes_LinearCombination_kernel<<>>(d_vel1, d_vel2, d_vel, 1.0, 1.0, group_size, d_group_members); + + // Free memory + cudaFree(d_vel1); + cudaFree(d_vel2); } diff --git a/PSEv1/Mobility.cuh b/PSEv1/Mobility.cuh index a5a7963..b62f8e6 100644 --- a/PSEv1/Mobility.cuh +++ b/PSEv1/Mobility.cuh @@ -71,90 +71,90 @@ ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. void gpu_stokes_Mobility_wrap( Scalar4 *d_pos, - Scalar4 *d_vel, - Scalar4 *d_net_force, - unsigned int *d_group_members, - unsigned int group_size, - const BoxDim& box, - Scalar xi, - Scalar eta, - Scalar ewald_cut, - Scalar ewald_dr, - int ewald_n, - Scalar4 *d_ewaldC1, - Scalar self, - Scalar4 *d_gridk, - CUFFTCOMPLEX *d_gridX, - CUFFTCOMPLEX *d_gridY, - CUFFTCOMPLEX *d_gridZ, - cufftHandle plan, - const int Nx, - const int Ny, - const int Nz, - const unsigned int *d_n_neigh, - const unsigned int *d_nlist, - const unsigned int *d_headlist, - unsigned int NxNyNz, - dim3 grid, - dim3 threads, - int gridBlockSize, - int gridNBlock, - const int P, - Scalar3 gridh ); + Scalar4 *d_vel, + Scalar4 *d_net_force, + unsigned int *d_group_members, + unsigned int group_size, + const BoxDim& box, + Scalar xi, + Scalar eta, + Scalar ewald_cut, + Scalar ewald_dr, + int ewald_n, + Scalar4 *d_ewaldC1, + Scalar self, + Scalar4 *d_gridk, + CUFFTCOMPLEX *d_gridX, + CUFFTCOMPLEX *d_gridY, + CUFFTCOMPLEX *d_gridZ, + cufftHandle plan, + const int Nx, + const int Ny, + const int Nz, + const unsigned int *d_n_neigh, + const unsigned int *d_nlist, + const unsigned int *d_headlist, + unsigned int NxNyNz, + dim3 grid, + dim3 threads, + int gridBlockSize, + int gridNBlock, + const int P, + Scalar3 gridh ); __global__ -void gpu_stokes_Mreal_kernel( Scalar4 *d_pos, - Scalar4 *d_vel, - Scalar4 *d_net_force, - int group_size, - Scalar xi, - Scalar4 *d_ewaldC1, - Scalar self, - Scalar ewald_cut, - int ewald_n, - Scalar ewald_dr, - unsigned int *d_group_members, - BoxDim box, - const unsigned int *d_n_neigh, - const unsigned int *d_nlist, - const unsigned int *d_headlist ); - -__global__ void gpu_stokes_Spread_kernel( Scalar4 *d_pos, - Scalar4 *d_net_force, - CUFFTCOMPLEX *gridX, - CUFFTCOMPLEX *gridY, - CUFFTCOMPLEX *gridZ, - int group_size, - int Nx, - int Ny, - int Nz, - unsigned int *d_group_members, - BoxDim box, - const int P, - Scalar3 gridh, - Scalar xi, - Scalar eta, - Scalar prefac, - Scalar expfac ); +void gpu_stokes_Mreal_kernel( Scalar4 *d_pos, + Scalar4 *d_vel, + Scalar4 *d_net_force, + int group_size, + Scalar xi, + Scalar4 *d_ewaldC1, + Scalar self, + Scalar ewald_cut, + int ewald_n, + Scalar ewald_dr, + unsigned int *d_group_members, + BoxDim box, + const unsigned int *d_n_neigh, + const unsigned int *d_nlist, + const unsigned int *d_headlist ); + +__global__ void gpu_stokes_Spread_kernel( Scalar4 *d_pos, + Scalar4 *d_net_force, + CUFFTCOMPLEX *gridX, + CUFFTCOMPLEX *gridY, + CUFFTCOMPLEX *gridZ, + int group_size, + int Nx, + int Ny, + int Nz, + unsigned int *d_group_members, + BoxDim box, + const int P, + Scalar3 gridh, + Scalar xi, + Scalar eta, + Scalar prefac, + Scalar expfac ); __global__ void gpu_stokes_Green_kernel(CUFFTCOMPLEX *gridX, CUFFTCOMPLEX *gridY, CUFFTCOMPLEX *gridZ, Scalar4 *gridk, unsigned int NxNyNz); -__global__ void gpu_stokes_Contract_kernel( Scalar4 *d_pos, - Scalar4 *d_vel, - CUFFTCOMPLEX *gridX, - CUFFTCOMPLEX *gridY, - CUFFTCOMPLEX *gridZ, - int group_size, - int Nx, - int Ny, - int Nz, - Scalar xi, - Scalar eta, - unsigned int *d_group_members, - BoxDim box, - const int P, - Scalar3 gridh, - Scalar prefac, - Scalar expfac ); +__global__ void gpu_stokes_Contract_kernel( Scalar4 *d_pos, + Scalar4 *d_vel, + CUFFTCOMPLEX *gridX, + CUFFTCOMPLEX *gridY, + CUFFTCOMPLEX *gridZ, + int group_size, + int Nx, + int Ny, + int Nz, + Scalar xi, + Scalar eta, + unsigned int *d_group_members, + BoxDim box, + const int P, + Scalar3 gridh, + Scalar prefac, + Scalar expfac ); #endif diff --git a/PSEv1/Stokes.cc b/PSEv1/Stokes.cc index db54b2a..4ca1939 100644 --- a/PSEv1/Stokes.cc +++ b/PSEv1/Stokes.cc @@ -73,40 +73,40 @@ using namespace std; */ /*! - \param sysdef SystemDefinition this method will act on. Must not be NULL. - \param group The group of particles this integration method is to work on - \param T temperature - \param seed seed for random number generator - \param nlist neighbor list - \param xi Ewald parameter - \param m_error Tolerance for all calculations + \param sysdef SystemDefinition this method will act on. Must not be NULL. + \param group The group of particles this integration method is to work on + \param T temperature + \param seed seed for random number generator + \param nlist neighbor list + \param xi Ewald parameter + \param m_error Tolerance for all calculations */ -Stokes::Stokes( std::shared_ptr sysdef, +Stokes::Stokes( std::shared_ptr sysdef, std::shared_ptr group, - std::shared_ptr T, - unsigned int seed, - std::shared_ptr nlist, - Scalar xi, - Scalar error ) - : IntegrationMethodTwoStep(sysdef, group), - m_T(T), - m_seed(seed), - m_nlist(nlist), - m_xi(xi), - m_error(error) + std::shared_ptr T, + unsigned int seed, + std::shared_ptr nlist, + Scalar xi, + Scalar error ) + : IntegrationMethodTwoStep(sysdef, group), + m_T(T), + m_seed(seed), + m_nlist(nlist), + m_xi(xi), + m_error(error) { m_exec_conf->msg->notice(5) << "Constructing Stokes" << endl; - // Hash the User's Seed to make it less likely to be a low positive integer - m_seed = m_seed * 0x12345677 + 0x12345; m_seed ^= (m_seed >> 16); m_seed *= 0x45679; + // Hash the User's Seed to make it less likely to be a low positive integer + m_seed = m_seed * 0x12345677 + 0x12345; m_seed ^= (m_seed >> 16); m_seed *= 0x45679; - // only one GPU is supported - if (!m_exec_conf->isCUDAEnabled()) - { - m_exec_conf->msg->error() << "Creating a Stokes when CUDA is disabled" << endl; - throw std::runtime_error("Error initializing Stokes"); - } + // only one GPU is supported + if (!m_exec_conf->isCUDAEnabled()) + { + m_exec_conf->msg->error() << "Creating a Stokes when CUDA is disabled" << endl; + throw std::runtime_error("Error initializing Stokes"); + } } @@ -114,240 +114,240 @@ Stokes::Stokes( std::shared_ptr sysdef, Stokes::~Stokes() { m_exec_conf->msg->notice(5) << "Destroying Stokes" << endl; - cufftDestroy(plan); + cufftDestroy(plan); } /*! - Set the parameters for Hydrodynamic Calculation. Do once at the beginning - of the simulation and then reuse computed values + Set the parameters for Hydrodynamic Calculation. Do once at the beginning + of the simulation and then reuse computed values - - Pre-tabulate real space interaction functions (f and g) - - Set wave space vectors + - Pre-tabulate real space interaction functions (f and g) + - Set wave space vectors */ void Stokes::setParams() { - // Try two Lanczos iterations to start (number of iterations will adapt as needed) - m_m_Lanczos = 2; - - // Real space cutoff - m_ewald_cut = sqrtf( - logf( m_error ) ) / m_xi; - - // Number of grid points - int kmax = int( 2.0 * sqrtf( - logf( m_error ) ) * m_xi ) + 1; - - const BoxDim& box = m_pdata->getBox(); // Only for box not changing with time. - Scalar3 L = box.getL(); - - m_Nx = int( kmax * L.x / (2.0 * 3.1415926536 ) * 2.0 ) + 1; - m_Ny = int( kmax * L.y / (2.0 * 3.1415926536 ) * 2.0 ) + 1; - m_Nz = int( kmax * L.z / (2.0 * 3.1415926536 ) * 2.0 ) + 1; - - // Get list of int values between 8 and 4096 that can be written as - // (2^a)*(3^b)*(5^c) - // Then sort list from low to high - // - // Go to such large values so as to be able simulate boxes with large - // aspect ratios - std::vector Mlist; - for ( int ii = 0; ii < 13; ++ii ){ - int pow2 = 1; - for ( int i = 0; i < ii; ++i ){ - pow2 *= 2; - } - for ( int jj = 0; jj < 8; ++jj ){ - int pow3 = 1; - for ( int j = 0; j < jj; ++j ){ - pow3 *= 3; - } - for ( int kk = 0; kk < 6; ++kk ){ - int pow5 = 1; - for ( int k = 0; k < kk; ++k ){ - pow5 *= 5; - } - int Mcurr = pow2 * pow3 * pow5; - if ( Mcurr >= 8 && Mcurr <= 4096 ){ - Mlist.push_back(Mcurr); - } - } - } - } - std::sort(Mlist.begin(), Mlist.end()); - const int nmult = Mlist.size(); - - // Compute the number of grid points in each direction - // - // Number of grid points should be a power of 2,3,5 for most efficient FFTs - for ( int ii = 0; ii < nmult; ++ii ){ - if (m_Nx <= Mlist[ii]){ - m_Nx = Mlist[ii]; - break; - } - } - for ( int ii = 0; ii < nmult; ++ii ){ - if (m_Ny <= Mlist[ii]){ - m_Ny = Mlist[ii]; - break; - } - } - for ( int ii = 0; ii < nmult; ++ii ){ - if (m_Nz <= Mlist[ii]){ - m_Nz = Mlist[ii]; - break; - } - } - - // Check that we haven't asked for too many grid points - // Max allowable by cuFFT is 512^3 - if ( m_Nx * m_Ny * m_Nz > 512*512*512 ){ - - printf("Requested Number of Fourier Nodes Exceeds Max Dimension of 512^3\n"); - printf("Mx = %i \n", m_Nx); - printf("My = %i \n", m_Ny); - printf("Mz = %i \n", m_Nz); - printf("Mx*My*Mz = %i \n", m_Nx * m_Ny * m_Nz); - printf("\n"); - printf("Note to User: Fix is to reduce xi and try again. \n"); - - exit(EXIT_FAILURE); - } - - // Maximum eigenvalue of A'*A to scale P - Scalar gamma = m_max_strain; - Scalar gamma2 = gamma*gamma; - Scalar lambda = 1.0 + gamma2/2.0 + gamma*sqrtf(1.0 + gamma2/4.0); - - // Grid spacing - m_gridh = L / make_scalar3(m_Nx,m_Ny,m_Nz); - - // Parameters for the Spectral Ewald Method (Lindbo and Tornberg, J. Comp. Phys., 2011) - m_gaussm = 1.0; - while ( erfcf( m_gaussm / sqrtf(2.0*lambda) ) > m_error ){ - m_gaussm = m_gaussm + 0.01; - } - m_gaussP = int( m_gaussm*m_gaussm / 3.1415926536 ) + 1; - - if (m_gaussP > m_Nx) m_gaussP = m_Nx; // Can't be supported beyond grid - if (m_gaussP > m_Ny) m_gaussP = m_Ny; - if (m_gaussP > m_Nz) m_gaussP = m_Nz; - Scalar w = m_gaussP*m_gridh.x / 2.0; // Gaussian width in simulation units - Scalar xisq = m_xi * m_xi; - m_eta = (2.0*w/m_gaussm)*(2.0*w/m_gaussm) * ( xisq ); // Gaussian splitting parameter - - // Print summary to command line output - printf("\n"); - printf("\n"); - m_exec_conf->msg->notice(2) << "--- NUFFT Hydrodynamics Statistics ---" << endl; - m_exec_conf->msg->notice(2) << "Mx: " << m_Nx << endl; - m_exec_conf->msg->notice(2) << "My: " << m_Ny << endl; - m_exec_conf->msg->notice(2) << "Mz: " << m_Nz << endl; - m_exec_conf->msg->notice(2) << "rcut: " << m_ewald_cut << endl; - m_exec_conf->msg->notice(2) << "Points per radius (x,y,z): " << m_Nx / L.x << ", " << m_Ny / L.y << ", " << m_Nz / L.z << endl; - m_exec_conf->msg->notice(2) << "--- Gaussian Spreading Parameters ---" << endl; - m_exec_conf->msg->notice(2) << "gauss_m: " << m_gaussm << endl; + // Try two Lanczos iterations to start (number of iterations will adapt as needed) + m_m_Lanczos = 2; + + // Real space cutoff + m_ewald_cut = sqrtf( - logf( m_error ) ) / m_xi; + + // Number of grid points + int kmax = int( 2.0 * sqrtf( - logf( m_error ) ) * m_xi ) + 1; + + const BoxDim& box = m_pdata->getBox(); // Only for box not changing with time. + Scalar3 L = box.getL(); + + m_Nx = int( kmax * L.x / (2.0 * 3.1415926536 ) * 2.0 ) + 1; + m_Ny = int( kmax * L.y / (2.0 * 3.1415926536 ) * 2.0 ) + 1; + m_Nz = int( kmax * L.z / (2.0 * 3.1415926536 ) * 2.0 ) + 1; + + // Get list of int values between 8 and 4096 that can be written as + // (2^a)*(3^b)*(5^c) + // Then sort list from low to high + // + // Go to such large values so as to be able simulate boxes with large + // aspect ratios + std::vector Mlist; + for ( int ii = 0; ii < 13; ++ii ){ + int pow2 = 1; + for ( int i = 0; i < ii; ++i ){ + pow2 *= 2; + } + for ( int jj = 0; jj < 8; ++jj ){ + int pow3 = 1; + for ( int j = 0; j < jj; ++j ){ + pow3 *= 3; + } + for ( int kk = 0; kk < 6; ++kk ){ + int pow5 = 1; + for ( int k = 0; k < kk; ++k ){ + pow5 *= 5; + } + int Mcurr = pow2 * pow3 * pow5; + if ( Mcurr >= 8 && Mcurr <= 4096 ){ + Mlist.push_back(Mcurr); + } + } + } + } + std::sort(Mlist.begin(), Mlist.end()); + const int nmult = Mlist.size(); + + // Compute the number of grid points in each direction + // + // Number of grid points should be a power of 2,3,5 for most efficient FFTs + for ( int ii = 0; ii < nmult; ++ii ){ + if (m_Nx <= Mlist[ii]){ + m_Nx = Mlist[ii]; + break; + } + } + for ( int ii = 0; ii < nmult; ++ii ){ + if (m_Ny <= Mlist[ii]){ + m_Ny = Mlist[ii]; + break; + } + } + for ( int ii = 0; ii < nmult; ++ii ){ + if (m_Nz <= Mlist[ii]){ + m_Nz = Mlist[ii]; + break; + } + } + + // Check that we haven't asked for too many grid points + // Max allowable by cuFFT is 512^3 + if ( m_Nx * m_Ny * m_Nz > 512*512*512 ){ + + printf("Requested Number of Fourier Nodes Exceeds Max Dimension of 512^3\n"); + printf("Mx = %i \n", m_Nx); + printf("My = %i \n", m_Ny); + printf("Mz = %i \n", m_Nz); + printf("Mx*My*Mz = %i \n", m_Nx * m_Ny * m_Nz); + printf("\n"); + printf("Note to User: Fix is to reduce xi and try again. \n"); + + exit(EXIT_FAILURE); + } + + // Maximum eigenvalue of A'*A to scale P + Scalar gamma = m_max_strain; + Scalar gamma2 = gamma*gamma; + Scalar lambda = 1.0 + gamma2/2.0 + gamma*sqrtf(1.0 + gamma2/4.0); + + // Grid spacing + m_gridh = L / make_scalar3(m_Nx,m_Ny,m_Nz); + + // Parameters for the Spectral Ewald Method (Lindbo and Tornberg, J. Comp. Phys., 2011) + m_gaussm = 1.0; + while ( erfcf( m_gaussm / sqrtf(2.0*lambda) ) > m_error ){ + m_gaussm = m_gaussm + 0.01; + } + m_gaussP = int( m_gaussm*m_gaussm / 3.1415926536 ) + 1; + + if (m_gaussP > m_Nx) m_gaussP = m_Nx; // Can't be supported beyond grid + if (m_gaussP > m_Ny) m_gaussP = m_Ny; + if (m_gaussP > m_Nz) m_gaussP = m_Nz; + Scalar w = m_gaussP*m_gridh.x / 2.0; // Gaussian width in simulation units + Scalar xisq = m_xi * m_xi; + m_eta = (2.0*w/m_gaussm)*(2.0*w/m_gaussm) * ( xisq ); // Gaussian splitting parameter + + // Print summary to command line output + printf("\n"); + printf("\n"); + m_exec_conf->msg->notice(2) << "--- NUFFT Hydrodynamics Statistics ---" << endl; + m_exec_conf->msg->notice(2) << "Mx: " << m_Nx << endl; + m_exec_conf->msg->notice(2) << "My: " << m_Ny << endl; + m_exec_conf->msg->notice(2) << "Mz: " << m_Nz << endl; + m_exec_conf->msg->notice(2) << "rcut: " << m_ewald_cut << endl; + m_exec_conf->msg->notice(2) << "Points per radius (x,y,z): " << m_Nx / L.x << ", " << m_Ny / L.y << ", " << m_Nz / L.z << endl; + m_exec_conf->msg->notice(2) << "--- Gaussian Spreading Parameters ---" << endl; + m_exec_conf->msg->notice(2) << "gauss_m: " << m_gaussm << endl; m_exec_conf->msg->notice(2) << "gauss_P: " << m_gaussP << endl; - m_exec_conf->msg->notice(2) << "gauss_eta: " << m_eta << endl; - m_exec_conf->msg->notice(2) << "gauss_w: " << w << endl; - m_exec_conf->msg->notice(2) << "gauss_gridh (x,y,z): " << L.x/m_Nx << ", " << L.y/m_Ny << ", " << L.z/m_Nz << endl; - printf("\n"); - printf("\n"); - - // Create plan for CUFFT on the GPU - cufftPlan3d(&plan, m_Nx, m_Ny, m_Nz, CUFFT_C2C); - - // Prepare GPUArrays for grid vectors and gridded forces - GPUArray n_gridk(m_Nx*m_Ny*m_Nz, m_exec_conf); - m_gridk.swap(n_gridk); - GPUArray n_gridX(m_Nx*m_Ny*m_Nz, m_exec_conf); - m_gridX.swap(n_gridX); - GPUArray n_gridY(m_Nx*m_Ny*m_Nz, m_exec_conf); - m_gridY.swap(n_gridY); - GPUArray n_gridZ(m_Nx*m_Ny*m_Nz, m_exec_conf); - m_gridZ.swap(n_gridZ); - - // Get list of reciprocal space vectors, and scaling factor for the wave space calculation at each grid point - ArrayHandle h_gridk(m_gridk, access_location::host, access_mode::readwrite); - for (int i = 0; i < m_Nx; i++) { - for (int j = 0; j < m_Ny; j++) { - for (int k = 0; k < m_Nz; k++) { - - // Index into grid vector storage array - int idx = i * m_Ny*m_Nz + j * m_Nz + k; - - // k goes from -N/2 to N/2 - h_gridk.data[idx].x = 2.0*3.1415926536 * ((i < ( m_Nx + 1 ) / 2) ? i : i - m_Nx) / L.x; - h_gridk.data[idx].y = 2.0*3.1415926536 * ((j < ( m_Ny + 1 ) / 2) ? j : j - m_Ny) / L.y; - h_gridk.data[idx].z = 2.0*3.1415926536 * ((k < ( m_Nz + 1 ) / 2) ? k : k - m_Nz) / L.z; - - // k dot k - Scalar k2 = h_gridk.data[idx].x*h_gridk.data[idx].x + h_gridk.data[idx].y*h_gridk.data[idx].y + h_gridk.data[idx].z*h_gridk.data[idx].z; - - // Scaling factor used in wave space sum - // - // Can't include k=0 term in the Ewald sum - if (i == 0 && j == 0 && k == 0){ - h_gridk.data[idx].w = 0; - } - else{ - // Have to divide by Nx*Ny*Nz to normalize the FFTs - h_gridk.data[idx].w = 6.0*3.1415926536 * (1.0 + k2/4.0/xisq) * expf( -(1-m_eta) * k2/4.0/xisq ) / ( k2 ) / Scalar( m_Nx*m_Ny*m_Nz ); - } - - } - } - } - - // Store the coefficients for the real space part of Ewald summation - // - // Will precompute scaling factors for real space component of summation for a given - // discretization to speed up GPU calculations - // - // Do calculation in double precision, then truncate and tabulate, because the - // expressions don't behave very well numerically, and double precision ensures - // it works. - m_ewald_dr = 0.001; // Distance resolution - m_ewald_n = m_ewald_cut / m_ewald_dr - 1; // Number of entries in tabulation - - double dr = 0.0010000000000000; - - // Assume all particles have radius of 1.0 + m_exec_conf->msg->notice(2) << "gauss_eta: " << m_eta << endl; + m_exec_conf->msg->notice(2) << "gauss_w: " << w << endl; + m_exec_conf->msg->notice(2) << "gauss_gridh (x,y,z): " << L.x/m_Nx << ", " << L.y/m_Ny << ", " << L.z/m_Nz << endl; + printf("\n"); + printf("\n"); + + // Create plan for CUFFT on the GPU + cufftPlan3d(&plan, m_Nx, m_Ny, m_Nz, CUFFT_C2C); + + // Prepare GPUArrays for grid vectors and gridded forces + GPUArray n_gridk(m_Nx*m_Ny*m_Nz, m_exec_conf); + m_gridk.swap(n_gridk); + GPUArray n_gridX(m_Nx*m_Ny*m_Nz, m_exec_conf); + m_gridX.swap(n_gridX); + GPUArray n_gridY(m_Nx*m_Ny*m_Nz, m_exec_conf); + m_gridY.swap(n_gridY); + GPUArray n_gridZ(m_Nx*m_Ny*m_Nz, m_exec_conf); + m_gridZ.swap(n_gridZ); + + // Get list of reciprocal space vectors, and scaling factor for the wave space calculation at each grid point + ArrayHandle h_gridk(m_gridk, access_location::host, access_mode::readwrite); + for (int i = 0; i < m_Nx; i++) { + for (int j = 0; j < m_Ny; j++) { + for (int k = 0; k < m_Nz; k++) { + + // Index into grid vector storage array + int idx = i * m_Ny*m_Nz + j * m_Nz + k; + + // k goes from -N/2 to N/2 + h_gridk.data[idx].x = 2.0*3.1415926536 * ((i < ( m_Nx + 1 ) / 2) ? i : i - m_Nx) / L.x; + h_gridk.data[idx].y = 2.0*3.1415926536 * ((j < ( m_Ny + 1 ) / 2) ? j : j - m_Ny) / L.y; + h_gridk.data[idx].z = 2.0*3.1415926536 * ((k < ( m_Nz + 1 ) / 2) ? k : k - m_Nz) / L.z; + + // k dot k + Scalar k2 = h_gridk.data[idx].x*h_gridk.data[idx].x + h_gridk.data[idx].y*h_gridk.data[idx].y + h_gridk.data[idx].z*h_gridk.data[idx].z; + + // Scaling factor used in wave space sum + // + // Can't include k=0 term in the Ewald sum + if (i == 0 && j == 0 && k == 0){ + h_gridk.data[idx].w = 0; + } + else{ + // Have to divide by Nx*Ny*Nz to normalize the FFTs + h_gridk.data[idx].w = 6.0*3.1415926536 * (1.0 + k2/4.0/xisq) * expf( -(1-m_eta) * k2/4.0/xisq ) / ( k2 ) / Scalar( m_Nx*m_Ny*m_Nz ); + } + + } + } + } + + // Store the coefficients for the real space part of Ewald summation + // + // Will precompute scaling factors for real space component of summation for a given + // discretization to speed up GPU calculations + // + // Do calculation in double precision, then truncate and tabulate, because the + // expressions don't behave very well numerically, and double precision ensures + // it works. + m_ewald_dr = 0.001; // Distance resolution + m_ewald_n = m_ewald_cut / m_ewald_dr - 1; // Number of entries in tabulation + + double dr = 0.0010000000000000; + + // Assume all particles have radius of 1.0 Scalar pi12 = 1.77245385091; Scalar aa = 1.0; - Scalar axi = aa * m_xi; - Scalar axi2 = axi * axi; + Scalar axi = aa * m_xi; + Scalar axi2 = axi * axi; m_self = (1. + 4.*pi12*axi*erfc(2.*axi) - exp(-4.*axi2))/(4.*pi12*axi*aa); - // Allocate storage for real space Ewald table - int nR = m_ewald_n + 1; // number of entries in ewald table - GPUArray n_ewaldC1( nR, m_exec_conf); - m_ewaldC1.swap(n_ewaldC1); - ArrayHandle h_ewaldC1(m_ewaldC1, access_location::host, access_mode::readwrite); + // Allocate storage for real space Ewald table + int nR = m_ewald_n + 1; // number of entries in ewald table + GPUArray n_ewaldC1( nR, m_exec_conf); + m_ewaldC1.swap(n_ewaldC1); + ArrayHandle h_ewaldC1(m_ewaldC1, access_location::host, access_mode::readwrite); - // Functions are complicated so calculation should be done in double precision, then truncated to single precision - // in order to ensure accurate evaluation - double xi = m_xi; - double Pi = 3.141592653589793; - double a = aa; + // Functions are complicated so calculation should be done in double precision, then truncated to single precision + // in order to ensure accurate evaluation + double xi = m_xi; + double Pi = 3.141592653589793; + double a = aa; - // Fill tables - for ( int kk = 0; kk < nR; kk++ ) - { + // Fill tables + for ( int kk = 0; kk < nR; kk++ ) + { - // Initialize entries - h_ewaldC1.data[ kk ].x = 0.0; // UF1 at r - h_ewaldC1.data[ kk ].y = 0.0; // UF2 at r - h_ewaldC1.data[ kk ].z = 0.0; // UF1 at r + dr - h_ewaldC1.data[ kk ].w = 0.0; // UF2 at r + dr + // Initialize entries + h_ewaldC1.data[ kk ].x = 0.0; // UF1 at r + h_ewaldC1.data[ kk ].y = 0.0; // UF2 at r + h_ewaldC1.data[ kk ].z = 0.0; // UF1 at r + dr + h_ewaldC1.data[ kk ].w = 0.0; // UF2 at r + dr - // Distance for current entry - double r = double( kk ) * dr + dr; - double Imrr = 0, rr = 0; + // Distance for current entry + double r = double( kk ) * dr + dr; + double Imrr = 0, rr = 0; - // Expression have been simplified assuming no overlap, touching, and overlap - if ( r > 2.0*a ){ + // Expression have been simplified assuming no overlap, touching, and overlap + if ( r > 2.0*a ){ - Imrr = -pow(a,-1) + (pow(a,2)*pow(r,-3))/2. + (3*pow(r,-1))/4. + (3*erfc(r*xi)*pow(a,-2)*pow(r,-3)*(-12*pow(r,4) + pow(xi,-4)))/128. + + Imrr = -pow(a,-1) + (pow(a,2)*pow(r,-3))/2. + (3*pow(r,-1))/4. + (3*erfc(r*xi)*pow(a,-2)*pow(r,-3)*(-12*pow(r,4) + pow(xi,-4)))/128. + pow(a,-2)*((9*r)/32. - (3*pow(r,-3)*pow(xi,-4))/128.) + (erfc((2*a + r)*xi)*(128*pow(a,-1) + 64*pow(a,2)*pow(r,-3) + 96*pow(r,-1) + pow(a,-2)*(36*r - 3*pow(r,-3)*pow(xi,-4))))/256. + (erfc(2*a*xi - r*xi)*(128*pow(a,-1) - 64*pow(a,2)*pow(r,-3) - 96*pow(r,-1) + pow(a,-2)*(-36*r + 3*pow(r,-3)*pow(xi,-4))))/ @@ -357,7 +357,7 @@ void Stokes::setParams() (exp(-(pow(-2*a + r,2)*pow(xi,2)))*pow(a,-2)*pow(Pi,-0.5)*pow(r,-3)*pow(xi,-3)* (8*r*pow(a,2)*pow(xi,2) + 16*pow(a,3)*pow(xi,2) + a*(-2 + 28*pow(r,2)*pow(xi,2)) - 3*(r + 6*pow(r,3)*pow(xi,2))))/128.; - rr = -pow(a,-1) - pow(a,2)*pow(r,-3) + (3*pow(r,-1))/2. + (3*pow(a,-2)*pow(r,-3)*(4*pow(r,4) + pow(xi,-4)))/64. + + rr = -pow(a,-1) - pow(a,2)*pow(r,-3) + (3*pow(r,-1))/2. + (3*pow(a,-2)*pow(r,-3)*(4*pow(r,4) + pow(xi,-4)))/64. + (erfc(2*a*xi - r*xi)*(64*pow(a,-1) + 64*pow(a,2)*pow(r,-3) - 96*pow(r,-1) + pow(a,-2)*(-12*r - 3*pow(r,-3)*pow(xi,-4))))/128. + (erfc((2*a + r)*xi)*(64*pow(a,-1) - 64*pow(a,2)*pow(r,-3) + 96*pow(r,-1) + pow(a,-2)*(12*r + 3*pow(r,-3)*pow(xi,-4))))/128. + (3*exp(-(pow(r,2)*pow(xi,2)))*pow(a,-2)*pow(Pi,-0.5)*pow(r,-2)*pow(xi,-3)*(-1 + 2*pow(r,2)*pow(xi,2)))/32. - @@ -367,23 +367,23 @@ void Stokes::setParams() (-1 + 8*a*r*pow(xi,2) + 8*pow(a,2)*pow(xi,2) + 2*pow(r,2)*pow(xi,2)))/64. - (3*erfc(r*xi)*pow(a,-2)*pow(r,-3)*pow(xi,-4)*(1 + 4*pow(r,4)*pow(xi,4)))/64.; - } - else if ( r == 2.0*a ){ + } + else if ( r == 2.0*a ){ - Imrr = -(pow(a,-5)*(3 + 16*a*xi*pow(Pi,-0.5))*pow(xi,-4))/2048. + (3*erfc(2*a*xi)*pow(a,-5)*(-192*pow(a,4) + pow(xi,-4)))/1024. + + Imrr = -(pow(a,-5)*(3 + 16*a*xi*pow(Pi,-0.5))*pow(xi,-4))/2048. + (3*erfc(2*a*xi)*pow(a,-5)*(-192*pow(a,4) + pow(xi,-4)))/1024. + erfc(4*a*xi)*(pow(a,-1) - (3*pow(a,-5)*pow(xi,-4))/2048.) + (exp(-16*pow(a,2)*pow(xi,2))*pow(a,-4)*pow(Pi,-0.5)*pow(xi,-3)*(-1 - 64*pow(a,2)*pow(xi,2)))/256. + (3*exp(-4*pow(a,2)*pow(xi,2))*pow(a,-4)*pow(Pi,-0.5)*pow(xi,-3)*(1 + 24*pow(a,2)*pow(xi,2)))/256.; - rr = (pow(a,-5)*(3 + 16*a*xi*pow(Pi,-0.5))*pow(xi,-4))/1024. + erfc(2*a*xi)*((-3*pow(a,-1))/8. - (3*pow(a,-5)*pow(xi,-4))/512.) + + rr = (pow(a,-5)*(3 + 16*a*xi*pow(Pi,-0.5))*pow(xi,-4))/1024. + erfc(2*a*xi)*((-3*pow(a,-1))/8. - (3*pow(a,-5)*pow(xi,-4))/512.) + erfc(4*a*xi)*(pow(a,-1) + (3*pow(a,-5)*pow(xi,-4))/1024.) + (exp(-16*pow(a,2)*pow(xi,2))*pow(a,-4)*pow(Pi,-0.5)*pow(xi,-3)*(1 - 32*pow(a,2)*pow(xi,2)))/128. + (3*exp(-4*pow(a,2)*pow(xi,2))*pow(a,-4)*pow(Pi,-0.5)*pow(xi,-3)*(-1 + 8*pow(a,2)*pow(xi,2)))/128.; - } - else if ( r < 2*a){ + } + else if ( r < 2*a){ - Imrr = (-9*r*pow(a,-2))/32. + pow(a,-1) - (pow(a,2)*pow(r,-3))/2. - (3*pow(r,-1))/4. + + Imrr = (-9*r*pow(a,-2))/32. + pow(a,-1) - (pow(a,2)*pow(r,-3))/2. - (3*pow(r,-1))/4. + (3*erfc(r*xi)*pow(a,-2)*pow(r,-3)*(-12*pow(r,4) + pow(xi,-4)))/128. + (erfc((-2*a + r)*xi)*(-128*pow(a,-1) + 64*pow(a,2)*pow(r,-3) + 96*pow(r,-1) + pow(a,-2)*(36*r - 3*pow(r,-3)*pow(xi,-4))))/ 256. + (erfc((2*a + r)*xi)*(128*pow(a,-1) + 64*pow(a,2)*pow(r,-3) + 96*pow(r,-1) + pow(a,-2)*(36*r - 3*pow(r,-3)*pow(xi,-4))))/ @@ -393,7 +393,7 @@ void Stokes::setParams() (exp(-(pow(-2*a + r,2)*pow(xi,2)))*pow(a,-2)*pow(Pi,-0.5)*pow(r,-3)*pow(xi,-3)* (8*r*pow(a,2)*pow(xi,2) + 16*pow(a,3)*pow(xi,2) + a*(-2 + 28*pow(r,2)*pow(xi,2)) - 3*(r + 6*pow(r,3)*pow(xi,2))))/128.; - rr = ((2*a + 3*r)*pow(a,-2)*pow(2*a - r,3)*pow(r,-3))/16. + + rr = ((2*a + 3*r)*pow(a,-2)*pow(2*a - r,3)*pow(r,-3))/16. + (erfc((-2*a + r)*xi)*(-64*pow(a,-1) - 64*pow(a,2)*pow(r,-3) + 96*pow(r,-1) + pow(a,-2)*(12*r + 3*pow(r,-3)*pow(xi,-4))))/128. + (erfc((2*a + r)*xi)*(64*pow(a,-1) - 64*pow(a,2)*pow(r,-3) + 96*pow(r,-1) + pow(a,-2)*(12*r + 3*pow(r,-3)*pow(xi,-4))))/128. + (3*exp(-(pow(r,2)*pow(xi,2)))*pow(a,-2)*pow(Pi,-0.5)*pow(r,-2)*pow(xi,-3)*(-1 + 2*pow(r,2)*pow(xi,2)))/32. - @@ -403,23 +403,23 @@ void Stokes::setParams() (-1 + 8*a*r*pow(xi,2) + 8*pow(a,2)*pow(xi,2) + 2*pow(r,2)*pow(xi,2)))/64. - (3*erfc(r*xi)*pow(a,-2)*pow(r,-3)*pow(xi,-4)*(1 + 4*pow(r,4)*pow(xi,4)))/64.; - } + } - // Save values to table - h_ewaldC1.data[ kk ].x = Scalar( Imrr ); // UF1 - h_ewaldC1.data[ kk ].y = Scalar( rr ); // UF2 + // Save values to table + h_ewaldC1.data[ kk ].x = Scalar( Imrr ); // UF1 + h_ewaldC1.data[ kk ].y = Scalar( rr ); // UF2 - } // kk loop over distances + } // kk loop over distances - // Both pieces of UF data for faster interpolation (r and r+dr stored in same Scalar4) - for ( int kk = 0; kk < (nR-1); kk++ ){ + // Both pieces of UF data for faster interpolation (r and r+dr stored in same Scalar4) + for ( int kk = 0; kk < (nR-1); kk++ ){ - int offset1 = kk; - int offset2 = (kk+1); + int offset1 = kk; + int offset2 = (kk+1); - h_ewaldC1.data[ offset1 ].z = h_ewaldC1.data[ offset2 ].x; - h_ewaldC1.data[ offset1 ].w = h_ewaldC1.data[ offset2 ].y; - } + h_ewaldC1.data[ offset1 ].z = h_ewaldC1.data[ offset2 ].x; + h_ewaldC1.data[ offset1 ].w = h_ewaldC1.data[ offset2 ].y; + } } @@ -429,96 +429,96 @@ void Stokes::setParams() void Stokes::integrateStepOne(unsigned int timestep) { - // Recompute neighborlist ( if needed ) - m_nlist->compute(timestep); + // Recompute neighborlist ( if needed ) + m_nlist->compute(timestep); - // access the neighbor list - ArrayHandle d_n_neigh(m_nlist->getNNeighArray(), access_location::device, access_mode::read); - ArrayHandle d_nlist(m_nlist->getNListArray(), access_location::device, access_mode::read); - ArrayHandle d_headlist(m_nlist->getHeadList(), access_location::device, access_mode::read); + // access the neighbor list + ArrayHandle d_n_neigh(m_nlist->getNNeighArray(), access_location::device, access_mode::read); + ArrayHandle d_nlist(m_nlist->getNListArray(), access_location::device, access_mode::read); + ArrayHandle d_headlist(m_nlist->getHeadList(), access_location::device, access_mode::read); - // Consistency check - unsigned int group_size = m_group->getNumMembers(); - assert(group_size <= m_pdata->getN()); - if (group_size == 0) - return; + // Consistency check + unsigned int group_size = m_group->getNumMembers(); + assert(group_size <= m_pdata->getN()); + if (group_size == 0) + return; - // Get particle forces - const GPUArray< Scalar4 >& net_force = m_pdata->getNetForce(); + // Get particle forces + const GPUArray< Scalar4 >& net_force = m_pdata->getNetForce(); - // profile this step - if (m_prof) - m_prof->push(m_exec_conf, "Stokes step 1 (no step 2)"); + // profile this step + if (m_prof) + m_prof->push(m_exec_conf, "Stokes step 1 (no step 2)"); - // Access all the needed data for the calculation - ArrayHandle d_pos(m_pdata->getPositions(), access_location::device, access_mode::readwrite); - ArrayHandle d_vel(m_pdata->getVelocities(), access_location::device, access_mode::readwrite); - ArrayHandle d_accel(m_pdata->getAccelerations(), access_location::device, access_mode::readwrite); - ArrayHandle d_net_force(net_force, access_location::device, access_mode::read); - ArrayHandle d_image(m_pdata->getImages(), access_location::device, access_mode::readwrite); + // Access all the needed data for the calculation + ArrayHandle d_pos(m_pdata->getPositions(), access_location::device, access_mode::readwrite); + ArrayHandle d_vel(m_pdata->getVelocities(), access_location::device, access_mode::readwrite); + ArrayHandle d_accel(m_pdata->getAccelerations(), access_location::device, access_mode::readwrite); + ArrayHandle d_net_force(net_force, access_location::device, access_mode::read); + ArrayHandle d_image(m_pdata->getImages(), access_location::device, access_mode::readwrite); - BoxDim box = m_pdata->getBox(); - ArrayHandle< unsigned int > d_index_array(m_group->getIndexArray(), access_location::device, access_mode::read); + BoxDim box = m_pdata->getBox(); + ArrayHandle< unsigned int > d_index_array(m_group->getIndexArray(), access_location::device, access_mode::read); - // Grid vectors - ArrayHandle d_gridk(m_gridk, access_location::device, access_mode::readwrite); - ArrayHandle d_gridX(m_gridX, access_location::device, access_mode::readwrite); - ArrayHandle d_gridY(m_gridY, access_location::device, access_mode::readwrite); - ArrayHandle d_gridZ(m_gridZ, access_location::device, access_mode::readwrite); + // Grid vectors + ArrayHandle d_gridk(m_gridk, access_location::device, access_mode::readwrite); + ArrayHandle d_gridX(m_gridX, access_location::device, access_mode::readwrite); + ArrayHandle d_gridY(m_gridY, access_location::device, access_mode::readwrite); + ArrayHandle d_gridZ(m_gridZ, access_location::device, access_mode::readwrite); - // Real space interaction tabulation - ArrayHandle d_ewaldC1(m_ewaldC1, access_location::device, access_mode::read); + // Real space interaction tabulation + ArrayHandle d_ewaldC1(m_ewaldC1, access_location::device, access_mode::read); // Calculate the shear rate of the current timestep Scalar current_shear_rate = m_shear_func -> getShearRate(timestep); - // perform the update on the GPU - gpu_stokes_step_one( - d_pos.data, - d_vel.data, - d_accel.data, - d_image.data, - d_index_array.data, - group_size, - box, - m_deltaT, - 256, - d_net_force.data, - m_T->getValue(timestep), - timestep, - m_seed, - m_xi, - m_eta, - m_ewald_cut, - m_ewald_dr, - m_ewald_n, - d_ewaldC1.data, - m_self, - d_gridk.data, - d_gridX.data, - d_gridY.data, - d_gridZ.data, - plan, - m_Nx, - m_Ny, - m_Nz, - d_n_neigh.data, - d_nlist.data, - d_headlist.data, - m_m_Lanczos, - m_pdata->getN(), - m_gaussP, - m_gridh, - m_error, - current_shear_rate - ); - - if (m_exec_conf->isCUDAErrorCheckingEnabled()) - CHECK_CUDA_ERROR(); - - // done profiling - if (m_prof) - m_prof->pop(m_exec_conf); + // perform the update on the GPU + gpu_stokes_step_one( + d_pos.data, + d_vel.data, + d_accel.data, + d_image.data, + d_index_array.data, + group_size, + box, + m_deltaT, + 256, + d_net_force.data, + m_T->getValue(timestep), + timestep, + m_seed, + m_xi, + m_eta, + m_ewald_cut, + m_ewald_dr, + m_ewald_n, + d_ewaldC1.data, + m_self, + d_gridk.data, + d_gridX.data, + d_gridY.data, + d_gridZ.data, + plan, + m_Nx, + m_Ny, + m_Nz, + d_n_neigh.data, + d_nlist.data, + d_headlist.data, + m_m_Lanczos, + m_pdata->getN(), + m_gaussP, + m_gridh, + m_error, + current_shear_rate + ); + + if (m_exec_conf->isCUDAErrorCheckingEnabled()) + CHECK_CUDA_ERROR(); + + // done profiling + if (m_prof) + m_prof->pop(m_exec_conf); } @@ -532,9 +532,9 @@ void Stokes::integrateStepTwo(unsigned int timestep) void export_Stokes(pybind11::module& m) { pybind11::class_ > (m, "Stokes", pybind11::base()) - .def(pybind11::init< std::shared_ptr, std::shared_ptr, std::shared_ptr, unsigned int, std::shared_ptr, Scalar, Scalar >()) - .def("setT", &Stokes::setT) - .def("setParams", &Stokes::setParams) + .def(pybind11::init< std::shared_ptr, std::shared_ptr, std::shared_ptr, unsigned int, std::shared_ptr, Scalar, Scalar >()) + .def("setT", &Stokes::setT) + .def("setParams", &Stokes::setParams) .def("setShear", &Stokes::setShear) ; } diff --git a/PSEv1/Stokes.cu b/PSEv1/Stokes.cu index d86d767..dca1665 100644 --- a/PSEv1/Stokes.cu +++ b/PSEv1/Stokes.cu @@ -136,17 +136,17 @@ scalar4_tex_t pos_tex; */ extern "C" __global__ void gpu_stokes_step_one_kernel( - Scalar4 *d_pos, - Scalar4 *d_vel, - Scalar3 *d_accel, - int3 *d_image, - unsigned int *d_group_members, - unsigned int group_size, - BoxDim box, - Scalar deltaT, - Scalar4 *d_net_force, - Scalar shear_rate - ){ + Scalar4 *d_pos, + Scalar4 *d_vel, + Scalar3 *d_accel, + int3 *d_image, + unsigned int *d_group_members, + unsigned int group_size, + BoxDim box, + Scalar deltaT, + Scalar4 *d_net_force, + Scalar shear_rate + ){ // determine which particle this thread works on (MEM TRANSFER: 4 bytes) int group_idx = blockIdx.x * blockDim.x + threadIdx.x; @@ -161,13 +161,13 @@ void gpu_stokes_step_one_kernel( // read the particle's velocity and acceleration (MEM TRANSFER: 32 bytes) Scalar4 velmass = d_vel[idx]; - Scalar mass = velmass.w; + Scalar mass = velmass.w; Scalar3 vel = make_scalar3(velmass.x, velmass.y, velmass.z); - // Add the shear + // Add the shear vel.x += shear_rate * pos.y; - Scalar4 net_force = d_net_force[idx]; + Scalar4 net_force = d_net_force[idx]; Scalar3 accel = make_scalar3(net_force.x, net_force.y, net_force.z); // update the position @@ -176,7 +176,7 @@ void gpu_stokes_step_one_kernel( // FLOPS: 3 pos += dx; - accel = accel/mass; + accel = accel/mass; // read in the particle's image (MEM TRANSFER: 16 bytes) int3 image = d_image[idx]; @@ -185,7 +185,7 @@ void gpu_stokes_step_one_kernel( box.wrap(pos, image); // write out the results (MEM_TRANSFER: 48 bytes) - d_accel[idx] = accel; + d_accel[idx] = accel; d_pos[idx] = make_scalar4(pos.x, pos.y, pos.z, postype.w); d_image[idx] = image; } @@ -216,7 +216,7 @@ void gpu_stokes_step_one_kernel( \param d_gridY y-component of force moment projection onto the grid \param d_gridZ z-component of force moment projection onto the grid \param plan cudaFFT plan - \param Nx number of grid nodes in the x-direction + \param Nx number of grid nodes in the x-direction \param Ny number of grid nodes in the y-direction \param Nz number of grid nodes in the z-direction \param d_n_neigh Number of neighbors for every particle @@ -232,134 +232,134 @@ void gpu_stokes_step_one_kernel( \param cheb_error error tolerance in chebyshev approximation */ cudaError_t gpu_stokes_step_one( - Scalar4 *d_pos, - Scalar4 *d_vel, - Scalar3 *d_accel, - int3 *d_image, - unsigned int *d_group_members, - unsigned int group_size, - const BoxDim& box, - Scalar dt, - unsigned int block_size, - Scalar4 *d_net_force, - const Scalar T, - const unsigned int timestep, - const unsigned int seed, - Scalar xi, - Scalar eta, - Scalar ewald_cut, - Scalar ewald_dr, - int ewald_n, - Scalar4 *d_ewaldC1, - Scalar self, - Scalar4 *d_gridk, - CUFFTCOMPLEX *d_gridX, - CUFFTCOMPLEX *d_gridY, - CUFFTCOMPLEX *d_gridZ, - cufftHandle plan, - const int Nx, - const int Ny, - const int Nz, - const unsigned int *d_n_neigh, - const unsigned int *d_nlist, - const unsigned int *d_headlist, - int& m_Lanczos, - const unsigned int N_total, - const int P, - Scalar3 gridh, - Scalar cheb_error, - Scalar shear_rate - ){ - - // Total number of grid points - unsigned int NxNyNz = Nx*Ny*Nz; - - // setup the grid to run the kernel - // block for particle calculation - dim3 grid( (group_size/block_size) + 1, 1, 1); - dim3 threads(block_size, 1, 1); - - // block for grid calculation - int gridBlockSize = ( NxNyNz > block_size ) ? block_size : NxNyNz; - int gridNBlock = ( NxNyNz + gridBlockSize - 1 ) / gridBlockSize ; - - // Get the textured tables for real space Ewald sum tabulation - tables1_tex.normalized = false; // Not normalized - tables1_tex.filterMode = cudaFilterModeLinear; // Filter mode: floor of the index - // One dimension, Read mode: ElementType(Get what we write) - cudaBindTexture(0, tables1_tex, d_ewaldC1, sizeof(Scalar4) * (ewald_n+1)); // This was a bug in former versions! - - // Same for the positions and forces - pos_tex.normalized = false; // Not normalized - pos_tex.filterMode = cudaFilterModePoint; // Filter mode: floor of the index - cudaBindTexture(0, pos_tex, d_pos, sizeof(Scalar4) * N_total); - - // Get sheared grid vectors - gpu_stokes_SetGridk_kernel<<>>(d_gridk,Nx,Ny,Nz,NxNyNz,box,xi,eta); - - // Do Mobility and Brownian Calculations (compute the velocity from the forces) - gpu_stokes_CombinedMobilityBrownian_wrap( - d_pos, - d_net_force, - d_group_members, - group_size, - box, - dt, - d_vel, // output - T, - timestep, - seed, - xi, - eta, - P, - ewald_cut, - ewald_dr, - ewald_n, - d_ewaldC1, - d_gridk, - d_gridX, - d_gridY, - d_gridZ, - plan, - Nx, - Ny, - Nz, - d_n_neigh, - d_nlist, - d_headlist, - m_Lanczos, - N_total, - NxNyNz, - grid, - threads, - gridBlockSize, - gridNBlock, - gridh, - cheb_error, - self ); - - - // Use forward Euler integration to move the particles according the velocity - // computed from the Mobility and Brownian calculations - gpu_stokes_step_one_kernel<<< grid, threads >>>( - d_pos, - d_vel, - d_accel, - d_image, - d_group_members, - group_size, - box, - dt, - d_net_force, - shear_rate - ); - - // Quick error check - gpuErrchk(cudaPeekAtLastError()); - - // Cleanup - cudaUnbindTexture(tables1_tex); - cudaUnbindTexture(pos_tex); - - return cudaSuccess; + Scalar4 *d_pos, + Scalar4 *d_vel, + Scalar3 *d_accel, + int3 *d_image, + unsigned int *d_group_members, + unsigned int group_size, + const BoxDim& box, + Scalar dt, + unsigned int block_size, + Scalar4 *d_net_force, + const Scalar T, + const unsigned int timestep, + const unsigned int seed, + Scalar xi, + Scalar eta, + Scalar ewald_cut, + Scalar ewald_dr, + int ewald_n, + Scalar4 *d_ewaldC1, + Scalar self, + Scalar4 *d_gridk, + CUFFTCOMPLEX *d_gridX, + CUFFTCOMPLEX *d_gridY, + CUFFTCOMPLEX *d_gridZ, + cufftHandle plan, + const int Nx, + const int Ny, + const int Nz, + const unsigned int *d_n_neigh, + const unsigned int *d_nlist, + const unsigned int *d_headlist, + int& m_Lanczos, + const unsigned int N_total, + const int P, + Scalar3 gridh, + Scalar cheb_error, + Scalar shear_rate + ){ + + // Total number of grid points + unsigned int NxNyNz = Nx*Ny*Nz; + + // setup the grid to run the kernel + // block for particle calculation + dim3 grid( (group_size/block_size) + 1, 1, 1); + dim3 threads(block_size, 1, 1); + + // block for grid calculation + int gridBlockSize = ( NxNyNz > block_size ) ? block_size : NxNyNz; + int gridNBlock = ( NxNyNz + gridBlockSize - 1 ) / gridBlockSize ; + + // Get the textured tables for real space Ewald sum tabulation + tables1_tex.normalized = false; // Not normalized + tables1_tex.filterMode = cudaFilterModeLinear; // Filter mode: floor of the index + // One dimension, Read mode: ElementType(Get what we write) + cudaBindTexture(0, tables1_tex, d_ewaldC1, sizeof(Scalar4) * (ewald_n+1)); // This was a bug in former versions! + + // Same for the positions and forces + pos_tex.normalized = false; // Not normalized + pos_tex.filterMode = cudaFilterModePoint; // Filter mode: floor of the index + cudaBindTexture(0, pos_tex, d_pos, sizeof(Scalar4) * N_total); + + // Get sheared grid vectors + gpu_stokes_SetGridk_kernel<<>>(d_gridk,Nx,Ny,Nz,NxNyNz,box,xi,eta); + + // Do Mobility and Brownian Calculations (compute the velocity from the forces) + gpu_stokes_CombinedMobilityBrownian_wrap( + d_pos, + d_net_force, + d_group_members, + group_size, + box, + dt, + d_vel, // output + T, + timestep, + seed, + xi, + eta, + P, + ewald_cut, + ewald_dr, + ewald_n, + d_ewaldC1, + d_gridk, + d_gridX, + d_gridY, + d_gridZ, + plan, + Nx, + Ny, + Nz, + d_n_neigh, + d_nlist, + d_headlist, + m_Lanczos, + N_total, + NxNyNz, + grid, + threads, + gridBlockSize, + gridNBlock, + gridh, + cheb_error, + self ); + + + // Use forward Euler integration to move the particles according the velocity + // computed from the Mobility and Brownian calculations + gpu_stokes_step_one_kernel<<< grid, threads >>>( + d_pos, + d_vel, + d_accel, + d_image, + d_group_members, + group_size, + box, + dt, + d_net_force, + shear_rate + ); + + // Quick error check + gpuErrchk(cudaPeekAtLastError()); + + // Cleanup + cudaUnbindTexture(tables1_tex); + cudaUnbindTexture(pos_tex); + + return cudaSuccess; } diff --git a/PSEv1/Stokes.cuh b/PSEv1/Stokes.cuh index db2d304..73f1713 100644 --- a/PSEv1/Stokes.cuh +++ b/PSEv1/Stokes.cuh @@ -81,34 +81,34 @@ cudaError_t gpu_stokes_step_one(Scalar4 *d_pos, const BoxDim& box, Scalar deltaT, unsigned int block_size, - Scalar4 *d_net_force, - const Scalar T, - const unsigned int timestep, - const unsigned int seed, - Scalar xi, - Scalar eta, - Scalar ewald_cut, - Scalar ewald_dr, - int ewald_n, - Scalar4 *d_ewald1, - Scalar self, - Scalar4 *d_gridk, - CUFFTCOMPLEX *d_gridX, - CUFFTCOMPLEX *d_gridY, - CUFFTCOMPLEX *d_gridZ, - cufftHandle plan, - const int Nx, - const int Ny, - const int Nz, - const unsigned int *d_n_neigh, + Scalar4 *d_net_force, + const Scalar T, + const unsigned int timestep, + const unsigned int seed, + Scalar xi, + Scalar eta, + Scalar ewald_cut, + Scalar ewald_dr, + int ewald_n, + Scalar4 *d_ewald1, + Scalar self, + Scalar4 *d_gridk, + CUFFTCOMPLEX *d_gridX, + CUFFTCOMPLEX *d_gridY, + CUFFTCOMPLEX *d_gridZ, + cufftHandle plan, + const int Nx, + const int Ny, + const int Nz, + const unsigned int *d_n_neigh, const unsigned int *d_nlist, const unsigned int *d_headlist, - int& m_Lanczos, - const unsigned int N_total, - const int P, - Scalar3 gridh, - Scalar cheb_error, - Scalar current_shear_rate); + int& m_Lanczos, + const unsigned int N_total, + const int P, + Scalar3 gridh, + Scalar cheb_error, + Scalar current_shear_rate); #endif diff --git a/PSEv1/integrate.py b/PSEv1/integrate.py index 7a959bb..e5a173c 100644 --- a/PSEv1/integrate.py +++ b/PSEv1/integrate.py @@ -20,9 +20,9 @@ class PSEv1(hoomd.md.integrate._integration_method): # \param seed Random seed to use for the run. Simulations that are identical, except for the seed, will follow # different trajectories. # \param xi Ewald splitting parameter - # \param error Relative error for all calculations - # \param function_form Functional form for shear - # \param max_strain Maximum box deformation for shear + # \param error Relative error for all calculations + # \param function_form Functional form for shear + # \param max_strain Maximum box deformation for shear # # # T can be a variant type, allowing for temperature ramps in simulation runs. @@ -31,7 +31,7 @@ class PSEv1(hoomd.md.integrate._integration_method): def __init__(self, group, T, seed=0, xi = 0.5, error = 0.001, function_form = None, max_strain = 0.5, nlist_type = "cell" ): - # Print the status of the initialization + # Print the status of the initialization hoomd.util.print_status_line(); # initialize base class @@ -52,9 +52,9 @@ def __init__(self, group, T, seed=0, xi = 0.5, error = 0.001, function_form = No hoomd.context.msg.error("Sorry, we have not written CPU code for PSE RPY simulation. \n"); raise RuntimeError('Error creating Stokes'); else: - - # Create a neighborlist exclusively for real space interactions. Use cell lists by - # default, but also allow the user to specify + + # Create a neighborlist exclusively for real space interactions. Use cell lists by + # default, but also allow the user to specify if ( nlist_type.upper() == "CELL" ): cl_stokes = _hoomd.CellListGPU(hoomd.context.current.system_definition); diff --git a/PSEv1/module.cc b/PSEv1/module.cc index 32a4940..775d32b 100644 --- a/PSEv1/module.cc +++ b/PSEv1/module.cc @@ -13,7 +13,7 @@ PYBIND11_MODULE(_PSEv1, m) { #ifdef ENABLE_CUDA - export_Stokes(m); + export_Stokes(m); #endif export_ShearFunction(m); export_ShearFunctionWrap(m); diff --git a/PSEv1/variant.py b/PSEv1/variant.py index a73541e..5dafa38 100644 --- a/PSEv1/variant.py +++ b/PSEv1/variant.py @@ -23,7 +23,7 @@ def __init__(self, function_form, total_timestep, max_strain = 0.5): # initialize the base class _variant.__init__(self) - # check total_timestep is positive + # check total_timestep is positive if total_timestep <= 0: hoomd.context.msg.error("Cannot create a shear_variant with 0 or negative points\n") raise RuntimeError('Error creating variant') diff --git a/examples/run.py b/examples/run.py index d04bd1f..b8b80ed 100644 --- a/examples/run.py +++ b/examples/run.py @@ -8,7 +8,7 @@ # Time stepping information dt = 1e-3 # time step -tf = 1e0 # the final time of the simulation (in units of bare particle diffusion time) +tf = 10e0 # the final time of the simulation (in units of bare particle diffusion time) nrun = tf / dt # number of steps # Particle size @@ -25,8 +25,8 @@ os.mkdir( loc ) # Simple cubic crystal of 1000 particles -N = 1000; -L = 64 +N = 6400 +L = 40 n = math.ceil(N ** (1.0/3.0)) # number of particles along 1D a = L / n # spacing between particles