diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 588679d7..00000000 --- a/MANIFEST.in +++ /dev/null @@ -1,3 +0,0 @@ -global-include src/**/*.h src/**/*.cpp src/**/*.f src/**/*.f90 src/**/*.cmake src/**/CMakeLists.txt -include CMakeLists.txt -include README.md diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 3d498a52..36af8011 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -17,6 +17,7 @@ set(SRC ddx.f90 ddx_legacy.f90 ddx_multipolar_solutes.f90 + ddx_profiling.f90 llgnew.f cbessel.f90 ddx_cinterface.f90 diff --git a/src/ddx.f90 b/src/ddx.f90 index 4b67ef44..ba424b4d 100644 --- a/src/ddx.f90 +++ b/src/ddx.f90 @@ -390,8 +390,10 @@ subroutine ddrun(ddx_data, state, electrostatics, psi, tol, esolv, & return end if + call time_push() call setup(ddx_data % params, ddx_data % constants, & & ddx_data % workspace, state, electrostatics, psi, ddx_error) + call time_pull("setup") if (ddx_error % flag .ne. 0) then call update_error(ddx_error, "ddrun: setup returned an error, exiting") return @@ -399,24 +401,31 @@ subroutine ddrun(ddx_data, state, electrostatics, psi, tol, esolv, & ! solve the primal linear system if (do_guess) then + call time_push() call fill_guess(ddx_data % params, ddx_data % constants, & & ddx_data % workspace, state, tol, ddx_error) + call time_pull("guess") if (ddx_error % flag .ne. 0) then call update_error(ddx_error, & & "ddrun: fill_guess returned an error, exiting") return end if end if + + call time_push() call solve(ddx_data % params, ddx_data % constants, & & ddx_data % workspace, state, tol, ddx_error) + call time_pull("solve") if (ddx_error % flag .ne. 0) then call update_error(ddx_error, "ddrun: solve returned an error, exiting") return end if ! compute the energy + call time_push() call energy(ddx_data % params, ddx_data % constants, & & ddx_data % workspace, state, esolv, ddx_error) + call time_pull("energy") if (ddx_error % flag .ne. 0) then call update_error(ddx_error, "ddrun: energy returned an error, exiting") return @@ -425,16 +434,20 @@ subroutine ddrun(ddx_data, state, electrostatics, psi, tol, esolv, & ! solve the primal linear system if (ddx_data % params % force .eq. 1) then if (do_guess) then + call time_push() call fill_guess_adjoint(ddx_data % params, ddx_data % constants, & & ddx_data % workspace, state, tol, ddx_error) + call time_pull("guess adjoint") if (ddx_error % flag .ne. 0) then call update_error(ddx_error, & & "ddrun: fill_guess_adjoint returned an error, exiting") return end if end if + call time_push() call solve_adjoint(ddx_data % params, ddx_data % constants, & & ddx_data % workspace, state, tol, ddx_error) + call time_pull("solve adjoint") if (ddx_error % flag .ne. 0) then call update_error(ddx_error, & & "ddrun: solve_adjoint returned an error, exiting") @@ -445,8 +458,10 @@ subroutine ddrun(ddx_data, state, electrostatics, psi, tol, esolv, & ! compute the forces if (ddx_data % params % force .eq. 1) then force = zero + call time_push() call solvation_force_terms(ddx_data % params, ddx_data % constants, & & ddx_data % workspace, state, electrostatics, force, ddx_error) + call time_pull("solvation forces") if (ddx_error % flag .ne. 0) then call update_error(ddx_error, & & "ddrun: solvation_force_terms returned an error, exiting") diff --git a/src/ddx_constants.f90 b/src/ddx_constants.f90 index 66577adc..c1934e6a 100644 --- a/src/ddx_constants.f90 +++ b/src/ddx_constants.f90 @@ -223,6 +223,19 @@ module ddx_constants !> Whether the diagonal of the matrices has to be used in the mvp for !! ddCOSMO, ddPCM or inner ddLPB iterations logical :: dodiag + !> List of exposed points in a CSR format. + integer, allocatable :: iexposed(:) + integer, allocatable :: exposed(:) + !> List of buried points in a CSR format. + integer, allocatable :: iburied(:) + integer, allocatable :: buried(:) + !> List of overlapped points in a CSR format. + integer, allocatable :: ioverlap(:) + integer, allocatable :: overlap(:) + !> Adjoint list of overlapped points (the relation is not self + ! adjoint) in a CSR format. + integer, allocatable :: adj_ioverlap(:) + integer, allocatable :: adj_overlap(:) end type ddx_constants_type contains @@ -423,7 +436,7 @@ subroutine constants_init(params, constants, ddx_error) end do end if - ! Generate geometry-related constants (required by the LPB code) + ! Generate geometry-related constants call constants_geometry_init(params, constants, ddx_error) if (ddx_error % flag .ne. 0) then call update_error(ddx_error, "constants_geometry_init returned an " // & @@ -773,9 +786,9 @@ subroutine constants_geometry_init(params, constants, ddx_error) !! Local variables real(dp) :: swthr, v(3), maxv, ssqv, vv, t integer :: i, isph, jsph, inear, igrid, iwork, jwork, lwork, & - & old_lwork, icav, info + & old_lwork, icav, info, ij, adj_iwork integer, allocatable :: work(:, :), tmp_work(:, :) - real(dp) :: start_time + real(dp) :: start_time, thigh, vij(3), vvij, tij !! The code ! Prepare FMM structures if needed start_time = omp_get_wtime() @@ -992,6 +1005,98 @@ subroutine constants_geometry_init(params, constants, ddx_error) end if enddo enddo + + allocate(constants % iburied(params % nsph + 1), & + & constants % iexposed(params % nsph + 1), & + & constants % buried(params % nsph * params % ngrid), & + & constants % exposed(params % nsph * params % ngrid), stat=info) + if (info .ne. 0) then + call update_error(ddx_error, "Buried exposed lists allocation failed") + return + endif + iwork = 1 + do isph = 1, params % nsph + constants % iburied(isph) = iwork + do igrid = 1, params % ngrid + if (constants % ui(igrid, isph) .lt. one) then + !write(7, *) isph, igrid + constants % buried(iwork) = igrid + iwork = iwork + 1 + end if + end do + end do + constants % iburied(params % nsph + 1) = iwork + + !!do isph = 1, params % nsph + !! write(6,*) isph, constants % iburied(isph), constants % iburied(isph + 1) - 1 + !! do iwork = constants % iburied(isph), constants % iburied(isph + 1) - 1 + !! igrid = constants % buried(iwork) + !! write(8, *) isph, igrid + !! end do + !!end do + !!stop 0 + + allocate(constants % ioverlap(constants % inl(params % nsph+1)), & + & constants % adj_ioverlap(constants % inl(params % nsph+1)), & + & constants % overlap(constants % inl(params % nsph+1)*params % ngrid), & + & constants % adj_overlap(constants % inl(params % nsph+1)*params % ngrid), & + & stat=info) + if (info .ne. 0) then + call update_error(ddx_error, "Overlapped grid lists allocation failed") + return + end if + thigh = one + pt5*(params % se + one)*params % eta + iwork = 1 + adj_iwork = 1 + do isph = 1, params % nsph + do ij = constants % inl(isph), constants % inl(isph+1) - 1 + jsph = constants % nl(ij) + constants % ioverlap(ij) = iwork + constants % adj_ioverlap(ij) = adj_iwork + do igrid = 1, params % ngrid + if (constants % ui(igrid, isph) .lt. one) then + vij = params % csph(:,isph) & + & + params % rsph(isph)*constants % cgrid(:,igrid) & + & - params % csph(:,jsph) + vvij = sqrt(vij(1)*vij(1) + vij(2)*vij(2) & + & + vij(3)*vij(3)) + tij = vvij / params % rsph(jsph) + if (tij.lt.thigh) then + constants % overlap(iwork) = igrid + iwork = iwork + 1 + end if + end if + if (constants % ui(igrid, jsph) .lt. one) then + vij = params % csph(:,jsph) & + & + params % rsph(jsph)*constants % cgrid(:,igrid) & + & - params % csph(:,isph) + vvij = sqrt(vij(1)*vij(1) + vij(2)*vij(2) & + & + vij(3)*vij(3)) + tij = vvij / params % rsph(isph) + if (tij.lt.thigh) then + constants % adj_overlap(adj_iwork) = igrid + adj_iwork = adj_iwork + 1 + end if + end if + end do + end do + end do + constants % ioverlap(ij) = iwork + constants % adj_ioverlap(ij) = adj_iwork + + !do isph = 1, params % nsph + ! do ij = constants % inl(isph), constants % inl(isph+1) - 1 + ! jsph = constants % nl(ij) + ! !do ijgrid = constants % ioverlap(ij), constants % ioverlap(ij+1) - 1 + ! ! igrid = constants % overlap(ijgrid) + ! do ijgrid = constants % adj_ioverlap(ij), constants % adj_ioverlap(ij+1) - 1 + ! igrid = constants % adj_overlap(ijgrid) + ! write(6,*) isph, jsph, igrid + ! end do + ! end do + !end do + !stop 0 + ! Build cavity array. At first get total count for each sphere allocate(constants % ncav_sph(params % nsph), stat=info) if (info .ne. 0) then @@ -2144,6 +2249,34 @@ subroutine constants_free(constants, ddx_error) & "deallocation failed!") end if end if + if (allocated(constants % iburied)) then + deallocate(constants % iburied, stat=istat) + if (istat .ne. 0) then + call update_error(ddx_error, "`iburied` " // & + & "deallocation failed!") + end if + end if + if (allocated(constants % buried)) then + deallocate(constants % buried, stat=istat) + if (istat .ne. 0) then + call update_error(ddx_error, "`buried` " // & + & "deallocation failed!") + end if + end if + if (allocated(constants % iexposed)) then + deallocate(constants % iexposed, stat=istat) + if (istat .ne. 0) then + call update_error(ddx_error, "`iexposed` " // & + & "deallocation failed!") + end if + end if + if (allocated(constants % exposed)) then + deallocate(constants % exposed, stat=istat) + if (istat .ne. 0) then + call update_error(ddx_error, "`exposed` " // & + & "deallocation failed!") + end if + end if end subroutine constants_free end module ddx_constants diff --git a/src/ddx_core.f90 b/src/ddx_core.f90 index cc957e68..15860ef0 100644 --- a/src/ddx_core.f90 +++ b/src/ddx_core.f90 @@ -21,6 +21,7 @@ module ddx_core use ddx_harmonics ! Enable OpenMP use omp_lib +use ddx_profiling implicit none !> @defgroup Fortran_interface_core Fortran interface: core routines @@ -1390,15 +1391,22 @@ real(dp) function hnorm(lmax, nbasis, nsph, x) implicit none integer, intent(in) :: lmax, nbasis, nsph real(dp), dimension(nbasis, nsph), intent(in) :: x - integer :: isph - real(dp) :: vrms, fac + integer :: isph, l, ind, m + real(dp) :: vrms, fac, tmp - vrms = 0.0_dp + vrms = 0.0d0 !$omp parallel do default(none) shared(nsph,lmax,nbasis,x) & - !$omp private(isph,fac) schedule(dynamic) reduction(+:vrms) + !$omp private(isph,tmp,l,ind,fac,m) schedule(static,1) reduction(+:vrms) do isph = 1, nsph - call hsnorm(lmax, nbasis, x(:,isph), fac) - vrms = vrms + fac*fac + tmp = 0.0d0 + do l = 0, lmax + ind = l*l + l + 1 + fac = 1.0d0/(1.0d0 + dble(l)) + do m = -l, l + tmp = tmp + fac*x(ind+m,isph)*x(ind+m,isph) + end do + end do + vrms = vrms + tmp enddo hnorm = sqrt(vrms/dble(nsph)) end function hnorm @@ -2104,40 +2112,48 @@ subroutine tree_m2l_bessel_rotation(params, constants, node_m, node_l) real(dp) :: work(6*params % pm**2 + 19*params % pm + 8) complex(dp) :: work_complex(2*params % pm+1) ! Local variables - integer :: i, j, k - real(dp) :: c1(3), c(3), r1, r + integer :: i, j, k, nclusters, pm + real(dp) :: c1(3), c(3), r1, r, kappa ! Any order of this cycle is OK - !$omp parallel do default(none) shared(constants,params,node_m,node_l) & - !$omp private(i,c,r,k,c1,r1,work,work_complex) schedule(dynamic) - do i = 1, constants % nclusters - ! If no far admissible pairs just set output to zero - if (constants % nfar(i) .eq. 0) then - node_l(:, i) = zero - cycle - end if - c = constants % cnode(:, i) - r = constants % rnode(i) - ! Use the first far admissible pair to initialize output - k = constants % far(constants % sfar(i)) - c1 = constants % cnode(:, k) - r1 = constants % rnode(k) - c1 = params % kappa*(c1 - c) - call fmm_m2l_bessel_rotation_work(c1, & - & constants % SK_rnode(:, k), constants % SI_rnode(:, i), & - & params % pm, & - & constants % vscales, one, & - & node_m(:, k), zero, node_l(:, i), work, work_complex) - do j = constants % sfar(i)+1, constants % sfar(i+1)-1 - k = constants % far(j) - c1 = constants % cnode(:, k) - r1 = constants % rnode(k) - c1 = params % kappa*(c1 - c) - call fmm_m2l_bessel_rotation_work(c1, constants % SK_rnode(:, k), & - & constants % SI_rnode(:, i), params % pm, & - & constants % vscales, one, & - & node_m(:, k), one, node_l(:, i), work, work_complex) + nclusters = constants % nclusters + kappa = params % kappa + pm = params % pm + associate(nfar => constants % nfar, cnode => constants % cnode, & + & rnode => constants % rnode, far => constants % far, & + & sfar => constants % sfar, sk_rnode => constants % sk_rnode, & + & si_rnode => constants % si_rnode, & + & vscales => constants % vscales) + !$omp parallel do schedule(dynamic,1) firstprivate(nclusters,kappa,pm) & + !$omp private(i,c,r,k,c1,r1,j,work,work_complex) & + !$omp shared(nfar,node_l,cnode,rnode, & + !$omp far,sfar,sk_rnode,si_rnode,vscales,node_m) + do i = 1, nclusters + ! If no far admissible pairs just set output to zero + if (nfar(i) .eq. 0) then + node_l(:, i) = zero + cycle + end if + c = cnode(:, i) + r = rnode(i) + ! Use the first far admissible pair to initialize output + k = far(sfar(i)) + c1 = cnode(:, k) + r1 = rnode(k) + c1 = kappa*(c1 - c) + call fmm_m2l_bessel_rotation_work(c1, SK_rnode(:, k), & + & SI_rnode(:, i), pm, vscales, 1.0d0, node_m(:, k), & + & 0.0d0, node_l(:, i), work, work_complex) + do j = sfar(i)+1, sfar(i+1)-1 + k = far(j) + c1 = cnode(:, k) + r1 = rnode(k) + c1 = kappa*(c1 - c) + call fmm_m2l_bessel_rotation_work(c1, SK_rnode(:, k), & + & SI_rnode(:, i), pm, vscales, 1.0d0, node_m(:, k), & + & 1.0d0, node_l(:, i), work, work_complex) + end do end do - end do + end associate end subroutine tree_m2l_bessel_rotation !------------------------------------------------------------------------------ !> Adjoint transfer multipole local coefficients into local over a tree @@ -2298,7 +2314,7 @@ subroutine tree_l2p_bessel(params, constants, alpha, node_l, beta, grid_v) real(dp), intent(inout) :: grid_v(params % ngrid, params % nsph) ! Local variables real(dp) :: sph_l((params % pl+1)**2, params % nsph) - integer :: isph + integer :: isph, nsph external :: dgemm ! Init output if (beta .eq. zero) then @@ -2306,12 +2322,15 @@ subroutine tree_l2p_bessel(params, constants, alpha, node_l, beta, grid_v) else grid_v = beta * grid_v end if + nsph = params % nsph ! Get data from all clusters to spheres - !$omp parallel do default(none) shared(params,constants,node_l,sph_l) & - !$omp private(isph) schedule(dynamic) - do isph = 1, params % nsph - sph_l(:, isph) = node_l(:, constants % snode(isph)) - end do + associate(snode => constants % snode) + !$omp parallel do default(none) schedule(static,100) & + !$omp shared(node_l,sph_l,snode) private(isph) firstprivate(nsph) + do isph = 1, nsph + sph_l(:, isph) = node_l(:, snode(isph)) + end do + end associate ! Get values at grid points call dgemm('T', 'N', params % ngrid, params % nsph, & & (params % pl+1)**2, alpha, constants % vgrid, & @@ -2457,8 +2476,8 @@ subroutine tree_m2p_bessel(params, constants, p, alpha, sph_p, sph_m, beta, grid ! Output real(dp), intent(inout) :: grid_v(params % ngrid, params % nsph) ! Local variables - integer :: isph, inode, jnear, jnode, jsph, igrid - real(dp) :: c(3) + integer :: isph, inode, jnear, jnode, jsph, igrid, nsph, ngrid + real(dp) :: c(3), kappa ! Temporary workspace real(dp) :: work(p+1) complex(dp) :: work_complex(p+1) @@ -2468,32 +2487,44 @@ subroutine tree_m2p_bessel(params, constants, p, alpha, sph_p, sph_m, beta, grid else grid_v = beta * grid_v end if + nsph = params % nsph + ngrid = params % ngrid + kappa = params % kappa ! Cycle over all spheres - !$omp parallel do default(none) shared(params,constants,grid_v,p, & - !$omp alpha,sph_m), private(isph,inode,jnear,jnode,jsph,igrid,c,work, & - !$omp work_complex) schedule(dynamic) - do isph = 1, params % nsph - ! Cycle over all near-field admissible pairs of spheres - inode = constants % snode(isph) - do jnear = constants % snear(inode), constants % snear(inode+1)-1 - ! Near-field interactions are possible only between leaf nodes, - ! which must contain only a single input sphere - jnode = constants % near(jnear) - jsph = constants % order(constants % cluster(1, jnode)) - ! Ignore self-interaction - !if(isph .eq. jsph) cycle - ! Accumulate interaction for external grid points only - do igrid = 1, params % ngrid - if(constants % ui(igrid, isph) .eq. zero) cycle - c = constants % cgrid(:, igrid)*params % rsph(isph) - & - & params % csph(:, jsph) + params % csph(:, isph) - c = c * params % kappa - call fmm_m2p_bessel_work(c, p, constants % vscales, & - & constants % SK_ri(:, jsph), alpha, sph_m(:, jsph), one, & - & grid_v(igrid, isph), work_complex, work) + associate(snode => constants % snode, snear => constants % snear, & + & near => constants % near, order => constants % order, & + & cluster => constants % cluster, ui => constants % ui, & + & cgrid => constants % cgrid, rsph => params % rsph, & + & csph => params % csph, vscales => constants % vscales, & + & sk_ri => constants % sk_ri) + !$omp parallel do schedule(dynamic,1) default(none) & + !$omp firstprivate(nsph,ngrid,kappa,alpha,p) & + !$omp private(isph,inode,jnear,jnode,jsph,igrid,c, & + !$omp work_complex,work) shared(snode,snear,near,order, & + !$omp csph,cluster,ui,cgrid,rsph,vscales,sk_ri,sph_m,grid_v) + do isph = 1, nsph + ! Cycle over all near-field admissible pairs of spheres + inode = snode(isph) + do jnear = snear(inode), snear(inode+1)-1 + ! Near-field interactions are possible only between leaf nodes, + ! which must contain only a single input sphere + jnode = near(jnear) + jsph = order(cluster(1, jnode)) + ! Ignore self-interaction + !if(isph .eq. jsph) cycle + ! Accumulate interaction for external grid points only + do igrid = 1, ngrid + if(ui(igrid, isph) .eq. zero) cycle + c = cgrid(:, igrid)*rsph(isph) - csph(:, jsph) & + & + csph(:, isph) + c = c * kappa + call fmm_m2p_bessel_work(c, p, vscales, & + & sk_ri(:, jsph), alpha, sph_m(:, jsph), 1.0d0, & + & grid_v(igrid, isph), work_complex, work) + end do end do end do - end do + end associate end subroutine tree_m2p_bessel !------------------------------------------------------------------------------ diff --git a/src/ddx_driver.f90 b/src/ddx_driver.f90 index 078551f5..422189b8 100644 --- a/src/ddx_driver.f90 +++ b/src/ddx_driver.f90 @@ -114,6 +114,7 @@ program main finish_time = omp_get_wtime() write(*, 100) "Psi time:", finish_time-start_time, " seconds" +call time_push() if (ddx_data % params % force .eq. 1) then allocate(force(3, ddx_data % params % nsph), stat=info) if (info .ne. 0) then @@ -126,6 +127,7 @@ program main call ddrun(ddx_data, state, electrostatics, psi, tol, esolv, ddx_error) end if call check_error(ddx_error) +call time_pull("ddrun") if (ddx_data % params % force .eq. 1) write(*, 100) & & "solvation force terms time:", state % force_time, " seconds" @@ -134,9 +136,11 @@ program main ! forces. if (ddx_data % params % force .eq. 1) then start_time = omp_get_wtime() + call time_push() call multipole_force_terms(ddx_data % params, ddx_data % constants, & & ddx_data % workspace, state, 0, multipoles, force, ddx_error) - call check_error(ddx_error) + call time_pull("solute forces") + call check_error(ddx_error) finish_time = omp_get_wtime() write(*, 100) "multipolar force terms time:", & & finish_time - start_time, " seconds" diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90 index dedbf00f..89d60296 100644 --- a/src/ddx_operators.f90 +++ b/src/ddx_operators.f90 @@ -34,57 +34,107 @@ subroutine lx(params, constants, workspace, x, y, ddx_error) real(dp), intent(out) :: y(constants % nbasis, params % nsph) type(ddx_error_type), intent(inout) :: ddx_error !! Local variables - integer :: isph, jsph, ij, l, ind, iproc + integer :: isph, jsph, ij, l, ind, its, m, ijits + real(dp) :: vij(3), tij, vvij, xij, oij, fac + + integer :: vgrid_nbasis, nbasis, ngrid, nsph, lmax + real(dp) :: se, eta, work(params % lmax + 1), tmp_grid(params % ngrid) + + call time_push() ! dummy operation on unused interface arguments if (ddx_error % flag .eq. 0) continue + if (workspace % xs_time .eq. 0) continue + + nsph = params % nsph + nbasis = constants % nbasis + lmax = params % lmax - !! Initialize - y = zero -! -! incore code: -! if (params % matvecmem .eq. 1) then - !$omp parallel do default(none) shared(params,constants,x,y) & - !$omp private(isph,ij,jsph) schedule(dynamic) - do isph = 1, params % nsph - do ij = constants % inl(isph), constants % inl(isph + 1) - 1 - jsph = constants % nl(ij) - call dgemv('n', constants % nbasis, constants % nbasis, one, & - & constants % l(:,:,ij), constants % nbasis, x(:,jsph), 1, & - & one, y(:,isph), 1) + associate(inl => constants % inl, nl => constants % nl, & + & l => constants % l) + !$omp parallel do default(none) shared(inl,nl,l,x,y) & + !$omp firstprivate(nsph,nbasis) & + !$omp private(isph,ij,jsph) schedule(dynamic,1) + do isph = 1, nsph + y(:,isph) = 0.0d0 + do ij = inl(isph), inl(isph + 1) - 1 + jsph = nl(ij) + call dgemv('n', nbasis, nbasis, 1.0d0, l(:,:,ij), & + & nbasis, x(:,jsph), 1, 1.0d0, y(:,isph), 1) + end do end do - end do + end associate else - !$omp parallel do default(none) shared(params,constants,workspace,x,y) & - !$omp private(isph,iproc) schedule(dynamic) - do isph = 1, params % nsph - iproc = omp_get_thread_num() + 1 - ! Compute NEGATIVE action of off-digonal blocks - call calcv(params, constants, isph, workspace % tmp_pot(:, iproc), x, & - & workspace % tmp_work(:, iproc)) - call ddintegrate(1, constants % nbasis, params % ngrid, & - & constants % vwgrid, constants % vgrid_nbasis, & - & workspace % tmp_pot(:, iproc), y(:, isph)) - ! now, fix the sign. - y(:, isph) = - y(:, isph) - end do + se = params % se + eta = params % eta + ngrid = params % ngrid + vgrid_nbasis = constants % vgrid_nbasis + + associate(inl => constants % inl, nl => constants % nl, & + & csph => params % csph, rsph => params % rsph, & + & cgrid => constants % cgrid, fi => constants % fi, & + & vscales_rel => constants % vscales_rel, & + & vwgrid => constants % vwgrid, & + & ioverlap => constants % ioverlap, & + & overlap => constants % overlap) + + !$omp parallel do default(none) schedule(dynamic,1) & + !$omp firstprivate(nsph,se,eta,lmax,nbasis,ngrid, & + !$omp vgrid_nbasis) shared(inl,nl,csph,rsph,cgrid,fi, & + !$omp vscales_rel,x,y,vwgrid,ioverlap,overlap) private( & + !$omp isph,tmp_grid,its,ij,jsph,vij,vvij,tij,xij,oij, & + !$omp work,ijits) + do isph = 1, nsph + tmp_grid(:) = 0.0d0 + do ij = inl(isph), inl(isph+1)-1 + jsph = nl(ij) + do ijits = ioverlap(ij), ioverlap(ij+1) - 1 + its = overlap(ijits) + vij = csph(:,isph) + rsph(isph)*cgrid(:,its) & + & - csph(:,jsph) + vvij = sqrt(vij(1)*vij(1) + vij(2)*vij(2) & + & + vij(3)*vij(3)) + tij = vvij / rsph(jsph) + xij = fsw(tij, se, eta) + if (fi(its, isph).gt.1.0d0) then + oij = xij / fi(its, isph) + else + oij = xij + end if + call fmm_l2p_work(vij, rsph(jsph), lmax, & + & vscales_rel, oij, x(:, jsph), 1.0d0, & + & tmp_grid(its), work) + end do + end do + call ddintegrate(1, nbasis, ngrid, vwgrid, vgrid_nbasis, & + & tmp_grid, y(:, isph)) + y(:, isph) = - y(:, isph) + end do + end associate end if -! -! if required, add the diagonal. -! - if (constants % dodiag) then - do isph = 1, params % nsph - do l = 0, params % lmax - ind = l*l + l + 1 - y(ind-l:ind+l, isph) = y(ind-l:ind+l, isph) + & - x(ind-l:ind+l, isph) / (constants % vscales(ind)**2) + + ! if required, add the diagonal. + if (constants % dodiag) then + associate(vscales => constants % vscales) + !$omp parallel do collapse(2) default(none) & + !$omp shared(vscales,x,y) firstprivate(nsph,lmax) & + !$omp private(isph,l,ind,m,fac) schedule(static,10) + do isph = 1, nsph + do l = 0, lmax + ind = l*l + l + 1 + fac = vscales(ind)**2 + do m = -l, l + y(ind+m, isph) = y(ind+m, isph) + x(ind+m, isph)/fac + end do + end do end do - end do + end associate end if + call time_pull("lx") end subroutine lx -!> Adjoint single layer operator matvec +!> Adjoint single layer operator matvec subroutine lstarx(params, constants, workspace, x, y, ddx_error) implicit none !! Inputs @@ -97,55 +147,113 @@ subroutine lstarx(params, constants, workspace, x, y, ddx_error) real(dp), intent(out) :: y(constants % nbasis, params % nsph) type(ddx_error_type), intent(inout) :: ddx_error !! Local variables - integer :: isph, jsph, ij, indmat, igrid, l, ind, iproc + integer :: isph, jsph, ij, indmat, igrid, l, ind, m, i, ijigrid + real(dp) :: vji(3), vvji, tji, xji, oji, fac + integer :: nsph, ngrid, nbasis, lmax + real(dp) :: thigh, se, eta, work(params % lmax + 1), & + & tmp(constants % nbasis) + + call time_push() ! dummy operation on unused interface arguments if (ddx_error % flag .eq. 0) continue - y = zero + nsph = params % nsph + nbasis = constants % nbasis + if (params % matvecmem .eq. 1) then - !$omp parallel do default(none) shared(params,constants,x,y) & - !$omp private(isph,ij,jsph,indmat) schedule(dynamic) - do isph = 1, params % nsph - do ij = constants % inl(isph), constants % inl(isph + 1) - 1 - jsph = constants % nl(ij) - indmat = constants % itrnl(ij) - call dgemv('t', constants % nbasis, constants % nbasis, one, & - & constants % l(:,:,indmat), constants % nbasis, x(:,jsph), 1, & - & one, y(:,isph), 1) + associate(inl => constants % inl, nl => constants % nl, & + & itrnl => constants % itrnl, l => constants % l) + !$omp parallel do default(none) schedule(dynamic,1) & + !$omp firstprivate(nsph,nbasis) shared(x,y,inl,nl,itrnl,l) & + !$omp private(isph,ij,jsph,indmat) + do isph = 1, nsph + y(:,isph) = 0.0d0 + do ij = inl(isph), inl(isph + 1) - 1 + jsph = nl(ij) + indmat = itrnl(ij) + call dgemv('t', nbasis, nbasis, 1.0d0, & + & l(:,:,indmat), nbasis, x(:,jsph), 1, & + & 1.0d0, y(:,isph), 1) + end do end do - end do + end associate else + ngrid = params % ngrid + thigh = one + (params % se+one)/two*params % eta + se = params % se + eta = params % eta + lmax = params % lmax ! Expand x over spherical harmonics - !$omp parallel do default(none) shared(params,constants,workspace,x) & - !$omp private(isph,igrid) schedule(dynamic) - do isph = 1, params % nsph - do igrid = 1, params % ngrid - workspace % tmp_grid(igrid, isph) = dot_product(x(:, isph), & - & constants % vgrid(:constants % nbasis, igrid)) + associate(vgrid => constants % vgrid, & + & tmp_grid => workspace % tmp_grid) + !$omp parallel do collapse(2) default(none) & + !$omp firstprivate(nsph,ngrid,nbasis) shared(x,tmp_grid,vgrid) & + !$omp private(isph,igrid) schedule(static,100) + do isph = 1, nsph + do igrid = 1, ngrid + tmp_grid(igrid,isph) = dot_product(x(:, isph), & + & vgrid(:nbasis, igrid)) + end do end do - end do - ! Compute (negative) action - !$omp parallel do default(none) shared(params,constants,workspace,x,y) & - !$omp private(isph,iproc) schedule(dynamic) - do isph = 1, params % nsph - iproc = omp_get_thread_num() + 1 - call adjrhs(params, constants, isph, workspace % tmp_grid, & - & y(:, isph), workspace % tmp_work(:, iproc)) - ! fix the sign - y(:, isph) = - y(:, isph) - end do + end associate + associate(inl => constants % inl, nl => constants % nl, & + & csph => params % csph, rsph => params % rsph, & + & cgrid => constants % cgrid, fi => constants % fi, & + & wgrid => constants % wgrid, & + & tmp_grid => workspace % tmp_grid, & + & vscales_rel => constants % vscales_rel, & + & adj_ioverlap => constants % adj_ioverlap, & + & adj_overlap => constants % adj_overlap) + !$omp parallel do default(none) schedule(dynamic,1) & + !$omp shared(inl,nl,csph,rsph,cgrid,fi,wgrid,tmp_grid,y, & + !$omp adj_overlap,adj_ioverlap,vscales_rel) firstprivate( & + !$omp nsph,ngrid,thigh,se,eta,lmax) private(isph,ij,jsph, & + !$omp igrid,vji,vvji,tji,xji,oji,fac,work,i,ijigrid,tmp) + do isph = 1, nsph + tmp = 0.0d0 + do ij = inl(isph), inl(isph+1)-1 + jsph = nl(ij) + do ijigrid = adj_ioverlap(ij), adj_ioverlap(ij+1) - 1 + igrid = adj_overlap(ijigrid) + + vji = csph(:,jsph) + rsph(jsph)*cgrid(:,igrid) & + & - csph(:,isph) + vvji = sqrt(vji(1)*vji(1) + vji(2)*vji(2) & + & + vji(3)*vji(3)) + tji = vvji/rsph(isph) + xji = fsw(tji, se, eta) + if (fi(igrid,jsph).gt.1.0d0) then + oji = xji/fi(igrid,jsph) + else + oji = xji + end if + fac = wgrid(igrid) * tmp_grid(igrid, jsph) * oji + call fmm_l2p_adj_work(vji, fac, rsph(isph), & + & lmax, vscales_rel, 1.0d0, tmp, work) + end do + end do + y(:, isph) = - tmp + end do + end associate end if if (constants % dodiag) then - ! Loop over harmonics - do isph = 1, params % nsph - do l = 0, params % lmax - ind = l*l + l + 1 - y(ind-l:ind+l, isph) = y(ind-l:ind+l, isph) + & - & x(ind-l:ind+l, isph) / (constants % vscales(ind)**2) + associate(vscales => constants % vscales) + !$omp parallel do collapse(2) default(none) & + !$omp shared(vscales,x,y) firstprivate(nsph,lmax) & + !$omp private(isph,l,ind,m,fac) schedule(static,10) + do isph = 1, nsph + do l = 0, lmax + ind = l*l + l + 1 + fac = vscales(ind)**2 + do m = -l, l + y(ind+m, isph) = y(ind+m, isph) + x(ind+m, isph)/fac + end do + end do end do - end do + end associate end if + call time_pull("lstarx") end subroutine lstarx !> Diagonal preconditioning for Lx operator @@ -163,22 +271,30 @@ subroutine ldm1x(params, constants, workspace, x, y, ddx_error) real(dp), intent(out) :: y(constants % nbasis, params % nsph) type(ddx_error_type), intent(inout) :: ddx_error !! Local variables - integer :: isph, l, ind + integer :: isph, l, ind, nsph, lmax, m + real(dp) :: fac ! dummy operation on unused interface arguments if ((ddx_error % flag .eq. 0) .or. & - & (allocated(workspace % tmp_pot))) continue + & (allocated(workspace % tmp_pot))) continue - !! Loop over harmonics - !$omp parallel do default(none) shared(params,constants,x,y) & - !$omp private(isph,l,ind) schedule(dynamic) - do isph = 1, params % nsph - do l = 0, params % lmax - ind = l*l + l + 1 - y(ind-l:ind+l, isph) = x(ind-l:ind+l, isph) & - & *(constants % vscales(ind)**2) + nsph = params % nsph + lmax = params % lmax + + associate(vscales => constants % vscales) + !$omp parallel do collapse(2) default(none) & + !$omp shared(vscales,x,y) firstprivate(nsph,lmax) & + !$omp private(isph,l,ind,m,fac) schedule(static,10) + do isph = 1, nsph + do l = 0, lmax + ind = l*l + l + 1 + fac = vscales(ind)**2 + do m = -l, l + y(ind+m, isph) = x(ind+m, isph)*fac + end do + end do end do - end do + end associate end subroutine ldm1x !> Double layer operator matvec without diagonal blocks @@ -700,46 +816,133 @@ subroutine bstarx(params, constants, workspace, x, y, ddx_error) real(dp), intent(out) :: y(constants % nbasis, params % nsph) type(ddx_error_type), intent(inout) :: ddx_error ! Local variables - integer :: isph, jsph, ij, indmat, iproc + integer :: isph, jsph, ij, indmat, ind, ijigrid, igrid, l, m, & + & nsph, nbasis, lmax, ngrid + real(dp) :: thigh, se, eta, kappa, vji(3), vvji, sji(3), tji, & + & rho, ctheta, stheta, cphi, sphi, xji, oji, fac, & + & basloc((params % lmax + 1)**2), vcos(params % lmax + 1), & + & vsin(params % lmax + 1), vplm((params % lmax + 1)**2), & + & tmp(constants % nbasis), fac_hsp(constants % nbasis), & + & si_rjin(0:params % lmax), di_rjin(0:params % lmax) + complex(dp) :: work_complex(max(2, params % lmax+1)) + + call time_push() + + nsph = params % nsph + nbasis = constants % nbasis ! dummy operation on unused interface arguments if (ddx_error % flag .eq. 0) continue - y = zero if (params % matvecmem .eq. 1) then - !$omp parallel do default(none) shared(params,constants,x,y) & - !$omp private(isph,ij,jsph,indmat) schedule(dynamic) - do isph = 1, params % nsph - do ij = constants % inl(isph), constants % inl(isph + 1) - 1 - jsph = constants % nl(ij) - indmat = constants % itrnl(ij) - call dgemv('t', constants % nbasis, constants % nbasis, one, & - & constants % b(:,:,indmat), constants % nbasis, x(:,jsph), 1, & - & one, y(:,isph), 1) + associate(inl => constants % inl, nl => constants % nl, & + & itrnl => constants % itrnl, b => constants % b) + !$omp parallel do default(none) schedule(dynamic,1) & + !$omp firstprivate(nsph,nbasis) shared(x,y,inl,nl,itrnl,b) & + !$omp private(isph,ij,jsph,indmat) + do isph = 1, nsph + y(:,isph) = zero + do ij = inl(isph), inl(isph + 1) - 1 + jsph = nl(ij) + indmat = itrnl(ij) + call dgemv('t', nbasis, nbasis, 1.0d0, & + & b(:,:,indmat), nbasis, x(:,jsph), 1, & + & 1.0d0, y(:,isph), 1) + end do end do - end do + end associate else - !$omp parallel do default(none) shared(params,constants,workspace,x,y) & - !$omp private(isph) schedule(static,1) - do isph = 1, params % nsph - call dgemv('t', constants % nbasis, params % ngrid, one, constants % vgrid, & - & constants % vgrid_nbasis, x(:, isph), 1, zero, & - & workspace % tmp_grid(:, isph), 1) - end do - !$omp parallel do default(none) shared(params,constants,workspace,x,y) & - !$omp private(isph,iproc) schedule(dynamic) - do isph = 1, params % nsph - iproc = omp_get_thread_num() + 1 - call adjrhs_lpb(params, constants, isph, workspace % tmp_grid, & - & y(:, isph), workspace % tmp_vylm(:, iproc), workspace % tmp_vplm(:, iproc), & - & workspace % tmp_vcos(:, iproc), workspace % tmp_vsin(:, iproc), & - & workspace % tmp_bessel(:, iproc)) - y(:,isph) = - y(:,isph) - end do + ngrid = params % ngrid + thigh = one + (params % se+one)/two*params % eta + se = params % se + eta = params % eta + lmax = params % lmax + kappa = params % kappa + + associate(vgrid => constants % vgrid, & + & tmp_grid => workspace % tmp_grid) + !$omp parallel do collapse(2) default(none) & + !$omp firstprivate(nsph,ngrid,nbasis) shared(x,tmp_grid,vgrid) & + !$omp private(isph,igrid) schedule(static,100) + do isph = 1, nsph + do igrid = 1, ngrid + tmp_grid(igrid,isph) = dot_product(x(:, isph), & + & vgrid(:nbasis, igrid)) + end do + end do + end associate + + associate(inl => constants % inl, nl => constants % nl, & + & csph => params % csph, rsph => params % rsph, & + & cgrid => constants % cgrid, fi => constants % fi, & + & wgrid => constants % wgrid, & + & tmp_grid => workspace % tmp_grid, & + & vscales => constants % vscales, & + & adj_ioverlap => constants % adj_ioverlap, & + & adj_overlap => constants % adj_overlap, & + & si_ri => constants % si_ri) + !$omp parallel do default(none) schedule(dynamic,1) & + !$omp firstprivate(nsph,ngrid,thigh,se,eta,lmax,kappa) & + !$omp shared(inl,nl,csph,rsph,cgrid,fi,wgrid,tmp_grid,y, & + !$omp adj_overlap,adj_ioverlap,vscales,si_ri) & + !$omp private(isph,ij,jsph,ijigrid,igrid,vji,vvji,tji,sji, & + !$omp rho,ctheta,stheta,cphi,sphi,basloc,vplm,vcos,vsin,tmp, & + !$omp si_rjin,di_rjin,work_complex,fac_hsp,xji,oji,fac,ind) + do isph = 1, nsph + tmp = 0.0d0 + do ij = inl(isph), inl(isph+1)-1 + jsph = nl(ij) + do ijigrid = adj_ioverlap(ij), adj_ioverlap(ij+1) - 1 + igrid = adj_overlap(ijigrid) + + vji = csph(:,jsph) + rsph(jsph)*cgrid(:,igrid) & + & - csph(:,isph) + vvji = sqrt(vji(1)*vji(1) + vji(2)*vji(2) & + & + vji(3)*vji(3)) + tji = vvji/rsph(isph) + sji = vji/vvji + + call ylmbas(sji, rho, ctheta, stheta, cphi, sphi, & + & lmax, vscales, basloc, vplm, vcos, vsin) + + si_rjin = 0.0d0 + di_rjin = 0.0d0 + + call modified_spherical_bessel_first_kind(lmax, & + & vvji*kappa, si_rjin, di_rjin, work_complex) + + do l = 0, lmax + ind = l*l + l + 1 + do m = -l, l + fac_hsp(ind+m) = SI_rjin(l)/SI_ri(l,isph)*basloc(ind+m) + end do + end do + + xji = fsw(tji, se, eta) + if (fi(igrid,jsph).gt.1.0d0) then + oji = xji/fi(igrid,jsph) + else + oji = xji + end if + + fac = wgrid(igrid) * tmp_grid(igrid,jsph) * oji + do l = 0, lmax + ind = l*l + l + 1 + do m = -l, l + tmp(ind+m) = tmp(ind+m) + fac*fac_hsp(ind+m) + end do + end do + end do + end do + y(:, isph) = - tmp + end do + end associate end if ! add the diagonal if required if (constants % dodiag) y = y + x + call time_pull("bstarx") + end subroutine bstarx !> Primal HSP matrix vector product @@ -751,16 +954,28 @@ subroutine bx(params, constants, workspace, x, y, ddx_error) real(dp), dimension(constants % nbasis, params % nsph), intent(in) :: x real(dp), dimension(constants % nbasis, params % nsph), intent(out) :: y type(ddx_error_type), intent(inout) :: ddx_error - integer :: isph, jsph, ij, iproc + integer :: isph, jsph, ij, ijits, its + + integer :: nsph, nbasis, lmax, ngrid, vgrid_nbasis + real(dp) :: se, eta, vij(3), vvij, tij, xij, oij, vtij(3), kappa + complex(dp) :: work_complex(params % lmax+1) + real(dp) :: work(params % lmax+1), tmp_grid(params % ngrid) + + call time_push() ! dummy operation on unused interface arguments if (ddx_error % flag .eq. 0) continue + if (workspace % xs_time .eq. 0) continue + + nsph = params % nsph + nbasis = constants % nbasis + lmax = params % lmax - y = zero if (params % matvecmem .eq. 1) then !$omp parallel do default(none) shared(params,constants,x,y) & !$omp private(isph,ij,jsph) schedule(dynamic) do isph = 1, params % nsph + y(:, isph) = 0.0d0 do ij = constants % inl(isph), constants % inl(isph + 1) - 1 jsph = constants % nl(ij) call dgemv('n', constants % nbasis, constants % nbasis, one, & @@ -768,18 +983,58 @@ subroutine bx(params, constants, workspace, x, y, ddx_error) & one, y(:,isph), 1) end do end do - else - !$omp parallel do default(none) shared(params,constants,workspace,x,y) & - !$omp private(isph,iproc) schedule(dynamic) - do isph = 1, params % nsph - iproc = omp_get_thread_num() + 1 - call calcv2_lpb(params, constants, isph, workspace % tmp_pot(:, iproc), x) - call ddintegrate(1, constants % nbasis, params % ngrid, constants % vwgrid, & - & constants % vgrid_nbasis, workspace % tmp_pot(:, iproc), y(:,isph)) - y(:,isph) = - y(:,isph) - end do + else + se = params % se + eta = params % eta + ngrid = params % ngrid + kappa = params % kappa + vgrid_nbasis = constants % vgrid_nbasis + + associate(inl => constants % inl, nl => constants % nl, & + & csph => params % csph, rsph => params % rsph, & + & cgrid => constants % cgrid, fi => constants % fi, & + & vscales => constants % vscales, & + & vwgrid => constants % vwgrid, & + & ioverlap => constants % ioverlap, & + & overlap => constants % overlap, & + & si_ri => constants % si_ri) + !$omp parallel do default(none) schedule(dynamic,1) & + !$omp firstprivate(nsph,se,eta,lmax,nbasis,ngrid, & + !$omp vgrid_nbasis,kappa) shared(inl,nl,csph,rsph,cgrid,fi, & + !$omp vscales,x,y,vwgrid,ioverlap,overlap,si_ri) private( & + !$omp isph,tmp_grid,its,ij,jsph,vij,vvij,tij,xij,oij,vtij, & + !$omp work,work_complex,ijits) + do isph = 1, nsph + tmp_grid(:) = 0.0d0 + do ij = inl(isph), inl(isph+1)-1 + jsph = nl(ij) + do ijits = ioverlap(ij), ioverlap(ij+1) - 1 + its = overlap(ijits) + vij = csph(:,isph) + rsph(isph)*cgrid(:,its) & + & - csph(:,jsph) + vvij = sqrt(vij(1)*vij(1) + vij(2)*vij(2) & + & + vij(3)*vij(3)) + tij = vvij/rsph(jsph) + xij = fsw(tij, se, eta) + if (fi(its,isph).gt.1.0d0) then + oij = xij/fi(its, isph) + else + oij = xij + end if + vtij = vij*kappa + call fmm_l2p_bessel_work(vtij, lmax, vscales, & + & si_ri(:, jsph), oij, x(:, jsph), 1.0d0, & + & tmp_grid(its), work_complex, work) + end do + end do + call ddintegrate(1, nbasis, ngrid, vwgrid, vgrid_nbasis, & + & tmp_grid, y(:, isph)) + y(:, isph) = - y(:, isph) + end do + end associate end if if (constants % dodiag) y = y + x + call time_pull("bx") end subroutine bx !> Adjoint ddLPB matrix-vector product @@ -850,8 +1105,10 @@ subroutine prec_tstarx(params, constants, workspace, x, y, ddx_error) y(:,:,1) = x(:,:,1) call convert_ddcosmo(params, constants, 1, y(:,:,1)) n_iter = params % maxiter - call jacobi_diis(params, constants, workspace, constants % inner_tol, & - & y(:,:,1), workspace % ddcosmo_guess, n_iter, x_rel_diff, lstarx, & + ! the empirical 10^-2 factor reduces the number of macro iterations + call jacobi_diis(params, constants, workspace, & + & constants % inner_tol*1.0d-2, y(:,:,1), & + & workspace % ddcosmo_guess, n_iter, x_rel_diff, lstarx, & & ldm1x, hnorm, ddx_error) if (ddx_error % flag .ne. 0) then call update_error(ddx_error, 'prec_tstarx: ddCOSMO failed to ' // & @@ -863,9 +1120,9 @@ subroutine prec_tstarx(params, constants, workspace, x, y, ddx_error) start_time = omp_get_wtime() n_iter = params % maxiter - call jacobi_diis(params, constants, workspace, constants % inner_tol, & - & x(:,:,2), workspace % hsp_guess, n_iter, x_rel_diff, bstarx, & - & bx_prec, hnorm, ddx_error) + call jacobi_diis(params, constants, workspace, & + & constants % inner_tol*1.0d1, x(:,:,2), workspace % hsp_guess, & + & n_iter, x_rel_diff, bstarx, bx_prec, hnorm, ddx_error) if (ddx_error % flag .ne. 0) then call update_error(ddx_error, 'prec_tstarx: HSP failed to ' // & & 'converge, exiting') @@ -899,9 +1156,9 @@ subroutine prec_tx(params, constants, workspace, x, y, ddx_error) ! perform A^-1 * Yr start_time = omp_get_wtime() n_iter = params % maxiter - call jacobi_diis(params, constants, workspace, constants % inner_tol, & - & x(:,:,1), workspace % ddcosmo_guess, n_iter, x_rel_diff, lx, & - & ldm1x, hnorm, ddx_error) + call jacobi_diis(params, constants, workspace, & + & constants % inner_tol, x(:,:,1), workspace % ddcosmo_guess, & + & n_iter, x_rel_diff, lx, ldm1x, hnorm, ddx_error) if (ddx_error % flag .ne. 0) then call update_error(ddx_error, 'prec_tx: ddCOSMO failed to ' // & & 'converge, exiting') @@ -916,9 +1173,9 @@ subroutine prec_tx(params, constants, workspace, x, y, ddx_error) ! perform B^-1 * Ye start_time = omp_get_wtime() n_iter = params % maxiter - call jacobi_diis(params, constants, workspace, constants % inner_tol, & - & x(:,:,2), workspace % hsp_guess, n_iter, x_rel_diff, bx, & - & bx_prec, hnorm, ddx_error) + call jacobi_diis(params, constants, workspace, & + & constants % inner_tol, x(:,:,2), workspace % hsp_guess, & + & n_iter, x_rel_diff, bx, bx_prec, hnorm, ddx_error) y(:,:,2) = workspace % hsp_guess workspace % hsp_time = workspace % hsp_time + omp_get_wtime() - start_time @@ -948,6 +1205,8 @@ subroutine cstarx(params, constants, workspace, x, y, ddx_error) real(dp) :: val, epsilon_ratio real(dp), allocatable :: scratch(:,:), scratch0(:,:) + call time_push() + ! dummy operation on unused interface arguments if (ddx_error % flag .eq. 0) continue @@ -1043,6 +1302,8 @@ subroutine cstarx(params, constants, workspace, x, y, ddx_error) end do end do + call time_pull("cstarx") + end subroutine cstarx !> ddLPB matrix-vector product @@ -1055,15 +1316,14 @@ subroutine cx(params, constants, workspace, x, y, ddx_error) real(dp), dimension(constants % nbasis, params % nsph, 2), intent(out) :: y type(ddx_error_type), intent(inout) :: ddx_error - integer :: isph, jsph, igrid, ind, l, m, ind0 - real(dp), dimension(3) :: vij, vtij - real(dp) :: val + integer :: isph, jsph, igrid, ind, l, m, indl, inode, info, & + & nsph, lmax, nbasis, nbasis0, lmax0, ngrid + real(dp) :: vij(3), vtij(3), val, epsp, eps, fac, & + & work(constants % lmax0 + 1) complex(dp) :: work_complex(constants % lmax0 + 1) - real(dp) :: work(constants % lmax0 + 1) - integer :: indl, inode, info - real(dp), allocatable :: diff_re(:,:), diff0(:,:) + call time_push() allocate(diff_re(constants % nbasis, params % nsph), & & diff0(constants % nbasis0, params % nsph), stat=info) if (info.ne.0) then @@ -1071,45 +1331,67 @@ subroutine cx(params, constants, workspace, x, y, ddx_error) return end if - ! diff_re = params % epsp/eps*l1/ri*Xr - i'(ri)/i(ri)*Xe, - diff_re = zero - !$omp parallel do default(none) shared(params,diff_re, & - !$omp constants,x) private(jsph,l,m,ind) - do jsph = 1, params % nsph - do l = 0, params % lmax - do m = -l,l - ind = l**2 + l + m + 1 - diff_re(ind,jsph) = (params % epsp/params % eps)* & - & (l/params % rsph(jsph))*x(ind,jsph,1) & - & - constants % termimat(l,jsph)*x(ind,jsph,2) + nsph = params % nsph + lmax = params % lmax + lmax0 = constants % lmax0 + epsp = params % epsp + eps = params % eps + nbasis = constants % nbasis + nbasis0 = constants % nbasis0 + ngrid = params % ngrid + + call time_push() + associate(rsph => params % rsph, termimat => constants % termimat) + !$omp parallel do collapse(2) default(none) schedule(static,100) & + !$omp firstprivate(nsph,lmax,epsp,eps) private(jsph,l,ind,m) & + !$omp shared(diff_re,rsph,x,termimat) + do jsph = 1, nsph + do l = 0, lmax + ind = l**2 + l + 1 + do m = -l, l + diff_re(ind+m,jsph) = (epsp/eps)*(l/rsph(jsph)) & + & *x(ind+m,jsph,1) - termimat(l,jsph)*x(ind+m,jsph,2) + end do + end do end do - end do - end do + end associate + call time_pull("cx1") ! diff0 = Pchi * diff_er, linear scaling - !$omp parallel do default(none) shared(constants,params, & - !$omp diff_re,diff0) private(jsph) - do jsph = 1, params % nsph - call dgemv('t', constants % nbasis, constants % nbasis0, one, & - & constants % pchi(1,1,jsph), constants % nbasis, & - & diff_re(1,jsph), 1, zero, diff0(1,jsph), 1) - end do + call time_push() + associate(pchi => constants % pchi) + !$omp parallel do default(none) schedule(static,10) & + !$omp firstprivate(nsph,nbasis,nbasis0) private(jsph) & + !$omp shared(pchi,diff_re,diff0) + do jsph = 1, nsph + call dgemv('t', nbasis, nbasis0, 1.0d0, pchi(1,1,jsph), nbasis, & + & diff_re(1,jsph), 1, 0.0d0, diff0(1,jsph), 1) + end do + end associate + call time_pull("cx2") ! Multiply diff0 by C_ik inplace - do isph = 1, params % nsph - do l = 0, constants % lmax0 - ind0 = l*l+l+1 - diff0(ind0-l:ind0+l, isph) = diff0(ind0-l:ind0+l, isph) * & - & constants % C_ik(l, isph) + call time_push() + associate(c_ik => constants % c_ik) + !$omp parallel do collapse(2) schedule(static,100) & + !$omp firstprivate(nsph,lmax0) private(isph,l,ind,fac,m) & + !$omp shared(diff0,c_ik) + do isph = 1, nsph + do l = 0, lmax0 + ind = l*l+l+1 + fac = c_ik(l, isph) + do m = -l, l + diff0(ind+m, isph) = diff0(ind+m, isph) * fac + end do + end do end do - end do + end associate + call time_pull("cx3") + ! avoiding N^2 storage, this code does not use the cached coefY - y(:,:,1) = zero if (params % fmm .eq. 0) then - !$omp parallel do default(none) shared(params,constants, & - !$omp diff0,y) private(isph,igrid,val,vij,work, & - !$omp ind0,ind,vtij,work_complex) do isph = 1, params % nsph + y(:,isph,1) = zero do igrid = 1, params % ngrid if (constants % ui(igrid,isph).gt.zero) then val = zero @@ -1134,55 +1416,95 @@ subroutine cx(params, constants, workspace, x, y, ddx_error) end do else ! Load input harmonics into tree data + call time_push() workspace % tmp_node_m = zero workspace % tmp_node_l = zero workspace % tmp_sph = zero - do isph = 1, params % nsph - do l = 0, constants % lmax0 - ind0 = l*l+l+1 - workspace % tmp_sph(ind0-l:ind0+l, isph) = & - & diff0(ind0-l:ind0+l, isph) + associate(tmp_sph => workspace % tmp_sph) + !$omp parallel do collapse(2) schedule(static,100) & + !$omp firstprivate(nsph,lmax0), shared(tmp_sph,diff0) & + !$omp private(isph,l,ind) + do isph = 1, nsph + do l = 0, lmax0 + ind = l*l+l+1 + tmp_sph(ind-l:ind+l, isph) = diff0(ind-l:ind+l, isph) + end do end do - end do + end associate if(constants % lmax0 .lt. params % pm) then - do isph = 1, params % nsph - inode = constants % snode(isph) - workspace % tmp_node_m(1:constants % nbasis0, inode) = & - & workspace % tmp_sph(1:constants % nbasis0, isph) - workspace % tmp_node_m(constants % nbasis0+1:, inode) = zero - end do + associate(tmp_node_m => workspace % tmp_node_m, & + & snode => constants % snode) + !$omp parallel do schedule(static,1) default(none) & + !$omp firstprivate(nsph,nbasis0) & + !$omp shared(snode,tmp_node_m,diff0) & + !$omp private(isph,inode) + do isph = 1, nsph + inode = snode(isph) + tmp_node_m(1:nbasis0, inode) = & + & diff0(1:nbasis0, isph) + tmp_node_m(nbasis0+1:, inode) = 0.0d0 + end do + end associate else indl = (params % pm+1)**2 - do isph = 1, params % nsph - inode = constants % snode(isph) - workspace % tmp_node_m(:, inode) = workspace % tmp_sph(1:indl, isph) - end do + associate(tmp_node_m => workspace % tmp_node_m, & + & snode => constants % snode) + !$omp parallel do schedule(static,1) default(none) & + !$omp firstprivate(nsph,nbasis0) & + !$omp shared(snode,tmp_node_m,diff0) & + !$omp private(isph,inode,indl) + do isph = 1, nsph + inode = snode(isph) + tmp_node_m(:, inode) = diff0(1:indl, isph) + end do + end associate end if + call time_pull("cx-fmmprep") + call time_push() ! Do FMM operations call tree_m2m_bessel_rotation(params, constants, workspace % tmp_node_m) + call time_pull("cx-m2m") + call time_push() call tree_m2l_bessel_rotation(params, constants, workspace % tmp_node_m, & & workspace % tmp_node_l) + call time_pull("cx-m2l") + call time_push() call tree_l2l_bessel_rotation(params, constants, workspace % tmp_node_l) + call time_pull("cx-l2l") + call time_push() workspace % tmp_grid = zero call tree_l2p_bessel(params, constants, one, workspace % tmp_node_l, zero, & & workspace % tmp_grid) + call time_pull("cx-l2p") + call time_push() call tree_m2p_bessel(params, constants, constants % lmax0, one, & & params % lmax, workspace % tmp_sph, one, & & workspace % tmp_grid) - - do isph = 1, params % nsph - do igrid = 1, params % ngrid - do ind = 1, constants % nbasis - y(ind,isph,1) = y(ind,isph,1) + workspace % tmp_grid(igrid, isph)*& - & constants % vwgrid(ind, igrid)*& - & constants % ui(igrid,isph) + call time_pull("cx-m2p") + call time_push() + + associate(tmp_grid => workspace % tmp_grid, & + & vwgrid => constants % vwgrid, ui => constants % ui) + !$omp parallel do collapse(2) schedule(static,100) & + !$omp firstprivate(nsph,ngrid,nbasis), shared(y,tmp_grid, & + !$omp vwgrid,ui) private(isph,igrid,ind,val) + do isph = 1, nsph + do ind = 1, nbasis + val = 0.0d0 + do igrid = 1, ngrid + val = val + tmp_grid(igrid, isph)*& + & vwgrid(ind, igrid)*ui(igrid,isph) + end do + y(ind,isph,1) = val end do end do - end do + end associate + call time_pull("cx-fmmend") end if y(:,:,2) = y(:,:,1) + call time_pull("cx") deallocate(diff_re, diff0, stat=info) if (info.ne.0) then call update_error(ddx_error, "Deallocation failed in cx") diff --git a/src/ddx_profiling.f90 b/src/ddx_profiling.f90 new file mode 100644 index 00000000..7e056f3c --- /dev/null +++ b/src/ddx_profiling.f90 @@ -0,0 +1,48 @@ +module ddx_profiling + !! Unified Input/Output handling across the code. + use ddx_definitions, only: dp + implicit none + private + + integer, parameter:: ntimes = 128 + integer:: tcnt = 1 + real(dp):: times(ntimes) + + public:: time_pull, time_push + + contains + + subroutine time_push() + implicit none + real(dp):: omp_get_wtime + + if(tcnt <= ntimes) then + times(tcnt) = omp_get_wtime() + tcnt = tcnt+1 + else + write(*, *) 'time_push Cannot push another time in the buffer.' + stop 1 + end if + end subroutine + + subroutine time_pull(s) + implicit none + + character(len=*), intent(in):: s + real(dp):: elap + character(len = 2048):: msg + + real(dp):: omp_get_wtime + + if(tcnt > 1) then + elap = omp_get_wtime() - times(tcnt-1) + tcnt = tcnt-1 + write(msg, "(3a, ': ', e14.6E2, ' s')") repeat('-', tcnt), '> ', s, elap + write(*, *) '[ddX-time]', trim(msg) + else + write(*, *) 'time_pull Cannot pull any value.' + stop 1 + end if + end subroutine + +end module ddx_profiling diff --git a/src/ddx_solvers.f90 b/src/ddx_solvers.f90 index 23ee70b0..a58ba1fa 100644 --- a/src/ddx_solvers.f90 +++ b/src/ddx_solvers.f90 @@ -207,7 +207,7 @@ subroutine diis(n, nmat, ndiis, x, e, b, xnew) end do nmat = nmat + 1 - deallocate (bloc, cex, stat=istatus) + deallocate (bloc, cex, ipiv, stat=istatus) end subroutine diis !> DIIS helper routine @@ -331,7 +331,7 @@ subroutine jacobi_diis_external(params, constants, workspace, n, tol, rhs, x, n_ rel_diff = diff / norm end if x_rel_diff(it) = rel_diff - constants % inner_tol = max(rel_diff*sqrt(tol), tol/100.0d0) + constants % inner_tol = max(rel_diff*1d-2, tol*1d-2) ! update x = x_new ! EXIT Jacobi loop here