From f30ec1b7f6dfe145fc53bfaa4fda98aa6f0cbe31 Mon Sep 17 00:00:00 2001 From: Michele Nottoli Date: Mon, 19 Jun 2023 14:56:46 +0200 Subject: [PATCH 01/16] Parallelized the adjoint ddpcm step. --- src/ddx_operators.f90 | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90 index d5ef9a1b..aeaddece 100644 --- a/src/ddx_operators.f90 +++ b/src/ddx_operators.f90 @@ -370,11 +370,14 @@ subroutine dstarx_dense(params, constants, workspace, do_diag, x, y) ! Local variables real(dp) :: vji(3), sji(3) real(dp) :: vvji, tji, tt, f, rho, ctheta, stheta, cphi, sphi - integer :: its, isph, jsph, l, m, ind + integer :: its, isph, jsph, l, m, ind, iproc real(dp), external :: dnrm2 y = zero - ! this loop is easily parallelizable + !$omp parallel do default(none) shared(do_diag,params,constants, & + !$omp workspace,x,y) private(isph,jsph,its,vji,vvji,tji,sji,rho, & + !$omp ctheta,stheta,cphi,sphi,tt,l,ind,f,m,iproc) schedule(dynamic) do isph = 1, params % nsph + iproc = omp_get_thread_num() + 1 do jsph = 1, params % nsph if (jsph.ne.isph) then do its = 1, params % ngrid @@ -389,8 +392,10 @@ subroutine dstarx_dense(params, constants, workspace, do_diag, x, y) ! build the local basis call ylmbas(sji, rho, ctheta, stheta, cphi, sphi, & & params % lmax, constants % vscales, & - & workspace % tmp_vylm, workspace % tmp_vplm, & - & workspace % tmp_vcos, workspace % tmp_vsin) + & workspace % tmp_vylm(:(params % lmax + 1)**2, iproc), & + & workspace % tmp_vplm(:(params % lmax + 1)**2, iproc), & + & workspace % tmp_vcos(:(params % lmax + 1), iproc), & + & workspace % tmp_vsin(:(params % lmax + 1), iproc)) tt = constants % ui(its,jsph) & & *dot_product(constants % vwgrid(:constants % nbasis, its), & & x(:,jsph))/tji @@ -399,7 +404,7 @@ subroutine dstarx_dense(params, constants, workspace, do_diag, x, y) f = dble(l)*tt/ constants % vscales(ind)**2 do m = -l, l y(ind+m,isph) = y(ind+m,isph) + & - & f*workspace % tmp_vylm(ind+m, 1) + & f*workspace % tmp_vylm(ind+m, iproc) end do tt = tt/tji end do From b6ecc7f01e8f1bb83b84d5a83f0637f904d6a5d5 Mon Sep 17 00:00:00 2001 From: Michele Nottoli Date: Mon, 19 Jun 2023 15:11:39 +0200 Subject: [PATCH 02/16] Parallelized tree_m2p_adj. --- src/ddx_core.f90 | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/ddx_core.f90 b/src/ddx_core.f90 index 73c701a6..ab40ef26 100644 --- a/src/ddx_core.f90 +++ b/src/ddx_core.f90 @@ -2720,9 +2720,9 @@ subroutine tree_m2p_adj(params, constants, p, alpha, grid_v, beta, sph_m) sph_m = beta * sph_m end if ! Cycle over all spheres - !!$omp parallel do default(none) shared(params,constants,grid_v,p, & - !!$omp alpha,sph_m), private(isph,inode,jnear,jnode,jsph,igrid,c,work) & - !!$omp schedule(dynamic) + !$omp parallel do default(none) shared(params,constants,grid_v,p, & + !$omp alpha,sph_m), private(isph,inode,jnear,jnode,jsph,igrid,c,work) & + !$omp schedule(dynamic) do isph = 1, params % nsph ! Cycle over all near-field admissible pairs of spheres inode = constants % snode(isph) @@ -2735,12 +2735,12 @@ subroutine tree_m2p_adj(params, constants, p, alpha, grid_v, beta, sph_m) if(isph .eq. jsph) cycle ! Accumulate interaction for external grid points only do igrid = 1, params % ngrid - if(constants % ui(igrid, isph) .eq. zero) cycle - c = constants % cgrid(:, igrid)*params % rsph(isph) - & - & params % csph(:, jsph) + params % csph(:, isph) - call fmm_m2p_adj_work(c, alpha*grid_v(igrid, isph), & - & params % rsph(jsph), p, constants % vscales_rel, one, & - & sph_m(:, jsph), work) + if(constants % ui(igrid, jsph) .eq. zero) cycle + c = constants % cgrid(:, igrid)*params % rsph(jsph) - & + & params % csph(:, isph) + params % csph(:, jsph) + call fmm_m2p_adj_work(c, alpha*grid_v(igrid, jsph), & + & params % rsph(isph), p, constants % vscales_rel, one, & + & sph_m(:, isph), work) end do end do end do From 14717b8992e374d3abbfa1f95de1630e59a127db Mon Sep 17 00:00:00 2001 From: Michele Nottoli Date: Mon, 19 Jun 2023 15:25:22 +0200 Subject: [PATCH 03/16] Parallelized tree_m2l_rotation_adj. --- src/ddx_core.f90 | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/ddx_core.f90 b/src/ddx_core.f90 index ab40ef26..19cb443e 100644 --- a/src/ddx_core.f90 +++ b/src/ddx_core.f90 @@ -2415,8 +2415,9 @@ subroutine tree_m2l_rotation_adj(params, constants, node_l, node_m) ! Local variables integer :: i, j, k real(dp) :: c1(3), c(3), r1, r - ! Any order of this cycle is OK node_m = zero + !$omp parallel do default(none) shared(constants,params,node_m,node_l) & + !$omp private(i,c,r,k,c1,r1,j) schedule(dynamic) do i = 1, constants % nclusters ! If no far admissible pairs just set output to zero if (constants % nfar(i) .eq. 0) then @@ -2428,18 +2429,18 @@ subroutine tree_m2l_rotation_adj(params, constants, node_l, node_m) k = constants % far(constants % sfar(i)) c1 = constants % cnode(:, k) r1 = constants % rnode(k) - c1 = c - c1 - call fmm_m2l_rotation_adj(c1, r, r1, params % pl, params % pm, & + c1 = c1 - c + call fmm_m2l_rotation_adj(c1, r1, r, params % pl, params % pm, & & constants % vscales, constants % m2l_ztranslate_adj_coef, one, & - & node_l(:, i), one, node_m(:, k)) + & node_l(:, k), one, node_m(:, i)) do j = constants % sfar(i)+1, constants % sfar(i+1)-1 k = constants % far(j) c1 = constants % cnode(:, k) r1 = constants % rnode(k) - c1 = c - c1 - call fmm_m2l_rotation_adj(c1, r, r1, params % pl, params % pm, & + c1 = c1 - c + call fmm_m2l_rotation_adj(c1, r1, r, params % pl, params % pm, & & constants % vscales, constants % m2l_ztranslate_adj_coef, one, & - & node_l(:, i), one, node_m(:, k)) + & node_l(:, k), one, node_m(:, i)) end do end do end subroutine tree_m2l_rotation_adj From 8ddd8a98c79f5dc4360be0fe8767178ae7b990cd Mon Sep 17 00:00:00 2001 From: Michele Nottoli Date: Mon, 19 Jun 2023 16:03:57 +0200 Subject: [PATCH 04/16] Increased the printed information about ddLPB timings. --- src/ddx_core.f90 | 2 ++ src/ddx_driver.f90 | 13 +++++++++++-- src/ddx_lpb.f90 | 12 ++++++++++++ src/ddx_operators.f90 | 11 +++++++++++ src/ddx_workspace.f90 | 2 ++ 5 files changed, 38 insertions(+), 2 deletions(-) diff --git a/src/ddx_core.f90 b/src/ddx_core.f90 index 19cb443e..f06f1c91 100644 --- a/src/ddx_core.f90 +++ b/src/ddx_core.f90 @@ -187,6 +187,8 @@ module ddx_core real(dp), allocatable :: x_adj_r_grid(:, :) real(dp), allocatable :: x_adj_e_grid(:, :) real(dp), allocatable :: phi_n(:, :) + real(dp) :: hsp_time + real(dp) :: hsp_adj_time end type ddx_state_type diff --git a/src/ddx_driver.f90 b/src/ddx_driver.f90 index b3fcbe61..3157e3da 100644 --- a/src/ddx_driver.f90 +++ b/src/ddx_driver.f90 @@ -298,7 +298,12 @@ program main & state % x_lpb_rel_diff(i) end do ! Print number of iterations and time - write(6, 100) "ddlpb step time: ", state % x_lpb_time, " seconds" + write(6, 100) "ddlpb step time: ", state % x_lpb_time, & + & " seconds, of which:" + write(6, 100) " ddcosmo: ", state % xs_time, " seconds" + write(6, 100) " hsp: ", state % hsp_time, " seconds" + write(6, 100) " coupling terms: ", state % x_lpb_time & + & - state % xs_time - state % hsp_time, " seconds" write(6, 300) "ddlpb step iterations: ", state % x_lpb_niter end if @@ -348,7 +353,11 @@ program main end do ! Print number of iterations and time write(6, 100) "adjoint ddlpb step time: ", & - & state % x_adj_lpb_time, " seconds" + & state % x_adj_lpb_time, " seconds, of which:" + write(6, 100) " adjoint ddcosmo: ", state % s_time, " seconds" + write(6, 100) " adjoint hsp: ", state % hsp_adj_time, " seconds" + write(6, 100) " adjoint coupling terms: ", state % x_adj_lpb_time & + & - state % s_time - state % hsp_adj_time, " seconds" write(6, 200) "adjoint ddlpb step iterations: ", & & state % x_adj_lpb_niter end if diff --git a/src/ddx_lpb.f90 b/src/ddx_lpb.f90 index c6b15f34..cc9c6909 100644 --- a/src/ddx_lpb.f90 +++ b/src/ddx_lpb.f90 @@ -233,6 +233,8 @@ subroutine ddlpb_solve(params, constants, workspace, state, tol) real(dp) :: start_time state % x_lpb_niter = params % maxiter + workspace % xs_time = zero + workspace % hsp_time = zero ! solve LS using Jacobi/DIIS start_time = omp_get_wtime() @@ -251,6 +253,10 @@ subroutine ddlpb_solve(params, constants, workspace, state, tol) ! the density so that it is consistent with the ddCOSMO and ddPCM ! ones. call convert_ddcosmo(params, constants, -1, state % x_lpb(:,:,1)) + + ! put the timings in the right places + state % xs_time = workspace % xs_time + state % hsp_time = workspace % hsp_time end subroutine ddlpb_solve @@ -275,6 +281,8 @@ subroutine ddlpb_solve_adjoint(params, constants, workspace, state, tol) state % x_adj_lpb_niter = params % maxiter constants % inner_tol = sqrt(tol) + workspace % s_time = zero + workspace % hsp_adj_time = zero ! solve adjoint LS using Jacobi/DIIS start_time = omp_get_wtime() @@ -291,6 +299,10 @@ subroutine ddlpb_solve_adjoint(params, constants, workspace, state, tol) state % q = state % x_adj_lpb(:, :, 1) + ! put the timings in the right places + state % s_time = workspace % s_time + state % hsp_adj_time = workspace % hsp_adj_time + call ddlpb_derivative_setup(params, constants, workspace, state) end subroutine ddlpb_solve_adjoint diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90 index aeaddece..e3cd984f 100644 --- a/src/ddx_operators.f90 +++ b/src/ddx_operators.f90 @@ -1360,7 +1360,10 @@ subroutine prec_tstarx(params, constants, workspace, x, y) real(dp), intent(inout) :: y(constants % nbasis, params % nsph, 2) integer :: n_iter real(dp), dimension(params % maxiter) :: x_rel_diff + real(dp) :: start_time + + start_time = omp_get_wtime() y(:,:,1) = x(:,:,1) call convert_ddcosmo(params, constants, 1, y(:,:,1)) n_iter = params % maxiter @@ -1371,7 +1374,9 @@ subroutine prec_tstarx(params, constants, workspace, x, y) return end if y(:,:,1) = workspace % ddcosmo_guess + workspace % s_time = workspace % s_time + omp_get_wtime() - start_time + start_time = omp_get_wtime() n_iter = params % maxiter call jacobi_diis(params, constants, workspace, constants % inner_tol, x(:,:,2), workspace % hsp_guess, & & n_iter, x_rel_diff, bstarx, bx_prec, hnorm) @@ -1380,6 +1385,7 @@ subroutine prec_tstarx(params, constants, workspace, x, y) return end if y(:,:,2) = workspace % hsp_guess + workspace % hsp_adj_time = workspace % hsp_adj_time + omp_get_wtime() - start_time end subroutine prec_tstarx @@ -1398,8 +1404,10 @@ subroutine prec_tx(params, constants, workspace, x, y) real(dp), intent(inout) :: y(constants % nbasis, params % nsph, 2) integer :: n_iter real(dp), dimension(params % maxiter) :: x_rel_diff + real(dp) :: start_time ! perform A^-1 * Yr + start_time = omp_get_wtime() n_iter = params % maxiter call jacobi_diis(params, constants, workspace, constants % inner_tol, x(:,:,1), & & workspace % ddcosmo_guess, n_iter, x_rel_diff, lx, ldm1x, hnorm) @@ -1411,12 +1419,15 @@ subroutine prec_tx(params, constants, workspace, x, y) ! Scale by the factor of (2l+1)/4Pi y(:,:,1) = workspace % ddcosmo_guess call convert_ddcosmo(params, constants, 1, y(:,:,1)) + workspace % xs_time = workspace % xs_time + omp_get_wtime() - start_time ! perform B^-1 * Ye + start_time = omp_get_wtime() n_iter = params % maxiter call jacobi_diis(params, constants, workspace, constants % inner_tol, x(:,:,2), workspace % hsp_guess, & & n_iter, x_rel_diff, bx, bx_prec, hnorm) y(:,:,2) = workspace % hsp_guess + workspace % hsp_time = workspace % hsp_time + omp_get_wtime() - start_time if (workspace % error_flag.ne.0) then workspace % error_message = 'prec_tx: HSP failed to converge' diff --git a/src/ddx_workspace.f90 b/src/ddx_workspace.f90 index c59c20db..ae01160f 100644 --- a/src/ddx_workspace.f90 +++ b/src/ddx_workspace.f90 @@ -98,6 +98,8 @@ module ddx_workspace real(dp), allocatable :: tmp_bmat(:, :) !> ddLPB solutions for the microiterations real(dp), allocatable :: ddcosmo_guess(:,:), hsp_guess(:,:) + !> ddLPB temporary timings + real(dp) :: xs_time, s_time, hsp_time, hsp_adj_time !> Flag if there were an error integer :: error_flag = 2 !> Last error message From a8046d720f5af5d852b18c944f1ea4bccd6736ac Mon Sep 17 00:00:00 2001 From: Michele Nottoli Date: Mon, 19 Jun 2023 16:29:58 +0200 Subject: [PATCH 05/16] Cleanup. --- src/ddx_lpb.f90 | 1 - src/ddx_operators.f90 | 24 ++++++++++++++---------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/src/ddx_lpb.f90 b/src/ddx_lpb.f90 index cc9c6909..1805293c 100644 --- a/src/ddx_lpb.f90 +++ b/src/ddx_lpb.f90 @@ -280,7 +280,6 @@ subroutine ddlpb_solve_adjoint(params, constants, workspace, state, tol) real(dp) :: start_time state % x_adj_lpb_niter = params % maxiter - constants % inner_tol = sqrt(tol) workspace % s_time = zero workspace % hsp_adj_time = zero diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90 index e3cd984f..50c0e73c 100644 --- a/src/ddx_operators.f90 +++ b/src/ddx_operators.f90 @@ -1362,13 +1362,13 @@ subroutine prec_tstarx(params, constants, workspace, x, y) real(dp), dimension(params % maxiter) :: x_rel_diff real(dp) :: start_time - start_time = omp_get_wtime() y(:,:,1) = x(:,:,1) call convert_ddcosmo(params, constants, 1, y(:,:,1)) n_iter = params % maxiter - call jacobi_diis(params, constants, workspace, constants % inner_tol, y(:,:,1), & - & workspace % ddcosmo_guess, n_iter, x_rel_diff, lstarx, ldm1x, hnorm) + call jacobi_diis(params, constants, workspace, constants % inner_tol, & + & y(:,:,1), workspace % ddcosmo_guess, n_iter, x_rel_diff, lstarx, & + & ldm1x, hnorm) if (workspace % error_flag.ne.0) then workspace % error_message = 'prec_tstarx: ddCOSMO failed to converge' return @@ -1378,14 +1378,16 @@ subroutine prec_tstarx(params, constants, workspace, x, y) start_time = omp_get_wtime() n_iter = params % maxiter - call jacobi_diis(params, constants, workspace, constants % inner_tol, x(:,:,2), workspace % hsp_guess, & - & n_iter, x_rel_diff, bstarx, bx_prec, hnorm) + call jacobi_diis(params, constants, workspace, constants % inner_tol, & + & x(:,:,2), workspace % hsp_guess, n_iter, x_rel_diff, bstarx, & + & bx_prec, hnorm) if (workspace % error_flag.ne.0) then workspace % error_message = 'prec_tstarx: HSP failed to converge' return end if y(:,:,2) = workspace % hsp_guess - workspace % hsp_adj_time = workspace % hsp_adj_time + omp_get_wtime() - start_time + workspace % hsp_adj_time = workspace % hsp_adj_time & + & + omp_get_wtime() - start_time end subroutine prec_tstarx @@ -1409,8 +1411,9 @@ subroutine prec_tx(params, constants, workspace, x, y) ! perform A^-1 * Yr start_time = omp_get_wtime() n_iter = params % maxiter - call jacobi_diis(params, constants, workspace, constants % inner_tol, x(:,:,1), & - & workspace % ddcosmo_guess, n_iter, x_rel_diff, lx, ldm1x, hnorm) + call jacobi_diis(params, constants, workspace, constants % inner_tol, & + & x(:,:,1), workspace % ddcosmo_guess, n_iter, x_rel_diff, lx, & + & ldm1x, hnorm) if (workspace % error_flag.ne.0) then workspace % error_message = 'prec_tx: ddCOSMO failed to converge' return @@ -1424,8 +1427,9 @@ subroutine prec_tx(params, constants, workspace, x, y) ! perform B^-1 * Ye start_time = omp_get_wtime() n_iter = params % maxiter - call jacobi_diis(params, constants, workspace, constants % inner_tol, x(:,:,2), workspace % hsp_guess, & - & n_iter, x_rel_diff, bx, bx_prec, hnorm) + call jacobi_diis(params, constants, workspace, constants % inner_tol, & + & x(:,:,2), workspace % hsp_guess, n_iter, x_rel_diff, bx, & + & bx_prec, hnorm) y(:,:,2) = workspace % hsp_guess workspace % hsp_time = workspace % hsp_time + omp_get_wtime() - start_time From 413bdf1b3e90982d162e0ba70086504a102d5b6a Mon Sep 17 00:00:00 2001 From: Michele Nottoli Date: Mon, 19 Jun 2023 16:37:33 +0200 Subject: [PATCH 06/16] Parallelized tree_m2p_bessel_adj. --- src/ddx_core.f90 | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/src/ddx_core.f90 b/src/ddx_core.f90 index f06f1c91..6a9cceb2 100644 --- a/src/ddx_core.f90 +++ b/src/ddx_core.f90 @@ -2774,28 +2774,25 @@ subroutine tree_m2p_bessel_adj(params, constants, p, alpha, grid_v, beta, sph_p, sph_m = beta * sph_m end if ! Cycle over all spheres - !!$omp parallel do default(none) shared(params,constants,p,sph_m,tmp, & - !!$omp alpha,grid_v) private(isph,inode,jnear,jnode,jsph,igrid,c, & - !!$omp iproc) schedule(dynamic) + !$omp parallel do default(none) shared(params,constants,p,sph_m, & + !$omp alpha,grid_v) private(isph,inode,jnear,jnode,jsph,igrid,c) & + !$omp schedule(dynamic) do isph = 1, params % nsph ! Cycle over all near-field admissible pairs of spheres inode = constants % snode(isph) - ! iproc = omp_get_thread_num() + 1 do jnear = constants % snear(inode), constants % snear(inode+1)-1 ! Near-field interactions are possible only between leaf nodes, ! which must contain only a single input sphere jnode = constants % near(jnear) jsph = constants % order(constants % cluster(1, jnode)) - ! Ignore self-interaction - !if(isph .eq. jsph) cycle ! Accumulate interaction for external grid points only do igrid = 1, params % ngrid - if(constants % ui(igrid, isph) .eq. zero) cycle - c = constants % cgrid(:, igrid)*params % rsph(isph) - & - & params % csph(:, jsph) + params % csph(:, isph) - call fmm_m2p_bessel_adj(c, alpha*grid_v(igrid, isph), & - & params % rsph(jsph), params % kappa, p, & - & constants % vscales, one, sph_m(:, jsph)) + if(constants % ui(igrid, jsph) .eq. zero) cycle + c = constants % cgrid(:, igrid)*params % rsph(jsph) - & + & params % csph(:, isph) + params % csph(:, jsph) + call fmm_m2p_bessel_adj(c, alpha*grid_v(igrid, jsph), & + & params % rsph(isph), params % kappa, p, & + & constants % vscales, one, sph_m(:, isph)) end do end do end do From 0cee714b7a801fa4d1a3a1a90b817070ff0c4c92 Mon Sep 17 00:00:00 2001 From: Michele Nottoli Date: Mon, 19 Jun 2023 16:47:08 +0200 Subject: [PATCH 07/16] Parallelized tree_m2l_bessel_rotation_adj_work. --- src/ddx_core.f90 | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/src/ddx_core.f90 b/src/ddx_core.f90 index 6a9cceb2..3a237498 100644 --- a/src/ddx_core.f90 +++ b/src/ddx_core.f90 @@ -2372,10 +2372,9 @@ subroutine tree_m2l_bessel_rotation_adj_work(params, constants, node_l, node_m) ! Local variables integer :: i, j, k real(dp) :: c1(3), c(3), r - ! Any order of this cycle is OK node_m = zero - !!$omp parallel do shared(constants,params,node_l,node_m) & - !!$omp private(i,c,r,k,c1,work,work_complex,j) schedule(dynamic) + !$omp parallel do shared(constants,params,node_l,node_m) & + !$omp private(i,c,r,k,c1,work,work_complex,j) schedule(dynamic) do i = 1, constants % nclusters ! If no far admissible pairs just set output to zero if (constants % nfar(i) .eq. 0) then @@ -2386,19 +2385,18 @@ subroutine tree_m2l_bessel_rotation_adj_work(params, constants, node_l, node_m) ! Use the first far admissible pair to initialize output k = constants % far(constants % sfar(i)) c1 = constants % cnode(:, k) - c1 = params % kappa*(c1 - c) - call fmm_m2l_bessel_rotation_adj_work(c1, constants % SI_rnode(:, i), & - & constants % SK_rnode(:, k), params % pm, & - & constants % vscales, one, & - & node_l(:, i), one, node_m(:, k), work, work_complex) + c1 = params % kappa*(c - c1) + call fmm_m2l_bessel_rotation_adj_work(c1, constants % SI_rnode(:, k), & + & constants % SK_rnode(:, i), params % pm, constants % vscales, & + & one, node_l(:, k), one, node_m(:, i), work, work_complex) do j = constants % sfar(i)+1, constants % sfar(i+1)-1 k = constants % far(j) c1 = constants % cnode(:, k) - c1 = params % kappa*(c1 - c) - call fmm_m2l_bessel_rotation_adj_work(c1, constants % SI_rnode(:, i), & - & constants % SK_rnode(:, k), params % pm, & - & constants % vscales, one, & - & node_l(:, i), one, node_m(:, k), work, work_complex) + c1 = params % kappa*(c - c1) + call fmm_m2l_bessel_rotation_adj_work(c1, & + & constants % SI_rnode(:, k), constants % SK_rnode(:, i), & + & params % pm, constants % vscales, one, node_l(:, k), & + & one, node_m(:, i), work, work_complex) end do end do end subroutine tree_m2l_bessel_rotation_adj_work From d0457d627e4f9a532f4a37c95a44314680548789 Mon Sep 17 00:00:00 2001 From: Michele Nottoli Date: Tue, 20 Jun 2023 11:36:37 +0200 Subject: [PATCH 08/16] Added some code to compute the layers of the bisection tree. --- src/ddx_constants.f90 | 83 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/src/ddx_constants.f90 b/src/ddx_constants.f90 index debbe22c..2609725c 100644 --- a/src/ddx_constants.f90 +++ b/src/ddx_constants.f90 @@ -219,6 +219,11 @@ module ddx_constants !! computation of forces (gradients). Allocated and used only if !! fmm=1. integer :: grad_nbasis + !> Number of layers in the bisection tree. Defined only if fmm=1. + integer :: nlayers + !> The first and the last clusters of each layer of the tree. The + !! dimension is (2, nlayers). Allocated and used only if fmm=1. + integer, allocatable :: layers(:, :) !> Inner tolerance for microiterations done when using ddLPB real(dp) :: inner_tol !> Whether the diagonal of the matrices has to be used in the mvp for @@ -837,7 +842,21 @@ subroutine constants_geometry_init(params, constants) & constants % parent, constants % cnode, constants % rnode, & & constants % snode, constants % error_message, & & constants % error_flag) + if (constants % error_flag .ne. 0) return + + constants % nlayers = int(log(dble(constants % nclusters))/log(two)) + 2 + allocate(constants % layers(2, constants % nlayers), stat=info) + if (info .ne. 0) then + constants % error_flag = 1 + constants % error_message = "constants_geometry_init: " // & + & "`layers` allocation failed" + return + end if + call compute_layers(constants % nclusters, constants % nlayers, & + & constants % layers, constants % children, & + & constants % error_flag, constants % error_message) if (params % error_flag .ne. 0) return + ! Get number of far and near admissible pairs iwork = 0 jwork = 1 @@ -1429,6 +1448,62 @@ subroutine mkprec(lmax, nbasis, nsph, ngrid, eps, ui, wgrid, vgrid, & error_message = "" end subroutine mkprec +!> Compute and store information about the layers of the tree used in +!! the fmm operations. +!! +subroutine compute_layers(nclusters, nlayers, layers, children, & + & error_flag, error_message) + implicit none + integer, intent(in) :: nclusters, nlayers + integer, intent(in) :: children(2, nclusters) + integer, intent(out) :: layers(2, nlayers) + integer :: i, start_index, stop_index, distance, info + integer, allocatable :: distance_from_root(:) + integer, intent(out) :: error_flag + character(len=255), intent(out) :: error_message + + allocate(distance_from_root(nclusters), stat=info) + if (info.ne.0) then + error_flag = 1 + error_message = "Allocation failed in compute_layers" + return + end if + + distance_from_root(1) = 0 + do i = 1, nclusters + distance = distance_from_root(i) + if (children(1, i) .ne. 0) & + & distance_from_root(children(1, i)) = distance + 1 + if (children(2, i) .ne. 0) & + & distance_from_root(children(2, i)) = distance + 1 + end do + + distance = 0 + layers(1, 1) = 1 + do i = 1, nclusters + if (distance_from_root(i) .lt. distance) then + error_flag = 1 + error_message = "Error in compute_layers" + return + end if + if (distance_from_root(i) .ne. distance) then + layers(2, distance + 1) = i + if (distance_from_root(i) .lt. nlayers) & + & layers(1, distance + 2) = i + 1 + distance = distance_from_root(i) + end if + end do + layers(2, nlayers) = nclusters + + deallocate(distance_from_root, stat=info) + if (info.ne.0) then + error_flag = 1 + error_message = "Deallocation failed in compute_layers" + return + end if + +end subroutine compute_layers + !> Build a recursive inertial binary tree !! !! Uses inertial bisection in a recursive manner until each leaf node has only @@ -2249,6 +2324,14 @@ subroutine constants_free(constants) return end if end if + if (allocated(constants % layers)) then + deallocate(constants % layers, stat=istat) + if (istat .ne. 0) then + constants % error_message = "`layers` deallocation failed!" + constants % error_flag = 1 + return + end if + end if end subroutine constants_free end module ddx_constants From 0f75fcaa9a9a6f97789278f5ed39967222516d34 Mon Sep 17 00:00:00 2001 From: Michele Nottoli Date: Tue, 20 Jun 2023 11:41:51 +0200 Subject: [PATCH 09/16] Parallelized tree_m2m_rotation_work. --- src/ddx_core.f90 | 48 +++++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/src/ddx_core.f90 b/src/ddx_core.f90 index 3a237498..4a9bb6ed 100644 --- a/src/ddx_core.f90 +++ b/src/ddx_core.f90 @@ -1878,34 +1878,36 @@ subroutine tree_m2m_rotation_work(params, constants, node_m, work) ! Temporary workspace real(dp), intent(out) :: work(6*params % pm**2 + 19*params % pm + 8) ! Local variables - integer :: i, j + integer :: i, j, l real(dp) :: c1(3), c(3), r1, r ! Bottom-to-top pass - !!$omp parallel do default(none) shared(constants,params,node_m) & - !!$omp private(i,j,c1,c,r1,r,work) - do i = constants % nclusters, 1, -1 - ! Leaf node does not need any update - if (constants % children(1, i) == 0) cycle - c = constants % cnode(:, i) - r = constants % rnode(i) - ! First child initializes output - j = constants % children(1, i) - c1 = constants % cnode(:, j) - r1 = constants % rnode(j) - c1 = c1 - c - call fmm_m2m_rotation_work(c1, r1, r, & - & params % pm, & - & constants % vscales, & - & constants % vcnk, one, & - & node_m(:, j), zero, node_m(:, i), work) - ! All other children update the same output - do j = constants % children(1, i)+1, constants % children(2, i) + do l = constants % nlayers, 1, -1 + !$omp parallel do default(none) shared(constants,params,node_m) & + !$omp private(i,j,c1,c,r1,r,work) + do i = constants % layers(1, l), constants % layers(2, l) + ! Leaf node does not need any update + if (constants % children(1, i) == 0) cycle + c = constants % cnode(:, i) + r = constants % rnode(i) + ! First child initializes output + j = constants % children(1, i) c1 = constants % cnode(:, j) r1 = constants % rnode(j) c1 = c1 - c - call fmm_m2m_rotation_work(c1, r1, r, params % pm, & - & constants % vscales, constants % vcnk, one, & - & node_m(:, j), one, node_m(:, i), work) + call fmm_m2m_rotation_work(c1, r1, r, & + & params % pm, & + & constants % vscales, & + & constants % vcnk, one, & + & node_m(:, j), zero, node_m(:, i), work) + ! All other children update the same output + do j = constants % children(1, i)+1, constants % children(2, i) + c1 = constants % cnode(:, j) + r1 = constants % rnode(j) + c1 = c1 - c + call fmm_m2m_rotation_work(c1, r1, r, params % pm, & + & constants % vscales, constants % vcnk, one, & + & node_m(:, j), one, node_m(:, i), work) + end do end do end do end subroutine tree_m2m_rotation_work From 7ab99b825c0907c3aa12aa91ae07dbe8aa0a7614 Mon Sep 17 00:00:00 2001 From: Michele Nottoli Date: Tue, 20 Jun 2023 11:52:41 +0200 Subject: [PATCH 10/16] Parallelized tree_l2l_rotation_work and tree_l2l_bessel_rotation_work. --- src/ddx_core.f90 | 52 +++++++++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/src/ddx_core.f90 b/src/ddx_core.f90 index 4a9bb6ed..f1f23d33 100644 --- a/src/ddx_core.f90 +++ b/src/ddx_core.f90 @@ -1882,7 +1882,7 @@ subroutine tree_m2m_rotation_work(params, constants, node_m, work) real(dp) :: c1(3), c(3), r1, r ! Bottom-to-top pass do l = constants % nlayers, 1, -1 - !$omp parallel do default(none) shared(constants,params,node_m) & + !$omp parallel do default(none) shared(constants,params,node_m,l) & !$omp private(i,j,c1,c,r1,r,work) do i = constants % layers(1, l), constants % layers(2, l) ! Leaf node does not need any update @@ -2090,21 +2090,23 @@ subroutine tree_l2l_rotation_work(params, constants, node_l, work) ! Temporary workspace real(dp), intent(out) :: work(6*params % pl**2 + 19*params % pl + 8) ! Local variables - integer :: i, j + integer :: i, j, l real(dp) :: c1(3), c(3), r1, r ! Top-to-bottom pass - !!$omp parallel do default(none) shared(constants,params,node_l) & - !!$omp private(i,j,c1,c,r1,r,work) - do i = 2, constants % nclusters - j = constants % parent(i) - c = constants % cnode(:, j) - r = constants % rnode(j) - c1 = constants % cnode(:, i) - r1 = constants % rnode(i) - c1 = c - c1 - call fmm_l2l_rotation_work(c1, r, r1, params % pl, & - & constants % vscales, constants % vfact, one, & - & node_l(:, j), one, node_l(:, i), work) + do l = 2, constants % nlayers + !$omp parallel do default(none) shared(constants,params,node_l,l) & + !$omp private(i,j,c1,c,r1,r,work) + do i = constants % layers(1, l), constants % layers(2, l) + j = constants % parent(i) + c = constants % cnode(:, j) + r = constants % rnode(j) + c1 = constants % cnode(:, i) + r1 = constants % rnode(i) + c1 = c - c1 + call fmm_l2l_rotation_work(c1, r, r1, params % pl, & + & constants % vscales, constants % vfact, one, & + & node_l(:, j), one, node_l(:, i), work) + end do end do end subroutine tree_l2l_rotation_work @@ -2142,15 +2144,19 @@ subroutine tree_l2l_bessel_rotation_work(params, constants, node_l) integer :: i, j real(dp) :: c_child(3), c_parent(3), c_diff(3) ! Top-to-bottom pass - do i = 2, constants % nclusters - j = constants % parent(i) - c_child = constants % cnode(:, j) - c_parent = constants % cnode(:, i) - c_diff = params % kappa*(c_child - c_parent) - call fmm_l2l_bessel_rotation_work(c_diff, & - & constants % si_rnode(:, j), constants % si_rnode(:, i), & - & params % pl, constants % vscales, one, & - & node_l(:, j), one, node_l(:, i), work, work_complex) + do l = 2, constants % nlayers + !$omp parallel do default(none) shared(constants,params,node_l,l) & + !$omp private(i,j,c1,c,r1,r,work) + do i = constants % layers(1, l), constants % layers(2, l) + j = constants % parent(i) + c_child = constants % cnode(:, j) + c_parent = constants % cnode(:, i) + c_diff = params % kappa*(c_child - c_parent) + call fmm_l2l_bessel_rotation_work(c_diff, & + & constants % si_rnode(:, j), constants % si_rnode(:, i), & + & params % pl, constants % vscales, one, & + & node_l(:, j), one, node_l(:, i), work, work_complex) + end do end do end subroutine tree_l2l_bessel_rotation_work From 71465a4e44b4988f026de8f16f6a11f57a1f303f Mon Sep 17 00:00:00 2001 From: Michele Nottoli Date: Tue, 20 Jun 2023 13:29:43 +0200 Subject: [PATCH 11/16] Now properly computing the total number of layers. --- src/ddx_constants.f90 | 77 ++++++++++++++++++++++--------------------- 1 file changed, 39 insertions(+), 38 deletions(-) diff --git a/src/ddx_constants.f90 b/src/ddx_constants.f90 index 2609725c..5ff7370b 100644 --- a/src/ddx_constants.f90 +++ b/src/ddx_constants.f90 @@ -844,18 +844,8 @@ subroutine constants_geometry_init(params, constants) & constants % error_flag) if (constants % error_flag .ne. 0) return - constants % nlayers = int(log(dble(constants % nclusters))/log(two)) + 2 - allocate(constants % layers(2, constants % nlayers), stat=info) - if (info .ne. 0) then - constants % error_flag = 1 - constants % error_message = "constants_geometry_init: " // & - & "`layers` allocation failed" - return - end if - call compute_layers(constants % nclusters, constants % nlayers, & - & constants % layers, constants % children, & - & constants % error_flag, constants % error_message) - if (params % error_flag .ne. 0) return + call compute_layers(constants) + if (constants % error_flag .ne. 0) return ! Get number of far and near admissible pairs iwork = 0 @@ -1451,54 +1441,65 @@ end subroutine mkprec !> Compute and store information about the layers of the tree used in !! the fmm operations. !! -subroutine compute_layers(nclusters, nlayers, layers, children, & - & error_flag, error_message) +!! @param[inout] constants: Object containing all constants. +!! +subroutine compute_layers(constants) implicit none - integer, intent(in) :: nclusters, nlayers - integer, intent(in) :: children(2, nclusters) - integer, intent(out) :: layers(2, nlayers) - integer :: i, start_index, stop_index, distance, info + type(ddx_constants_type), intent(inout) :: constants + integer :: i, distance, info, max_distance integer, allocatable :: distance_from_root(:) - integer, intent(out) :: error_flag - character(len=255), intent(out) :: error_message - allocate(distance_from_root(nclusters), stat=info) + allocate(distance_from_root(constants % nclusters), stat=info) if (info.ne.0) then - error_flag = 1 - error_message = "Allocation failed in compute_layers" + constants % error_flag = 1 + constants % error_message = "Allocation failed in compute_layers" return end if + ! first we compute the distances from the root of the tree and + ! the number of layers distance_from_root(1) = 0 - do i = 1, nclusters + do i = 1, constants % nclusters distance = distance_from_root(i) - if (children(1, i) .ne. 0) & - & distance_from_root(children(1, i)) = distance + 1 - if (children(2, i) .ne. 0) & - & distance_from_root(children(2, i)) = distance + 1 + if (constants % children(1, i) .ne. 0) & + & distance_from_root(constants % children(1, i)) = distance + 1 + if (constants % children(2, i) .ne. 0) & + & distance_from_root(constants % children(2, i)) = distance + 1 + if (distance .gt. max_distance) max_distance = distance end do + constants % nlayers = max_distance + 1 + ! now we can allocate the layer information using the actual number + ! of layers + allocate(constants % layers(2, constants % nlayers), stat=info) + if (info.ne.0) then + constants % error_flag = 1 + constants % error_message = "Allocation failed in compute_layers" + return + end if + + ! finally we can populate the layer information distance = 0 - layers(1, 1) = 1 - do i = 1, nclusters + constants % layers(1, 1) = 1 + do i = 1, constants % nclusters if (distance_from_root(i) .lt. distance) then - error_flag = 1 - error_message = "Error in compute_layers" + constants % error_flag = 1 + constants % error_message = "Error in compute_layers" return end if if (distance_from_root(i) .ne. distance) then - layers(2, distance + 1) = i - if (distance_from_root(i) .lt. nlayers) & - & layers(1, distance + 2) = i + 1 + constants % layers(2, distance + 1) = i + if (distance_from_root(i) .lt. constants % nlayers) & + & constants % layers(1, distance + 2) = i + 1 distance = distance_from_root(i) end if end do - layers(2, nlayers) = nclusters + constants % layers(2, constants % nlayers) = constants % nclusters deallocate(distance_from_root, stat=info) if (info.ne.0) then - error_flag = 1 - error_message = "Deallocation failed in compute_layers" + constants % error_flag = 1 + constants % error_message = "Deallocation failed in compute_layers" return end if From c2f10d61d10f4e07a09993c1e19c9972650b599d Mon Sep 17 00:00:00 2001 From: Michele Nottoli Date: Tue, 20 Jun 2023 13:48:18 +0200 Subject: [PATCH 12/16] Fixed a few errors. --- src/ddx_core.f90 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ddx_core.f90 b/src/ddx_core.f90 index f1f23d33..db99eb97 100644 --- a/src/ddx_core.f90 +++ b/src/ddx_core.f90 @@ -2141,12 +2141,12 @@ subroutine tree_l2l_bessel_rotation_work(params, constants, node_l) real(dp) :: work(6*params % pl**2 + 19*params % pl + 8) complex(dp) :: work_complex(2*params % pl+1) ! Local variables - integer :: i, j + integer :: i, j, l real(dp) :: c_child(3), c_parent(3), c_diff(3) ! Top-to-bottom pass do l = 2, constants % nlayers !$omp parallel do default(none) shared(constants,params,node_l,l) & - !$omp private(i,j,c1,c,r1,r,work) + !$omp private(i,j,c_child,c_parent,c_diff,work,work_complex) do i = constants % layers(1, l), constants % layers(2, l) j = constants % parent(i) c_child = constants % cnode(:, j) From 6960ea02ff8a05b90a76ab562216c3431c172d5f Mon Sep 17 00:00:00 2001 From: Michele Nottoli Date: Tue, 20 Jun 2023 14:01:52 +0200 Subject: [PATCH 13/16] Parallelized the remaining l2l, m2m, with Bessel and without. --- src/ddx_core.f90 | 153 ++++++++++++++++++++++++++--------------------- 1 file changed, 86 insertions(+), 67 deletions(-) diff --git a/src/ddx_core.f90 b/src/ddx_core.f90 index db99eb97..c4f72180 100644 --- a/src/ddx_core.f90 +++ b/src/ddx_core.f90 @@ -1943,32 +1943,36 @@ subroutine tree_m2m_bessel_rotation_work(params, constants, node_m) real(dp) :: work(6*params % pm**2 + 19*params % pm + 8) complex(dp) :: work_complex(2*params % pm+1) ! Local variables - integer :: i, j + integer :: i, j, l real(dp) :: c1(3), c(3), r1, r ! Bottom-to-top pass - do i = constants % nclusters, 1, -1 - ! Leaf node does not need any update - if (constants % children(1, i) == 0) cycle - c = constants % cnode(:, i) - r = constants % rnode(i) - ! First child initializes output - j = constants % children(1, i) - c1 = constants % cnode(:, j) - r1 = constants % rnode(j) - c1 = params % kappa*(c1 - c) - call fmm_m2m_bessel_rotation_work(c1, & - & constants % SK_rnode(:, j), constants % SK_rnode(:, i), & - & params % pm, constants % vscales, one, node_m(:, j), zero, & - & node_m(:, i), work, work_complex) - ! All other children update the same output - do j = constants % children(1, i)+1, constants % children(2, i) + do l = constants % nlayers, 1, -1 + !$omp parallel do default(none) shared(constants,params,node_m,l) & + !$omp private(i,j,c1,c,r1,r,work,work_complex) + do i = constants % layers(1, l), constants % layers(2, l) + ! Leaf node does not need any update + if (constants % children(1, i) == 0) cycle + c = constants % cnode(:, i) + r = constants % rnode(i) + ! First child initializes output + j = constants % children(1, i) c1 = constants % cnode(:, j) r1 = constants % rnode(j) c1 = params % kappa*(c1 - c) call fmm_m2m_bessel_rotation_work(c1, & & constants % SK_rnode(:, j), constants % SK_rnode(:, i), & - & params % pm, constants % vscales, one, node_m(:, j), one, & + & params % pm, constants % vscales, one, node_m(:, j), zero, & & node_m(:, i), work, work_complex) + ! All other children update the same output + do j = constants % children(1, i)+1, constants % children(2, i) + c1 = constants % cnode(:, j) + r1 = constants % rnode(j) + c1 = params % kappa*(c1 - c) + call fmm_m2m_bessel_rotation_work(c1, & + & constants % SK_rnode(:, j), constants % SK_rnode(:, i), & + & params % pm, constants % vscales, one, node_m(:, j), one, & + & node_m(:, i), work, work_complex) + end do end do end do end subroutine tree_m2m_bessel_rotation_work @@ -2002,19 +2006,23 @@ subroutine tree_m2m_rotation_adj_work(params, constants, node_m, work) ! Temporary workspace real(dp), intent(out) :: work(6*params % pm**2 + 19*params % pm + 8) ! Local variables - integer :: i, j + integer :: i, j, l real(dp) :: c1(3), c(3), r1, r ! Top-to-bottom pass - do i = 2, constants % nclusters - j = constants % parent(i) - c = constants % cnode(:, j) - r = constants % rnode(j) - c1 = constants % cnode(:, i) - r1 = constants % rnode(i) - c1 = c - c1 - call fmm_m2m_rotation_adj_work(c1, r, r1, params % pm, & - & constants % vscales, constants % vcnk, one, node_m(:, j), one, & - & node_m(:, i), work) + do l = 2, constants % nlayers + !$omp parallel do default(none) shared(constants,params,node_m,l) & + !$omp private(i,j,c1,c,r1,r,work) + do i = constants % layers(1, l), constants % layers(2, l) + j = constants % parent(i) + c = constants % cnode(:, j) + r = constants % rnode(j) + c1 = constants % cnode(:, i) + r1 = constants % rnode(i) + c1 = c - c1 + call fmm_m2m_rotation_adj_work(c1, r, r1, params % pm, & + & constants % vscales, constants % vcnk, one, node_m(:, j), one, & + & node_m(:, i), work) + end do end do end subroutine tree_m2m_rotation_adj_work !------------------------------------------------------------------------------ @@ -2046,18 +2054,21 @@ subroutine tree_m2m_bessel_rotation_adj_work(params, constants, node_m) real(dp) :: work(6*params % pm**2 + 19*params % pm + 8) complex(dp) :: work_complex(2*params % pm+1) ! Local variables - integer :: i, j + integer :: i, j, l real(dp) :: c1(3), c(3) ! Top-to-bottom pass - do i = 2, constants % nclusters - j = constants % parent(i) - c = constants % cnode(:, j) - c1 = constants % cnode(:, i) - c1 = params % kappa*(c1 - c) - ! Little bit confusion about the i and j indices - call fmm_m2m_bessel_rotation_adj_work(c1, constants % SK_rnode(:, i), & - & constants % SK_rnode(:, j), params % pm, constants % vscales, & - & one, node_m(:, j), one, node_m(:, i), work, work_complex) + do l = 2, constants % nlayers + !$omp parallel do default(none) shared(constants,params,node_m,l) & + !$omp private(i,j,c1,c,work,work_complex) + do i = constants % layers(1, l), constants % layers(2, l) + j = constants % parent(i) + c = constants % cnode(:, j) + c1 = constants % cnode(:, i) + c1 = params % kappa*(c1 - c) + call fmm_m2m_bessel_rotation_adj_work(c1, constants % SK_rnode(:, i), & + & constants % SK_rnode(:, j), params % pm, constants % vscales, & + & one, node_m(:, j), one, node_m(:, i), work, work_complex) + end do end do end subroutine tree_m2m_bessel_rotation_adj_work @@ -2189,30 +2200,34 @@ subroutine tree_l2l_rotation_adj_work(params, constants, node_l, work) ! Temporary workspace real(dp), intent(out) :: work(6*params % pl**2 + 19*params % pl + 8) ! Local variables - integer :: i, j + integer :: i, j, l real(dp) :: c1(3), c(3), r1, r ! Bottom-to-top pass - do i = constants % nclusters, 1, -1 - ! Leaf node does not need any update - if (constants % children(1, i) == 0) cycle - c = constants % cnode(:, i) - r = constants % rnode(i) - ! First child initializes output - j = constants % children(1, i) - c1 = constants % cnode(:, j) - r1 = constants % rnode(j) - c1 = c1 - c - call fmm_l2l_rotation_adj_work(c1, r1, r, params % pl, & - & constants % vscales, constants % vfact, one, & - & node_l(:, j), zero, node_l(:, i), work) - ! All other children update the same output - do j = constants % children(1, i)+1, constants % children(2, i) + do l = constants % nlayers, 1, -1 + !$omp parallel do default(none) shared(constants,params,node_l,l) & + !$omp private(i,j,c1,c,r1,r,work) + do i = constants % layers(1, l), constants % layers(2, l) + ! Leaf node does not need any update + if (constants % children(1, i) == 0) cycle + c = constants % cnode(:, i) + r = constants % rnode(i) + ! First child initializes output + j = constants % children(1, i) c1 = constants % cnode(:, j) r1 = constants % rnode(j) c1 = c1 - c call fmm_l2l_rotation_adj_work(c1, r1, r, params % pl, & & constants % vscales, constants % vfact, one, & - & node_l(:, j), one, node_l(:, i), work) + & node_l(:, j), zero, node_l(:, i), work) + ! All other children update the same output + do j = constants % children(1, i)+1, constants % children(2, i) + c1 = constants % cnode(:, j) + r1 = constants % rnode(j) + c1 = c1 - c + call fmm_l2l_rotation_adj_work(c1, r1, r, params % pl, & + & constants % vscales, constants % vfact, one, & + & node_l(:, j), one, node_l(:, i), work) + end do end do end do end subroutine tree_l2l_rotation_adj_work @@ -2231,19 +2246,23 @@ subroutine tree_l2l_bessel_rotation_adj(params, constants, node_l) real(dp) :: work(6*params % pl**2 + 19*params % pl + 8) complex(dp) :: work_complex(2*params % pl+1) ! Local variables - integer :: i, j + integer :: i, j, l real(dp) :: c_parent(3), c_child(3), c_diff(3) ! Bottom-to-top pass - do i = constants % nclusters, 1, -1 - c_child = constants % cnode(:, i) - j = constants % parent(i) - if (j == 0) cycle - c_parent = constants % cnode(:, j) - c_diff = params % kappa*(c_parent - c_child) - call fmm_l2l_bessel_rotation_adj_work(c_diff, & - & constants % si_rnode(:, i), constants % si_rnode(:, j), & - & params % pl, constants % vscales, one, & - & node_l(:, i), one, node_l(:, j), work, work_complex) + do l = constants % nlayers, 1, -1 + !$omp parallel do default(none) shared(constants,params,node_l,l) & + !$omp private(i,j,c_parent,c_child,c_diff,work,work_complex) + do i = constants % layers(1, l), constants % layers(2, l) + c_child = constants % cnode(:, i) + j = constants % parent(i) + if (j == 0) cycle + c_parent = constants % cnode(:, j) + c_diff = params % kappa*(c_parent - c_child) + call fmm_l2l_bessel_rotation_adj_work(c_diff, & + & constants % si_rnode(:, i), constants % si_rnode(:, j), & + & params % pl, constants % vscales, one, & + & node_l(:, i), one, node_l(:, j), work, work_complex) + end do end do end subroutine tree_l2l_bessel_rotation_adj From 6864efa9dafdc2fde5c1703e4bd46d1bfc3c94a8 Mon Sep 17 00:00:00 2001 From: Michele Nottoli Date: Tue, 20 Jun 2023 14:34:43 +0200 Subject: [PATCH 14/16] Subroutine arguments are not declared anymore as omp private. --- src/ddx_core.f90 | 58 +++++++++++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 25 deletions(-) diff --git a/src/ddx_core.f90 b/src/ddx_core.f90 index c4f72180..56134ea9 100644 --- a/src/ddx_core.f90 +++ b/src/ddx_core.f90 @@ -1860,7 +1860,7 @@ subroutine tree_m2m_rotation(params, constants, node_m) ! Output real(dp), intent(inout) :: node_m((params % pm+1)**2, constants % nclusters) ! Temporary workspace - real(dp) :: work(6*params % pm**2 + 19*params % pm + 8) + real(dp) :: work(6*params % pm**2 + 19*params % pm + 8, params % nproc) ! Call corresponding work routine call tree_m2m_rotation_work(params, constants, node_m, work) end subroutine tree_m2m_rotation @@ -1876,15 +1876,17 @@ subroutine tree_m2m_rotation_work(params, constants, node_m, work) ! Output real(dp), intent(inout) :: node_m((params % pm+1)**2, constants % nclusters) ! Temporary workspace - real(dp), intent(out) :: work(6*params % pm**2 + 19*params % pm + 8) + real(dp), intent(out) :: work(6*params % pm**2 + 19*params % pm + 8, & + & params % nproc) ! Local variables - integer :: i, j, l + integer :: i, j, l, iproc real(dp) :: c1(3), c(3), r1, r ! Bottom-to-top pass do l = constants % nlayers, 1, -1 - !$omp parallel do default(none) shared(constants,params,node_m,l) & - !$omp private(i,j,c1,c,r1,r,work) + !$omp parallel do default(none) shared(constants,params,node_m,l, & + !$omp work) private(i,j,c1,c,r1,r,iproc) do i = constants % layers(1, l), constants % layers(2, l) + iproc = omp_get_thread_num() + 1 ! Leaf node does not need any update if (constants % children(1, i) == 0) cycle c = constants % cnode(:, i) @@ -1898,7 +1900,7 @@ subroutine tree_m2m_rotation_work(params, constants, node_m, work) & params % pm, & & constants % vscales, & & constants % vcnk, one, & - & node_m(:, j), zero, node_m(:, i), work) + & node_m(:, j), zero, node_m(:, i), work(:, iproc)) ! All other children update the same output do j = constants % children(1, i)+1, constants % children(2, i) c1 = constants % cnode(:, j) @@ -1906,7 +1908,7 @@ subroutine tree_m2m_rotation_work(params, constants, node_m, work) c1 = c1 - c call fmm_m2m_rotation_work(c1, r1, r, params % pm, & & constants % vscales, constants % vcnk, one, & - & node_m(:, j), one, node_m(:, i), work) + & node_m(:, j), one, node_m(:, i), work(:, iproc)) end do end do end do @@ -1988,7 +1990,7 @@ subroutine tree_m2m_rotation_adj(params, constants, node_m) ! Output real(dp), intent(inout) :: node_m((params % pm+1)**2, constants % nclusters) ! Temporary workspace - real(dp) :: work(6*params % pm**2 + 19*params % pm + 8) + real(dp) :: work(6*params % pm**2 + 19*params % pm + 8, params % nproc) ! Call corresponding work routine call tree_m2m_rotation_adj_work(params, constants, node_m, work) end subroutine tree_m2m_rotation_adj @@ -2004,15 +2006,17 @@ subroutine tree_m2m_rotation_adj_work(params, constants, node_m, work) ! Output real(dp), intent(inout) :: node_m((params % pm+1)**2, constants % nclusters) ! Temporary workspace - real(dp), intent(out) :: work(6*params % pm**2 + 19*params % pm + 8) + real(dp), intent(out) :: work(6*params % pm**2 + 19*params % pm + 8, & + & params % nproc) ! Local variables - integer :: i, j, l + integer :: i, j, l, iproc real(dp) :: c1(3), c(3), r1, r ! Top-to-bottom pass do l = 2, constants % nlayers - !$omp parallel do default(none) shared(constants,params,node_m,l) & - !$omp private(i,j,c1,c,r1,r,work) + !$omp parallel do default(none) shared(constants,params,node_m,l, & + !$omp work) private(i,j,c1,c,r1,r,iproc) do i = constants % layers(1, l), constants % layers(2, l) + iproc = omp_get_thread_num() + 1 j = constants % parent(i) c = constants % cnode(:, j) r = constants % rnode(j) @@ -2021,7 +2025,7 @@ subroutine tree_m2m_rotation_adj_work(params, constants, node_m, work) c1 = c - c1 call fmm_m2m_rotation_adj_work(c1, r, r1, params % pm, & & constants % vscales, constants % vcnk, one, node_m(:, j), one, & - & node_m(:, i), work) + & node_m(:, i), work(:, iproc)) end do end do end subroutine tree_m2m_rotation_adj_work @@ -2083,7 +2087,7 @@ subroutine tree_l2l_rotation(params, constants, node_l) ! Output real(dp), intent(inout) :: node_l((params % pl+1)**2, constants % nclusters) ! Temporary workspace - real(dp) :: work(6*params % pl**2 + 19*params % pl + 8) + real(dp) :: work(6*params % pl**2 + 19*params % pl + 8, params % nproc) ! Call corresponding work routine call tree_l2l_rotation_work(params, constants, node_l, work) end subroutine tree_l2l_rotation @@ -2099,15 +2103,17 @@ subroutine tree_l2l_rotation_work(params, constants, node_l, work) ! Output real(dp), intent(inout) :: node_l((params % pl+1)**2, constants % nclusters) ! Temporary workspace - real(dp), intent(out) :: work(6*params % pl**2 + 19*params % pl + 8) + real(dp), intent(out) :: work(6*params % pl**2 + 19*params % pl + 8, & + & params % nproc) ! Local variables - integer :: i, j, l + integer :: i, j, l, iproc real(dp) :: c1(3), c(3), r1, r ! Top-to-bottom pass do l = 2, constants % nlayers - !$omp parallel do default(none) shared(constants,params,node_l,l) & - !$omp private(i,j,c1,c,r1,r,work) + !$omp parallel do default(none) shared(constants,params,node_l,l, & + !$omp work) private(i,j,c1,c,r1,r,iproc) do i = constants % layers(1, l), constants % layers(2, l) + iproc = omp_get_thread_num() + 1 j = constants % parent(i) c = constants % cnode(:, j) r = constants % rnode(j) @@ -2116,7 +2122,7 @@ subroutine tree_l2l_rotation_work(params, constants, node_l, work) c1 = c - c1 call fmm_l2l_rotation_work(c1, r, r1, params % pl, & & constants % vscales, constants % vfact, one, & - & node_l(:, j), one, node_l(:, i), work) + & node_l(:, j), one, node_l(:, i), work(:, iproc)) end do end do end subroutine tree_l2l_rotation_work @@ -2182,7 +2188,7 @@ subroutine tree_l2l_rotation_adj(params, constants, node_l) ! Output real(dp), intent(inout) :: node_l((params % pl+1)**2, constants % nclusters) ! Temporary workspace - real(dp) :: work(6*params % pl**2 + 19*params % pl + 8) + real(dp) :: work(6*params % pl**2 + 19*params % pl + 8, params % nproc) ! Call corresponding work routine call tree_l2l_rotation_adj_work(params, constants, node_l, work) end subroutine tree_l2l_rotation_adj @@ -2198,15 +2204,17 @@ subroutine tree_l2l_rotation_adj_work(params, constants, node_l, work) ! Output real(dp), intent(inout) :: node_l((params % pl+1)**2, constants % nclusters) ! Temporary workspace - real(dp), intent(out) :: work(6*params % pl**2 + 19*params % pl + 8) + real(dp), intent(out) :: work(6*params % pl**2 + 19*params % pl + 8, & + & params % nproc) ! Local variables - integer :: i, j, l + integer :: i, j, l, iproc real(dp) :: c1(3), c(3), r1, r ! Bottom-to-top pass do l = constants % nlayers, 1, -1 - !$omp parallel do default(none) shared(constants,params,node_l,l) & - !$omp private(i,j,c1,c,r1,r,work) + !$omp parallel do default(none) shared(constants,params,node_l,l, & + !$omp work) private(i,j,c1,c,r1,r,iproc) do i = constants % layers(1, l), constants % layers(2, l) + iproc = omp_get_thread_num() + 1 ! Leaf node does not need any update if (constants % children(1, i) == 0) cycle c = constants % cnode(:, i) @@ -2226,7 +2234,7 @@ subroutine tree_l2l_rotation_adj_work(params, constants, node_l, work) c1 = c1 - c call fmm_l2l_rotation_adj_work(c1, r1, r, params % pl, & & constants % vscales, constants % vfact, one, & - & node_l(:, j), one, node_l(:, i), work) + & node_l(:, j), one, node_l(:, i), work(:, iproc)) end do end do end do From 84f1866c61647efbe759eb18ccd92538207fb701 Mon Sep 17 00:00:00 2001 From: Michele Nottoli Date: Tue, 20 Jun 2023 17:02:28 +0200 Subject: [PATCH 15/16] Disabled omp in m2m. --- src/ddx_core.f90 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ddx_core.f90 b/src/ddx_core.f90 index 56134ea9..e376ebcc 100644 --- a/src/ddx_core.f90 +++ b/src/ddx_core.f90 @@ -1883,8 +1883,8 @@ subroutine tree_m2m_rotation_work(params, constants, node_m, work) real(dp) :: c1(3), c(3), r1, r ! Bottom-to-top pass do l = constants % nlayers, 1, -1 - !$omp parallel do default(none) shared(constants,params,node_m,l, & - !$omp work) private(i,j,c1,c,r1,r,iproc) + !!$omp parallel do default(none) shared(constants,params,node_m,l, & + !!$omp work) private(i,j,c1,c,r1,r,iproc) do i = constants % layers(1, l), constants % layers(2, l) iproc = omp_get_thread_num() + 1 ! Leaf node does not need any update From 2eebd24db0eda9c274ce0cda137ac0f9d73271e7 Mon Sep 17 00:00:00 2001 From: Michele Nottoli Date: Tue, 20 Jun 2023 17:21:19 +0200 Subject: [PATCH 16/16] Reenabled openmp in m2m. --- src/ddx_core.f90 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ddx_core.f90 b/src/ddx_core.f90 index e376ebcc..56134ea9 100644 --- a/src/ddx_core.f90 +++ b/src/ddx_core.f90 @@ -1883,8 +1883,8 @@ subroutine tree_m2m_rotation_work(params, constants, node_m, work) real(dp) :: c1(3), c(3), r1, r ! Bottom-to-top pass do l = constants % nlayers, 1, -1 - !!$omp parallel do default(none) shared(constants,params,node_m,l, & - !!$omp work) private(i,j,c1,c,r1,r,iproc) + !$omp parallel do default(none) shared(constants,params,node_m,l, & + !$omp work) private(i,j,c1,c,r1,r,iproc) do i = constants % layers(1, l), constants % layers(2, l) iproc = omp_get_thread_num() + 1 ! Leaf node does not need any update