From 4b23b44cf698eb652766455e4b402b1e258e5db7 Mon Sep 17 00:00:00 2001 From: Jeff Larkin Date: Tue, 6 Dec 2022 10:36:11 -0500 Subject: [PATCH 1/9] Merged from upstream --- cpp/YAKL | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/YAKL b/cpp/YAKL index a298bdef..0d171b2c 160000 --- a/cpp/YAKL +++ b/cpp/YAKL @@ -1 +1 @@ -Subproject commit a298bdefbf353b2a5c49688960b2dce2c24c7fc4 +Subproject commit 0d171b2c4a3b80c2c8bab03bef1228361c1eca07 From a9968ab459c6c2758c37215fac01f0fcb0b96dd7 Mon Sep 17 00:00:00 2001 From: Jeff Larkin Date: Wed, 17 Apr 2024 13:20:37 -0400 Subject: [PATCH 2/9] Added new for_each and for_each_n versions. --- c/miniWeather_serial_for_each.cpp | 885 ++++++++++++++++++++++++++++ c/miniWeather_serial_for_each_n.cpp | 877 +++++++++++++++++++++++++++ 2 files changed, 1762 insertions(+) create mode 100644 c/miniWeather_serial_for_each.cpp create mode 100644 c/miniWeather_serial_for_each_n.cpp diff --git a/c/miniWeather_serial_for_each.cpp b/c/miniWeather_serial_for_each.cpp new file mode 100644 index 00000000..2203aa7a --- /dev/null +++ b/c/miniWeather_serial_for_each.cpp @@ -0,0 +1,885 @@ + +////////////////////////////////////////////////////////////////////////////////////////// +// miniWeather +// Author: Matt Norman , Oak Ridge National Laboratory +// This code simulates dry, stratified, compressible, non-hydrostatic fluid flows +// For documentation, please see the attached documentation in the "documentation" folder +// +////////////////////////////////////////////////////////////////////////////////////////// +#define _NX 200 +#define _NZ 100 +#define _SIM_TIME 100 +#define _OUT_FREQ 200 +#define _DATA_SPEC DATA_SPEC_INJECTION + +#include +#include +#include +#include +#include +#include +#ifndef NO_PNETCDF +#include "pnetcdf.h" +#endif +#include +#include +#include +#include + +// These should become unnecessary with the addition of cartesian_product +constexpr std::tuple idx2d(int idx, int nx) { return {idx%nx, idx/nx}; } +constexpr std::tuple idx3d(int idx, int nx, int nz) { return {idx%nx, (idx/nx)%nz, idx / (nx*nz)}; } + +constexpr double pi = 3.14159265358979323846264338327; //Pi +constexpr double grav = 9.8; //Gravitational acceleration (m / s^2) +constexpr double cp = 1004.; //Specific heat of dry air at constant pressure +constexpr double cv = 717.; //Specific heat of dry air at constant volume +constexpr double rd = 287.; //Dry air constant for equation of state (P=rho*rd*T) +constexpr double p0 = 1.e5; //Standard pressure at the surface in Pascals +constexpr double C0 = 27.5629410929725921310572974482; //Constant to translate potential temperature into pressure (P=C0*(rho*theta)**gamma) +constexpr double gamm = 1.40027894002789400278940027894; //gamma=cp/Rd , have to call this gamm because "gamma" is taken (I hate C so much) +//Define domain and stability-related constants +constexpr double xlen = 2.e4; //Length of the domain in the x-direction (meters) +constexpr double zlen = 1.e4; //Length of the domain in the z-direction (meters) +constexpr double hv_beta = 0.05; //How strong to diffuse the solution: hv_beta \in [0:1] +constexpr double cfl = 1.50; //"Courant, Friedrichs, Lewy" number (for numerical stability) +constexpr double max_speed = 450; //Assumed maximum wave speed during the simulation (speed of sound + speed of wind) (meter / sec) +constexpr int hs = 2; //"Halo" size: number of cells beyond the MPI tasks's domain needed for a full "stencil" of information for reconstruction +constexpr int sten_size = 4; //Size of the stencil used for interpolation + +//Parameters for indexing and flags +constexpr int NUM_VARS = 4; //Number of fluid state variables +constexpr int ID_DENS = 0; //index for density ("rho") +constexpr int ID_UMOM = 1; //index for momentum in the x-direction ("rho * u") +constexpr int ID_WMOM = 2; //index for momentum in the z-direction ("rho * w") +constexpr int ID_RHOT = 3; //index for density * potential temperature ("rho * theta") +constexpr int DIR_X = 1; //Integer constant to express that this operation is in the x-direction +constexpr int DIR_Z = 2; //Integer constant to express that this operation is in the z-direction +constexpr int DATA_SPEC_COLLISION = 1; +constexpr int DATA_SPEC_THERMAL = 2; +constexpr int DATA_SPEC_GRAVITY_WAVES = 3; +constexpr int DATA_SPEC_DENSITY_CURRENT = 5; +constexpr int DATA_SPEC_INJECTION = 6; + +constexpr int nqpoints = 3; +constexpr double qpoints [] = { 0.112701665379258311482073460022E0 , 0.500000000000000000000000000000E0 , 0.887298334620741688517926539980E0 }; +constexpr double qweights[] = { 0.277777777777777777777777777779E0 , 0.444444444444444444444444444444E0 , 0.277777777777777777777777777779E0 }; + +/////////////////////////////////////////////////////////////////////////////////////// +// BEGIN USER-CONFIGURABLE PARAMETERS +/////////////////////////////////////////////////////////////////////////////////////// +//The x-direction length is twice as long as the z-direction length +//So, you'll want to have nx_glob be twice as large as nz_glob +int constexpr nx_glob = _NX; //Number of total cells in the x-direction +int constexpr nz_glob = _NZ; //Number of total cells in the z-direction +double constexpr sim_time = _SIM_TIME; //How many seconds to run the simulation +double constexpr output_freq = _OUT_FREQ; //How frequently to output data to file (in seconds) +int constexpr data_spec_int = _DATA_SPEC; //How to initialize the data +double constexpr dx = xlen / nx_glob; // grid spacing in the x-direction +double constexpr dz = zlen / nz_glob; // grid spacing in the x-direction +/////////////////////////////////////////////////////////////////////////////////////// +// END USER-CONFIGURABLE PARAMETERS +/////////////////////////////////////////////////////////////////////////////////////// + +/////////////////////////////////////////////////////////////////////////////////////// +// Variables that are initialized but remain static over the course of the simulation +/////////////////////////////////////////////////////////////////////////////////////// +double dt; //Model time step (seconds) +int nx, nz; //Number of local grid cells in the x- and z- dimensions for this MPI task +int i_beg, k_beg; //beginning index in the x- and z-directions for this MPI task +int nranks, myrank; //Number of MPI ranks and my rank id +int left_rank, right_rank; //MPI Rank IDs that exist to my left and right in the global domain +int mainproc; //Am I the main process (rank == 0)? +double *hy_dens_cell; //hydrostatic density (vert cell avgs). Dimensions: (1-hs:nz+hs) +double *hy_dens_theta_cell; //hydrostatic rho*t (vert cell avgs). Dimensions: (1-hs:nz+hs) +double *hy_dens_int; //hydrostatic density (vert cell interf). Dimensions: (1:nz+1) +double *hy_dens_theta_int; //hydrostatic rho*t (vert cell interf). Dimensions: (1:nz+1) +double *hy_pressure_int; //hydrostatic press (vert cell interf). Dimensions: (1:nz+1) + +/////////////////////////////////////////////////////////////////////////////////////// +// Variables that are dynamics over the course of the simulation +/////////////////////////////////////////////////////////////////////////////////////// +double etime; //Elapsed model time +double output_counter; //Helps determine when it's time to do output +//Runtime variable arrays +double *state; //Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) +double *state_tmp; //Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) +double *flux; //Cell interface fluxes. Dimensions: (nx+1,nz+1,NUM_VARS) +double *tend; //Fluid state tendencies. Dimensions: (nx,nz,NUM_VARS) +int num_out = 0; //The number of outputs performed so far +int direction_switch = 1; +double mass0, te0; //Initial domain totals for mass and total energy +double mass , te ; //Domain totals for mass and total energy + +//How is this not in the standard?! +double dmin( double a , double b ) { if (a sim_time) { dt = sim_time - etime; } + //Perform a single time step + perform_timestep(state,state_tmp,flux,tend,dt); + //Inform the user +#ifndef NO_INFORM + if (mainproc) { printf( "Elapsed Time: %lf / %lf\n", etime , sim_time ); } +#endif + //Update the elapsed time and output counter + etime = etime + dt; + output_counter = output_counter + dt; + //If it's time for output, reset the counter, and do output + if (output_counter >= output_freq) { + output_counter = output_counter - output_freq; + output(state,etime); + } + } + auto t2 = std::chrono::steady_clock::now(); + if (mainproc) { + std::cout << "CPU Time: " << std::chrono::duration(t2-t1).count() << " sec\n"; + } + + //Final reductions for mass, kinetic energy, and total energy + reductions(mass,te); + + if (mainproc) { + printf( "d_mass: %le\n" , (mass - mass0)/mass0 ); + printf( "d_te: %le\n" , (te - te0 )/te0 ); + } + + finalize(); +} + + +//Performs a single dimensionally split time step using a simple low-storage three-stage Runge-Kutta time integrator +//The dimensional splitting is a second-order-accurate alternating Strang splitting in which the +//order of directions is alternated each time step. +//The Runge-Kutta method used here is defined as follows: +// q* = q[n] + dt/3 * rhs(q[n]) +// q** = q[n] + dt/2 * rhs(q* ) +// q[n+1] = q[n] + dt/1 * rhs(q** ) +void perform_timestep( double *state , double *state_tmp , double *flux , double *tend , double dt ) { + if (direction_switch) { + //x-direction first + semi_discrete_step( state , state , state_tmp , dt / 3 , DIR_X , flux , tend ); + semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_X , flux , tend ); + semi_discrete_step( state , state_tmp , state , dt / 1 , DIR_X , flux , tend ); + //z-direction second + semi_discrete_step( state , state , state_tmp , dt / 3 , DIR_Z , flux , tend ); + semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_Z , flux , tend ); + semi_discrete_step( state , state_tmp , state , dt / 1 , DIR_Z , flux , tend ); + } else { + //z-direction second + semi_discrete_step( state , state , state_tmp , dt / 3 , DIR_Z , flux , tend ); + semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_Z , flux , tend ); + semi_discrete_step( state , state_tmp , state , dt / 1 , DIR_Z , flux , tend ); + //x-direction first + semi_discrete_step( state , state , state_tmp , dt / 3 , DIR_X , flux , tend ); + semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_X , flux , tend ); + semi_discrete_step( state , state_tmp , state , dt / 1 , DIR_X , flux , tend ); + } + if (direction_switch) { direction_switch = 0; } else { direction_switch = 1; } +} + + +//Perform a single semi-discretized step in time with the form: +//state_out = state_init + dt * rhs(state_forcing) +//Meaning the step starts from state_init, computes the rhs using state_forcing, and stores the result in state_out +void semi_discrete_step( double *state_init , double *state_forcing , double *state_out , double dt , int dir , double *flux , double *tend ) { + int i, k, ll, inds, indt, indw; + double x, z, wpert, dist, x0, z0, xrad, zrad, amp; + if (dir == DIR_X) { + //Set the halo values for this MPI task's fluid state in the x-direction + set_halo_values_x(state_forcing); + //Compute the time tendencies for the fluid state in the x-direction + compute_tendencies_x(state_forcing,flux,tend,dt); + } else if (dir == DIR_Z) { + //Set the halo values for this MPI task's fluid state in the z-direction + set_halo_values_z(state_forcing); + //Compute the time tendencies for the fluid state in the z-direction + compute_tendencies_z(state_forcing,flux,tend,dt); + } + + ///////////////////////////////////////////////// + // TODO: THREAD ME + ///////////////////////////////////////////////// + //Apply the tendencies to the fluid state + auto bounds = std::views::iota(0,(int)(NUM_VARS*nz*nx)); + std::for_each(std::execution::par_unseq,bounds.begin(),bounds.end(),[=,nx=nx,nz=nz,hy_dens_cell=hy_dens_cell,i_beg=i_beg,k_beg=k_beg](int idx){ + auto [i,k,ll] = idx3d(idx,nx,nz); + if (data_spec_int == DATA_SPEC_GRAVITY_WAVES) { + double x = (i_beg + i+0.5)*dx; + double z = (k_beg + k+0.5)*dz; + double wpert; + // Using sample_ellipse_cosine requires "acc routine" in OpenACC and "declare target" in OpenMP offload + // Neither of these are particularly well supported. So I'm manually inlining here + // wpert = sample_ellipse_cosine( x,z , 0.01 , xlen/8,1000., 500.,500. ); + { + double x0 = xlen/8; + double z0 = 1000; + double xrad = 500; + double zrad = 500; + double amp = 0.01; + //Compute distance from bubble center + double dist = sqrt( ((x-x0)/xrad)*((x-x0)/xrad) + ((z-z0)/zrad)*((z-z0)/zrad) ) * pi / 2.; + //If the distance from bubble center is less than the radius, create a cos**2 profile + if (dist <= pi / 2.) { + wpert = amp * pow(cos(dist),2.); + } else { + wpert = 0.; + } + } + int indw = ID_WMOM*nz*nx + k*nx + i; + tend[indw] += wpert*hy_dens_cell[hs+k]; + } + int inds = ll*(nz+2*hs)*(nx+2*hs) + (k+hs)*(nx+2*hs) + i+hs; + int indt = ll*nz*nx + k*nx + i; + state_out[inds] = state_init[inds] + dt * tend[indt]; + }); +} + + +//Compute the time tendencies of the fluid state using forcing in the x-direction +//Since the halos are set in a separate routine, this will not require MPI +//First, compute the flux vector at each cell interface in the x-direction (including hyperviscosity) +//Then, compute the tendencies using those fluxes +void compute_tendencies_x( double *state , double *flux , double *tend , double dt) { + double hv_coef; + //Compute the hyperviscosity coefficient + hv_coef = -hv_beta * dx / (16*dt); + ///////////////////////////////////////////////// + // TODO: THREAD ME + ///////////////////////////////////////////////// + //Compute fluxes in the x-direction for each cell + auto bounds1 = std::views::iota(0,(int)(nz*(nx+1))); + std::for_each(std::execution::par_unseq,bounds1.begin(),bounds1.end(),[=,nx=nx,nz=nz,hy_dens_cell=hy_dens_cell,hy_dens_theta_cell=hy_dens_theta_cell](int idx){ + auto [i,k] = idx2d(idx,nx+1); + double stencil[4], d3_vals[NUM_VARS],vals[NUM_VARS]; + //Use fourth-order interpolation from four cell averages to compute the value at the interface in question + for (int ll=0; ll , Oak Ridge National Laboratory +// This code simulates dry, stratified, compressible, non-hydrostatic fluid flows +// For documentation, please see the attached documentation in the "documentation" folder +// +////////////////////////////////////////////////////////////////////////////////////////// +#define _NX 200 +#define _NZ 100 +#define _SIM_TIME 100 +#define _OUT_FREQ 200 +#define _DATA_SPEC DATA_SPEC_INJECTION + +#include +#include +#include +#include +#include +#include +#ifndef NO_PNETCDF +#include "pnetcdf.h" +#endif +#include +#include +#include +#include + +// These should become unnecessary with the addition of cartesian_product +constexpr std::tuple idx2d(int idx, int nx) { return {idx%nx, idx/nx}; } +constexpr std::tuple idx3d(int idx, int nx, int nz) { return {idx%nx, (idx/nx)%nz, idx / (nx*nz)}; } + +constexpr double pi = 3.14159265358979323846264338327; //Pi +constexpr double grav = 9.8; //Gravitational acceleration (m / s^2) +constexpr double cp = 1004.; //Specific heat of dry air at constant pressure +constexpr double cv = 717.; //Specific heat of dry air at constant volume +constexpr double rd = 287.; //Dry air constant for equation of state (P=rho*rd*T) +constexpr double p0 = 1.e5; //Standard pressure at the surface in Pascals +constexpr double C0 = 27.5629410929725921310572974482; //Constant to translate potential temperature into pressure (P=C0*(rho*theta)**gamma) +constexpr double gamm = 1.40027894002789400278940027894; //gamma=cp/Rd , have to call this gamm because "gamma" is taken (I hate C so much) +//Define domain and stability-related constants +constexpr double xlen = 2.e4; //Length of the domain in the x-direction (meters) +constexpr double zlen = 1.e4; //Length of the domain in the z-direction (meters) +constexpr double hv_beta = 0.05; //How strong to diffuse the solution: hv_beta \in [0:1] +constexpr double cfl = 1.50; //"Courant, Friedrichs, Lewy" number (for numerical stability) +constexpr double max_speed = 450; //Assumed maximum wave speed during the simulation (speed of sound + speed of wind) (meter / sec) +constexpr int hs = 2; //"Halo" size: number of cells beyond the MPI tasks's domain needed for a full "stencil" of information for reconstruction +constexpr int sten_size = 4; //Size of the stencil used for interpolation + +//Parameters for indexing and flags +constexpr int NUM_VARS = 4; //Number of fluid state variables +constexpr int ID_DENS = 0; //index for density ("rho") +constexpr int ID_UMOM = 1; //index for momentum in the x-direction ("rho * u") +constexpr int ID_WMOM = 2; //index for momentum in the z-direction ("rho * w") +constexpr int ID_RHOT = 3; //index for density * potential temperature ("rho * theta") +constexpr int DIR_X = 1; //Integer constant to express that this operation is in the x-direction +constexpr int DIR_Z = 2; //Integer constant to express that this operation is in the z-direction +constexpr int DATA_SPEC_COLLISION = 1; +constexpr int DATA_SPEC_THERMAL = 2; +constexpr int DATA_SPEC_GRAVITY_WAVES = 3; +constexpr int DATA_SPEC_DENSITY_CURRENT = 5; +constexpr int DATA_SPEC_INJECTION = 6; + +constexpr int nqpoints = 3; +constexpr double qpoints [] = { 0.112701665379258311482073460022E0 , 0.500000000000000000000000000000E0 , 0.887298334620741688517926539980E0 }; +constexpr double qweights[] = { 0.277777777777777777777777777779E0 , 0.444444444444444444444444444444E0 , 0.277777777777777777777777777779E0 }; + +/////////////////////////////////////////////////////////////////////////////////////// +// BEGIN USER-CONFIGURABLE PARAMETERS +/////////////////////////////////////////////////////////////////////////////////////// +//The x-direction length is twice as long as the z-direction length +//So, you'll want to have nx_glob be twice as large as nz_glob +int constexpr nx_glob = _NX; //Number of total cells in the x-direction +int constexpr nz_glob = _NZ; //Number of total cells in the z-direction +double constexpr sim_time = _SIM_TIME; //How many seconds to run the simulation +double constexpr output_freq = _OUT_FREQ; //How frequently to output data to file (in seconds) +int constexpr data_spec_int = _DATA_SPEC; //How to initialize the data +double constexpr dx = xlen / nx_glob; // grid spacing in the x-direction +double constexpr dz = zlen / nz_glob; // grid spacing in the x-direction +/////////////////////////////////////////////////////////////////////////////////////// +// END USER-CONFIGURABLE PARAMETERS +/////////////////////////////////////////////////////////////////////////////////////// + +/////////////////////////////////////////////////////////////////////////////////////// +// Variables that are initialized but remain static over the course of the simulation +/////////////////////////////////////////////////////////////////////////////////////// +double dt; //Model time step (seconds) +int nx, nz; //Number of local grid cells in the x- and z- dimensions for this MPI task +int i_beg, k_beg; //beginning index in the x- and z-directions for this MPI task +int nranks, myrank; //Number of MPI ranks and my rank id +int left_rank, right_rank; //MPI Rank IDs that exist to my left and right in the global domain +int mainproc; //Am I the main process (rank == 0)? +double *hy_dens_cell; //hydrostatic density (vert cell avgs). Dimensions: (1-hs:nz+hs) +double *hy_dens_theta_cell; //hydrostatic rho*t (vert cell avgs). Dimensions: (1-hs:nz+hs) +double *hy_dens_int; //hydrostatic density (vert cell interf). Dimensions: (1:nz+1) +double *hy_dens_theta_int; //hydrostatic rho*t (vert cell interf). Dimensions: (1:nz+1) +double *hy_pressure_int; //hydrostatic press (vert cell interf). Dimensions: (1:nz+1) + +/////////////////////////////////////////////////////////////////////////////////////// +// Variables that are dynamics over the course of the simulation +/////////////////////////////////////////////////////////////////////////////////////// +double etime; //Elapsed model time +double output_counter; //Helps determine when it's time to do output +//Runtime variable arrays +double *state; //Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) +double *state_tmp; //Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) +double *flux; //Cell interface fluxes. Dimensions: (nx+1,nz+1,NUM_VARS) +double *tend; //Fluid state tendencies. Dimensions: (nx,nz,NUM_VARS) +int num_out = 0; //The number of outputs performed so far +int direction_switch = 1; +double mass0, te0; //Initial domain totals for mass and total energy +double mass , te ; //Domain totals for mass and total energy + +//How is this not in the standard?! +double dmin( double a , double b ) { if (a sim_time) { dt = sim_time - etime; } + //Perform a single time step + perform_timestep(state,state_tmp,flux,tend,dt); + //Inform the user +#ifndef NO_INFORM + if (mainproc) { printf( "Elapsed Time: %lf / %lf\n", etime , sim_time ); } +#endif + //Update the elapsed time and output counter + etime = etime + dt; + output_counter = output_counter + dt; + //If it's time for output, reset the counter, and do output + if (output_counter >= output_freq) { + output_counter = output_counter - output_freq; + output(state,etime); + } + } + auto t2 = std::chrono::steady_clock::now(); + if (mainproc) { + std::cout << "CPU Time: " << std::chrono::duration(t2-t1).count() << " sec\n"; + } + + //Final reductions for mass, kinetic energy, and total energy + reductions(mass,te); + + if (mainproc) { + printf( "d_mass: %le\n" , (mass - mass0)/mass0 ); + printf( "d_te: %le\n" , (te - te0 )/te0 ); + } + + finalize(); +} + + +//Performs a single dimensionally split time step using a simple low-storage three-stage Runge-Kutta time integrator +//The dimensional splitting is a second-order-accurate alternating Strang splitting in which the +//order of directions is alternated each time step. +//The Runge-Kutta method used here is defined as follows: +// q* = q[n] + dt/3 * rhs(q[n]) +// q** = q[n] + dt/2 * rhs(q* ) +// q[n+1] = q[n] + dt/1 * rhs(q** ) +void perform_timestep( double *state , double *state_tmp , double *flux , double *tend , double dt ) { + if (direction_switch) { + //x-direction first + semi_discrete_step( state , state , state_tmp , dt / 3 , DIR_X , flux , tend ); + semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_X , flux , tend ); + semi_discrete_step( state , state_tmp , state , dt / 1 , DIR_X , flux , tend ); + //z-direction second + semi_discrete_step( state , state , state_tmp , dt / 3 , DIR_Z , flux , tend ); + semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_Z , flux , tend ); + semi_discrete_step( state , state_tmp , state , dt / 1 , DIR_Z , flux , tend ); + } else { + //z-direction second + semi_discrete_step( state , state , state_tmp , dt / 3 , DIR_Z , flux , tend ); + semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_Z , flux , tend ); + semi_discrete_step( state , state_tmp , state , dt / 1 , DIR_Z , flux , tend ); + //x-direction first + semi_discrete_step( state , state , state_tmp , dt / 3 , DIR_X , flux , tend ); + semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_X , flux , tend ); + semi_discrete_step( state , state_tmp , state , dt / 1 , DIR_X , flux , tend ); + } + if (direction_switch) { direction_switch = 0; } else { direction_switch = 1; } +} + + +//Perform a single semi-discretized step in time with the form: +//state_out = state_init + dt * rhs(state_forcing) +//Meaning the step starts from state_init, computes the rhs using state_forcing, and stores the result in state_out +void semi_discrete_step( double *state_init , double *state_forcing , double *state_out , double dt , int dir , double *flux , double *tend ) { + int i, k, ll, inds, indt, indw; + double x, z, wpert, dist, x0, z0, xrad, zrad, amp; + if (dir == DIR_X) { + //Set the halo values for this MPI task's fluid state in the x-direction + set_halo_values_x(state_forcing); + //Compute the time tendencies for the fluid state in the x-direction + compute_tendencies_x(state_forcing,flux,tend,dt); + } else if (dir == DIR_Z) { + //Set the halo values for this MPI task's fluid state in the z-direction + set_halo_values_z(state_forcing); + //Compute the time tendencies for the fluid state in the z-direction + compute_tendencies_z(state_forcing,flux,tend,dt); + } + + ///////////////////////////////////////////////// + // TODO: THREAD ME + ///////////////////////////////////////////////// + //Apply the tendencies to the fluid state + std::for_each_n(std::execution::par_unseq,std::views::iota(0).begin(),(NUM_VARS*nz*nx),[=,nx=nx,nz=nz,hy_dens_cell=hy_dens_cell,i_beg=i_beg,k_beg=k_beg](int idx){ + auto [i,k,ll] = idx3d(idx,nx,nz); + if (data_spec_int == DATA_SPEC_GRAVITY_WAVES) { + double x = (i_beg + i+0.5)*dx; + double z = (k_beg + k+0.5)*dz; + double wpert; + // Using sample_ellipse_cosine requires "acc routine" in OpenACC and "declare target" in OpenMP offload + // Neither of these are particularly well supported. So I'm manually inlining here + // wpert = sample_ellipse_cosine( x,z , 0.01 , xlen/8,1000., 500.,500. ); + { + double x0 = xlen/8; + double z0 = 1000; + double xrad = 500; + double zrad = 500; + double amp = 0.01; + //Compute distance from bubble center + double dist = sqrt( ((x-x0)/xrad)*((x-x0)/xrad) + ((z-z0)/zrad)*((z-z0)/zrad) ) * pi / 2.; + //If the distance from bubble center is less than the radius, create a cos**2 profile + if (dist <= pi / 2.) { + wpert = amp * pow(cos(dist),2.); + } else { + wpert = 0.; + } + } + int indw = ID_WMOM*nz*nx + k*nx + i; + tend[indw] += wpert*hy_dens_cell[hs+k]; + } + int inds = ll*(nz+2*hs)*(nx+2*hs) + (k+hs)*(nx+2*hs) + i+hs; + int indt = ll*nz*nx + k*nx + i; + state_out[inds] = state_init[inds] + dt * tend[indt]; + }); +} + + +//Compute the time tendencies of the fluid state using forcing in the x-direction +//Since the halos are set in a separate routine, this will not require MPI +//First, compute the flux vector at each cell interface in the x-direction (including hyperviscosity) +//Then, compute the tendencies using those fluxes +void compute_tendencies_x( double *state , double *flux , double *tend , double dt) { + double hv_coef; + //Compute the hyperviscosity coefficient + hv_coef = -hv_beta * dx / (16*dt); + ///////////////////////////////////////////////// + // TODO: THREAD ME + ///////////////////////////////////////////////// + //Compute fluxes in the x-direction for each cell + std::for_each_n(std::execution::par_unseq,std::views::iota(0).begin(),(nz*(nx+1)),[=,nx=nx,nz=nz,hy_dens_cell=hy_dens_cell,hy_dens_theta_cell=hy_dens_theta_cell](int idx){ + auto [i,k] = idx2d(idx,nx+1); + double stencil[4], d3_vals[NUM_VARS],vals[NUM_VARS]; + //Use fourth-order interpolation from four cell averages to compute the value at the interface in question + for (int ll=0; ll Date: Wed, 17 Apr 2024 13:53:45 -0400 Subject: [PATCH 3/9] Added stdpar to MPI version --- c/miniWeather_mpi_for_each.cpp | 910 ++++++++++++++++++++++++++++++ c/miniWeather_mpi_for_each_n.cpp | 901 +++++++++++++++++++++++++++++ c/miniWeather_serial_for_each.cpp | 4 - 3 files changed, 1811 insertions(+), 4 deletions(-) create mode 100644 c/miniWeather_mpi_for_each.cpp create mode 100644 c/miniWeather_mpi_for_each_n.cpp diff --git a/c/miniWeather_mpi_for_each.cpp b/c/miniWeather_mpi_for_each.cpp new file mode 100644 index 00000000..82ca2acb --- /dev/null +++ b/c/miniWeather_mpi_for_each.cpp @@ -0,0 +1,910 @@ + +////////////////////////////////////////////////////////////////////////////////////////// +// miniWeather +// Author: Matt Norman , Oak Ridge National Laboratory +// This code simulates dry, stratified, compressible, non-hydrostatic fluid flows +// For documentation, please see the attached documentation in the "documentation" folder +// +////////////////////////////////////////////////////////////////////////////////////////// +#define _NX 200 +#define _NZ 100 +#define _SIM_TIME 100 +#define _OUT_FREQ 200 +#define _DATA_SPEC DATA_SPEC_INJECTION + +#include +#include +#include +#include +#include +#include +#ifndef NO_PNETCDF +#include "pnetcdf.h" +#endif +#include +#include +#include +#include + +// These should become unnecessary with the addition of cartesian_product +constexpr std::tuple idx2d(int idx, int nx) { return {idx%nx, idx/nx}; } +constexpr std::tuple idx3d(int idx, int nx, int nz) { return {idx%nx, (idx/nx)%nz, idx / (nx*nz)}; } + +constexpr double pi = 3.14159265358979323846264338327; //Pi +constexpr double grav = 9.8; //Gravitational acceleration (m / s^2) +constexpr double cp = 1004.; //Specific heat of dry air at constant pressure +constexpr double cv = 717.; //Specific heat of dry air at constant volume +constexpr double rd = 287.; //Dry air constant for equation of state (P=rho*rd*T) +constexpr double p0 = 1.e5; //Standard pressure at the surface in Pascals +constexpr double C0 = 27.5629410929725921310572974482; //Constant to translate potential temperature into pressure (P=C0*(rho*theta)**gamma) +constexpr double gamm = 1.40027894002789400278940027894; //gamma=cp/Rd , have to call this gamm because "gamma" is taken (I hate C so much) +//Define domain and stability-related constants +constexpr double xlen = 2.e4; //Length of the domain in the x-direction (meters) +constexpr double zlen = 1.e4; //Length of the domain in the z-direction (meters) +constexpr double hv_beta = 0.05; //How strong to diffuse the solution: hv_beta \in [0:1] +constexpr double cfl = 1.50; //"Courant, Friedrichs, Lewy" number (for numerical stability) +constexpr double max_speed = 450; //Assumed maximum wave speed during the simulation (speed of sound + speed of wind) (meter / sec) +constexpr int hs = 2; //"Halo" size: number of cells beyond the MPI tasks's domain needed for a full "stencil" of information for reconstruction +constexpr int sten_size = 4; //Size of the stencil used for interpolation + +//Parameters for indexing and flags +constexpr int NUM_VARS = 4; //Number of fluid state variables +constexpr int ID_DENS = 0; //index for density ("rho") +constexpr int ID_UMOM = 1; //index for momentum in the x-direction ("rho * u") +constexpr int ID_WMOM = 2; //index for momentum in the z-direction ("rho * w") +constexpr int ID_RHOT = 3; //index for density * potential temperature ("rho * theta") +constexpr int DIR_X = 1; //Integer constant to express that this operation is in the x-direction +constexpr int DIR_Z = 2; //Integer constant to express that this operation is in the z-direction +constexpr int DATA_SPEC_COLLISION = 1; +constexpr int DATA_SPEC_THERMAL = 2; +constexpr int DATA_SPEC_GRAVITY_WAVES = 3; +constexpr int DATA_SPEC_DENSITY_CURRENT = 5; +constexpr int DATA_SPEC_INJECTION = 6; + +constexpr int nqpoints = 3; +constexpr double qpoints [] = { 0.112701665379258311482073460022E0 , 0.500000000000000000000000000000E0 , 0.887298334620741688517926539980E0 }; +constexpr double qweights[] = { 0.277777777777777777777777777779E0 , 0.444444444444444444444444444444E0 , 0.277777777777777777777777777779E0 }; + +/////////////////////////////////////////////////////////////////////////////////////// +// BEGIN USER-CONFIGURABLE PARAMETERS +/////////////////////////////////////////////////////////////////////////////////////// +//The x-direction length is twice as long as the z-direction length +//So, you'll want to have nx_glob be twice as large as nz_glob +int constexpr nx_glob = _NX; //Number of total cells in the x-direction +int constexpr nz_glob = _NZ; //Number of total cells in the z-direction +double constexpr sim_time = _SIM_TIME; //How many seconds to run the simulation +double constexpr output_freq = _OUT_FREQ; //How frequently to output data to file (in seconds) +int constexpr data_spec_int = _DATA_SPEC; //How to initialize the data +double constexpr dx = xlen / nx_glob; // grid spacing in the x-direction +double constexpr dz = zlen / nz_glob; // grid spacing in the x-direction +/////////////////////////////////////////////////////////////////////////////////////// +// END USER-CONFIGURABLE PARAMETERS +/////////////////////////////////////////////////////////////////////////////////////// + +/////////////////////////////////////////////////////////////////////////////////////// +// Variables that are initialized but remain static over the course of the simulation +/////////////////////////////////////////////////////////////////////////////////////// +double dt; //Model time step (seconds) +int nx, nz; //Number of local grid cells in the x- and z- dimensions for this MPI task +int i_beg, k_beg; //beginning index in the x- and z-directions for this MPI task +int nranks, myrank; //Number of MPI ranks and my rank id +int left_rank, right_rank; //MPI Rank IDs that exist to my left and right in the global domain +int mainproc; //Am I the main process (rank == 0)? +double *hy_dens_cell; //hydrostatic density (vert cell avgs). Dimensions: (1-hs:nz+hs) +double *hy_dens_theta_cell; //hydrostatic rho*t (vert cell avgs). Dimensions: (1-hs:nz+hs) +double *hy_dens_int; //hydrostatic density (vert cell interf). Dimensions: (1:nz+1) +double *hy_dens_theta_int; //hydrostatic rho*t (vert cell interf). Dimensions: (1:nz+1) +double *hy_pressure_int; //hydrostatic press (vert cell interf). Dimensions: (1:nz+1) + +/////////////////////////////////////////////////////////////////////////////////////// +// Variables that are dynamics over the course of the simulation +/////////////////////////////////////////////////////////////////////////////////////// +double etime; //Elapsed model time +double output_counter; //Helps determine when it's time to do output +//Runtime variable arrays +double *state; //Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) +double *state_tmp; //Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) +double *flux; //Cell interface fluxes. Dimensions: (nx+1,nz+1,NUM_VARS) +double *tend; //Fluid state tendencies. Dimensions: (nx,nz,NUM_VARS) +double *sendbuf_l; //Buffer to send data to the left MPI rank +double *sendbuf_r; //Buffer to send data to the right MPI rank +double *recvbuf_l; //Buffer to receive data from the left MPI rank +double *recvbuf_r; //Buffer to receive data from the right MPI rank +int num_out = 0; //The number of outputs performed so far +int direction_switch = 1; +double mass0, te0; //Initial domain totals for mass and total energy +double mass , te ; //Domain totals for mass and total energy + +//How is this not in the standard?! +double dmin( double a , double b ) { if (a= 0) output(state,etime); + + //////////////////////////////////////////////////// + // MAIN TIME STEP LOOP + //////////////////////////////////////////////////// + auto t1 = std::chrono::steady_clock::now(); + while (etime < sim_time) { + //If the time step leads to exceeding the simulation time, shorten it for the last step + if (etime + dt > sim_time) { dt = sim_time - etime; } + //Perform a single time step + perform_timestep(state,state_tmp,flux,tend,dt); + //Inform the user +#ifndef NO_INFORM + if (mainproc) { printf( "Elapsed Time: %lf / %lf\n", etime , sim_time ); } +#endif + //Update the elapsed time and output counter + etime = etime + dt; + output_counter = output_counter + dt; + //If it's time for output, reset the counter, and do output + if (output_freq >= 0 && output_counter >= output_freq) { + output_counter = output_counter - output_freq; + output(state,etime); + } + } + auto t2 = std::chrono::steady_clock::now(); + if (mainproc) { + std::cout << "CPU Time: " << std::chrono::duration(t2-t1).count() << " sec\n"; + } + + //Final reductions for mass, kinetic energy, and total energy + reductions(mass,te); + + if (mainproc) { + printf( "d_mass: %le\n" , (mass - mass0)/mass0 ); + printf( "d_te: %le\n" , (te - te0 )/te0 ); + } + + finalize(); +} + + +//Performs a single dimensionally split time step using a simple low-storage three-stage Runge-Kutta time integrator +//The dimensional splitting is a second-order-accurate alternating Strang splitting in which the +//order of directions is alternated each time step. +//The Runge-Kutta method used here is defined as follows: +// q* = q[n] + dt/3 * rhs(q[n]) +// q** = q[n] + dt/2 * rhs(q* ) +// q[n+1] = q[n] + dt/1 * rhs(q** ) +void perform_timestep( double *state , double *state_tmp , double *flux , double *tend , double dt ) { + if (direction_switch) { + //x-direction first + semi_discrete_step( state , state , state_tmp , dt / 3 , DIR_X , flux , tend ); + semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_X , flux , tend ); + semi_discrete_step( state , state_tmp , state , dt / 1 , DIR_X , flux , tend ); + //z-direction second + semi_discrete_step( state , state , state_tmp , dt / 3 , DIR_Z , flux , tend ); + semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_Z , flux , tend ); + semi_discrete_step( state , state_tmp , state , dt / 1 , DIR_Z , flux , tend ); + } else { + //z-direction second + semi_discrete_step( state , state , state_tmp , dt / 3 , DIR_Z , flux , tend ); + semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_Z , flux , tend ); + semi_discrete_step( state , state_tmp , state , dt / 1 , DIR_Z , flux , tend ); + //x-direction first + semi_discrete_step( state , state , state_tmp , dt / 3 , DIR_X , flux , tend ); + semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_X , flux , tend ); + semi_discrete_step( state , state_tmp , state , dt / 1 , DIR_X , flux , tend ); + } + if (direction_switch) { direction_switch = 0; } else { direction_switch = 1; } +} + + +//Perform a single semi-discretized step in time with the form: +//state_out = state_init + dt * rhs(state_forcing) +//Meaning the step starts from state_init, computes the rhs using state_forcing, and stores the result in state_out +void semi_discrete_step( double *state_init , double *state_forcing , double *state_out , double dt , int dir , double *flux , double *tend ) { + if (dir == DIR_X) { + //Set the halo values for this MPI task's fluid state in the x-direction + set_halo_values_x(state_forcing); + //Compute the time tendencies for the fluid state in the x-direction + compute_tendencies_x(state_forcing,flux,tend,dt); + } else if (dir == DIR_Z) { + //Set the halo values for this MPI task's fluid state in the z-direction + set_halo_values_z(state_forcing); + //Compute the time tendencies for the fluid state in the z-direction + compute_tendencies_z(state_forcing,flux,tend,dt); + } + + ///////////////////////////////////////////////// + // TODO: THREAD ME + ///////////////////////////////////////////////// + //Apply the tendencies to the fluid state + auto bounds = std::views::iota(0,(int)(NUM_VARS*nz*nx)); + std::for_each(std::execution::par_unseq,bounds.begin(),bounds.end(),[=,nx=nx,nz=nz,hy_dens_cell=hy_dens_cell,i_beg=i_beg,k_beg=k_beg](int idx){ + auto [i,k,ll] = idx3d(idx,nx,nz); + if (data_spec_int == DATA_SPEC_GRAVITY_WAVES) { + double x = (i_beg + i+0.5)*dx; + double z = (k_beg + k+0.5)*dz; + double wpert; + // Using sample_ellipse_cosine requires "acc routine" in OpenACC and "declare target" in OpenMP offload + // Neither of these are particularly well supported. So I'm manually inlining here + // wpert = sample_ellipse_cosine( x,z , 0.01 , xlen/8,1000., 500.,500. ); + { + double x0 = xlen/8; + double z0 = 1000; + double xrad = 500; + double zrad = 500; + double amp = 0.01; + //Compute distance from bubble center + double dist = sqrt( ((x-x0)/xrad)*((x-x0)/xrad) + ((z-z0)/zrad)*((z-z0)/zrad) ) * pi / 2.; + //If the distance from bubble center is less than the radius, create a cos**2 profile + if (dist <= pi / 2.) { + wpert = amp * pow(cos(dist),2.); + } else { + wpert = 0.; + } + } + int indw = ID_WMOM*nz*nx + k*nx + i; + tend[indw] += wpert*hy_dens_cell[hs+k]; + } + int inds = ll*(nz+2*hs)*(nx+2*hs) + (k+hs)*(nx+2*hs) + i+hs; + int indt = ll*nz*nx + k*nx + i; + state_out[inds] = state_init[inds] + dt * tend[indt]; + }); +} + + +//Compute the time tendencies of the fluid state using forcing in the x-direction +//Since the halos are set in a separate routine, this will not require MPI +//First, compute the flux vector at each cell interface in the x-direction (including hyperviscosity) +//Then, compute the tendencies using those fluxes +void compute_tendencies_x( double *state , double *flux , double *tend , double dt) { + double hv_coef; + //Compute the hyperviscosity coefficient + hv_coef = -hv_beta * dx / (16*dt); + ///////////////////////////////////////////////// + // TODO: THREAD ME + ///////////////////////////////////////////////// + //Compute fluxes in the x-direction for each cell + auto bounds1 = std::views::iota(0,(int)(nz*(nx+1))); + std::for_each(std::execution::par_unseq,bounds1.begin(),bounds1.end(),[=,nx=nx,nz=nz,hy_dens_cell=hy_dens_cell,hy_dens_theta_cell=hy_dens_theta_cell](int idx){ + auto [i,k] = idx2d(idx,nx+1); + double stencil[4], d3_vals[NUM_VARS],vals[NUM_VARS]; + //Use fourth-order interpolation from four cell averages to compute the value at the interface in question + for (int ll=0; ll , Oak Ridge National Laboratory +// This code simulates dry, stratified, compressible, non-hydrostatic fluid flows +// For documentation, please see the attached documentation in the "documentation" folder +// +////////////////////////////////////////////////////////////////////////////////////////// +#define _NX 200 +#define _NZ 100 +#define _SIM_TIME 100 +#define _OUT_FREQ 200 +#define _DATA_SPEC DATA_SPEC_INJECTION + +#include +#include +#include +#include +#include +#include +#ifndef NO_PNETCDF +#include "pnetcdf.h" +#endif +#include +#include +#include +#include + +// These should become unnecessary with the addition of cartesian_product +constexpr std::tuple idx2d(int idx, int nx) { return {idx%nx, idx/nx}; } +constexpr std::tuple idx3d(int idx, int nx, int nz) { return {idx%nx, (idx/nx)%nz, idx / (nx*nz)}; } + +constexpr double pi = 3.14159265358979323846264338327; //Pi +constexpr double grav = 9.8; //Gravitational acceleration (m / s^2) +constexpr double cp = 1004.; //Specific heat of dry air at constant pressure +constexpr double cv = 717.; //Specific heat of dry air at constant volume +constexpr double rd = 287.; //Dry air constant for equation of state (P=rho*rd*T) +constexpr double p0 = 1.e5; //Standard pressure at the surface in Pascals +constexpr double C0 = 27.5629410929725921310572974482; //Constant to translate potential temperature into pressure (P=C0*(rho*theta)**gamma) +constexpr double gamm = 1.40027894002789400278940027894; //gamma=cp/Rd , have to call this gamm because "gamma" is taken (I hate C so much) +//Define domain and stability-related constants +constexpr double xlen = 2.e4; //Length of the domain in the x-direction (meters) +constexpr double zlen = 1.e4; //Length of the domain in the z-direction (meters) +constexpr double hv_beta = 0.05; //How strong to diffuse the solution: hv_beta \in [0:1] +constexpr double cfl = 1.50; //"Courant, Friedrichs, Lewy" number (for numerical stability) +constexpr double max_speed = 450; //Assumed maximum wave speed during the simulation (speed of sound + speed of wind) (meter / sec) +constexpr int hs = 2; //"Halo" size: number of cells beyond the MPI tasks's domain needed for a full "stencil" of information for reconstruction +constexpr int sten_size = 4; //Size of the stencil used for interpolation + +//Parameters for indexing and flags +constexpr int NUM_VARS = 4; //Number of fluid state variables +constexpr int ID_DENS = 0; //index for density ("rho") +constexpr int ID_UMOM = 1; //index for momentum in the x-direction ("rho * u") +constexpr int ID_WMOM = 2; //index for momentum in the z-direction ("rho * w") +constexpr int ID_RHOT = 3; //index for density * potential temperature ("rho * theta") +constexpr int DIR_X = 1; //Integer constant to express that this operation is in the x-direction +constexpr int DIR_Z = 2; //Integer constant to express that this operation is in the z-direction +constexpr int DATA_SPEC_COLLISION = 1; +constexpr int DATA_SPEC_THERMAL = 2; +constexpr int DATA_SPEC_GRAVITY_WAVES = 3; +constexpr int DATA_SPEC_DENSITY_CURRENT = 5; +constexpr int DATA_SPEC_INJECTION = 6; + +constexpr int nqpoints = 3; +constexpr double qpoints [] = { 0.112701665379258311482073460022E0 , 0.500000000000000000000000000000E0 , 0.887298334620741688517926539980E0 }; +constexpr double qweights[] = { 0.277777777777777777777777777779E0 , 0.444444444444444444444444444444E0 , 0.277777777777777777777777777779E0 }; + +/////////////////////////////////////////////////////////////////////////////////////// +// BEGIN USER-CONFIGURABLE PARAMETERS +/////////////////////////////////////////////////////////////////////////////////////// +//The x-direction length is twice as long as the z-direction length +//So, you'll want to have nx_glob be twice as large as nz_glob +int constexpr nx_glob = _NX; //Number of total cells in the x-direction +int constexpr nz_glob = _NZ; //Number of total cells in the z-direction +double constexpr sim_time = _SIM_TIME; //How many seconds to run the simulation +double constexpr output_freq = _OUT_FREQ; //How frequently to output data to file (in seconds) +int constexpr data_spec_int = _DATA_SPEC; //How to initialize the data +double constexpr dx = xlen / nx_glob; // grid spacing in the x-direction +double constexpr dz = zlen / nz_glob; // grid spacing in the x-direction +/////////////////////////////////////////////////////////////////////////////////////// +// END USER-CONFIGURABLE PARAMETERS +/////////////////////////////////////////////////////////////////////////////////////// + +/////////////////////////////////////////////////////////////////////////////////////// +// Variables that are initialized but remain static over the course of the simulation +/////////////////////////////////////////////////////////////////////////////////////// +double dt; //Model time step (seconds) +int nx, nz; //Number of local grid cells in the x- and z- dimensions for this MPI task +int i_beg, k_beg; //beginning index in the x- and z-directions for this MPI task +int nranks, myrank; //Number of MPI ranks and my rank id +int left_rank, right_rank; //MPI Rank IDs that exist to my left and right in the global domain +int mainproc; //Am I the main process (rank == 0)? +double *hy_dens_cell; //hydrostatic density (vert cell avgs). Dimensions: (1-hs:nz+hs) +double *hy_dens_theta_cell; //hydrostatic rho*t (vert cell avgs). Dimensions: (1-hs:nz+hs) +double *hy_dens_int; //hydrostatic density (vert cell interf). Dimensions: (1:nz+1) +double *hy_dens_theta_int; //hydrostatic rho*t (vert cell interf). Dimensions: (1:nz+1) +double *hy_pressure_int; //hydrostatic press (vert cell interf). Dimensions: (1:nz+1) + +/////////////////////////////////////////////////////////////////////////////////////// +// Variables that are dynamics over the course of the simulation +/////////////////////////////////////////////////////////////////////////////////////// +double etime; //Elapsed model time +double output_counter; //Helps determine when it's time to do output +//Runtime variable arrays +double *state; //Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) +double *state_tmp; //Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) +double *flux; //Cell interface fluxes. Dimensions: (nx+1,nz+1,NUM_VARS) +double *tend; //Fluid state tendencies. Dimensions: (nx,nz,NUM_VARS) +double *sendbuf_l; //Buffer to send data to the left MPI rank +double *sendbuf_r; //Buffer to send data to the right MPI rank +double *recvbuf_l; //Buffer to receive data from the left MPI rank +double *recvbuf_r; //Buffer to receive data from the right MPI rank +int num_out = 0; //The number of outputs performed so far +int direction_switch = 1; +double mass0, te0; //Initial domain totals for mass and total energy +double mass , te ; //Domain totals for mass and total energy + +//How is this not in the standard?! +double dmin( double a , double b ) { if (a= 0) output(state,etime); + + //////////////////////////////////////////////////// + // MAIN TIME STEP LOOP + //////////////////////////////////////////////////// + auto t1 = std::chrono::steady_clock::now(); + while (etime < sim_time) { + //If the time step leads to exceeding the simulation time, shorten it for the last step + if (etime + dt > sim_time) { dt = sim_time - etime; } + //Perform a single time step + perform_timestep(state,state_tmp,flux,tend,dt); + //Inform the user +#ifndef NO_INFORM + if (mainproc) { printf( "Elapsed Time: %lf / %lf\n", etime , sim_time ); } +#endif + //Update the elapsed time and output counter + etime = etime + dt; + output_counter = output_counter + dt; + //If it's time for output, reset the counter, and do output + if (output_freq >= 0 && output_counter >= output_freq) { + output_counter = output_counter - output_freq; + output(state,etime); + } + } + auto t2 = std::chrono::steady_clock::now(); + if (mainproc) { + std::cout << "CPU Time: " << std::chrono::duration(t2-t1).count() << " sec\n"; + } + + //Final reductions for mass, kinetic energy, and total energy + reductions(mass,te); + + if (mainproc) { + printf( "d_mass: %le\n" , (mass - mass0)/mass0 ); + printf( "d_te: %le\n" , (te - te0 )/te0 ); + } + + finalize(); +} + + +//Performs a single dimensionally split time step using a simple low-storage three-stage Runge-Kutta time integrator +//The dimensional splitting is a second-order-accurate alternating Strang splitting in which the +//order of directions is alternated each time step. +//The Runge-Kutta method used here is defined as follows: +// q* = q[n] + dt/3 * rhs(q[n]) +// q** = q[n] + dt/2 * rhs(q* ) +// q[n+1] = q[n] + dt/1 * rhs(q** ) +void perform_timestep( double *state , double *state_tmp , double *flux , double *tend , double dt ) { + if (direction_switch) { + //x-direction first + semi_discrete_step( state , state , state_tmp , dt / 3 , DIR_X , flux , tend ); + semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_X , flux , tend ); + semi_discrete_step( state , state_tmp , state , dt / 1 , DIR_X , flux , tend ); + //z-direction second + semi_discrete_step( state , state , state_tmp , dt / 3 , DIR_Z , flux , tend ); + semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_Z , flux , tend ); + semi_discrete_step( state , state_tmp , state , dt / 1 , DIR_Z , flux , tend ); + } else { + //z-direction second + semi_discrete_step( state , state , state_tmp , dt / 3 , DIR_Z , flux , tend ); + semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_Z , flux , tend ); + semi_discrete_step( state , state_tmp , state , dt / 1 , DIR_Z , flux , tend ); + //x-direction first + semi_discrete_step( state , state , state_tmp , dt / 3 , DIR_X , flux , tend ); + semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_X , flux , tend ); + semi_discrete_step( state , state_tmp , state , dt / 1 , DIR_X , flux , tend ); + } + if (direction_switch) { direction_switch = 0; } else { direction_switch = 1; } +} + + +//Perform a single semi-discretized step in time with the form: +//state_out = state_init + dt * rhs(state_forcing) +//Meaning the step starts from state_init, computes the rhs using state_forcing, and stores the result in state_out +void semi_discrete_step( double *state_init , double *state_forcing , double *state_out , double dt , int dir , double *flux , double *tend ) { + if (dir == DIR_X) { + //Set the halo values for this MPI task's fluid state in the x-direction + set_halo_values_x(state_forcing); + //Compute the time tendencies for the fluid state in the x-direction + compute_tendencies_x(state_forcing,flux,tend,dt); + } else if (dir == DIR_Z) { + //Set the halo values for this MPI task's fluid state in the z-direction + set_halo_values_z(state_forcing); + //Compute the time tendencies for the fluid state in the z-direction + compute_tendencies_z(state_forcing,flux,tend,dt); + } + + ///////////////////////////////////////////////// + // TODO: THREAD ME + ///////////////////////////////////////////////// + //Apply the tendencies to the fluid state + std::for_each_n(std::execution::par_unseq,std::views::iota(0).begin(),(NUM_VARS*nz*nx),[=,nx=nx,nz=nz,hy_dens_cell=hy_dens_cell,i_beg=i_beg,k_beg=k_beg](int idx){ + auto [i,k,ll] = idx3d(idx,nx,nz); + if (data_spec_int == DATA_SPEC_GRAVITY_WAVES) { + double x = (i_beg + i+0.5)*dx; + double z = (k_beg + k+0.5)*dz; + double wpert; + // Using sample_ellipse_cosine requires "acc routine" in OpenACC and "declare target" in OpenMP offload + // Neither of these are particularly well supported. So I'm manually inlining here + // wpert = sample_ellipse_cosine( x,z , 0.01 , xlen/8,1000., 500.,500. ); + { + double x0 = xlen/8; + double z0 = 1000; + double xrad = 500; + double zrad = 500; + double amp = 0.01; + //Compute distance from bubble center + double dist = sqrt( ((x-x0)/xrad)*((x-x0)/xrad) + ((z-z0)/zrad)*((z-z0)/zrad) ) * pi / 2.; + //If the distance from bubble center is less than the radius, create a cos**2 profile + if (dist <= pi / 2.) { + wpert = amp * pow(cos(dist),2.); + } else { + wpert = 0.; + } + } + int indw = ID_WMOM*nz*nx + k*nx + i; + tend[indw] += wpert*hy_dens_cell[hs+k]; + } + int inds = ll*(nz+2*hs)*(nx+2*hs) + (k+hs)*(nx+2*hs) + i+hs; + int indt = ll*nz*nx + k*nx + i; + state_out[inds] = state_init[inds] + dt * tend[indt]; + }); +} + + +//Compute the time tendencies of the fluid state using forcing in the x-direction +//Since the halos are set in a separate routine, this will not require MPI +//First, compute the flux vector at each cell interface in the x-direction (including hyperviscosity) +//Then, compute the tendencies using those fluxes +void compute_tendencies_x( double *state , double *flux , double *tend , double dt) { + double hv_coef; + //Compute the hyperviscosity coefficient + hv_coef = -hv_beta * dx / (16*dt); + ///////////////////////////////////////////////// + // TODO: THREAD ME + ///////////////////////////////////////////////// + //Compute fluxes in the x-direction for each cell + std::for_each_n(std::execution::par_unseq,std::views::iota(0).begin(),(nz*(nx+1)),[=,nx=nx,nz=nz,hy_dens_cell=hy_dens_cell,hy_dens_theta_cell=hy_dens_theta_cell](int idx){ + auto [i,k] = idx2d(idx,nx+1); + double stencil[4], d3_vals[NUM_VARS],vals[NUM_VARS]; + //Use fourth-order interpolation from four cell averages to compute the value at the interface in question + for (int ll=0; ll Date: Wed, 17 Apr 2024 13:55:15 -0400 Subject: [PATCH 4/9] Removed hard-coded settings. --- c/miniWeather_mpi_for_each.cpp | 6 +----- c/miniWeather_mpi_for_each_n.cpp | 6 +----- c/miniWeather_serial_for_each.cpp | 6 +----- c/miniWeather_serial_for_each_n.cpp | 6 +----- 4 files changed, 4 insertions(+), 20 deletions(-) diff --git a/c/miniWeather_mpi_for_each.cpp b/c/miniWeather_mpi_for_each.cpp index 82ca2acb..44711189 100644 --- a/c/miniWeather_mpi_for_each.cpp +++ b/c/miniWeather_mpi_for_each.cpp @@ -2,15 +2,11 @@ ////////////////////////////////////////////////////////////////////////////////////////// // miniWeather // Author: Matt Norman , Oak Ridge National Laboratory +// Jeff Larkin , NVIDIA Corporation // This code simulates dry, stratified, compressible, non-hydrostatic fluid flows // For documentation, please see the attached documentation in the "documentation" folder // ////////////////////////////////////////////////////////////////////////////////////////// -#define _NX 200 -#define _NZ 100 -#define _SIM_TIME 100 -#define _OUT_FREQ 200 -#define _DATA_SPEC DATA_SPEC_INJECTION #include #include diff --git a/c/miniWeather_mpi_for_each_n.cpp b/c/miniWeather_mpi_for_each_n.cpp index 76c1d34a..9d8a0ab6 100644 --- a/c/miniWeather_mpi_for_each_n.cpp +++ b/c/miniWeather_mpi_for_each_n.cpp @@ -2,15 +2,11 @@ ////////////////////////////////////////////////////////////////////////////////////////// // miniWeather // Author: Matt Norman , Oak Ridge National Laboratory +// Jeff Larkin , NVIDIA Corporation // This code simulates dry, stratified, compressible, non-hydrostatic fluid flows // For documentation, please see the attached documentation in the "documentation" folder // ////////////////////////////////////////////////////////////////////////////////////////// -#define _NX 200 -#define _NZ 100 -#define _SIM_TIME 100 -#define _OUT_FREQ 200 -#define _DATA_SPEC DATA_SPEC_INJECTION #include #include diff --git a/c/miniWeather_serial_for_each.cpp b/c/miniWeather_serial_for_each.cpp index 9ddde9df..81b55e0e 100644 --- a/c/miniWeather_serial_for_each.cpp +++ b/c/miniWeather_serial_for_each.cpp @@ -2,15 +2,11 @@ ////////////////////////////////////////////////////////////////////////////////////////// // miniWeather // Author: Matt Norman , Oak Ridge National Laboratory +// Jeff Larkin , NVIDIA Corporation // This code simulates dry, stratified, compressible, non-hydrostatic fluid flows // For documentation, please see the attached documentation in the "documentation" folder // ////////////////////////////////////////////////////////////////////////////////////////// -#define _NX 200 -#define _NZ 100 -#define _SIM_TIME 100 -#define _OUT_FREQ 200 -#define _DATA_SPEC DATA_SPEC_INJECTION #include #include diff --git a/c/miniWeather_serial_for_each_n.cpp b/c/miniWeather_serial_for_each_n.cpp index 98fe2ba2..e8a9bf54 100644 --- a/c/miniWeather_serial_for_each_n.cpp +++ b/c/miniWeather_serial_for_each_n.cpp @@ -2,15 +2,11 @@ ////////////////////////////////////////////////////////////////////////////////////////// // miniWeather // Author: Matt Norman , Oak Ridge National Laboratory +// Jeff Larkin , NVIDIA Corporation // This code simulates dry, stratified, compressible, non-hydrostatic fluid flows // For documentation, please see the attached documentation in the "documentation" folder // ////////////////////////////////////////////////////////////////////////////////////////// -#define _NX 200 -#define _NZ 100 -#define _SIM_TIME 100 -#define _OUT_FREQ 200 -#define _DATA_SPEC DATA_SPEC_INJECTION #include #include From ac41bcb9df8af483da9a329bd41564ca3e8bfc4c Mon Sep 17 00:00:00 2001 From: Jeff Larkin Date: Wed, 17 Apr 2024 15:18:32 -0400 Subject: [PATCH 5/9] Added stdpar to cmake and fixed bug in mpi_for_each_n --- c/CMakeLists.txt | 88 ++++++++++++++++++++++++++++++++ c/miniWeather_mpi_for_each_n.cpp | 2 +- 2 files changed, 89 insertions(+), 1 deletion(-) diff --git a/c/CMakeLists.txt b/c/CMakeLists.txt index 3d469143..ddcb6f5e 100644 --- a/c/CMakeLists.txt +++ b/c/CMakeLists.txt @@ -54,6 +54,50 @@ endif() add_test(NAME SERIAL_TEST COMMAND ./check_output.sh ./serial_test 1e-13 4.5e-5 ) +############################################################ +## Compile the serial for_each version +############################################################ +if (NOT ("${STDPAR_FLAGS}" STREQUAL "") ) + add_executable(serial_for_each miniWeather_serial_for_each.cpp) + set_target_properties(serial_for_each PROPERTIES COMPILE_FLAGS "${EXE_DEFS} ${STDPAR_FLAGS}") + + add_executable(serial_for_each_test miniWeather_serial_for_each.cpp) + set_target_properties(serial_for_each_test PROPERTIES COMPILE_FLAGS "${TEST_DEFS} ${STDPAR_FLAGS}") + + if (NOT ("${LDFLAGS}" STREQUAL "") ) + target_link_libraries(serial_for_each "${LDFLAGS} ${STDPAR_FLAGS}") + target_link_libraries(serial_for_each_test "${LDFLAGS} ${STDPAR_FLAGS}") + endif() + if (NOT ("${SERIAL_LINK_FLAGS}" STREQUAL "") ) + target_link_libraries(serial_for_each "${SERIAL_LINK_FLAGS} ${STDPAR_FLAGS}") + target_link_libraries(serial_for_each_test "${SERIAL_LINK_FLAGS} ${STDPAR_FLAGS}") + endif() + + add_test(NAME SERIAL_FOR_EACH_TEST COMMAND ./check_output.sh ./serial_for_each_test 1e-13 4.5e-5 ) +endif() + + +############################################################ +## Compile the serial for_each_n version +############################################################ +if (NOT ("${STDPAR_FLAGS}" STREQUAL "") ) + add_executable(serial_for_each_n miniWeather_serial_for_each_n.cpp) + set_target_properties(serial_for_each_n PROPERTIES COMPILE_FLAGS "${EXE_DEFS} ${STDPAR_FLAGS}") + + add_executable(serial_for_each_n_test miniWeather_serial_for_each_n.cpp) + set_target_properties(serial_for_each_n_test PROPERTIES COMPILE_FLAGS "${TEST_DEFS} ${STDPAR_FLAGS}") + + if (NOT ("${LDFLAGS}" STREQUAL "") ) + target_link_libraries(serial_for_each_n "${LDFLAGS} ${STDPAR_FLAGS}") + target_link_libraries(serial_for_each_n_test "${LDFLAGS} ${STDPAR_FLAGS}") + endif() + if (NOT ("${SERIAL_LINK_FLAGS}" STREQUAL "") ) + target_link_libraries(serial_for_each_n "${SERIAL_LINK_FLAGS} ${STDPAR_FLAGS}") + target_link_libraries(serial_for_each_n_test "${SERIAL_LINK_FLAGS} ${STDPAR_FLAGS}") + endif() + + add_test(NAME SERIAL_FOR_EACH_N_TEST COMMAND ./check_output.sh ./serial_for_each_n_test 1e-13 4.5e-5 ) +endif() ############################################################ ## Compile the MPI version @@ -75,6 +119,50 @@ endif() add_test(NAME MPI_TEST COMMAND ./check_output.sh ./mpi_test 1e-13 4.5e-5 ) +############################################################ +## Compile the mpi for_each version +############################################################ +if (NOT ("${STDPAR_FLAGS}" STREQUAL "") ) + add_executable(mpi_for_each miniWeather_mpi_for_each.cpp) + set_target_properties(mpi_for_each PROPERTIES COMPILE_FLAGS "${EXE_DEFS} ${STDPAR_FLAGS}") + + add_executable(mpi_for_each_test miniWeather_mpi_for_each.cpp) + set_target_properties(mpi_for_each_test PROPERTIES COMPILE_FLAGS "${TEST_DEFS} ${STDPAR_FLAGS}") + + if (NOT ("${LDFLAGS}" STREQUAL "") ) + target_link_libraries(mpi_for_each "${LDFLAGS} ${STDPAR_FLAGS}") + target_link_libraries(mpi_for_each_test "${LDFLAGS} ${STDPAR_FLAGS}") + endif() + if (NOT ("${SERIAL_LINK_FLAGS}" STREQUAL "") ) + target_link_libraries(mpi_for_each "${MPI_LINK_FLAGS} ${STDPAR_FLAGS}") + target_link_libraries(mpi_for_each_test "${MPI_LINK_FLAGS} ${STDPAR_FLAGS}") + endif() + + add_test(NAME MPI_FOR_EACH_TEST COMMAND ./check_output.sh ./mpi_for_each_test 1e-13 4.5e-5 ) +endif() + + +############################################################ +## Compile the serial for_each_n version +############################################################ +if (NOT ("${STDPAR_FLAGS}" STREQUAL "") ) + add_executable(mpi_for_each_n miniWeather_mpi_for_each_n.cpp) + set_target_properties(mpi_for_each_n PROPERTIES COMPILE_FLAGS "${EXE_DEFS} ${STDPAR_FLAGS}") + + add_executable(mpi_for_each_n_test miniWeather_mpi_for_each_n.cpp) + set_target_properties(mpi_for_each_n_test PROPERTIES COMPILE_FLAGS "${TEST_DEFS} ${STDPAR_FLAGS}") + + if (NOT ("${LDFLAGS}" STREQUAL "") ) + target_link_libraries(mpi_for_each_n "${LDFLAGS} ${STDPAR_FLAGS}") + target_link_libraries(mpi_for_each_n_test "${LDFLAGS} ${STDPAR_FLAGS}") + endif() + if (NOT ("${SERIAL_LINK_FLAGS}" STREQUAL "") ) + target_link_libraries(mpi_for_each_n "${MPI_LINK_FLAGS} ${STDPAR_FLAGS}") + target_link_libraries(mpi_for_each_n_test "${MPI_LINK_FLAGS} ${STDPAR_FLAGS}") + endif() + + add_test(NAME MPI_FOR_EACH_N_TEST COMMAND ./check_output.sh ./mpi_for_each_n_test 1e-13 4.5e-5 ) +endif() ############################################################ ## Compile the MPI + OpenMP version diff --git a/c/miniWeather_mpi_for_each_n.cpp b/c/miniWeather_mpi_for_each_n.cpp index 9d8a0ab6..160c7de6 100644 --- a/c/miniWeather_mpi_for_each_n.cpp +++ b/c/miniWeather_mpi_for_each_n.cpp @@ -467,7 +467,7 @@ void set_halo_values_z( double *state ) { ///////////////////////////////////////////////// // TODO: THREAD ME ///////////////////////////////////////////////// - std::for_each_n(std::execution::par_unseq,std::views::iota(0).begin(),(NUM_VARS*nx+2*hs),[=,nz=nz,nx=nx,hy_dens_cell=hy_dens_cell](int idx){ + std::for_each_n(std::execution::par_unseq,std::views::iota(0).begin(),(NUM_VARS*(nx+2*hs)),[=,nz=nz,nx=nx,hy_dens_cell=hy_dens_cell](int idx){ auto [i,ll] = idx2d(idx,(nx+2*hs)); if (ll == ID_WMOM) { state[ll*(nz+2*hs)*(nx+2*hs) + (0 )*(nx+2*hs) + i] = 0.; From aebda1b62885bcf0e4f21d2d67461cb1c0191a04 Mon Sep 17 00:00:00 2001 From: Jeff Larkin Date: Thu, 18 Apr 2024 12:29:37 -0400 Subject: [PATCH 6/9] Added versions that include for_each and for_each_n with C++23 mdspan. --- cpp/CMakeLists.txt | 45 ++ cpp/miniWeather_serial_for_each.cpp | 874 ++++++++++++++++++++++++++ cpp/miniWeather_serial_for_each_n.cpp | 866 +++++++++++++++++++++++++ 3 files changed, 1785 insertions(+) create mode 100644 cpp/miniWeather_serial_for_each.cpp create mode 100644 cpp/miniWeather_serial_for_each_n.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 9dde59f6..451f18d5 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -70,6 +70,51 @@ target_link_libraries(serial_test "${LDFLAGS}") add_test(NAME SERIAL_TEST COMMAND ./check_output.sh ./serial_test 1e-9 4.5e-5 ) +############################################################ +## Compile the serial for_each version +############################################################ +if (NOT ("${STDPAR_FLAGS}" STREQUAL "") ) + add_executable(serial_for_each miniWeather_serial_for_each.cpp) + set_target_properties(serial_for_each PROPERTIES COMPILE_FLAGS "${EXE_DEFS} ${STDPAR_FLAGS}") + + add_executable(serial_for_each_test miniWeather_serial_for_each.cpp) + set_target_properties(serial_for_each_test PROPERTIES COMPILE_FLAGS "${TEST_DEFS} ${STDPAR_FLAGS}") + + if (NOT ("${LDFLAGS}" STREQUAL "") ) + target_link_libraries(serial_for_each "${LDFLAGS} ${STDPAR_FLAGS}") + target_link_libraries(serial_for_each_test "${LDFLAGS} ${STDPAR_FLAGS}") + endif() + if (NOT ("${SERIAL_LINK_FLAGS}" STREQUAL "") ) + target_link_libraries(serial_for_each "${SERIAL_LINK_FLAGS} ${STDPAR_FLAGS}") + target_link_libraries(serial_for_each_test "${SERIAL_LINK_FLAGS} ${STDPAR_FLAGS}") + endif() + + add_test(NAME SERIAL_FOR_EACH_TEST COMMAND ./check_output.sh ./serial_for_each_test 1e-13 4.5e-5 ) +endif() + +############################################################ +## Compile the serial for_each_n version +############################################################ +if (NOT ("${STDPAR_FLAGS}" STREQUAL "") ) + add_executable(serial_for_each_n miniWeather_serial_for_each_n.cpp) + set_target_properties(serial_for_each_n PROPERTIES COMPILE_FLAGS "${EXE_DEFS} ${STDPAR_FLAGS}") + + add_executable(serial_for_each_n_test miniWeather_serial_for_each_n.cpp) + set_target_properties(serial_for_each_n_test PROPERTIES COMPILE_FLAGS "${TEST_DEFS} ${STDPAR_FLAGS}") + + if (NOT ("${LDFLAGS}" STREQUAL "") ) + target_link_libraries(serial_for_each_n "${LDFLAGS} ${STDPAR_FLAGS}") + target_link_libraries(serial_for_each_n_test "${LDFLAGS} ${STDPAR_FLAGS}") + endif() + if (NOT ("${SERIAL_LINK_FLAGS}" STREQUAL "") ) + target_link_libraries(serial_for_each_n "${SERIAL_LINK_FLAGS} ${STDPAR_FLAGS}") + target_link_libraries(serial_for_each_n_test "${SERIAL_LINK_FLAGS} ${STDPAR_FLAGS}") + endif() + + add_test(NAME SERIAL_FOR_EACH_N_TEST COMMAND ./check_output.sh ./serial_for_each_n_test 1e-13 4.5e-5 ) +endif() + + ############################################################ ## Compile the MPI version diff --git a/cpp/miniWeather_serial_for_each.cpp b/cpp/miniWeather_serial_for_each.cpp new file mode 100644 index 00000000..130b48df --- /dev/null +++ b/cpp/miniWeather_serial_for_each.cpp @@ -0,0 +1,874 @@ + +////////////////////////////////////////////////////////////////////////////////////////// +// miniWeather +// Author: Matt Norman , Oak Ridge National Laboratory +// Jeff Larkin , NVIDIA Corporation +// This code simulates dry, stratified, compressible, non-hydrostatic fluid flows +// For documentation, please see the attached documentation in the "documentation" folder +// +////////////////////////////////////////////////////////////////////////////////////////// + +#include +#include +#include +#include +#include +#include +#ifndef NO_PNETCDF +#include "pnetcdf.h" +#endif +#include +#include +#include +#include + +#include +namespace stdex = std::experimental; + +template +using array_view = stdex::mdspan>; +using real_1d_array_view = array_view; +using real_2d_array_view = array_view; +using real3d_span = array_view; +using const_real_1d_array_view = array_view; +using const_real_2d_array_view = array_view; +using const_real3d_span = array_view; + +// These should become unnecessary with the addition of cartesian_product +constexpr std::tuple idx2d(int idx, int nx) { return {idx%nx, idx/nx}; } +constexpr std::tuple idx3d(int idx, int nx, int nz) { return {idx%nx, (idx/nx)%nz, idx / (nx*nz)}; } + +constexpr double pi = 3.14159265358979323846264338327; //Pi +constexpr double grav = 9.8; //Gravitational acceleration (m / s^2) +constexpr double cp = 1004.; //Specific heat of dry air at constant pressure +constexpr double cv = 717.; //Specific heat of dry air at constant volume +constexpr double rd = 287.; //Dry air constant for equation of state (P=rho*rd*T) +constexpr double p0 = 1.e5; //Standard pressure at the surface in Pascals +constexpr double C0 = 27.5629410929725921310572974482; //Constant to translate potential temperature into pressure (P=C0*(rho*theta)**gamma) +constexpr double gamm = 1.40027894002789400278940027894; //gamma=cp/Rd , have to call this gamm because "gamma" is taken (I hate C so much) +//Define domain and stability-related constants +constexpr double xlen = 2.e4; //Length of the domain in the x-direction (meters) +constexpr double zlen = 1.e4; //Length of the domain in the z-direction (meters) +constexpr double hv_beta = 0.05; //How strong to diffuse the solution: hv_beta \in [0:1] +constexpr double cfl = 1.50; //"Courant, Friedrichs, Lewy" number (for numerical stability) +constexpr double max_speed = 450; //Assumed maximum wave speed during the simulation (speed of sound + speed of wind) (meter / sec) +constexpr int hs = 2; //"Halo" size: number of cells beyond the MPI tasks's domain needed for a full "stencil" of information for reconstruction +constexpr int sten_size = 4; //Size of the stencil used for interpolation + +//Parameters for indexing and flags +constexpr int NUM_VARS = 4; //Number of fluid state variables +constexpr int ID_DENS = 0; //index for density ("rho") +constexpr int ID_UMOM = 1; //index for momentum in the x-direction ("rho * u") +constexpr int ID_WMOM = 2; //index for momentum in the z-direction ("rho * w") +constexpr int ID_RHOT = 3; //index for density * potential temperature ("rho * theta") +constexpr int DIR_X = 1; //Integer constant to express that this operation is in the x-direction +constexpr int DIR_Z = 2; //Integer constant to express that this operation is in the z-direction +constexpr int DATA_SPEC_COLLISION = 1; +constexpr int DATA_SPEC_THERMAL = 2; +constexpr int DATA_SPEC_GRAVITY_WAVES = 3; +constexpr int DATA_SPEC_DENSITY_CURRENT = 5; +constexpr int DATA_SPEC_INJECTION = 6; + +constexpr int nqpoints = 3; +constexpr double qpoints [] = { 0.112701665379258311482073460022E0 , 0.500000000000000000000000000000E0 , 0.887298334620741688517926539980E0 }; +constexpr double qweights[] = { 0.277777777777777777777777777779E0 , 0.444444444444444444444444444444E0 , 0.277777777777777777777777777779E0 }; + +/////////////////////////////////////////////////////////////////////////////////////// +// BEGIN USER-CONFIGURABLE PARAMETERS +/////////////////////////////////////////////////////////////////////////////////////// +//The x-direction length is twice as long as the z-direction length +//So, you'll want to have nx_glob be twice as large as nz_glob +int constexpr nx_glob = _NX; //Number of total cells in the x-direction +int constexpr nz_glob = _NZ; //Number of total cells in the z-direction +double constexpr sim_time = _SIM_TIME; //How many seconds to run the simulation +double constexpr output_freq = _OUT_FREQ; //How frequently to output data to file (in seconds) +int constexpr data_spec_int = _DATA_SPEC; //How to initialize the data +double constexpr dx = xlen / nx_glob; // grid spacing in the x-direction +double constexpr dz = zlen / nz_glob; // grid spacing in the x-direction +/////////////////////////////////////////////////////////////////////////////////////// +// END USER-CONFIGURABLE PARAMETERS +/////////////////////////////////////////////////////////////////////////////////////// + +/////////////////////////////////////////////////////////////////////////////////////// +// Variables that are initialized but remain static over the course of the simulation +/////////////////////////////////////////////////////////////////////////////////////// +double dt; //Model time step (seconds) +int nx, nz; //Number of local grid cells in the x- and z- dimensions for this MPI task +int i_beg, k_beg; //beginning index in the x- and z-directions for this MPI task +int nranks, myrank; //Number of MPI ranks and my rank id +int left_rank, right_rank; //MPI Rank IDs that exist to my left and right in the global domain +int mainproc; //Am I the main process (rank == 0)? +double *hy_dens_cell; //hydrostatic density (vert cell avgs). Dimensions: (1-hs:nz+hs) +double *hy_dens_theta_cell; //hydrostatic rho*t (vert cell avgs). Dimensions: (1-hs:nz+hs) +double *hy_dens_int; //hydrostatic density (vert cell interf). Dimensions: (1:nz+1) +double *hy_dens_theta_int; //hydrostatic rho*t (vert cell interf). Dimensions: (1:nz+1) +double *hy_pressure_int; //hydrostatic press (vert cell interf). Dimensions: (1:nz+1) + +/////////////////////////////////////////////////////////////////////////////////////// +// Variables that are dynamics over the course of the simulation +/////////////////////////////////////////////////////////////////////////////////////// +double etime; //Elapsed model time +double output_counter; //Helps determine when it's time to do output +//Runtime variable arrays +double *state; //Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) +double *state_tmp; //Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) +double *flux; //Cell interface fluxes. Dimensions: (nx+1,nz+1,NUM_VARS) +double *tend; //Fluid state tendencies. Dimensions: (nx,nz,NUM_VARS) +int num_out = 0; //The number of outputs performed so far +int direction_switch = 1; +double mass0, te0; //Initial domain totals for mass and total energy +double mass , te ; //Domain totals for mass and total energy + +//How is this not in the standard?! +double dmin( double a , double b ) { if (a sim_time) { dt = sim_time - etime; } + //Perform a single time step + perform_timestep(real3d_span{state,NUM_VARS,nz+2*hs,nx+2*hs},real3d_span{state_tmp,NUM_VARS,nz+2*hs,nx+2*hs},real3d_span{flux,NUM_VARS, (nz+1) ,(nx+1)},real3d_span{tend,NUM_VARS,nz,nx},dt); + //Inform the user +#ifndef NO_INFORM + if (mainproc) { printf( "Elapsed Time: %lf / %lf\n", etime , sim_time ); } +#endif + //Update the elapsed time and output counter + etime = etime + dt; + output_counter = output_counter + dt; + //If it's time for output, reset the counter, and do output + if (output_counter >= output_freq) { + output_counter = output_counter - output_freq; + output(state,etime); + } + } + auto t2 = std::chrono::steady_clock::now(); + if (mainproc) { + std::cout << "CPU Time: " << std::chrono::duration(t2-t1).count() << " sec\n"; + } + + //Final reductions for mass, kinetic energy, and total energy + reductions(mass,te); + + if (mainproc) { + printf( "d_mass: %le\n" , (mass - mass0)/mass0 ); + printf( "d_te: %le\n" , (te - te0 )/te0 ); + } + + finalize(); +} + + +//Performs a single dimensionally split time step using a simple low-storage three-stage Runge-Kutta time integrator +//The dimensional splitting is a second-order-accurate alternating Strang splitting in which the +//order of directions is alternated each time step. +//The Runge-Kutta method used here is defined as follows: +// q* = q[n] + dt/3 * rhs(q[n]) +// q** = q[n] + dt/2 * rhs(q* ) +// q[n+1] = q[n] + dt/1 * rhs(q** ) +void perform_timestep( real3d_span state , real3d_span state_tmp , real3d_span flux , real3d_span tend , double dt ) { + if (direction_switch) { + //x-direction first + semi_discrete_step( state, state , state_tmp, dt / 3 , DIR_X , flux , tend ); + semi_discrete_step( state, state_tmp, state_tmp, dt / 2 , DIR_X , flux , tend ); + semi_discrete_step( state, state_tmp, state , dt / 1 , DIR_X , flux , tend ); + //z-direction second + semi_discrete_step( state, state , state_tmp, dt / 3 , DIR_Z , flux , tend ); + semi_discrete_step( state, state_tmp, state_tmp, dt / 2 , DIR_Z , flux , tend ); + semi_discrete_step( state, state_tmp, state , dt / 1 , DIR_Z , flux , tend ); + } else { + //z-direction second + semi_discrete_step( state, state , state_tmp, dt / 3 , DIR_Z , flux , tend ); + semi_discrete_step( state, state_tmp, state_tmp, dt / 2 , DIR_Z , flux , tend ); + semi_discrete_step( state, state_tmp, state , dt / 1 , DIR_Z , flux , tend ); + //x-direction first + semi_discrete_step( state, state , state_tmp, dt / 3 , DIR_X , flux , tend ); + semi_discrete_step( state, state_tmp, state_tmp, dt / 2 , DIR_X , flux , tend ); + semi_discrete_step( state, state_tmp, state , dt / 1 , DIR_X , flux , tend ); + } + if (direction_switch) { direction_switch = 0; } else { direction_switch = 1; } +} + + +//Perform a single semi-discretized step in time with the form: +//state_out = state_init + dt * rhs(state_forcing) +//Meaning the step starts from state_init, computes the rhs using state_forcing, and stores the result in state_out +void semi_discrete_step( real3d_span state_init , real3d_span state_forcing , real3d_span state_out , double dt , int dir , real3d_span flux , real3d_span tend ) { + if (dir == DIR_X) { + //Set the halo values for this MPI task's fluid state in the x-direction + set_halo_values_x(state_forcing); + //Compute the time tendencies for the fluid state in the x-direction + compute_tendencies_x(state_forcing,flux,tend,dt); + } else if (dir == DIR_Z) { + //Set the halo values for this MPI task's fluid state in the z-direction + set_halo_values_z(state_forcing); + //Compute the time tendencies for the fluid state in the z-direction + compute_tendencies_z(state_forcing,flux,tend,dt); + } + + ///////////////////////////////////////////////// + // TODO: THREAD ME + ///////////////////////////////////////////////// + //Apply the tendencies to the fluid state + auto bounds = std::views::iota(0,(int)(NUM_VARS*nz*nx)); + std::for_each(std::execution::par_unseq,bounds.begin(),bounds.end(),[=,nx=nx,nz=nz,hy_dens_cell=hy_dens_cell,i_beg=i_beg,k_beg=k_beg](int idx){ + auto [i,k,ll] = idx3d(idx,nx,nz); + if (data_spec_int == DATA_SPEC_GRAVITY_WAVES) { + double x = (i_beg + i+0.5)*dx; + double z = (k_beg + k+0.5)*dz; + double wpert; + // Using sample_ellipse_cosine requires "acc routine" in OpenACC and "declare target" in OpenMP offload + // Neither of these are particularly well supported. So I'm manually inlining here + // wpert = sample_ellipse_cosine( x,z , 0.01 , xlen/8,1000., 500.,500. ); + { + double x0 = xlen/8; + double z0 = 1000; + double xrad = 500; + double zrad = 500; + double amp = 0.01; + //Compute distance from bubble center + double dist = sqrt( ((x-x0)/xrad)*((x-x0)/xrad) + ((z-z0)/zrad)*((z-z0)/zrad) ) * pi / 2.; + //If the distance from bubble center is less than the radius, create a cos**2 profile + if (dist <= pi / 2.) { + wpert = amp * pow(cos(dist),2.); + } else { + wpert = 0.; + } + } + tend(ID_WMOM,k,i) += wpert*hy_dens_cell[hs+k]; + } + state_out(ll, (k+hs), (i+hs)) = state_init(ll, (k+hs), (i+hs)) + dt * tend(ll,k,i); + }); +} + + +//Compute the time tendencies of the fluid state using forcing in the x-direction +//Since the halos are set in a separate routine, this will not require MPI +//First, compute the flux vector at each cell interface in the x-direction (including hyperviscosity) +//Then, compute the tendencies using those fluxes +void compute_tendencies_x( real3d_span state , real3d_span flux , real3d_span tend , double dt) { + double hv_coef; + //Compute the hyperviscosity coefficient + hv_coef = -hv_beta * dx / (16*dt); + ///////////////////////////////////////////////// + // TODO: THREAD ME + ///////////////////////////////////////////////// + //Compute fluxes in the x-direction for each cell + auto bounds1 = std::views::iota(0,(int)(nz*(nx+1))); + std::for_each(std::execution::par_unseq,bounds1.begin(),bounds1.end(),[=,nx=nx,nz=nz,hy_dens_cell=hy_dens_cell,hy_dens_theta_cell=hy_dens_theta_cell](int idx){ + auto [i,k] = idx2d(idx,nx+1); + double stencil[4], d3_vals[NUM_VARS],vals[NUM_VARS]; + //Use fourth-order interpolation from four cell averages to compute the value at the interface in question + for (int ll=0; ll , Oak Ridge National Laboratory +// Jeff Larkin , NVIDIA Corporation +// This code simulates dry, stratified, compressible, non-hydrostatic fluid flows +// For documentation, please see the attached documentation in the "documentation" folder +// +////////////////////////////////////////////////////////////////////////////////////////// + +#include +#include +#include +#include +#include +#include +#ifndef NO_PNETCDF +#include "pnetcdf.h" +#endif +#include +#include +#include +#include + +#include +namespace stdex = std::experimental; + +template +using array_view = stdex::mdspan>; +using real_1d_array_view = array_view; +using real_2d_array_view = array_view; +using real3d_span = array_view; +using const_real_1d_array_view = array_view; +using const_real_2d_array_view = array_view; +using const_real3d_span = array_view; + +// These should become unnecessary with the addition of cartesian_product +constexpr std::tuple idx2d(int idx, int nx) { return {idx%nx, idx/nx}; } +constexpr std::tuple idx3d(int idx, int nx, int nz) { return {idx%nx, (idx/nx)%nz, idx / (nx*nz)}; } + +constexpr double pi = 3.14159265358979323846264338327; //Pi +constexpr double grav = 9.8; //Gravitational acceleration (m / s^2) +constexpr double cp = 1004.; //Specific heat of dry air at constant pressure +constexpr double cv = 717.; //Specific heat of dry air at constant volume +constexpr double rd = 287.; //Dry air constant for equation of state (P=rho*rd*T) +constexpr double p0 = 1.e5; //Standard pressure at the surface in Pascals +constexpr double C0 = 27.5629410929725921310572974482; //Constant to translate potential temperature into pressure (P=C0*(rho*theta)**gamma) +constexpr double gamm = 1.40027894002789400278940027894; //gamma=cp/Rd , have to call this gamm because "gamma" is taken (I hate C so much) +//Define domain and stability-related constants +constexpr double xlen = 2.e4; //Length of the domain in the x-direction (meters) +constexpr double zlen = 1.e4; //Length of the domain in the z-direction (meters) +constexpr double hv_beta = 0.05; //How strong to diffuse the solution: hv_beta \in [0:1] +constexpr double cfl = 1.50; //"Courant, Friedrichs, Lewy" number (for numerical stability) +constexpr double max_speed = 450; //Assumed maximum wave speed during the simulation (speed of sound + speed of wind) (meter / sec) +constexpr int hs = 2; //"Halo" size: number of cells beyond the MPI tasks's domain needed for a full "stencil" of information for reconstruction +constexpr int sten_size = 4; //Size of the stencil used for interpolation + +//Parameters for indexing and flags +constexpr int NUM_VARS = 4; //Number of fluid state variables +constexpr int ID_DENS = 0; //index for density ("rho") +constexpr int ID_UMOM = 1; //index for momentum in the x-direction ("rho * u") +constexpr int ID_WMOM = 2; //index for momentum in the z-direction ("rho * w") +constexpr int ID_RHOT = 3; //index for density * potential temperature ("rho * theta") +constexpr int DIR_X = 1; //Integer constant to express that this operation is in the x-direction +constexpr int DIR_Z = 2; //Integer constant to express that this operation is in the z-direction +constexpr int DATA_SPEC_COLLISION = 1; +constexpr int DATA_SPEC_THERMAL = 2; +constexpr int DATA_SPEC_GRAVITY_WAVES = 3; +constexpr int DATA_SPEC_DENSITY_CURRENT = 5; +constexpr int DATA_SPEC_INJECTION = 6; + +constexpr int nqpoints = 3; +constexpr double qpoints [] = { 0.112701665379258311482073460022E0 , 0.500000000000000000000000000000E0 , 0.887298334620741688517926539980E0 }; +constexpr double qweights[] = { 0.277777777777777777777777777779E0 , 0.444444444444444444444444444444E0 , 0.277777777777777777777777777779E0 }; + +/////////////////////////////////////////////////////////////////////////////////////// +// BEGIN USER-CONFIGURABLE PARAMETERS +/////////////////////////////////////////////////////////////////////////////////////// +//The x-direction length is twice as long as the z-direction length +//So, you'll want to have nx_glob be twice as large as nz_glob +int constexpr nx_glob = _NX; //Number of total cells in the x-direction +int constexpr nz_glob = _NZ; //Number of total cells in the z-direction +double constexpr sim_time = _SIM_TIME; //How many seconds to run the simulation +double constexpr output_freq = _OUT_FREQ; //How frequently to output data to file (in seconds) +int constexpr data_spec_int = _DATA_SPEC; //How to initialize the data +double constexpr dx = xlen / nx_glob; // grid spacing in the x-direction +double constexpr dz = zlen / nz_glob; // grid spacing in the x-direction +/////////////////////////////////////////////////////////////////////////////////////// +// END USER-CONFIGURABLE PARAMETERS +/////////////////////////////////////////////////////////////////////////////////////// + +/////////////////////////////////////////////////////////////////////////////////////// +// Variables that are initialized but remain static over the course of the simulation +/////////////////////////////////////////////////////////////////////////////////////// +double dt; //Model time step (seconds) +int nx, nz; //Number of local grid cells in the x- and z- dimensions for this MPI task +int i_beg, k_beg; //beginning index in the x- and z-directions for this MPI task +int nranks, myrank; //Number of MPI ranks and my rank id +int left_rank, right_rank; //MPI Rank IDs that exist to my left and right in the global domain +int mainproc; //Am I the main process (rank == 0)? +double *hy_dens_cell; //hydrostatic density (vert cell avgs). Dimensions: (1-hs:nz+hs) +double *hy_dens_theta_cell; //hydrostatic rho*t (vert cell avgs). Dimensions: (1-hs:nz+hs) +double *hy_dens_int; //hydrostatic density (vert cell interf). Dimensions: (1:nz+1) +double *hy_dens_theta_int; //hydrostatic rho*t (vert cell interf). Dimensions: (1:nz+1) +double *hy_pressure_int; //hydrostatic press (vert cell interf). Dimensions: (1:nz+1) + +/////////////////////////////////////////////////////////////////////////////////////// +// Variables that are dynamics over the course of the simulation +/////////////////////////////////////////////////////////////////////////////////////// +double etime; //Elapsed model time +double output_counter; //Helps determine when it's time to do output +//Runtime variable arrays +double *state; //Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) +double *state_tmp; //Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) +double *flux; //Cell interface fluxes. Dimensions: (nx+1,nz+1,NUM_VARS) +double *tend; //Fluid state tendencies. Dimensions: (nx,nz,NUM_VARS) +int num_out = 0; //The number of outputs performed so far +int direction_switch = 1; +double mass0, te0; //Initial domain totals for mass and total energy +double mass , te ; //Domain totals for mass and total energy + +//How is this not in the standard?! +double dmin( double a , double b ) { if (a sim_time) { dt = sim_time - etime; } + //Perform a single time step + perform_timestep(real3d_span{state,NUM_VARS,nz+2*hs,nx+2*hs},real3d_span{state_tmp,NUM_VARS,nz+2*hs,nx+2*hs},real3d_span{flux,NUM_VARS, (nz+1) ,(nx+1)},real3d_span{tend,NUM_VARS,nz,nx},dt); + //Inform the user +#ifndef NO_INFORM + if (mainproc) { printf( "Elapsed Time: %lf / %lf\n", etime , sim_time ); } +#endif + //Update the elapsed time and output counter + etime = etime + dt; + output_counter = output_counter + dt; + //If it's time for output, reset the counter, and do output + if (output_counter >= output_freq) { + output_counter = output_counter - output_freq; + output(state,etime); + } + } + auto t2 = std::chrono::steady_clock::now(); + if (mainproc) { + std::cout << "CPU Time: " << std::chrono::duration(t2-t1).count() << " sec\n"; + } + + //Final reductions for mass, kinetic energy, and total energy + reductions(mass,te); + + if (mainproc) { + printf( "d_mass: %le\n" , (mass - mass0)/mass0 ); + printf( "d_te: %le\n" , (te - te0 )/te0 ); + } + + finalize(); +} + + +//Performs a single dimensionally split time step using a simple low-storage three-stage Runge-Kutta time integrator +//The dimensional splitting is a second-order-accurate alternating Strang splitting in which the +//order of directions is alternated each time step. +//The Runge-Kutta method used here is defined as follows: +// q* = q[n] + dt/3 * rhs(q[n]) +// q** = q[n] + dt/2 * rhs(q* ) +// q[n+1] = q[n] + dt/1 * rhs(q** ) +void perform_timestep( real3d_span state , real3d_span state_tmp , real3d_span flux , real3d_span tend , double dt ) { + if (direction_switch) { + //x-direction first + semi_discrete_step( state, state , state_tmp, dt / 3 , DIR_X , flux , tend ); + semi_discrete_step( state, state_tmp, state_tmp, dt / 2 , DIR_X , flux , tend ); + semi_discrete_step( state, state_tmp, state , dt / 1 , DIR_X , flux , tend ); + //z-direction second + semi_discrete_step( state, state , state_tmp, dt / 3 , DIR_Z , flux , tend ); + semi_discrete_step( state, state_tmp, state_tmp, dt / 2 , DIR_Z , flux , tend ); + semi_discrete_step( state, state_tmp, state , dt / 1 , DIR_Z , flux , tend ); + } else { + //z-direction second + semi_discrete_step( state, state , state_tmp, dt / 3 , DIR_Z , flux , tend ); + semi_discrete_step( state, state_tmp, state_tmp, dt / 2 , DIR_Z , flux , tend ); + semi_discrete_step( state, state_tmp, state , dt / 1 , DIR_Z , flux , tend ); + //x-direction first + semi_discrete_step( state, state , state_tmp, dt / 3 , DIR_X , flux , tend ); + semi_discrete_step( state, state_tmp, state_tmp, dt / 2 , DIR_X , flux , tend ); + semi_discrete_step( state, state_tmp, state , dt / 1 , DIR_X , flux , tend ); + } + if (direction_switch) { direction_switch = 0; } else { direction_switch = 1; } +} + + +//Perform a single semi-discretized step in time with the form: +//state_out = state_init + dt * rhs(state_forcing) +//Meaning the step starts from state_init, computes the rhs using state_forcing, and stores the result in state_out +void semi_discrete_step( real3d_span state_init , real3d_span state_forcing , real3d_span state_out , double dt , int dir , real3d_span flux , real3d_span tend ) { + if (dir == DIR_X) { + //Set the halo values for this MPI task's fluid state in the x-direction + set_halo_values_x(state_forcing); + //Compute the time tendencies for the fluid state in the x-direction + compute_tendencies_x(state_forcing,flux,tend,dt); + } else if (dir == DIR_Z) { + //Set the halo values for this MPI task's fluid state in the z-direction + set_halo_values_z(state_forcing); + //Compute the time tendencies for the fluid state in the z-direction + compute_tendencies_z(state_forcing,flux,tend,dt); + } + + ///////////////////////////////////////////////// + // TODO: THREAD ME + ///////////////////////////////////////////////// + //Apply the tendencies to the fluid state + std::for_each_n(std::execution::par_unseq,std::views::iota(0).begin(),NUM_VARS*nz*nx,[=,nx=nx,nz=nz,hy_dens_cell=hy_dens_cell,i_beg=i_beg,k_beg=k_beg](int idx){ + auto [i,k,ll] = idx3d(idx,nx,nz); + if (data_spec_int == DATA_SPEC_GRAVITY_WAVES) { + double x = (i_beg + i+0.5)*dx; + double z = (k_beg + k+0.5)*dz; + double wpert; + // Using sample_ellipse_cosine requires "acc routine" in OpenACC and "declare target" in OpenMP offload + // Neither of these are particularly well supported. So I'm manually inlining here + // wpert = sample_ellipse_cosine( x,z , 0.01 , xlen/8,1000., 500.,500. ); + { + double x0 = xlen/8; + double z0 = 1000; + double xrad = 500; + double zrad = 500; + double amp = 0.01; + //Compute distance from bubble center + double dist = sqrt( ((x-x0)/xrad)*((x-x0)/xrad) + ((z-z0)/zrad)*((z-z0)/zrad) ) * pi / 2.; + //If the distance from bubble center is less than the radius, create a cos**2 profile + if (dist <= pi / 2.) { + wpert = amp * pow(cos(dist),2.); + } else { + wpert = 0.; + } + } + tend(ID_WMOM,k,i) += wpert*hy_dens_cell[hs+k]; + } + state_out(ll, (k+hs), (i+hs)) = state_init(ll, (k+hs), (i+hs)) + dt * tend(ll,k,i); + }); +} + + +//Compute the time tendencies of the fluid state using forcing in the x-direction +//Since the halos are set in a separate routine, this will not require MPI +//First, compute the flux vector at each cell interface in the x-direction (including hyperviscosity) +//Then, compute the tendencies using those fluxes +void compute_tendencies_x( real3d_span state , real3d_span flux , real3d_span tend , double dt) { + double hv_coef; + //Compute the hyperviscosity coefficient + hv_coef = -hv_beta * dx / (16*dt); + ///////////////////////////////////////////////// + // TODO: THREAD ME + ///////////////////////////////////////////////// + //Compute fluxes in the x-direction for each cell + std::for_each_n(std::execution::par_unseq,std::views::iota(0).begin(),(nz*(nx+1)),[=,nx=nx,nz=nz,hy_dens_cell=hy_dens_cell,hy_dens_theta_cell=hy_dens_theta_cell](int idx){ + auto [i,k] = idx2d(idx,nx+1); + double stencil[4], d3_vals[NUM_VARS],vals[NUM_VARS]; + //Use fourth-order interpolation from four cell averages to compute the value at the interface in question + for (int ll=0; ll Date: Thu, 18 Apr 2024 13:32:03 -0400 Subject: [PATCH 7/9] Added MPI versions with for_each(_n) and mdspan --- cpp/CMakeLists.txt | 44 ++ cpp/miniWeather_mpi_for_each.cpp | 915 +++++++++++++++++++++++++++++ cpp/miniWeather_mpi_for_each_n.cpp | 906 ++++++++++++++++++++++++++++ 3 files changed, 1865 insertions(+) create mode 100644 cpp/miniWeather_mpi_for_each.cpp create mode 100644 cpp/miniWeather_mpi_for_each_n.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 451f18d5..25f09fc9 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -130,6 +130,50 @@ target_link_libraries(mpi_test "${LDFLAGS}") add_test(NAME MPI_TEST COMMAND ./check_output.sh ./mpi_test 1e-9 4.5e-5 ) +############################################################ +## Compile the mpi for_each version +############################################################ +if (NOT ("${STDPAR_FLAGS}" STREQUAL "") ) + add_executable(mpi_for_each miniWeather_mpi_for_each.cpp) + set_target_properties(mpi_for_each PROPERTIES COMPILE_FLAGS "${EXE_DEFS} ${STDPAR_FLAGS}") + + add_executable(mpi_for_each_test miniWeather_mpi_for_each.cpp) + set_target_properties(mpi_for_each_test PROPERTIES COMPILE_FLAGS "${TEST_DEFS} ${STDPAR_FLAGS}") + + if (NOT ("${LDFLAGS}" STREQUAL "") ) + target_link_libraries(mpi_for_each "${LDFLAGS} ${STDPAR_FLAGS}") + target_link_libraries(mpi_for_each_test "${LDFLAGS} ${STDPAR_FLAGS}") + endif() + if (NOT ("${SERIAL_LINK_FLAGS}" STREQUAL "") ) + target_link_libraries(mpi_for_each "${SERIAL_LINK_FLAGS} ${STDPAR_FLAGS}") + target_link_libraries(mpi_for_each_test "${SERIAL_LINK_FLAGS} ${STDPAR_FLAGS}") + endif() + + add_test(NAME MPI_FOR_EACH_TEST COMMAND ./check_output.sh ./mpi_for_each_test 1e-13 4.5e-5 ) +endif() + +############################################################ +## Compile the mpi for_each_n version +############################################################ +if (NOT ("${STDPAR_FLAGS}" STREQUAL "") ) + add_executable(mpi_for_each_n miniWeather_mpi_for_each_n.cpp) + set_target_properties(mpi_for_each_n PROPERTIES COMPILE_FLAGS "${EXE_DEFS} ${STDPAR_FLAGS}") + + add_executable(mpi_for_each_n_test miniWeather_mpi_for_each_n.cpp) + set_target_properties(mpi_for_each_n_test PROPERTIES COMPILE_FLAGS "${TEST_DEFS} ${STDPAR_FLAGS}") + + if (NOT ("${LDFLAGS}" STREQUAL "") ) + target_link_libraries(mpi_for_each_n "${LDFLAGS} ${STDPAR_FLAGS}") + target_link_libraries(mpi_for_each_n_test "${LDFLAGS} ${STDPAR_FLAGS}") + endif() + if (NOT ("${SERIAL_LINK_FLAGS}" STREQUAL "") ) + target_link_libraries(mpi_for_each_n "${SERIAL_LINK_FLAGS} ${STDPAR_FLAGS}") + target_link_libraries(mpi_for_each_n_test "${SERIAL_LINK_FLAGS} ${STDPAR_FLAGS}") + endif() + + add_test(NAME MPI_FOR_EACH_N_TEST COMMAND ./check_output.sh ./mpi_for_each_n_test 1e-13 4.5e-5 ) +endif() + ############################################################ ## YAKL parallel_for Version diff --git a/cpp/miniWeather_mpi_for_each.cpp b/cpp/miniWeather_mpi_for_each.cpp new file mode 100644 index 00000000..245cd4f3 --- /dev/null +++ b/cpp/miniWeather_mpi_for_each.cpp @@ -0,0 +1,915 @@ + +////////////////////////////////////////////////////////////////////////////////////////// +// miniWeather +// Author: Matt Norman , Oak Ridge National Laboratory +// Jeff Larkin , NVIDIA Corporation +// This code simulates dry, stratified, compressible, non-hydrostatic fluid flows +// For documentation, please see the attached documentation in the "documentation" folder +// +////////////////////////////////////////////////////////////////////////////////////////// + +#include +#include +#include +#include +#include +#include +#ifndef NO_PNETCDF +#include "pnetcdf.h" +#endif +#include +#include +#include +#include + +#include +namespace stdex = std::experimental; + +template +using array_view = stdex::mdspan>; +using real_1d_array_view = array_view; +using real_2d_array_view = array_view; +using real3d_span = array_view; +using const_real_1d_array_view = array_view; +using const_real_2d_array_view = array_view; +using const_real3d_span = array_view; + +// These should become unnecessary with the addition of cartesian_product +constexpr std::tuple idx2d(int idx, int nx) { return {idx%nx, idx/nx}; } +constexpr std::tuple idx3d(int idx, int nx, int nz) { return {idx%nx, (idx/nx)%nz, idx / (nx*nz)}; } + +constexpr double pi = 3.14159265358979323846264338327; //Pi +constexpr double grav = 9.8; //Gravitational acceleration (m / s^2) +constexpr double cp = 1004.; //Specific heat of dry air at constant pressure +constexpr double cv = 717.; //Specific heat of dry air at constant volume +constexpr double rd = 287.; //Dry air constant for equation of state (P=rho*rd*T) +constexpr double p0 = 1.e5; //Standard pressure at the surface in Pascals +constexpr double C0 = 27.5629410929725921310572974482; //Constant to translate potential temperature into pressure (P=C0*(rho*theta)**gamma) +constexpr double gamm = 1.40027894002789400278940027894; //gamma=cp/Rd , have to call this gamm because "gamma" is taken (I hate C so much) +//Define domain and stability-related constants +constexpr double xlen = 2.e4; //Length of the domain in the x-direction (meters) +constexpr double zlen = 1.e4; //Length of the domain in the z-direction (meters) +constexpr double hv_beta = 0.05; //How strong to diffuse the solution: hv_beta \in [0:1] +constexpr double cfl = 1.50; //"Courant, Friedrichs, Lewy" number (for numerical stability) +constexpr double max_speed = 450; //Assumed maximum wave speed during the simulation (speed of sound + speed of wind) (meter / sec) +constexpr int hs = 2; //"Halo" size: number of cells beyond the MPI tasks's domain needed for a full "stencil" of information for reconstruction +constexpr int sten_size = 4; //Size of the stencil used for interpolation + +//Parameters for indexing and flags +constexpr int NUM_VARS = 4; //Number of fluid state variables +constexpr int ID_DENS = 0; //index for density ("rho") +constexpr int ID_UMOM = 1; //index for momentum in the x-direction ("rho * u") +constexpr int ID_WMOM = 2; //index for momentum in the z-direction ("rho * w") +constexpr int ID_RHOT = 3; //index for density * potential temperature ("rho * theta") +constexpr int DIR_X = 1; //Integer constant to express that this operation is in the x-direction +constexpr int DIR_Z = 2; //Integer constant to express that this operation is in the z-direction +constexpr int DATA_SPEC_COLLISION = 1; +constexpr int DATA_SPEC_THERMAL = 2; +constexpr int DATA_SPEC_GRAVITY_WAVES = 3; +constexpr int DATA_SPEC_DENSITY_CURRENT = 5; +constexpr int DATA_SPEC_INJECTION = 6; + +constexpr int nqpoints = 3; +constexpr double qpoints [] = { 0.112701665379258311482073460022E0 , 0.500000000000000000000000000000E0 , 0.887298334620741688517926539980E0 }; +constexpr double qweights[] = { 0.277777777777777777777777777779E0 , 0.444444444444444444444444444444E0 , 0.277777777777777777777777777779E0 }; + +/////////////////////////////////////////////////////////////////////////////////////// +// BEGIN USER-CONFIGURABLE PARAMETERS +/////////////////////////////////////////////////////////////////////////////////////// +//The x-direction length is twice as long as the z-direction length +//So, you'll want to have nx_glob be twice as large as nz_glob +int constexpr nx_glob = _NX; //Number of total cells in the x-direction +int constexpr nz_glob = _NZ; //Number of total cells in the z-direction +double constexpr sim_time = _SIM_TIME; //How many seconds to run the simulation +double constexpr output_freq = _OUT_FREQ; //How frequently to output data to file (in seconds) +int constexpr data_spec_int = _DATA_SPEC; //How to initialize the data +double constexpr dx = xlen / nx_glob; // grid spacing in the x-direction +double constexpr dz = zlen / nz_glob; // grid spacing in the x-direction +/////////////////////////////////////////////////////////////////////////////////////// +// END USER-CONFIGURABLE PARAMETERS +/////////////////////////////////////////////////////////////////////////////////////// + +/////////////////////////////////////////////////////////////////////////////////////// +// Variables that are initialized but remain static over the course of the simulation +/////////////////////////////////////////////////////////////////////////////////////// +double dt; //Model time step (seconds) +int nx, nz; //Number of local grid cells in the x- and z- dimensions for this MPI task +int i_beg, k_beg; //beginning index in the x- and z-directions for this MPI task +int nranks, myrank; //Number of MPI ranks and my rank id +int left_rank, right_rank; //MPI Rank IDs that exist to my left and right in the global domain +int mainproc; //Am I the main process (rank == 0)? +double *hy_dens_cell; //hydrostatic density (vert cell avgs). Dimensions: (1-hs:nz+hs) +double *hy_dens_theta_cell; //hydrostatic rho*t (vert cell avgs). Dimensions: (1-hs:nz+hs) +double *hy_dens_int; //hydrostatic density (vert cell interf). Dimensions: (1:nz+1) +double *hy_dens_theta_int; //hydrostatic rho*t (vert cell interf). Dimensions: (1:nz+1) +double *hy_pressure_int; //hydrostatic press (vert cell interf). Dimensions: (1:nz+1) + +/////////////////////////////////////////////////////////////////////////////////////// +// Variables that are dynamics over the course of the simulation +/////////////////////////////////////////////////////////////////////////////////////// +double etime; //Elapsed model time +double output_counter; //Helps determine when it's time to do output +//Runtime variable arrays +double *state; //Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) +double *state_tmp; //Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) +double *flux; //Cell interface fluxes. Dimensions: (nx+1,nz+1,NUM_VARS) +double *tend; //Fluid state tendencies. Dimensions: (nx,nz,NUM_VARS) +double *sendbuf_l; //Buffer to send data to the left MPI rank +double *sendbuf_r; //Buffer to send data to the right MPI rank +double *recvbuf_l; //Buffer to receive data from the left MPI rank +double *recvbuf_r; //Buffer to receive data from the right MPI rank +int num_out = 0; //The number of outputs performed so far +int direction_switch = 1; +double mass0, te0; //Initial domain totals for mass and total energy +double mass , te ; //Domain totals for mass and total energy + +//How is this not in the standard?! +double dmin( double a , double b ) { if (a sim_time) { dt = sim_time - etime; } + //Perform a single time step + perform_timestep(real3d_span{state,NUM_VARS,nz+2*hs,nx+2*hs},real3d_span{state_tmp,NUM_VARS,nz+2*hs,nx+2*hs},real3d_span{flux,NUM_VARS, (nz+1) ,(nx+1)},real3d_span{tend,NUM_VARS,nz,nx},dt); + //Inform the user +#ifndef NO_INFORM + if (mainproc) { printf( "Elapsed Time: %lf / %lf\n", etime , sim_time ); } +#endif + //Update the elapsed time and output counter + etime = etime + dt; + output_counter = output_counter + dt; + //If it's time for output, reset the counter, and do output + if (output_counter >= output_freq) { + output_counter = output_counter - output_freq; + output(state,etime); + } + } + auto t2 = std::chrono::steady_clock::now(); + if (mainproc) { + std::cout << "CPU Time: " << std::chrono::duration(t2-t1).count() << " sec\n"; + } + + //Final reductions for mass, kinetic energy, and total energy + reductions(mass,te); + + if (mainproc) { + printf( "d_mass: %le\n" , (mass - mass0)/mass0 ); + printf( "d_te: %le\n" , (te - te0 )/te0 ); + } + + finalize(); +} + + +//Performs a single dimensionally split time step using a simple low-storage three-stage Runge-Kutta time integrator +//The dimensional splitting is a second-order-accurate alternating Strang splitting in which the +//order of directions is alternated each time step. +//The Runge-Kutta method used here is defined as follows: +// q* = q[n] + dt/3 * rhs(q[n]) +// q** = q[n] + dt/2 * rhs(q* ) +// q[n+1] = q[n] + dt/1 * rhs(q** ) +void perform_timestep( real3d_span state , real3d_span state_tmp , real3d_span flux , real3d_span tend , double dt ) { + if (direction_switch) { + //x-direction first + semi_discrete_step( state, state , state_tmp, dt / 3 , DIR_X , flux , tend ); + semi_discrete_step( state, state_tmp, state_tmp, dt / 2 , DIR_X , flux , tend ); + semi_discrete_step( state, state_tmp, state , dt / 1 , DIR_X , flux , tend ); + //z-direction second + semi_discrete_step( state, state , state_tmp, dt / 3 , DIR_Z , flux , tend ); + semi_discrete_step( state, state_tmp, state_tmp, dt / 2 , DIR_Z , flux , tend ); + semi_discrete_step( state, state_tmp, state , dt / 1 , DIR_Z , flux , tend ); + } else { + //z-direction second + semi_discrete_step( state, state , state_tmp, dt / 3 , DIR_Z , flux , tend ); + semi_discrete_step( state, state_tmp, state_tmp, dt / 2 , DIR_Z , flux , tend ); + semi_discrete_step( state, state_tmp, state , dt / 1 , DIR_Z , flux , tend ); + //x-direction first + semi_discrete_step( state, state , state_tmp, dt / 3 , DIR_X , flux , tend ); + semi_discrete_step( state, state_tmp, state_tmp, dt / 2 , DIR_X , flux , tend ); + semi_discrete_step( state, state_tmp, state , dt / 1 , DIR_X , flux , tend ); + } + if (direction_switch) { direction_switch = 0; } else { direction_switch = 1; } +} + + +//Perform a single semi-discretized step in time with the form: +//state_out = state_init + dt * rhs(state_forcing) +//Meaning the step starts from state_init, computes the rhs using state_forcing, and stores the result in state_out +void semi_discrete_step( real3d_span state_init , real3d_span state_forcing , real3d_span state_out , double dt , int dir , real3d_span flux , real3d_span tend ) { + if (dir == DIR_X) { + //Set the halo values for this MPI task's fluid state in the x-direction + set_halo_values_x(state_forcing); + //Compute the time tendencies for the fluid state in the x-direction + compute_tendencies_x(state_forcing,flux,tend,dt); + } else if (dir == DIR_Z) { + //Set the halo values for this MPI task's fluid state in the z-direction + set_halo_values_z(state_forcing); + //Compute the time tendencies for the fluid state in the z-direction + compute_tendencies_z(state_forcing,flux,tend,dt); + } + + ///////////////////////////////////////////////// + // TODO: THREAD ME + ///////////////////////////////////////////////// + //Apply the tendencies to the fluid state + auto bounds = std::views::iota(0,(int)(NUM_VARS*nz*nx)); + std::for_each(std::execution::par_unseq,bounds.begin(),bounds.end(),[=,nx=nx,nz=nz,hy_dens_cell=hy_dens_cell,i_beg=i_beg,k_beg=k_beg](int idx){ + auto [i,k,ll] = idx3d(idx,nx,nz); + if (data_spec_int == DATA_SPEC_GRAVITY_WAVES) { + double x = (i_beg + i+0.5)*dx; + double z = (k_beg + k+0.5)*dz; + double wpert; + // Using sample_ellipse_cosine requires "acc routine" in OpenACC and "declare target" in OpenMP offload + // Neither of these are particularly well supported. So I'm manually inlining here + // wpert = sample_ellipse_cosine( x,z , 0.01 , xlen/8,1000., 500.,500. ); + { + double x0 = xlen/8; + double z0 = 1000; + double xrad = 500; + double zrad = 500; + double amp = 0.01; + //Compute distance from bubble center + double dist = sqrt( ((x-x0)/xrad)*((x-x0)/xrad) + ((z-z0)/zrad)*((z-z0)/zrad) ) * pi / 2.; + //If the distance from bubble center is less than the radius, create a cos**2 profile + if (dist <= pi / 2.) { + wpert = amp * pow(cos(dist),2.); + } else { + wpert = 0.; + } + } + tend(ID_WMOM,k,i) += wpert*hy_dens_cell[hs+k]; + } + state_out(ll, (k+hs), (i+hs)) = state_init(ll, (k+hs), (i+hs)) + dt * tend(ll,k,i); + }); +} + + +//Compute the time tendencies of the fluid state using forcing in the x-direction +//Since the halos are set in a separate routine, this will not require MPI +//First, compute the flux vector at each cell interface in the x-direction (including hyperviscosity) +//Then, compute the tendencies using those fluxes +void compute_tendencies_x( real3d_span state , real3d_span flux , real3d_span tend , double dt) { + double hv_coef; + //Compute the hyperviscosity coefficient + hv_coef = -hv_beta * dx / (16*dt); + ///////////////////////////////////////////////// + // TODO: THREAD ME + ///////////////////////////////////////////////// + //Compute fluxes in the x-direction for each cell + auto bounds1 = std::views::iota(0,(int)(nz*(nx+1))); + std::for_each(std::execution::par_unseq,bounds1.begin(),bounds1.end(),[=,nx=nx,nz=nz,hy_dens_cell=hy_dens_cell,hy_dens_theta_cell=hy_dens_theta_cell](int idx){ + auto [i,k] = idx2d(idx,nx+1); + double stencil[4], d3_vals[NUM_VARS],vals[NUM_VARS]; + //Use fourth-order interpolation from four cell averages to compute the value at the interface in question + for (int ll=0; ll , Oak Ridge National Laboratory +// Jeff Larkin , NVIDIA Corporation +// This code simulates dry, stratified, compressible, non-hydrostatic fluid flows +// For documentation, please see the attached documentation in the "documentation" folder +// +////////////////////////////////////////////////////////////////////////////////////////// + +#include +#include +#include +#include +#include +#include +#ifndef NO_PNETCDF +#include "pnetcdf.h" +#endif +#include +#include +#include +#include + +#include +namespace stdex = std::experimental; + +template +using array_view = stdex::mdspan>; +using real_1d_array_view = array_view; +using real_2d_array_view = array_view; +using real3d_span = array_view; +using const_real_1d_array_view = array_view; +using const_real_2d_array_view = array_view; +using const_real3d_span = array_view; + +// These should become unnecessary with the addition of cartesian_product +constexpr std::tuple idx2d(int idx, int nx) { return {idx%nx, idx/nx}; } +constexpr std::tuple idx3d(int idx, int nx, int nz) { return {idx%nx, (idx/nx)%nz, idx / (nx*nz)}; } + +constexpr double pi = 3.14159265358979323846264338327; //Pi +constexpr double grav = 9.8; //Gravitational acceleration (m / s^2) +constexpr double cp = 1004.; //Specific heat of dry air at constant pressure +constexpr double cv = 717.; //Specific heat of dry air at constant volume +constexpr double rd = 287.; //Dry air constant for equation of state (P=rho*rd*T) +constexpr double p0 = 1.e5; //Standard pressure at the surface in Pascals +constexpr double C0 = 27.5629410929725921310572974482; //Constant to translate potential temperature into pressure (P=C0*(rho*theta)**gamma) +constexpr double gamm = 1.40027894002789400278940027894; //gamma=cp/Rd , have to call this gamm because "gamma" is taken (I hate C so much) +//Define domain and stability-related constants +constexpr double xlen = 2.e4; //Length of the domain in the x-direction (meters) +constexpr double zlen = 1.e4; //Length of the domain in the z-direction (meters) +constexpr double hv_beta = 0.05; //How strong to diffuse the solution: hv_beta \in [0:1] +constexpr double cfl = 1.50; //"Courant, Friedrichs, Lewy" number (for numerical stability) +constexpr double max_speed = 450; //Assumed maximum wave speed during the simulation (speed of sound + speed of wind) (meter / sec) +constexpr int hs = 2; //"Halo" size: number of cells beyond the MPI tasks's domain needed for a full "stencil" of information for reconstruction +constexpr int sten_size = 4; //Size of the stencil used for interpolation + +//Parameters for indexing and flags +constexpr int NUM_VARS = 4; //Number of fluid state variables +constexpr int ID_DENS = 0; //index for density ("rho") +constexpr int ID_UMOM = 1; //index for momentum in the x-direction ("rho * u") +constexpr int ID_WMOM = 2; //index for momentum in the z-direction ("rho * w") +constexpr int ID_RHOT = 3; //index for density * potential temperature ("rho * theta") +constexpr int DIR_X = 1; //Integer constant to express that this operation is in the x-direction +constexpr int DIR_Z = 2; //Integer constant to express that this operation is in the z-direction +constexpr int DATA_SPEC_COLLISION = 1; +constexpr int DATA_SPEC_THERMAL = 2; +constexpr int DATA_SPEC_GRAVITY_WAVES = 3; +constexpr int DATA_SPEC_DENSITY_CURRENT = 5; +constexpr int DATA_SPEC_INJECTION = 6; + +constexpr int nqpoints = 3; +constexpr double qpoints [] = { 0.112701665379258311482073460022E0 , 0.500000000000000000000000000000E0 , 0.887298334620741688517926539980E0 }; +constexpr double qweights[] = { 0.277777777777777777777777777779E0 , 0.444444444444444444444444444444E0 , 0.277777777777777777777777777779E0 }; + +/////////////////////////////////////////////////////////////////////////////////////// +// BEGIN USER-CONFIGURABLE PARAMETERS +/////////////////////////////////////////////////////////////////////////////////////// +//The x-direction length is twice as long as the z-direction length +//So, you'll want to have nx_glob be twice as large as nz_glob +int constexpr nx_glob = _NX; //Number of total cells in the x-direction +int constexpr nz_glob = _NZ; //Number of total cells in the z-direction +double constexpr sim_time = _SIM_TIME; //How many seconds to run the simulation +double constexpr output_freq = _OUT_FREQ; //How frequently to output data to file (in seconds) +int constexpr data_spec_int = _DATA_SPEC; //How to initialize the data +double constexpr dx = xlen / nx_glob; // grid spacing in the x-direction +double constexpr dz = zlen / nz_glob; // grid spacing in the x-direction +/////////////////////////////////////////////////////////////////////////////////////// +// END USER-CONFIGURABLE PARAMETERS +/////////////////////////////////////////////////////////////////////////////////////// + +/////////////////////////////////////////////////////////////////////////////////////// +// Variables that are initialized but remain static over the course of the simulation +/////////////////////////////////////////////////////////////////////////////////////// +double dt; //Model time step (seconds) +int nx, nz; //Number of local grid cells in the x- and z- dimensions for this MPI task +int i_beg, k_beg; //beginning index in the x- and z-directions for this MPI task +int nranks, myrank; //Number of MPI ranks and my rank id +int left_rank, right_rank; //MPI Rank IDs that exist to my left and right in the global domain +int mainproc; //Am I the main process (rank == 0)? +double *hy_dens_cell; //hydrostatic density (vert cell avgs). Dimensions: (1-hs:nz+hs) +double *hy_dens_theta_cell; //hydrostatic rho*t (vert cell avgs). Dimensions: (1-hs:nz+hs) +double *hy_dens_int; //hydrostatic density (vert cell interf). Dimensions: (1:nz+1) +double *hy_dens_theta_int; //hydrostatic rho*t (vert cell interf). Dimensions: (1:nz+1) +double *hy_pressure_int; //hydrostatic press (vert cell interf). Dimensions: (1:nz+1) + +/////////////////////////////////////////////////////////////////////////////////////// +// Variables that are dynamics over the course of the simulation +/////////////////////////////////////////////////////////////////////////////////////// +double etime; //Elapsed model time +double output_counter; //Helps determine when it's time to do output +//Runtime variable arrays +double *state; //Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) +double *state_tmp; //Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) +double *flux; //Cell interface fluxes. Dimensions: (nx+1,nz+1,NUM_VARS) +double *tend; //Fluid state tendencies. Dimensions: (nx,nz,NUM_VARS) +double *sendbuf_l; //Buffer to send data to the left MPI rank +double *sendbuf_r; //Buffer to send data to the right MPI rank +double *recvbuf_l; //Buffer to receive data from the left MPI rank +double *recvbuf_r; //Buffer to receive data from the right MPI rank +int num_out = 0; //The number of outputs performed so far +int direction_switch = 1; +double mass0, te0; //Initial domain totals for mass and total energy +double mass , te ; //Domain totals for mass and total energy + +//How is this not in the standard?! +double dmin( double a , double b ) { if (a sim_time) { dt = sim_time - etime; } + //Perform a single time step + perform_timestep(real3d_span{state,NUM_VARS,nz+2*hs,nx+2*hs},real3d_span{state_tmp,NUM_VARS,nz+2*hs,nx+2*hs},real3d_span{flux,NUM_VARS, (nz+1) ,(nx+1)},real3d_span{tend,NUM_VARS,nz,nx},dt); + //Inform the user +#ifndef NO_INFORM + if (mainproc) { printf( "Elapsed Time: %lf / %lf\n", etime , sim_time ); } +#endif + //Update the elapsed time and output counter + etime = etime + dt; + output_counter = output_counter + dt; + //If it's time for output, reset the counter, and do output + if (output_counter >= output_freq) { + output_counter = output_counter - output_freq; + output(state,etime); + } + } + auto t2 = std::chrono::steady_clock::now(); + if (mainproc) { + std::cout << "CPU Time: " << std::chrono::duration(t2-t1).count() << " sec\n"; + } + + //Final reductions for mass, kinetic energy, and total energy + reductions(mass,te); + + if (mainproc) { + printf( "d_mass: %le\n" , (mass - mass0)/mass0 ); + printf( "d_te: %le\n" , (te - te0 )/te0 ); + } + + finalize(); +} + + +//Performs a single dimensionally split time step using a simple low-storage three-stage Runge-Kutta time integrator +//The dimensional splitting is a second-order-accurate alternating Strang splitting in which the +//order of directions is alternated each time step. +//The Runge-Kutta method used here is defined as follows: +// q* = q[n] + dt/3 * rhs(q[n]) +// q** = q[n] + dt/2 * rhs(q* ) +// q[n+1] = q[n] + dt/1 * rhs(q** ) +void perform_timestep( real3d_span state , real3d_span state_tmp , real3d_span flux , real3d_span tend , double dt ) { + if (direction_switch) { + //x-direction first + semi_discrete_step( state, state , state_tmp, dt / 3 , DIR_X , flux , tend ); + semi_discrete_step( state, state_tmp, state_tmp, dt / 2 , DIR_X , flux , tend ); + semi_discrete_step( state, state_tmp, state , dt / 1 , DIR_X , flux , tend ); + //z-direction second + semi_discrete_step( state, state , state_tmp, dt / 3 , DIR_Z , flux , tend ); + semi_discrete_step( state, state_tmp, state_tmp, dt / 2 , DIR_Z , flux , tend ); + semi_discrete_step( state, state_tmp, state , dt / 1 , DIR_Z , flux , tend ); + } else { + //z-direction second + semi_discrete_step( state, state , state_tmp, dt / 3 , DIR_Z , flux , tend ); + semi_discrete_step( state, state_tmp, state_tmp, dt / 2 , DIR_Z , flux , tend ); + semi_discrete_step( state, state_tmp, state , dt / 1 , DIR_Z , flux , tend ); + //x-direction first + semi_discrete_step( state, state , state_tmp, dt / 3 , DIR_X , flux , tend ); + semi_discrete_step( state, state_tmp, state_tmp, dt / 2 , DIR_X , flux , tend ); + semi_discrete_step( state, state_tmp, state , dt / 1 , DIR_X , flux , tend ); + } + if (direction_switch) { direction_switch = 0; } else { direction_switch = 1; } +} + + +//Perform a single semi-discretized step in time with the form: +//state_out = state_init + dt * rhs(state_forcing) +//Meaning the step starts from state_init, computes the rhs using state_forcing, and stores the result in state_out +void semi_discrete_step( real3d_span state_init , real3d_span state_forcing , real3d_span state_out , double dt , int dir , real3d_span flux , real3d_span tend ) { + if (dir == DIR_X) { + //Set the halo values for this MPI task's fluid state in the x-direction + set_halo_values_x(state_forcing); + //Compute the time tendencies for the fluid state in the x-direction + compute_tendencies_x(state_forcing,flux,tend,dt); + } else if (dir == DIR_Z) { + //Set the halo values for this MPI task's fluid state in the z-direction + set_halo_values_z(state_forcing); + //Compute the time tendencies for the fluid state in the z-direction + compute_tendencies_z(state_forcing,flux,tend,dt); + } + + ///////////////////////////////////////////////// + // TODO: THREAD ME + ///////////////////////////////////////////////// + //Apply the tendencies to the fluid state + std::for_each_n(std::execution::par_unseq,std::views::iota(0).begin(),(NUM_VARS*nz*nx),[=,nx=nx,nz=nz,hy_dens_cell=hy_dens_cell,i_beg=i_beg,k_beg=k_beg](int idx){ + auto [i,k,ll] = idx3d(idx,nx,nz); + if (data_spec_int == DATA_SPEC_GRAVITY_WAVES) { + double x = (i_beg + i+0.5)*dx; + double z = (k_beg + k+0.5)*dz; + double wpert; + // Using sample_ellipse_cosine requires "acc routine" in OpenACC and "declare target" in OpenMP offload + // Neither of these are particularly well supported. So I'm manually inlining here + // wpert = sample_ellipse_cosine( x,z , 0.01 , xlen/8,1000., 500.,500. ); + { + double x0 = xlen/8; + double z0 = 1000; + double xrad = 500; + double zrad = 500; + double amp = 0.01; + //Compute distance from bubble center + double dist = sqrt( ((x-x0)/xrad)*((x-x0)/xrad) + ((z-z0)/zrad)*((z-z0)/zrad) ) * pi / 2.; + //If the distance from bubble center is less than the radius, create a cos**2 profile + if (dist <= pi / 2.) { + wpert = amp * pow(cos(dist),2.); + } else { + wpert = 0.; + } + } + tend(ID_WMOM,k,i) += wpert*hy_dens_cell[hs+k]; + } + state_out(ll, (k+hs), (i+hs)) = state_init(ll, (k+hs), (i+hs)) + dt * tend(ll,k,i); + }); +} + + +//Compute the time tendencies of the fluid state using forcing in the x-direction +//Since the halos are set in a separate routine, this will not require MPI +//First, compute the flux vector at each cell interface in the x-direction (including hyperviscosity) +//Then, compute the tendencies using those fluxes +void compute_tendencies_x( real3d_span state , real3d_span flux , real3d_span tend , double dt) { + double hv_coef; + //Compute the hyperviscosity coefficient + hv_coef = -hv_beta * dx / (16*dt); + ///////////////////////////////////////////////// + // TODO: THREAD ME + ///////////////////////////////////////////////// + //Compute fluxes in the x-direction for each cell + std::for_each_n(std::execution::par_unseq,std::views::iota(0).begin(),(nz*(nx+1)),[=,nx=nx,nz=nz,hy_dens_cell=hy_dens_cell,hy_dens_theta_cell=hy_dens_theta_cell](int idx){ + auto [i,k] = idx2d(idx,nx+1); + double stencil[4], d3_vals[NUM_VARS],vals[NUM_VARS]; + //Use fourth-order interpolation from four cell averages to compute the value at the interface in question + for (int ll=0; ll Date: Thu, 18 Apr 2024 14:32:14 -0400 Subject: [PATCH 8/9] Updated to use mdspan for MPI too --- cpp/miniWeather_mpi_for_each.cpp | 13 +++++++++---- cpp/miniWeather_mpi_for_each_n.cpp | 13 +++++++++---- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/cpp/miniWeather_mpi_for_each.cpp b/cpp/miniWeather_mpi_for_each.cpp index 245cd4f3..a1d10101 100644 --- a/cpp/miniWeather_mpi_for_each.cpp +++ b/cpp/miniWeather_mpi_for_each.cpp @@ -405,6 +405,11 @@ void compute_tendencies_z( real3d_span state , real3d_span flux , real3d_span te void set_halo_values_x( real3d_span state ) { int ierr; + auto sendspan_l = real3d_span(sendbuf_l,NUM_VARS,nz,hs); + auto sendspan_r = real3d_span(sendbuf_r,NUM_VARS,nz,hs); + auto recvspan_l = real3d_span(recvbuf_l,NUM_VARS,nz,hs); + auto recvspan_r = real3d_span(recvbuf_r,NUM_VARS,nz,hs); + if (nranks == 1) { auto bounds1 = std::views::iota(0,(int)(NUM_VARS*nz)); @@ -428,8 +433,8 @@ void set_halo_values_x( real3d_span state ) { auto bounds1 = std::views::iota(0,(int)(NUM_VARS*nz*hs)); std::for_each(std::execution::par_unseq,bounds1.begin(),bounds1.end(),[=,nx=nx,nz=nz,sendbuf_l=sendbuf_l,sendbuf_r=sendbuf_r](int idx){ auto [s,k,ll] = idx3d(idx,hs,nz); - sendbuf_l[ll*nz*hs + k*hs + s] = state(ll, (k+hs), hs+s); - sendbuf_r[ll*nz*hs + k*hs + s] = state(ll, (k+hs), nx+s); + sendspan_l(ll, k, s) = state(ll, (k+hs), hs+s); + sendspan_r(ll, k, s) = state(ll, (k+hs), nx+s); }); //Fire off the sends @@ -442,8 +447,8 @@ void set_halo_values_x( real3d_span state ) { //Unpack the receive buffers std::for_each(std::execution::par_unseq,bounds1.begin(),bounds1.end(),[=,nx=nx,nz=nz,recvbuf_l=recvbuf_l,recvbuf_r=recvbuf_r](int idx){ auto [s,k,ll] = idx3d(idx,hs,nz); - state(ll, (k+hs), s ) = recvbuf_l[ll*nz*hs + k*hs + s]; - state(ll, (k+hs), nx+hs+s) = recvbuf_r[ll*nz*hs + k*hs + s]; + state(ll, (k+hs), s ) = recvspan_l(ll, k, s); + state(ll, (k+hs), nx+hs+s) = recvspan_r(ll, k, s); }); //Wait for sends to finish diff --git a/cpp/miniWeather_mpi_for_each_n.cpp b/cpp/miniWeather_mpi_for_each_n.cpp index d401aaf9..1950a1c7 100644 --- a/cpp/miniWeather_mpi_for_each_n.cpp +++ b/cpp/miniWeather_mpi_for_each_n.cpp @@ -400,6 +400,11 @@ void compute_tendencies_z( real3d_span state , real3d_span flux , real3d_span te void set_halo_values_x( real3d_span state ) { int ierr; + auto sendspan_l = real3d_span(sendbuf_l,NUM_VARS,nz,hs); + auto sendspan_r = real3d_span(sendbuf_r,NUM_VARS,nz,hs); + auto recvspan_l = real3d_span(recvbuf_l,NUM_VARS,nz,hs); + auto recvspan_r = real3d_span(recvbuf_r,NUM_VARS,nz,hs); + if (nranks == 1) { std::for_each_n(std::execution::par_unseq,std::views::iota(0).begin(),(NUM_VARS*nz),[=,nx=nx,nz=nz](int idx){ @@ -421,8 +426,8 @@ void set_halo_values_x( real3d_span state ) { //Pack the send buffers std::for_each_n(std::execution::par_unseq,std::views::iota(0).begin(),(NUM_VARS*nz*hs),[=,nx=nx,nz=nz,sendbuf_l=sendbuf_l,sendbuf_r=sendbuf_r](int idx){ auto [s,k,ll] = idx3d(idx,hs,nz); - sendbuf_l[ll*nz*hs + k*hs + s] = state(ll, (k+hs), hs+s); - sendbuf_r[ll*nz*hs + k*hs + s] = state(ll, (k+hs), nx+s); + sendspan_l(ll, k, s) = state(ll, (k+hs), hs+s); + sendspan_r(ll, k, s) = state(ll, (k+hs), nx+s); }); //Fire off the sends @@ -435,8 +440,8 @@ void set_halo_values_x( real3d_span state ) { //Unpack the receive buffers std::for_each_n(std::execution::par_unseq,std::views::iota(0).begin(),(NUM_VARS*nz*hs),[=,nx=nx,nz=nz,recvbuf_l=recvbuf_l,recvbuf_r=recvbuf_r](int idx){ auto [s,k,ll] = idx3d(idx,hs,nz); - state(ll, (k+hs), s ) = recvbuf_l[ll*nz*hs + k*hs + s]; - state(ll, (k+hs), nx+hs+s) = recvbuf_r[ll*nz*hs + k*hs + s]; + state(ll, (k+hs), s ) = recvspan_l(ll, k, s); + state(ll, (k+hs), nx+hs+s) = recvspan_r(ll, k, s); }); //Wait for sends to finish From a3257902f4e4667917cf3d12f2bba8dbdc1bea03 Mon Sep 17 00:00:00 2001 From: Jeff Larkin Date: Thu, 18 Apr 2024 17:21:58 -0400 Subject: [PATCH 9/9] Code clean-up to help with diffing --- c/miniWeather_mpi_for_each.cpp | 44 ++++++++++----------- c/miniWeather_mpi_for_each_n.cpp | 44 ++++++++++----------- c/miniWeather_serial_for_each.cpp | 52 ++++++++++++------------ c/miniWeather_serial_for_each_n.cpp | 2 +- cpp/miniWeather_mpi_for_each.cpp | 57 ++++++++++++--------------- cpp/miniWeather_mpi_for_each_n.cpp | 57 ++++++++++++--------------- cpp/miniWeather_serial_for_each.cpp | 57 ++++++++++++--------------- cpp/miniWeather_serial_for_each_n.cpp | 57 ++++++++++++--------------- 8 files changed, 175 insertions(+), 195 deletions(-) diff --git a/c/miniWeather_mpi_for_each.cpp b/c/miniWeather_mpi_for_each.cpp index 44711189..39d3e904 100644 --- a/c/miniWeather_mpi_for_each.cpp +++ b/c/miniWeather_mpi_for_each.cpp @@ -293,30 +293,30 @@ void compute_tendencies_x( double *state , double *flux , double *tend , double std::for_each(std::execution::par_unseq,bounds1.begin(),bounds1.end(),[=,nx=nx,nz=nz,hy_dens_cell=hy_dens_cell,hy_dens_theta_cell=hy_dens_theta_cell](int idx){ auto [i,k] = idx2d(idx,nx+1); double stencil[4], d3_vals[NUM_VARS],vals[NUM_VARS]; - //Use fourth-order interpolation from four cell averages to compute the value at the interface in question - for (int ll=0; ll #include -#include +#include namespace stdex = std::experimental; template -using array_view = stdex::mdspan>; -using real_1d_array_view = array_view; -using real_2d_array_view = array_view; -using real3d_span = array_view; -using const_real_1d_array_view = array_view; -using const_real_2d_array_view = array_view; -using const_real3d_span = array_view; +using ndim_span = stdex::mdspan>; +using real3d_span = ndim_span; // These should become unnecessary with the addition of cartesian_product constexpr std::tuple idx2d(int idx, int nx) { return {idx%nx, idx/nx}; } @@ -302,29 +297,29 @@ void compute_tendencies_x( real3d_span state , real3d_span flux , real3d_span te std::for_each(std::execution::par_unseq,bounds1.begin(),bounds1.end(),[=,nx=nx,nz=nz,hy_dens_cell=hy_dens_cell,hy_dens_theta_cell=hy_dens_theta_cell](int idx){ auto [i,k] = idx2d(idx,nx+1); double stencil[4], d3_vals[NUM_VARS],vals[NUM_VARS]; - //Use fourth-order interpolation from four cell averages to compute the value at the interface in question - for (int ll=0; ll #include -#include +#include namespace stdex = std::experimental; template -using array_view = stdex::mdspan>; -using real_1d_array_view = array_view; -using real_2d_array_view = array_view; -using real3d_span = array_view; -using const_real_1d_array_view = array_view; -using const_real_2d_array_view = array_view; -using const_real3d_span = array_view; +using ndim_span = stdex::mdspan>; +using real3d_span = ndim_span; // These should become unnecessary with the addition of cartesian_product constexpr std::tuple idx2d(int idx, int nx) { return {idx%nx, idx/nx}; } @@ -300,29 +295,29 @@ void compute_tendencies_x( real3d_span state , real3d_span flux , real3d_span te std::for_each_n(std::execution::par_unseq,std::views::iota(0).begin(),(nz*(nx+1)),[=,nx=nx,nz=nz,hy_dens_cell=hy_dens_cell,hy_dens_theta_cell=hy_dens_theta_cell](int idx){ auto [i,k] = idx2d(idx,nx+1); double stencil[4], d3_vals[NUM_VARS],vals[NUM_VARS]; - //Use fourth-order interpolation from four cell averages to compute the value at the interface in question - for (int ll=0; ll #include -#include +#include namespace stdex = std::experimental; template -using array_view = stdex::mdspan>; -using real_1d_array_view = array_view; -using real_2d_array_view = array_view; -using real3d_span = array_view; -using const_real_1d_array_view = array_view; -using const_real_2d_array_view = array_view; -using const_real3d_span = array_view; +using ndim_span = stdex::mdspan>; +using real3d_span = ndim_span; // These should become unnecessary with the addition of cartesian_product constexpr std::tuple idx2d(int idx, int nx) { return {idx%nx, idx/nx}; } @@ -298,29 +293,29 @@ void compute_tendencies_x( real3d_span state , real3d_span flux , real3d_span te std::for_each(std::execution::par_unseq,bounds1.begin(),bounds1.end(),[=,nx=nx,nz=nz,hy_dens_cell=hy_dens_cell,hy_dens_theta_cell=hy_dens_theta_cell](int idx){ auto [i,k] = idx2d(idx,nx+1); double stencil[4], d3_vals[NUM_VARS],vals[NUM_VARS]; - //Use fourth-order interpolation from four cell averages to compute the value at the interface in question - for (int ll=0; ll #include -#include +#include namespace stdex = std::experimental; template -using array_view = stdex::mdspan>; -using real_1d_array_view = array_view; -using real_2d_array_view = array_view; -using real3d_span = array_view; -using const_real_1d_array_view = array_view; -using const_real_2d_array_view = array_view; -using const_real3d_span = array_view; +using ndim_span = stdex::mdspan>; +using real3d_span = ndim_span; // These should become unnecessary with the addition of cartesian_product constexpr std::tuple idx2d(int idx, int nx) { return {idx%nx, idx/nx}; } @@ -296,29 +291,29 @@ void compute_tendencies_x( real3d_span state , real3d_span flux , real3d_span te std::for_each_n(std::execution::par_unseq,std::views::iota(0).begin(),(nz*(nx+1)),[=,nx=nx,nz=nz,hy_dens_cell=hy_dens_cell,hy_dens_theta_cell=hy_dens_theta_cell](int idx){ auto [i,k] = idx2d(idx,nx+1); double stencil[4], d3_vals[NUM_VARS],vals[NUM_VARS]; - //Use fourth-order interpolation from four cell averages to compute the value at the interface in question - for (int ll=0; ll