diff --git a/cudastf/CMakeLists.txt b/cudastf/CMakeLists.txt new file mode 100644 index 0000000..1cae7fa --- /dev/null +++ b/cudastf/CMakeLists.txt @@ -0,0 +1,30 @@ +cmake_minimum_required(VERSION 3.14 FATAL_ERROR) +project(miniWeather CUDA CXX) + +include(cmake/CPM.cmake) + +# This will automatically clone CCCL from GitHub and make the exported cmake targets available +CPMAddPackage( + NAME CCCL + GITHUB_REPOSITORY nvidia/cccl + GIT_TAG main + # The following is required to make the `CCCL::cudax` target available: + OPTIONS "CCCL_ENABLE_UNSTABLE ON" +) + +# Default to building for the GPU on the current system +if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + set(CMAKE_CUDA_ARCHITECTURES native) +endif() + +# If you're building an executable +add_executable(miniWeather_cudastf miniWeather_cudastf.cu) + +target_link_libraries(miniWeather_cudastf PUBLIC cuda) + +if (CMAKE_CUDA_COMPILER) + target_compile_options(miniWeather_cudastf PUBLIC $<$:--expt-relaxed-constexpr>) + target_compile_options(miniWeather_cudastf PUBLIC $<$:--extended-lambda>) +endif() + +target_link_libraries(miniWeather_cudastf PRIVATE CCCL::CCCL CCCL::cudax) diff --git a/cudastf/README.md b/cudastf/README.md new file mode 100644 index 0000000..1b9191c --- /dev/null +++ b/cudastf/README.md @@ -0,0 +1,14 @@ +# CUDASTF port of the miniWeather benchmark + +CUDASTF is a task-based programming model implemented as a header-only C++ +library. It is shipped as part of NVIDIA's CCCL project +. + +After identifying physical fields as `logical data`, and expressing +computations as tasks which operate on these fields, CUDASTF automatically +manages data, and infers concurrency opportunities. + +This example illustrates how to automatically generate CUDA kernels based on +CUDASTF's `ctx.parallel_for`. It also demonstrates how these constructs can be +seamlessly spread over multiple GPUs, and how to leverage CUDA graphs to +improve performance for small problem sizes. diff --git a/cudastf/cmake/CPM.cmake b/cudastf/cmake/CPM.cmake new file mode 100644 index 0000000..baf2d8c --- /dev/null +++ b/cudastf/cmake/CPM.cmake @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: MIT +# +# SPDX-FileCopyrightText: Copyright (c) 2019-2023 Lars Melchior and contributors + +set(CPM_DOWNLOAD_VERSION 0.40.2) +set(CPM_HASH_SUM "c8cdc32c03816538ce22781ed72964dc864b2a34a310d3b7104812a5ca2d835d") + +if(CPM_SOURCE_CACHE) + set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake") +elseif(DEFINED ENV{CPM_SOURCE_CACHE}) + set(CPM_DOWNLOAD_LOCATION "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake") +else() + set(CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake") +endif() + +# Expand relative path. This is important if the provided path contains a tilde (~) +get_filename_component(CPM_DOWNLOAD_LOCATION ${CPM_DOWNLOAD_LOCATION} ABSOLUTE) + +file(DOWNLOAD + https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake + ${CPM_DOWNLOAD_LOCATION} EXPECTED_HASH SHA256=${CPM_HASH_SUM} +) + +include(${CPM_DOWNLOAD_LOCATION}) diff --git a/cudastf/miniWeather_cudastf.cu b/cudastf/miniWeather_cudastf.cu new file mode 100644 index 0000000..d8a7fa6 --- /dev/null +++ b/cudastf/miniWeather_cudastf.cu @@ -0,0 +1,846 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +////////////////////////////////////////////////////////////////////////////////////////// +// miniWeather +// Author: Matt Norman , Oak Ridge National Laboratory +// This code simulates dry, stratified, compressible, non-hydrostatic fluid flows +// For documentation, please see the attached documentation in the "documentation" folder +////////////////////////////////////////////////////////////////////////////////////////// + +/* +** Copyright (c) 2018, National Center for Computational Sciences, Oak Ridge National Laboratory. All rights reserved. +** +** Portions Copyright (c) 2020, NVIDIA Corporation. All rights reserved. +*/ + +/** + * @file + * @brief CUDASTF implementation of the ORNL's miniWeather CFD code + */ + +#include + +#include +#include +#include +#include + +#ifdef HAVE_NETCDF +# include +#endif + +using namespace cuda::experimental::stf; + +using policy = blocked_partition_custom<1>; + +struct state_t { + state_t(context& ctx, size_t nx, size_t nz, size_t hs, size_t NUM_VARS) { + l = ctx.logical_data(nx + 2 * hs, nz + 2 * hs, NUM_VARS); + } + + // double *vals; + logical_data> l; +}; + +struct tend_t { + tend_t(context& ctx, size_t nx, size_t nz, size_t NUM_VARS) { + l = ctx.logical_data(nx, nz, NUM_VARS); + l.set_symbol("tend"); + } + + logical_data> l; +}; + +const double grav = 9.8; // Gravitational acceleration (m / s^2) +const double cp = 1004.; // Specific heat of dry air at constant pressure +const double rd = 287.; // Dry air constant for equation of state (P=rho*rd*T) +const double p0 = 1.e5; // Standard pressure at the surface in Pascals +const double C0 = 27.5629410929725921310572974482; // Constant to translate potential temperature into pressure + // (P=C0*(rho*theta)**gamma) +const double gamm = 1.40027894002789400278940027894; // gamma=cp/Rd , have to call this gamm because "gamma" is taken + // (I hate C so much) +// Define domain and stability-related constants +const double xlen = 2.e4; // Length of the domain in the x-direction (meters) +const double zlen = 1.e4; // Length of the domain in the z-direction (meters) +const double hv_beta = 0.25; // How strong to diffuse the solution: hv_beta \in [0:1] +const double cfl = 1.50; //"Courant, Friedrichs, Lewy" number (for numerical stability) +const double max_speed = + 450; // Assumed maximum wave speed during the simulation (speed of sound + speed of wind) (meter / sec) +const int hs = 2; //"Halo" size: number of cells needed for a full "stencil" of information for reconstruction +const int sten_size = 4; // Size of the stencil used for interpolation + +// Parameters for indexing and flags +const int NUM_VARS = 4; // Number of fluid state variables +const int ID_DENS = 0; // index for density ("rho") +const int ID_UMOM = 1; // index for momentum in the x-direction ("rho * u") +const int ID_WMOM = 2; // index for momentum in the z-direction ("rho * w") +const int ID_RHOT = 3; // index for density * potential temperature ("rho * theta") +const int DIR_X = 1; // Integer constant to express that this operation is in the x-direction +const int DIR_Z = 2; // Integer constant to express that this operation is in the z-direction + +/////////////////////////////////////////////////////////////////////////////////////// +// Variables that are initialized but remain static over the course of the simulation +/////////////////////////////////////////////////////////////////////////////////////// +double sim_time; // total simulation time in seconds +double output_freq; // frequency to perform output in seconds +double dt; // Model time step (seconds) +int nx, nz; // Number of local grid cells in the x- and z- dimensions +double dx, dz; // Grid space length in x- and z-dimension (meters) +int nx_glob, nz_glob; // Number of total grid cells in the x- and z- dimensions +int i_beg, k_beg; // beginning index in the x- and z-directions +int nranks, myrank; // my rank id +int left_rank, right_rank; // Rank IDs that exist to my left and right in the global domain + +struct boundaries_t { + boundaries_t(context& ctx, size_t nz, size_t hs) { + lhy_dens_cell = ctx.logical_data(nz + 2 + hs); + lhy_dens_theta_cell = ctx.logical_data(nz + 2 + hs); + lhy_dens_int = ctx.logical_data(nz + 1); + lhy_dens_theta_int = ctx.logical_data(nz + 1); + lhy_pressure_int = ctx.logical_data(nz + 1); + + lhy_dens_cell.set_symbol("hy_dens_cell"); + lhy_dens_theta_cell.set_symbol("hy_dens_theta_cell"); + lhy_dens_int.set_symbol("hy_dens_int"); + lhy_dens_theta_int.set_symbol("hy_dens_theta_int"); + lhy_pressure_int.set_symbol("hy_pressure_int"); + } + + logical_data> lhy_dens_cell; // hydrostatic density (vert cell avgs). Dimensions: (1-hs:nz+hs) + logical_data> lhy_dens_theta_cell; // hydrostatic rho*t (vert cell avgs). Dimensions: + // (1-hs:nz+hs) + logical_data> lhy_dens_int; // hydrostatic density (vert cell interf). Dimensions: (1:nz+1) + logical_data> lhy_dens_theta_int; // hydrostatic rho*t (vert cell interf). Dimensions: (1:nz+1) + logical_data> lhy_pressure_int; // hydrostatic press (vert cell interf). Dimensions: (1:nz+1) + + frozen_logical_data> flhy_dens_cell; + frozen_logical_data> flhy_dens_theta_cell; + frozen_logical_data> flhy_dens_int; + frozen_logical_data> flhy_dens_theta_int; + frozen_logical_data> flhy_pressure_int; + + slice hy_dens_cell; + slice hy_dens_theta_cell; + slice hy_dens_int; + slice hy_dens_theta_int; + slice hy_pressure_int; +}; + +/////////////////////////////////////////////////////////////////////////////////////// +// Variables that are dynamics over the course of the simulation +/////////////////////////////////////////////////////////////////////////////////////// +double etime; // Elapsed model time +double output_counter; // Helps determine when it's time to do output +int num_out = 0; // The number of outputs performed so far +int direction_switch = 1; + +// Declaring the functions defined after "main" +void init(exec_place& where, context& ctx, state_t& state, state_t& state_tmp, boundaries_t& b); + +__host__ __device__ void injection( + double x, double z, double& r, double& u, double& w, double& t, double& hr, double& ht); +__host__ __device__ void hydro_const_theta(double z, double& r, double& t); + +void perform_timestep(exec_place& where, context& ctx, state_t& state, state_t& state_tmp, boundaries_t& b, double dt); + +void semi_discrete_step(exec_place& where, context& ctx, state_t& state_init, state_t& state_forcing, + state_t& state_out, boundaries_t& b, double dt, int dir); + +void compute_tendencies_x(exec_place& where, context& ctx, state_t& state, tend_t& tend, boundaries_t& b); + +void compute_tendencies_z(exec_place& where, context& ctx, state_t& state, tend_t& tend, boundaries_t& b); + +void set_halo_values_x(exec_place& where, context& ctx, state_t& state, boundaries_t& b); + +void set_halo_values_z(exec_place& where, context& ctx, state_t& state); + +#ifdef HAVE_NETCDF +void output(context& ctx, state_t& state, boundaries_t& b, double etime); +void ncwrap(int ierr, int line); +#endif // HAVE_NETCDF + +void simulation(context& ctx, exec_place where) { + // printf("Using ctx %s on %s\n", ctx.to_string().c_str(), where.to_string().c_str()); + + double exe_time; + struct timeval stop_time, start_time; + + // Runtime variable arrays + state_t state(ctx, nx_glob, nz_glob, hs, + NUM_VARS); // Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) + state_t state_tmp(ctx, nx_glob, nz_glob, hs, + NUM_VARS); // Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) + + state.l.set_symbol("state"); + state_tmp.l.set_symbol("state_tmp"); + + boundaries_t b(ctx, nz_glob, hs); + + init(where, ctx, state, state_tmp, b); + +#ifdef HAVE_NETCDF + // Output the initial state + output(ctx, state, b, etime); +#endif + + gettimeofday(&start_time, NULL); + + ctx.repeat([&]() { return etime < sim_time; })->*[&](context ctx, size_t) { + // If the time step leads to exceeding the simulation time, shorten it for the last step + if (etime + dt > sim_time) { + dt = sim_time - etime; + } + + // Perform a single time step + perform_timestep(where, ctx, state, state_tmp, b, dt); + + // Inform the user + fprintf(stderr, "Elapsed Time: %lf / %lf\n", etime, sim_time); + + // Update the elapsed time and output counter + etime = etime + dt; + +#ifdef HAVE_NETCDF + output_counter = output_counter + dt; + // If it's time for output, reset the counter, and do output + + if (output_counter >= output_freq) { + output_counter = output_counter - output_freq; + output(ctx, state, b, etime); + } +#endif + }; + + ctx.finalize(); + + gettimeofday(&stop_time, NULL); + exe_time = (stop_time.tv_sec + (stop_time.tv_usec / 1000000.0)) - + (start_time.tv_sec + (start_time.tv_usec / 1000000.0)); + + printf("Complete. Execution time is = %lf seconds\n", exe_time); +} + +/////////////////////////////////////////////////////////////////////////////////////// +// THE MAIN PROGRAM STARTS HERE +/////////////////////////////////////////////////////////////////////////////////////// +int main(int argc, char** argv) { + /////////////////////////////////////////////////////////////////////////////////////// + // BEGIN USER-CONFIGURABLE PARAMETERS + /////////////////////////////////////////////////////////////////////////////////////// + // The x-direction length is twice as long as the z-direction length + // So, you'll want to have nx_glob be twice as large as nz_glob + nx_glob = 400; // Number of total cells in the x-dirction + nz_glob = 200; // Number of total cells in the z-dirction + sim_time = 2; // How many seconds to run the simulation + output_freq = 10; // How frequently to output data to file (in seconds) + /////////////////////////////////////////////////////////////////////////////////////// + // END USER-CONFIGURABLE PARAMETERS + /////////////////////////////////////////////////////////////////////////////////////// + + if (argc >= 4) { + fprintf(stdout, "The arguments supplied are %s %s %s\n", argv[1], argv[2], argv[3]); + nx_glob = atoi(argv[1]); + nz_glob = atoi(argv[2]); + sim_time = atoi(argv[3]); + } else { + // printf("Using default values ...\n"); + } + + exec_place where = exec_place::current_device(); + + if (argc >= 5) { + int use_gpu = atoi(argv[4]); + switch (use_gpu) { + case 0: where = exec_place::host(); break; + case 1: where = exec_place::current_device(); break; + case 2: where = exec_place::all_devices(); break; + case 3: where = exec_place::repeat(exec_place::current_device(), 8); break; + default: abort(); + } + } + + context ctx; + int use_graph = 0; + if (argc >= 6) { + use_graph = atoi(argv[5]); + if (use_graph) { + ctx = graph_ctx(); + } + } + // fprintf(stderr, "Using %s backend.\n", use_graph ? "graph" : "stream"); + + simulation(ctx, where); +} + +// Performs a single dimensionally split time step using a simple low-storate three-stage Runge-Kutta time integrator +// The dimensional splitting is a second-order-accurate alternating Strang splitting in which the +// order of directions is alternated each time step. +// The Runge-Kutta method used here is defined as follows: +// q* = q[n] + dt/3 * rhs(q[n]) +// q** = q[n] + dt/2 * rhs(q* ) +// q[n+1] = q[n] + dt/1 * rhs(q** ) +void perform_timestep(exec_place& where, context& ctx, state_t& state, state_t& state_tmp, boundaries_t& b, double dt) { + if (direction_switch) { + // x-direction first + semi_discrete_step(where, ctx, state, state, state_tmp, b, dt / 3, DIR_X); + semi_discrete_step(where, ctx, state, state_tmp, state_tmp, b, dt / 2, DIR_X); + semi_discrete_step(where, ctx, state, state_tmp, state, b, dt / 1, DIR_X); + // z-direction second + semi_discrete_step(where, ctx, state, state, state_tmp, b, dt / 3, DIR_Z); + semi_discrete_step(where, ctx, state, state_tmp, state_tmp, b, dt / 2, DIR_Z); + semi_discrete_step(where, ctx, state, state_tmp, state, b, dt / 1, DIR_Z); + } else { + // z-direction second + semi_discrete_step(where, ctx, state, state, state_tmp, b, dt / 3, DIR_Z); + semi_discrete_step(where, ctx, state, state_tmp, state_tmp, b, dt / 2, DIR_Z); + semi_discrete_step(where, ctx, state, state_tmp, state, b, dt / 1, DIR_Z); + // x-direction first + semi_discrete_step(where, ctx, state, state, state_tmp, b, dt / 3, DIR_X); + semi_discrete_step(where, ctx, state, state_tmp, state_tmp, b, dt / 2, DIR_X); + semi_discrete_step(where, ctx, state, state_tmp, state, b, dt / 1, DIR_X); + } + if (direction_switch) { + direction_switch = 0; + } else { + direction_switch = 1; + } +} + +// Perform a single semi-discretized step in time with the form: +// state_out = state_init + dt * rhs(state_forcing) +// Meaning the step starts from state_init, computes the rhs using state_forcing, and stores the result in state_out +void semi_discrete_step(exec_place& where, context& ctx, state_t& state_init, state_t& state_forcing, + state_t& state_out, boundaries_t& b, double dt, int dir) { + // Fluid state tendencies. Dimensions: (nx,nz,NUM_VARS) + tend_t tend(ctx, nx, nz, NUM_VARS); + + if (dir == DIR_X) { + // Set the halo values in the x-direction + set_halo_values_x(where, ctx, state_forcing, b); + // Compute the time tendencies for the fluid state in the x-direction + compute_tendencies_x(where, ctx, state_forcing, tend, b); + } else if (dir == DIR_Z) { + // Set the halo values in the z-direction + set_halo_values_z(where, ctx, state_forcing); + // Compute the time tendencies for the fluid state in the z-direction + compute_tendencies_z(where, ctx, state_forcing, tend, b); + } + + // Apply the tendencies to the fluid state + ctx.parallel_for(policy(), where, tend.l.shape(), state_out.l.write(), state_init.l.read(), tend.l.read()) + .set_symbol("apply tend") + ->*[=] __host__ __device__(size_t i, size_t k, size_t ll, slice dstate_out, + slice dstate_init, slice dtend) { + dstate_out(i + hs, k + hs, ll) = dstate_init(i + hs, k + hs, ll) + dt * dtend(i, k, ll); + }; +} + +// Compute the time tendencies of the fluid state using forcing in the x-direction + +// First, compute the flux vector at each cell interface in the x-direction (including hyperviscosity) +// Then, compute the tendencies using those fluxes +void compute_tendencies_x(exec_place& where, context& ctx, state_t& state, tend_t& tend, boundaries_t& b) { + // Cell interface fluxes. Dimensions: (nx+1,nz+1,NUM_VARS) + // double *flux = (double *)malloc((nx + 1) * (nz + 1) * NUM_VARS * sizeof(double)); + + double dx_ = dx; + auto hy_dens_cell = b.hy_dens_cell; + auto hy_dens_theta_cell = b.hy_dens_theta_cell; + + auto lflux = ctx.logical_data(nx + 1, nz, NUM_VARS); + lflux.set_symbol("flux_x"); + + // int i, k, ll, s, inds, indf1, indf2, indt; + // Compute the hyperviscosity coeficient + double hv_coef = -hv_beta * dx / (16 * dt); + // Compute fluxes in the x-direction for each cell + ctx.parallel_for(policy(), where, box(nx + 1, nz), state.l.read(), lflux.write()) + .set_symbol("comp_tend_x") + ->* + [=] __host__ __device__(size_t i, size_t k, slice dstate, slice dflux) + { + double d3_vals[NUM_VARS], vals[NUM_VARS]; + // Use fourth-order interpolation from four cell averages to compute the value at the interface in + // question + for (size_t ll = 0; ll < NUM_VARS; ll++) { + double stencil[4]; + for (size_t s = 0; s < sten_size; s++) { + stencil[s] = dstate(i + s, k + hs, ll); + } + // Fourth-order-accurate interpolation of the state + vals[ll] = -stencil[0] / 12 + 7 * stencil[1] / 12 + 7 * stencil[2] / 12 - stencil[3] / 12; + // First-order-accurate interpolation of the third spatial derivative of the state (for artificial + // viscosity) + d3_vals[ll] = -stencil[0] + 3 * stencil[1] - 3 * stencil[2] + stencil[3]; + } + + // Compute density, u-wind, w-wind, potential temperature, and pressure (r,u,w,t,p respectively) + double r = vals[ID_DENS] + hy_dens_cell(k + hs); + double u = vals[ID_UMOM] / r; + double w = vals[ID_WMOM] / r; + double t = (vals[ID_RHOT] + hy_dens_theta_cell(k + hs)) / r; + double p = C0 * pow((r * t), gamm); + // fprintf(stderr, "FLUX P %e\n", p); + + // Compute the flux vector + dflux(i, k, ID_DENS) = r * u - hv_coef * d3_vals[ID_DENS]; + dflux(i, k, ID_UMOM) = r * u * u + p - hv_coef * d3_vals[ID_UMOM]; + dflux(i, k, ID_WMOM) = r * u * w - hv_coef * d3_vals[ID_WMOM]; + dflux(i, k, ID_RHOT) = r * u * t - hv_coef * d3_vals[ID_RHOT]; + }; + + // Use the fluxes to compute tendencies for each cell + ctx.parallel_for(policy(), where, tend.l.shape(), tend.l.write(), lflux.read()).set_symbol("update_tend_x") + ->* + [=] __host__ __device__(size_t i, size_t k, size_t ll, slice dtend, slice dflux) { + dtend(i, k, ll) = -(dflux(i + 1, k, ll) - dflux(i, k, ll)) / dx_; + }; +} + +// Compute the time tendencies of the fluid state using forcing in the z-direction + +// First, compute the flux vector at each cell interface in the z-direction (including hyperviscosity) +// Then, compute the tendencies using those fluxes +void compute_tendencies_z(exec_place& where, context& ctx, state_t& state, tend_t& tend, boundaries_t& b) { + double dz_ = dz; + + // Cell interface fluxes. Dimensions: (nx+1,nz+1,NUM_VARS) + auto lflux = ctx.logical_data(nx, nz + 1, NUM_VARS); + lflux.set_symbol("flux_z"); + + // Compute the hyperviscosity coeficient + double hv_coef = -hv_beta * dx / (16 * dt); + // Compute fluxes in the x-direction for each cell + + auto hy_dens_int = b.hy_dens_int; + auto hy_dens_theta_int = b.hy_dens_theta_int; + auto hy_pressure_int = b.hy_pressure_int; + + ctx.parallel_for(policy(), where, box(nx, nz + 1), state.l.read(), lflux.write()) + .set_symbol("comp_tend_z") + ->* + [=] __host__ __device__(size_t i, size_t k, slice dstate, slice dflux) { + double d3_vals[NUM_VARS], vals[NUM_VARS]; + // Use fourth-order interpolation from four cell averages to compute the value at the interface in + // question + for (size_t ll = 0; ll < NUM_VARS; ll++) { + double stencil[4]; + for (size_t s = 0; s < sten_size; s++) { + stencil[s] = dstate(i + hs, k + s, ll); + } + // Fourth-order-accurate interpolation of the state + vals[ll] = -stencil[0] / 12 + 7 * stencil[1] / 12 + 7 * stencil[2] / 12 - stencil[3] / 12; + // First-order-accurate interpolation of the third spatial derivative of the state + d3_vals[ll] = -stencil[0] + 3 * stencil[1] - 3 * stencil[2] + stencil[3]; + } + + // Compute density, u-wind, w-wind, potential temperature, and pressure (r,u,w,t,p respectively) + double r = vals[ID_DENS] + hy_dens_int(k); + double u = vals[ID_UMOM] / r; + double w = vals[ID_WMOM] / r; + double t = (vals[ID_RHOT] + hy_dens_theta_int(k)) / r; + double p = C0 * pow((r * t), gamm) - hy_pressure_int(k); + + // Compute the flux vector with hyperviscosity + dflux(i, k, ID_DENS) = r * w - hv_coef * d3_vals[ID_DENS]; + dflux(i, k, ID_UMOM) = r * w * u - hv_coef * d3_vals[ID_UMOM]; + dflux(i, k, ID_WMOM) = r * w * w + p - hv_coef * d3_vals[ID_WMOM]; + dflux(i, k, ID_RHOT) = r * w * t - hv_coef * d3_vals[ID_RHOT]; + }; + + // Use the fluxes to compute tendencies for each cell + ctx.parallel_for(policy(), where, tend.l.shape(), tend.l.write(), lflux.read(), state.l.read()) + .set_symbol("update_tend_z") + ->*[=] __host__ __device__(size_t i, size_t k, size_t ll, slice dtend, + slice dflux, slice dstate) { + dtend(i, k, ll) = -(dflux(i, k + 1, ll) - dflux(i, k, ll)) / dz_; + + if (ll == ID_WMOM) { + dtend(i, k, ll) -= dstate(i + hs, k + hs, ID_DENS); + } + }; +} + +void set_halo_values_x(exec_place& where, context& ctx, state_t& state, boundaries_t& b) { + // int k, ll, ind_r, ind_u, ind_t, i; + + double dz_ = dz; + int nx_ = nx; + int k_beg_ = k_beg; + + auto hy_dens_theta_cell = b.hy_dens_theta_cell; + auto hy_dens_cell = b.hy_dens_cell; + + ctx.parallel_for(policy(), where, box(nz, NUM_VARS), state.l.rw()).set_symbol("set halo x") + ->*[=] __host__ __device__(size_t k, size_t ll, slice dstate) { + dstate(0, k + hs, ll) = dstate(nx_ + hs - 2, k + hs, ll); + dstate(1, k + hs, ll) = dstate(nx_ + hs - 1, k + hs, ll); + dstate(nx_ + hs, k + hs, ll) = dstate(hs, k + hs, ll); + dstate(nx_ + hs + 1, k + hs, ll) = dstate(hs + 1, k + hs, ll); + }; + + if (myrank == 0) { + ctx.parallel_for( + policy(), where, box(nz, hs), state.l.rw()) + .set_symbol("set halo x(2)") + ->* + [=] __host__ __device__(size_t k, size_t i, slice dstate) { + double z = ((double) k_beg_ + (double) k + 0.5) * dz_; + if (fabs(z - 3.0 * zlen / 4.0) <= zlen / 16.0) { + dstate(i, k + hs, ID_UMOM) = (dstate(i, k + hs, ID_DENS) + hy_dens_cell(k + hs)) * 50.; + dstate(i, k + hs, ID_RHOT) = + (dstate(i, k + hs, ID_DENS) + hy_dens_cell(k + hs)) * 298. - hy_dens_theta_cell(k + hs); + } + }; + } +} + +// Set this task's halo values in the z-direction. +// decomposition in the vertical direction. +void set_halo_values_z(exec_place& where, context& ctx, state_t& state) { + // int i, ll; + // const double mnt_width = xlen / 8; + // double x, xloc, mnt_deriv; + + int nz_ = nz; + + ctx.parallel_for(policy(), where, box(nx + 2 * hs, NUM_VARS), state.l.rw()).set_symbol("set halo z") + ->*[=] __host__ __device__(size_t i, size_t ll, slice dstate) { + if (ll == ID_WMOM) { + dstate(i, 0, ll) = 0.; + dstate(i, 1, ll) = 0.; + dstate(i, nz_ + hs, ll) = 0.; + dstate(i, nz_ + hs + 1, ll) = 0.; + } else { + dstate(i, 0, ll) = dstate(i, hs, ll); + dstate(i, 1, ll) = dstate(i, hs, ll); + dstate(i, nz_ + hs, ll) = dstate(i, nz_ + hs - 1, ll); + dstate(i, nz_ + hs + 1, ll) = dstate(i, nz_ + hs - 1, ll); + } + }; +} + +void init(exec_place& where, context& ctx, state_t& state, state_t& state_tmp, boundaries_t& b) { + int i_end; + double nper; + + // Set the cell grid size + dx = xlen / nx_glob; + dz = zlen / nz_glob; + + nranks = 1; + myrank = 0; + + // For simpler version, replace i_beg = 0, nx = nx_glob, left_rank = 0, right_rank = 0; + + nper = ((double) nx_glob) / nranks; + i_beg = round(nper * (myrank)); + i_end = round(nper * ((myrank) + 1)) - 1; + nx = i_end - i_beg + 1; + left_rank = myrank - 1; + if (left_rank == -1) + left_rank = nranks - 1; + right_rank = myrank + 1; + if (right_rank == nranks) + right_rank = 0; + + //////////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////////// + // YOU DON'T NEED TO ALTER ANYTHING BELOW THIS POINT IN THE CODE + //////////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////////// + + k_beg = 0; + nz = nz_glob; + + // Define the maximum stable time step based on an assumed maximum wind speed + dt = std::min(dx, dz) / max_speed * cfl; + // Set initial elapsed model time and output_counter to zero + etime = 0.; + output_counter = 0.; + + // Display grid information + + printf("nx_glob, nz_glob: %d %d\n", nx_glob, nz_glob); + printf("dx,dz: %lf %lf\n", dx, dz); + printf("dt: %lf\n", dt); + + int nqpoints = 3; + double qpoints[3] = { 0.112701665379258311482073460022E0, 0.500000000000000000000000000000E0, + 0.887298334620741688517926539980E0 }; + double qweights[3] = { 0.277777777777777777777777777779E0, 0.444444444444444444444444444444E0, + 0.277777777777777777777777777779E0 }; + + auto lqweights = ctx.logical_data(qweights); + auto lqpoints = ctx.logical_data(qpoints); + + auto dx_ = dx; + auto dz_ = dz; + auto i_beg_ = i_beg; + auto k_beg_ = k_beg; + + ////////////////////////////////////////////////////////////////////////// + // Initialize the cell-averaged fluid state via Gauss-Legendre quadrature + ////////////////////////////////////////////////////////////////////////// + ctx.parallel_for(policy(), where, box(nx + 2 * hs, nz + 2 * hs), state.l.write(), lqweights.read(), lqpoints.read()) + .set_symbol("init_fluid_cells") + ->* + [=] __host__ __device__( + size_t i, size_t k, slice hstate, slice qweights, slice qpoints) { + // Initialize the state to zero + for (size_t ll = 0; ll < NUM_VARS; ll++) { + hstate(i, k, ll) = 0.; + } + // Use Gauss-Legendre quadrature to initialize a hydrostatic balance + temperature perturbation + for (size_t kk = 0; kk < nqpoints; kk++) { + for (size_t ii = 0; ii < nqpoints; ii++) { + // Compute the x,z location within the global domain based on cell and quadrature index + double x = ((double) i_beg_ + (double) i - (double) hs + 0.5) * dx_ + (qpoints[ii] - 0.5) * dx_; + double z = ((double) k_beg_ + (double) k - (double) hs + 0.5) * dz_ + (qpoints[kk] - 0.5) * dz_; + + // Set the fluid state based on the user's specification (default is injection in this example) + double r, u, w, t, hr, ht; + injection(x, z, r, u, w, t, hr, ht); + + // Store into the fluid state array + hstate(i, k, ID_DENS) += r * qweights[ii] * qweights[kk]; + hstate(i, k, ID_UMOM) += (r + hr) * u * qweights[ii] * qweights[kk]; + hstate(i, k, ID_WMOM) += (r + hr) * w * qweights[ii] * qweights[kk]; + hstate(i, k, ID_RHOT) += ((r + hr) * (t + ht) - hr * ht) * qweights[ii] * qweights[kk]; + } + } + }; + + ctx.parallel_for(policy(), where, state.l.shape(), state.l.read(), state_tmp.l.write()) + .set_symbol("init_fluid_cells_cpy") + ->*[] __host__ __device__(size_t i, size_t k, size_t ll, slice hstate, + slice hstate_tmp) { hstate_tmp(i, k, ll) = hstate(i, k, ll); }; + + // Compute the hydrostatic background state over vertical cell averages + ctx.parallel_for(policy(), where, b.lhy_dens_cell.shape(), b.lhy_dens_cell.write(), b.lhy_dens_theta_cell.write()) + .set_symbol("init_hydro_background") + ->*[=] __host__ __device__(size_t k, slice hy_dens_cell, slice hy_dens_theta_cell) { + hy_dens_cell(k) = 0.; + hy_dens_theta_cell(k) = 0.; + for (int kk = 0; kk < nqpoints; kk++) { + double z = (k_beg_ + (double) k - (double) hs + 0.5) * dz_; + + // Set the fluid state based on the user's specification (default is injection in this + // example) + double r, u, w, t, hr, ht; + injection(0., z, r, u, w, t, hr, ht); + + hy_dens_cell(k) += hr * qweights[kk]; + hy_dens_theta_cell(k) += hr * ht * qweights[kk]; + } + }; + + // Compute the hydrostatic background state at vertical cell interfaces + ctx.parallel_for(policy(), where, b.lhy_dens_int.shape(), b.lhy_dens_int.write(), b.lhy_dens_theta_int.write(), + b.lhy_pressure_int.write()) + .set_symbol("init_hydro_background_interfaces") + ->*[=] __host__ __device__(size_t k, slice hy_dens_int, slice hy_dens_theta_int, + slice hy_pressure_int) { + double z = ((double) k_beg_ + (double) k) * dz_; + + // Set the fluid state based on the user's specification (default is injection in this + // example) + double r, u, w, t, hr, ht; + injection(0., z, r, u, w, t, hr, ht); + + hy_dens_int(k) = hr; + hy_dens_theta_int(k) = hr * ht; + hy_pressure_int(k) = C0 * pow((hr * ht), gamm); + }; + + b.flhy_dens_cell = ctx.freeze(b.lhy_dens_cell); + b.flhy_dens_theta_cell = ctx.freeze(b.lhy_dens_theta_cell); + b.flhy_dens_int = ctx.freeze(b.lhy_dens_int); + b.flhy_dens_theta_int = ctx.freeze(b.lhy_dens_theta_int); + b.flhy_pressure_int = ctx.freeze(b.lhy_pressure_int); + + b.flhy_dens_cell.set_automatic_unfreeze(); + b.flhy_dens_theta_cell.set_automatic_unfreeze(); + b.flhy_dens_int.set_automatic_unfreeze(); + b.flhy_dens_theta_int.set_automatic_unfreeze(); + b.flhy_pressure_int.set_automatic_unfreeze(); + + auto dplace = where.is_grid()? + data_place::composite(policy(), where.as_grid()) + :where.affine_data_place(); + + b.hy_dens_cell = b.flhy_dens_cell.get(dplace).first; + b.hy_dens_theta_cell = b.flhy_dens_theta_cell.get(dplace).first; + b.hy_dens_int = b.flhy_dens_int.get(dplace).first; + b.hy_dens_theta_int = b.flhy_dens_theta_int.get(dplace).first; + b.hy_pressure_int = b.flhy_pressure_int.get(dplace).first; + + // Ensure get operations are completed + cuda_safe_call(cudaStreamSynchronize(ctx.fence())); +} + +// This test case is initially balanced but injects fast, cold air from the left boundary near the model top +// x and z are input coordinates at which to sample +// r,u,w,t are output density, u-wind, w-wind, and potential temperature at that location +// hr and ht are output background hydrostatic density and potential temperature at that location +__host__ __device__ void injection( + double /* unused x */, double z, double& r, double& u, double& w, double& t, double& hr, double& ht) { + hydro_const_theta(z, hr, ht); + r = 0.; + t = 0.; + u = 0.; + w = 0.; +} + +// Establish hydrstatic balance using constant potential temperature (thermally neutral atmosphere) +// z is the input coordinate +// r and t are the output background hydrostatic density and potential temperature +__host__ __device__ void hydro_const_theta(double z, double& r, double& t) { + const double theta0 = 300.; // Background potential temperature + const double exner0 = 1.; // Surface-level Exner pressure + double p, exner, rt; + // Establish hydrostatic balance first using Exner pressure + t = theta0; // Potential Temperature at z + exner = exner0 - grav * z / (cp * theta0); // Exner pressure at z + p = p0 * pow(exner, (cp / rd)); // Pressure at z + rt = pow((p / C0), (1. / gamm)); // rho*theta at z + r = rt / t; // Density at z +} + +#ifdef HAVE_NETCDF +// Output the fluid state (state) to a NetCDF file at a given elapsed model time (etime) +// The file I/O uses netcdf, the only external library required for this mini-app. +// If it's too cumbersome, you can comment the I/O out, but you'll miss out on some potentially cool graphics +void output(context& ctx, state_t& state, boundaries_t& b, double etime) { + auto hy_dens_cell = b.hy_dens_cell; + auto hy_dens_theta_cell = b.hy_dens_theta_cell; + + ctx.host_launch(state.l.read()) + ->*[=](slice hstate) { + int ncid, t_dimid, x_dimid, z_dimid, dens_varid, uwnd_varid, wwnd_varid, theta_varid, + t_varid, dimids[3]; + int i, k; + + size_t st1[1], ct1[1], st3[3], ct3[3]; + + // Temporary arrays to hold density, u-wind, w-wind, and potential temperature (theta) + double *dens, *uwnd, *wwnd, *theta; + double* etimearr; + // Inform the user + + printf("*** OUTPUT ***\n"); + + // Allocate some (big) temp arrays + dens = (double*) malloc(nx * nz * sizeof(double)); + uwnd = (double*) malloc(nx * nz * sizeof(double)); + wwnd = (double*) malloc(nx * nz * sizeof(double)); + theta = (double*) malloc(nx * nz * sizeof(double)); + etimearr = (double*) malloc(1 * sizeof(double)); + + // If the elapsed time is zero, create the file. Otherwise, open the file + if (etime == 0) { + // Create the file + ncwrap(nc_create("reference.nc", NC_CLOBBER, &ncid), __LINE__); + + // Create the dimensions + ncwrap(nc_def_dim(ncid, "t", NC_UNLIMITED, &t_dimid), __LINE__); + ncwrap(nc_def_dim(ncid, "x", nx_glob, &x_dimid), __LINE__); + ncwrap(nc_def_dim(ncid, "z", nz_glob, &z_dimid), __LINE__); + + // Create the variables + dimids[0] = t_dimid; + ncwrap(nc_def_var(ncid, "t", NC_DOUBLE, 1, dimids, &t_varid), __LINE__); + + dimids[0] = t_dimid; + dimids[1] = z_dimid; + dimids[2] = x_dimid; + + ncwrap(nc_def_var(ncid, "dens", NC_DOUBLE, 3, dimids, &dens_varid), __LINE__); + ncwrap(nc_def_var(ncid, "uwnd", NC_DOUBLE, 3, dimids, &uwnd_varid), __LINE__); + ncwrap(nc_def_var(ncid, "wwnd", NC_DOUBLE, 3, dimids, &wwnd_varid), __LINE__); + ncwrap(nc_def_var(ncid, "theta", NC_DOUBLE, 3, dimids, &theta_varid), __LINE__); + + // End "define" mode + ncwrap(nc_enddef(ncid), __LINE__); + } else { + // Open the file + ncwrap(nc_open("reference.nc", NC_WRITE, &ncid), __LINE__); + + // Get the variable IDs + ncwrap(nc_inq_varid(ncid, "dens", &dens_varid), __LINE__); + ncwrap(nc_inq_varid(ncid, "uwnd", &uwnd_varid), __LINE__); + ncwrap(nc_inq_varid(ncid, "wwnd", &wwnd_varid), __LINE__); + ncwrap(nc_inq_varid(ncid, "theta", &theta_varid), __LINE__); + ncwrap(nc_inq_varid(ncid, "t", &t_varid), __LINE__); + } + + // Store perturbed values in the temp arrays for output + for (k = 0; k < nz; k++) { + for (i = 0; i < nx; i++) { + auto r = hstate(i + hs, k + hs, ID_DENS); + auto u = hstate(i + hs, k + hs, ID_UMOM); + auto w = hstate(i + hs, k + hs, ID_WMOM); + auto t = hstate(i + hs, k + hs, ID_RHOT); + + dens[k * nx + i] = r; + uwnd[k * nx + i] = u / (hy_dens_cell(k + hs) + r); + wwnd[k * nx + i] = w / (hy_dens_cell(k + hs) + r); + theta[k * nx + i] = (t + hy_dens_theta_cell[k + hs]) / (hy_dens_cell(k + hs) + r) - + hy_dens_theta_cell(k + hs) / hy_dens_cell(k + hs); + // fprintf(stderr, "DUMP DENS(%d, %d) = %e\n", i, k, r); + } + } + + // Write the grid data to file with all the processes writing collectively + st3[0] = num_out; + st3[1] = k_beg; + st3[2] = i_beg; + ct3[0] = 1; + ct3[1] = nz; + ct3[2] = nx; + + ncwrap(nc_put_vara_double(ncid, dens_varid, st3, ct3, dens), __LINE__); + ncwrap(nc_put_vara_double(ncid, uwnd_varid, st3, ct3, uwnd), __LINE__); + ncwrap(nc_put_vara_double(ncid, wwnd_varid, st3, ct3, wwnd), __LINE__); + ncwrap(nc_put_vara_double(ncid, theta_varid, st3, ct3, theta), __LINE__); + + // Only the master process needs to write the elapsed time + // write elapsed time to file + + st1[0] = num_out; + ct1[0] = 1; + etimearr[0] = etime; + ncwrap(nc_put_vara_double(ncid, t_varid, st1, ct1, etimearr), __LINE__); + + // Close the file + ncwrap(nc_close(ncid), __LINE__); + + // Increment the number of outputs + num_out = num_out + 1; + + // Deallocate the temp arrays + free(dens); + free(uwnd); + free(wwnd); + free(theta); + free(etimearr); + }; +} + +// Error reporting routine for the NetCDF I/O +void ncwrap(int ierr, int line) { + if (ierr != NC_NOERR) { + printf("NetCDF Error at line: %d\n", line); + printf("%s\n", nc_strerror(ierr)); + exit(-1); + } +} +#endif // HAVE_NETCDF