diff --git a/docs/env_vars.rst b/docs/env_vars.rst index 2123b57..502858f 100644 --- a/docs/env_vars.rst +++ b/docs/env_vars.rst @@ -90,3 +90,11 @@ The following CSV files are generated: Each CSV file includes grid configuration information as comments at the top, followed by performance data in comma-separated format. Default setting is unset (no CSV files written). Setting this variable to a directory path will enable CSV file output. + +CUDECOMP_USE_COL_MAJOR_RANK_ORDER +-------------------------------------- +(since v0.6.0) + +:code:`CUDECOMP_USE_COL_MAJOR_RANK_ORDER` controls the rank assignment order in the process grid. By default, ranks are assigned in row-major order for consistency with :code:`MPI_Cart_*` routines. When enabled, ranks are assigned in column-major order. + +Default setting is off (:code:`0`). Setting this variable to :code:`1` will enable column-major rank assignment. \ No newline at end of file diff --git a/docs/overview.rst b/docs/overview.rst index 256f7d1..993d7dd 100644 --- a/docs/overview.rst +++ b/docs/overview.rst @@ -33,8 +33,8 @@ cuDecomp can distribute 3D Cartesian domains with dimensions :math:`[X, Y, Z]`, the *global grid*. The global grid is decomposed across :math:`N_{\text{GPU}}` processes in a 2D **process grid** with dimensions :math:`P_{\text{row}} \times P_{\text{col}}`. The processes are logically grouped by column and row index into :math:`P_{\text{row}}` *row* communicators and :math:`P_{\text{col}}` *column* communicators. -For consistency with :code:`MPI_Cart_*` routines, the ranks are assigned in a row-major ordering (i.e. row communicators -are composed of sequential ranks). +By default, for consistency with :code:`MPI_Cart_*` routines, the ranks are assigned in a row-major ordering (i.e. row communicators +are composed of sequential ranks). This can be changed to column-major ordering using the :code:`CUDECOMP_USE_COL_MAJOR_RANK_ORDER` environment variable (see :ref:`env-var-section-ref`). cuDecomp will distribute the global domain data so that each process is assigned a unique *pencil* of data, with three different pencil configurations corresponding to different transposed configurations of the global domain. The domain can be diff --git a/include/internal/comm_routines.h b/include/internal/comm_routines.h index 6825709..4e51221 100644 --- a/include/internal/comm_routines.h +++ b/include/internal/comm_routines.h @@ -113,7 +113,7 @@ nvshmemAlltoallV(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_ for (int i = 1; i < send_counts.size(); ++i) { int src_rank, dst_rank; getAlltoallPeerRanks(grid_desc, comm_axis, i, src_rank, dst_rank); - int dst_rank_global = getGlobalRank(grid_desc, comm_axis, dst_rank); + int dst_rank_global = getGlobalRank(handle, grid_desc, comm_axis, dst_rank); if (nvshmem_ptr(recv_buff, dst_rank_global)) { continue; } params.send_offsets[count] = send_offsets[dst_rank]; @@ -140,7 +140,7 @@ nvshmemAlltoallV(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_ for (int i = 1; i < send_counts.size(); ++i) { int src_rank, dst_rank; getAlltoallPeerRanks(grid_desc, comm_axis, i, src_rank, dst_rank); - int dst_rank_global = getGlobalRank(grid_desc, comm_axis, dst_rank); + int dst_rank_global = getGlobalRank(handle, grid_desc, comm_axis, dst_rank); if (nvshmem_ptr(recv_buff, dst_rank_global)) { if (comm_info.ngroups == 1 && handle->device_p2p_ce_count == 1 && @@ -242,7 +242,7 @@ static void cudecompAlltoall(const cudecompHandle_t& handle, const cudecompGridD CHECK_NCCL(ncclGroupStart()); for (int i = 0; i < send_counts.size(); ++i) { - int peer_rank_global = getGlobalRank(grid_desc, comm_axis, i); + int peer_rank_global = getGlobalRank(handle, grid_desc, comm_axis, i); if (comm_info.ngroups == 1) { peer_rank_global = handle->rank_to_clique_rank[peer_rank_global]; } if (send_counts[i] != 0) { CHECK_NCCL(ncclSend(send_buff + send_offsets[i], send_counts[i] * sizeof(T), ncclChar, peer_rank_global, comm, @@ -400,7 +400,7 @@ cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cudecompGridDesc synced = true; } - int dst_rank_global = getGlobalRank(grid_desc, comm_axis, dst_rank); + int dst_rank_global = getGlobalRank(handle, grid_desc, comm_axis, dst_rank); // Need to chunk host API calls due to 2 GiB limitation in API size_t send_bytes = send_counts[dst_rank] * sizeof(T); int nchunks = (send_bytes + CUDECOMP_NVSHMEM_CHUNK_SZ - 1) / CUDECOMP_NVSHMEM_CHUNK_SZ; @@ -463,8 +463,8 @@ cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cudecompGridDesc CHECK_NCCL(ncclGroupStart()); group_started = true; } - int src_rank_global = getGlobalRank(grid_desc, comm_axis, src_rank); - int dst_rank_global = getGlobalRank(grid_desc, comm_axis, dst_rank); + int src_rank_global = getGlobalRank(handle, grid_desc, comm_axis, src_rank); + int dst_rank_global = getGlobalRank(handle, grid_desc, comm_axis, dst_rank); if (comm_info.ngroups == 1) { src_rank_global = handle->rank_to_clique_rank[src_rank_global]; dst_rank_global = handle->rank_to_clique_rank[dst_rank_global]; diff --git a/include/internal/common.h b/include/internal/common.h index 5ae74b0..4cd7685 100644 --- a/include/internal/common.h +++ b/include/internal/common.h @@ -125,7 +125,8 @@ struct cudecompHandle { ""; // directory to write CSV performance reports, empty means no file writing // Miscellaneous - int32_t device_p2p_ce_count = 0; // number of P2P CEs available + int32_t device_p2p_ce_count = 0; // number of P2P CEs available + bool use_col_major_rank_order = false; // Flag to control whether to use column-major rank order }; // Structure with information about row/column communicator @@ -224,9 +225,17 @@ using comm_count_t = int32_t; enum cudecompCommAxis { CUDECOMP_COMM_COL = 0, CUDECOMP_COMM_ROW = 1 }; // Helper function to convert row or column rank to global rank -static inline int getGlobalRank(const cudecompGridDesc_t grid_desc, cudecompCommAxis axis, int axis_rank) { - return (axis == CUDECOMP_COMM_ROW) ? grid_desc->config.pdims[1] * grid_desc->pidx[0] + axis_rank - : grid_desc->pidx[1] + axis_rank * grid_desc->config.pdims[1]; +static inline int getGlobalRank(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc, + cudecompCommAxis axis, int axis_rank) { + if (handle->use_col_major_rank_order) { + // Column-major rank order + return (axis == CUDECOMP_COMM_ROW) ? grid_desc->pidx[0] + axis_rank * grid_desc->config.pdims[0] + : grid_desc->config.pdims[0] * grid_desc->pidx[1] + axis_rank; + } else { + // Row-major rank order (default) + return (axis == CUDECOMP_COMM_ROW) ? grid_desc->config.pdims[1] * grid_desc->pidx[0] + axis_rank + : grid_desc->pidx[1] + axis_rank * grid_desc->config.pdims[1]; + } } // Helper function to return maximum pencil size across all processes for a given axis @@ -320,7 +329,7 @@ static void setCommInfo(cudecompHandle_t& handle, cudecompGridDesc_t& grid_desc, // Count occurences of hostname in row/col communicator std::map host_counts; for (int i = 0; i < info.nranks; ++i) { - int peer_rank_global = getGlobalRank(grid_desc, comm_axis, i); + int peer_rank_global = getGlobalRank(handle, grid_desc, comm_axis, i); std::string hostname = std::string(handle->hostnames[peer_rank_global].data()); host_counts[hostname]++; } @@ -338,7 +347,7 @@ static void setCommInfo(cudecompHandle_t& handle, cudecompGridDesc_t& grid_desc, // For MNNVL configurations, count occurences of clique in row/col communicator std::map clique_counts; for (int i = 0; i < info.nranks; ++i) { - int peer_rank_global = getGlobalRank(grid_desc, comm_axis, i); + int peer_rank_global = getGlobalRank(handle, grid_desc, comm_axis, i); unsigned int clique = handle->rank_to_clique[peer_rank_global]; clique_counts[clique]++; } diff --git a/src/autotune.cc b/src/autotune.cc index 4cb0dbe..9f11f6d 100644 --- a/src/autotune.cc +++ b/src/autotune.cc @@ -155,8 +155,13 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d for (auto& pdim1 : pdim1_list) { grid_desc->config.pdims[0] = handle->nranks / pdim1; grid_desc->config.pdims[1] = pdim1; - grid_desc->pidx[0] = handle->rank / grid_desc->config.pdims[1]; - grid_desc->pidx[1] = handle->rank % grid_desc->config.pdims[1]; + if (handle->use_col_major_rank_order) { + grid_desc->pidx[0] = handle->rank % grid_desc->config.pdims[0]; + grid_desc->pidx[1] = handle->rank / grid_desc->config.pdims[0]; + } else { + grid_desc->pidx[0] = handle->rank / grid_desc->config.pdims[1]; + grid_desc->pidx[1] = handle->rank % grid_desc->config.pdims[1]; + } cudecompPencilInfo_t pinfo_x0, pinfo_x3; cudecompPencilInfo_t pinfo_y0, pinfo_y1, pinfo_y2, pinfo_y3; @@ -583,8 +588,13 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, for (auto& pdim1 : pdim1_list) { grid_desc->config.pdims[0] = handle->nranks / pdim1; grid_desc->config.pdims[1] = pdim1; - grid_desc->pidx[0] = handle->rank / grid_desc->config.pdims[1]; - grid_desc->pidx[1] = handle->rank % grid_desc->config.pdims[1]; + if (handle->use_col_major_rank_order) { + grid_desc->pidx[0] = handle->rank % grid_desc->config.pdims[0]; + grid_desc->pidx[1] = handle->rank / grid_desc->config.pdims[0]; + } else { + grid_desc->pidx[0] = handle->rank / grid_desc->config.pdims[1]; + grid_desc->pidx[1] = handle->rank % grid_desc->config.pdims[1]; + } cudecompPencilInfo_t pinfo; CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo, options->halo_axis, options->halo_extents, diff --git a/src/cudecomp.cc b/src/cudecomp.cc index 70215bb..c234073 100644 --- a/src/cudecomp.cc +++ b/src/cudecomp.cc @@ -374,6 +374,10 @@ static void getCudecompEnvVars(cudecompHandle_t& handle) { // Check CUDECOMP_PERFORMANCE_REPORT_WRITE_DIR (Directory for CSV performance reports) const char* performance_write_dir_str = std::getenv("CUDECOMP_PERFORMANCE_REPORT_WRITE_DIR"); if (performance_write_dir_str) { handle->performance_report_write_dir = std::string(performance_write_dir_str); } + + // Check CUDECOMP_USE_COL_MAJOR_RANK_ORDER (Column-major rank assignment) + const char* col_major_rank_str = std::getenv("CUDECOMP_USE_COL_MAJOR_RANK_ORDER"); + if (col_major_rank_str) { handle->use_col_major_rank_order = std::strtol(col_major_rank_str, nullptr, 10) == 1; } } #ifdef ENABLE_NVSHMEM @@ -634,8 +638,13 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes if (grid_desc->config.pdims[0] > 0 && grid_desc->config.pdims[1] > 0) { // If pdims are set, temporarily set up comm info stuctures to determine if we need to create a local NCCL // communicator - grid_desc->pidx[0] = handle->rank / grid_desc->config.pdims[1]; - grid_desc->pidx[1] = handle->rank % grid_desc->config.pdims[1]; + if (handle->use_col_major_rank_order) { + grid_desc->pidx[0] = handle->rank % grid_desc->config.pdims[0]; + grid_desc->pidx[1] = handle->rank / grid_desc->config.pdims[0]; + } else { + grid_desc->pidx[0] = handle->rank / grid_desc->config.pdims[1]; + grid_desc->pidx[1] = handle->rank % grid_desc->config.pdims[1]; + } int color_row = grid_desc->pidx[0]; MPI_Comm row_comm; CHECK_MPI(MPI_Comm_split(handle->mpi_comm, color_row, handle->rank, &row_comm)); @@ -721,8 +730,13 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes THROW_NOT_SUPPORTED("No valid decomposition found during autotuning with provided arguments."); } - grid_desc->pidx[0] = handle->rank / grid_desc->config.pdims[1]; - grid_desc->pidx[1] = handle->rank % grid_desc->config.pdims[1]; + if (handle->use_col_major_rank_order) { + grid_desc->pidx[0] = handle->rank % grid_desc->config.pdims[0]; + grid_desc->pidx[1] = handle->rank / grid_desc->config.pdims[0]; + } else { + grid_desc->pidx[0] = handle->rank / grid_desc->config.pdims[1]; + grid_desc->pidx[1] = handle->rank % grid_desc->config.pdims[1]; + } // Setup final row and column communicators int color_row = grid_desc->pidx[0]; @@ -1341,7 +1355,7 @@ cudecompResult_t cudecompGetShiftedRank(cudecompHandle_t handle, cudecompGridDes *shifted_rank = -1; // "null" case } else { int comm_peer = (shifted + grid_desc->config.pdims[comm_axis]) % grid_desc->config.pdims[comm_axis]; - int global_peer = getGlobalRank(grid_desc, comm_axis, comm_peer); + int global_peer = getGlobalRank(handle, grid_desc, comm_axis, comm_peer); *shifted_rank = global_peer; }