diff --git a/docs/env_vars.rst b/docs/env_vars.rst
index 2123b57..502858f 100644
--- a/docs/env_vars.rst
+++ b/docs/env_vars.rst
@@ -90,3 +90,11 @@ The following CSV files are generated:
 Each CSV file includes grid configuration information as comments at the top, followed by performance data in comma-separated format.
 
 Default setting is unset (no CSV files written). Setting this variable to a directory path will enable CSV file output.
+
+CUDECOMP_USE_COL_MAJOR_RANK_ORDER
+--------------------------------------
+(since v0.6.0)
+
+:code:`CUDECOMP_USE_COL_MAJOR_RANK_ORDER` controls the rank assignment order in the process grid. By default, ranks are assigned in row-major order for consistency with :code:`MPI_Cart_*` routines. When enabled, ranks are assigned in column-major order.
+
+Default setting is off (:code:`0`). Setting this variable to :code:`1` will enable column-major rank assignment.
\ No newline at end of file
diff --git a/docs/overview.rst b/docs/overview.rst
index 256f7d1..993d7dd 100644
--- a/docs/overview.rst
+++ b/docs/overview.rst
@@ -33,8 +33,8 @@ cuDecomp can distribute 3D Cartesian domains with dimensions :math:`[X, Y, Z]`,
 the *global grid*. The global grid is decomposed
 across :math:`N_{\text{GPU}}` processes in a 2D **process grid** with dimensions :math:`P_{\text{row}} \times P_{\text{col}}`. The processes
 are logically grouped by column and row index into :math:`P_{\text{row}}` *row* communicators and :math:`P_{\text{col}}` *column* communicators.
-For consistency with :code:`MPI_Cart_*` routines, the ranks are assigned in a row-major ordering (i.e. row communicators
-are composed of sequential ranks).
+By default, for consistency with :code:`MPI_Cart_*` routines, the ranks are assigned in a row-major ordering (i.e. row communicators
+are composed of sequential ranks). This can be changed to column-major ordering using the :code:`CUDECOMP_USE_COL_MAJOR_RANK_ORDER` environment variable (see :ref:`env-var-section-ref`).
 
 cuDecomp will distribute the global domain data so that each process is assigned a unique *pencil* of data, with three different
 pencil configurations corresponding to different transposed configurations of the global domain. The domain can be
diff --git a/include/internal/comm_routines.h b/include/internal/comm_routines.h
index 6825709..4e51221 100644
--- a/include/internal/comm_routines.h
+++ b/include/internal/comm_routines.h
@@ -113,7 +113,7 @@ nvshmemAlltoallV(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_
   for (int i = 1; i < send_counts.size(); ++i) {
     int src_rank, dst_rank;
     getAlltoallPeerRanks(grid_desc, comm_axis, i, src_rank, dst_rank);
-    int dst_rank_global = getGlobalRank(grid_desc, comm_axis, dst_rank);
+    int dst_rank_global = getGlobalRank(handle, grid_desc, comm_axis, dst_rank);
     if (nvshmem_ptr(recv_buff, dst_rank_global)) { continue; }
 
     params.send_offsets[count] = send_offsets[dst_rank];
@@ -140,7 +140,7 @@ nvshmemAlltoallV(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_
   for (int i = 1; i < send_counts.size(); ++i) {
     int src_rank, dst_rank;
     getAlltoallPeerRanks(grid_desc, comm_axis, i, src_rank, dst_rank);
-    int dst_rank_global = getGlobalRank(grid_desc, comm_axis, dst_rank);
+    int dst_rank_global = getGlobalRank(handle, grid_desc, comm_axis, dst_rank);
     if (nvshmem_ptr(recv_buff, dst_rank_global)) {
 
       if (comm_info.ngroups == 1 && handle->device_p2p_ce_count == 1 &&
@@ -242,7 +242,7 @@ static void cudecompAlltoall(const cudecompHandle_t& handle, const cudecompGridD
 
     CHECK_NCCL(ncclGroupStart());
     for (int i = 0; i < send_counts.size(); ++i) {
-      int peer_rank_global = getGlobalRank(grid_desc, comm_axis, i);
+      int peer_rank_global = getGlobalRank(handle, grid_desc, comm_axis, i);
       if (comm_info.ngroups == 1) { peer_rank_global = handle->rank_to_clique_rank[peer_rank_global]; }
       if (send_counts[i] != 0) {
         CHECK_NCCL(ncclSend(send_buff + send_offsets[i], send_counts[i] * sizeof(T), ncclChar, peer_rank_global, comm,
@@ -400,7 +400,7 @@ cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cudecompGridDesc
             synced = true;
           }
 
-          int dst_rank_global = getGlobalRank(grid_desc, comm_axis, dst_rank);
+          int dst_rank_global = getGlobalRank(handle, grid_desc, comm_axis, dst_rank);
           // Need to chunk host API calls due to 2 GiB limitation in API
           size_t send_bytes = send_counts[dst_rank] * sizeof(T);
           int nchunks = (send_bytes + CUDECOMP_NVSHMEM_CHUNK_SZ - 1) / CUDECOMP_NVSHMEM_CHUNK_SZ;
@@ -463,8 +463,8 @@ cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cudecompGridDesc
           CHECK_NCCL(ncclGroupStart());
           group_started = true;
         }
-        int src_rank_global = getGlobalRank(grid_desc, comm_axis, src_rank);
-        int dst_rank_global = getGlobalRank(grid_desc, comm_axis, dst_rank);
+        int src_rank_global = getGlobalRank(handle, grid_desc, comm_axis, src_rank);
+        int dst_rank_global = getGlobalRank(handle, grid_desc, comm_axis, dst_rank);
         if (comm_info.ngroups == 1) {
           src_rank_global = handle->rank_to_clique_rank[src_rank_global];
           dst_rank_global = handle->rank_to_clique_rank[dst_rank_global];
diff --git a/include/internal/common.h b/include/internal/common.h
index 5ae74b0..4cd7685 100644
--- a/include/internal/common.h
+++ b/include/internal/common.h
@@ -125,7 +125,8 @@ struct cudecompHandle {
       ""; // directory to write CSV performance reports, empty means no file writing
 
   // Miscellaneous
-  int32_t device_p2p_ce_count = 0; // number of P2P CEs available
+  int32_t device_p2p_ce_count = 0;       // number of P2P CEs available
+  bool use_col_major_rank_order = false; // Flag to control whether to use column-major rank order
 };
 
 // Structure with information about row/column communicator
@@ -224,9 +225,17 @@ using comm_count_t = int32_t;
 enum cudecompCommAxis { CUDECOMP_COMM_COL = 0, CUDECOMP_COMM_ROW = 1 };
 
 // Helper function to convert row or column rank to global rank
-static inline int getGlobalRank(const cudecompGridDesc_t grid_desc, cudecompCommAxis axis, int axis_rank) {
-  return (axis == CUDECOMP_COMM_ROW) ? grid_desc->config.pdims[1] * grid_desc->pidx[0] + axis_rank
-                                     : grid_desc->pidx[1] + axis_rank * grid_desc->config.pdims[1];
+static inline int getGlobalRank(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc,
+                                cudecompCommAxis axis, int axis_rank) {
+  if (handle->use_col_major_rank_order) {
+    // Column-major rank order
+    return (axis == CUDECOMP_COMM_ROW) ? grid_desc->pidx[0] + axis_rank * grid_desc->config.pdims[0]
+                                       : grid_desc->config.pdims[0] * grid_desc->pidx[1] + axis_rank;
+  } else {
+    // Row-major rank order (default)
+    return (axis == CUDECOMP_COMM_ROW) ? grid_desc->config.pdims[1] * grid_desc->pidx[0] + axis_rank
+                                       : grid_desc->pidx[1] + axis_rank * grid_desc->config.pdims[1];
+  }
 }
 
 // Helper function to return maximum pencil size across all processes for a given axis
@@ -320,7 +329,7 @@ static void setCommInfo(cudecompHandle_t& handle, cudecompGridDesc_t& grid_desc,
     // Count occurences of hostname in row/col communicator
     std::map<std::string, int> host_counts;
     for (int i = 0; i < info.nranks; ++i) {
-      int peer_rank_global = getGlobalRank(grid_desc, comm_axis, i);
+      int peer_rank_global = getGlobalRank(handle, grid_desc, comm_axis, i);
       std::string hostname = std::string(handle->hostnames[peer_rank_global].data());
       host_counts[hostname]++;
     }
@@ -338,7 +347,7 @@ static void setCommInfo(cudecompHandle_t& handle, cudecompGridDesc_t& grid_desc,
     // For MNNVL configurations, count occurences of clique in row/col communicator
     std::map<unsigned int, int> clique_counts;
     for (int i = 0; i < info.nranks; ++i) {
-      int peer_rank_global = getGlobalRank(grid_desc, comm_axis, i);
+      int peer_rank_global = getGlobalRank(handle, grid_desc, comm_axis, i);
       unsigned int clique = handle->rank_to_clique[peer_rank_global];
       clique_counts[clique]++;
     }
diff --git a/src/autotune.cc b/src/autotune.cc
index 4cb0dbe..9f11f6d 100644
--- a/src/autotune.cc
+++ b/src/autotune.cc
@@ -155,8 +155,13 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
   for (auto& pdim1 : pdim1_list) {
     grid_desc->config.pdims[0] = handle->nranks / pdim1;
     grid_desc->config.pdims[1] = pdim1;
-    grid_desc->pidx[0] = handle->rank / grid_desc->config.pdims[1];
-    grid_desc->pidx[1] = handle->rank % grid_desc->config.pdims[1];
+    if (handle->use_col_major_rank_order) {
+      grid_desc->pidx[0] = handle->rank % grid_desc->config.pdims[0];
+      grid_desc->pidx[1] = handle->rank / grid_desc->config.pdims[0];
+    } else {
+      grid_desc->pidx[0] = handle->rank / grid_desc->config.pdims[1];
+      grid_desc->pidx[1] = handle->rank % grid_desc->config.pdims[1];
+    }
 
     cudecompPencilInfo_t pinfo_x0, pinfo_x3;
     cudecompPencilInfo_t pinfo_y0, pinfo_y1, pinfo_y2, pinfo_y3;
@@ -583,8 +588,13 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
   for (auto& pdim1 : pdim1_list) {
     grid_desc->config.pdims[0] = handle->nranks / pdim1;
     grid_desc->config.pdims[1] = pdim1;
-    grid_desc->pidx[0] = handle->rank / grid_desc->config.pdims[1];
-    grid_desc->pidx[1] = handle->rank % grid_desc->config.pdims[1];
+    if (handle->use_col_major_rank_order) {
+      grid_desc->pidx[0] = handle->rank % grid_desc->config.pdims[0];
+      grid_desc->pidx[1] = handle->rank / grid_desc->config.pdims[0];
+    } else {
+      grid_desc->pidx[0] = handle->rank / grid_desc->config.pdims[1];
+      grid_desc->pidx[1] = handle->rank % grid_desc->config.pdims[1];
+    }
 
     cudecompPencilInfo_t pinfo;
     CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo, options->halo_axis, options->halo_extents,
diff --git a/src/cudecomp.cc b/src/cudecomp.cc
index 70215bb..c234073 100644
--- a/src/cudecomp.cc
+++ b/src/cudecomp.cc
@@ -374,6 +374,10 @@ static void getCudecompEnvVars(cudecompHandle_t& handle) {
   // Check CUDECOMP_PERFORMANCE_REPORT_WRITE_DIR (Directory for CSV performance reports)
   const char* performance_write_dir_str = std::getenv("CUDECOMP_PERFORMANCE_REPORT_WRITE_DIR");
   if (performance_write_dir_str) { handle->performance_report_write_dir = std::string(performance_write_dir_str); }
+
+  // Check CUDECOMP_USE_COL_MAJOR_RANK_ORDER (Column-major rank assignment)
+  const char* col_major_rank_str = std::getenv("CUDECOMP_USE_COL_MAJOR_RANK_ORDER");
+  if (col_major_rank_str) { handle->use_col_major_rank_order = std::strtol(col_major_rank_str, nullptr, 10) == 1; }
 }
 
 #ifdef ENABLE_NVSHMEM
@@ -634,8 +638,13 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes
         if (grid_desc->config.pdims[0] > 0 && grid_desc->config.pdims[1] > 0) {
           // If pdims are set, temporarily set up comm info stuctures to determine if we need to create a local NCCL
           // communicator
-          grid_desc->pidx[0] = handle->rank / grid_desc->config.pdims[1];
-          grid_desc->pidx[1] = handle->rank % grid_desc->config.pdims[1];
+          if (handle->use_col_major_rank_order) {
+            grid_desc->pidx[0] = handle->rank % grid_desc->config.pdims[0];
+            grid_desc->pidx[1] = handle->rank / grid_desc->config.pdims[0];
+          } else {
+            grid_desc->pidx[0] = handle->rank / grid_desc->config.pdims[1];
+            grid_desc->pidx[1] = handle->rank % grid_desc->config.pdims[1];
+          }
           int color_row = grid_desc->pidx[0];
           MPI_Comm row_comm;
           CHECK_MPI(MPI_Comm_split(handle->mpi_comm, color_row, handle->rank, &row_comm));
@@ -721,8 +730,13 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes
       THROW_NOT_SUPPORTED("No valid decomposition found during autotuning with provided arguments.");
     }
 
-    grid_desc->pidx[0] = handle->rank / grid_desc->config.pdims[1];
-    grid_desc->pidx[1] = handle->rank % grid_desc->config.pdims[1];
+    if (handle->use_col_major_rank_order) {
+      grid_desc->pidx[0] = handle->rank % grid_desc->config.pdims[0];
+      grid_desc->pidx[1] = handle->rank / grid_desc->config.pdims[0];
+    } else {
+      grid_desc->pidx[0] = handle->rank / grid_desc->config.pdims[1];
+      grid_desc->pidx[1] = handle->rank % grid_desc->config.pdims[1];
+    }
 
     // Setup final row and column communicators
     int color_row = grid_desc->pidx[0];
@@ -1341,7 +1355,7 @@ cudecompResult_t cudecompGetShiftedRank(cudecompHandle_t handle, cudecompGridDes
       *shifted_rank = -1; // "null" case
     } else {
       int comm_peer = (shifted + grid_desc->config.pdims[comm_axis]) % grid_desc->config.pdims[comm_axis];
-      int global_peer = getGlobalRank(grid_desc, comm_axis, comm_peer);
+      int global_peer = getGlobalRank(handle, grid_desc, comm_axis, comm_peer);
       *shifted_rank = global_peer;
     }