From 786bfc5cdf2cb165ebf437149dbf38bed653f608 Mon Sep 17 00:00:00 2001 From: Abhishree Date: Fri, 27 Feb 2026 15:01:30 -0800 Subject: [PATCH 1/2] Add --mpi=pmix to COMMAND srun in ray template Signed-off-by: Abhishree --- nemo_run/run/ray/templates/ray.sub.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_run/run/ray/templates/ray.sub.j2 b/nemo_run/run/ray/templates/ray.sub.j2 index 8d510cb2..9c6b3ec7 100644 --- a/nemo_run/run/ray/templates/ray.sub.j2 +++ b/nemo_run/run/ray/templates/ray.sub.j2 @@ -454,7 +454,7 @@ COMMAND="${COMMAND:-{{ command | default('', true) }}}" COMMAND_WORKDIR={{ command_workdir | default('$CONTAINER_CWD') }} if [[ -n "$COMMAND" ]]; then - srun {% if heterogeneous %}--het-group=0 {% endif %}--no-container-mount-home --gpus=0 --overlap --container-name=ray-head --container-workdir=$COMMAND_WORKDIR --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/{{ ray_log_prefix }}job.log bash -c "$COMMAND" + srun {% if heterogeneous %}--het-group=0 {% endif %}--no-container-mount-home --mpi=pmix --gpus=0 --overlap --container-name=ray-head --container-workdir=$COMMAND_WORKDIR --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/{{ ray_log_prefix }}job.log bash -c "$COMMAND" else echo "[INFO]: Ray Cluster is idled, run this on the slurm head node to get a shell to the head node:" cat <$CLUSTER_DIR/scripts/${SLURM_JOB_ID}-attach.sh From b38e09c1b88bdaea40526c2a547d0e68c3166290 Mon Sep 17 00:00:00 2001 From: Abhishree Date: Mon, 2 Mar 2026 15:12:30 -0800 Subject: [PATCH 2/2] Update test artifacts with --mpi=pmix Signed-off-by: Abhishree --- test/core/execution/artifacts/expected_ray_cluster.sub | 4 ++-- test/core/execution/artifacts/expected_ray_cluster_ssh.sub | 4 ++-- test/core/execution/artifacts/expected_ray_het_cluster.sub | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/test/core/execution/artifacts/expected_ray_cluster.sub b/test/core/execution/artifacts/expected_ray_cluster.sub index 981640bc..2c6705e5 100644 --- a/test/core/execution/artifacts/expected_ray_cluster.sub +++ b/test/core/execution/artifacts/expected_ray_cluster.sub @@ -423,7 +423,7 @@ COMMAND="${COMMAND:-python train.py}" COMMAND_WORKDIR=/workspace if [[ -n "$COMMAND" ]]; then - srun --no-container-mount-home --gpus=0 --overlap --container-name=ray-head --container-workdir=$COMMAND_WORKDIR --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/ray-job.log bash -c "$COMMAND" + srun --no-container-mount-home --mpi=pmix --gpus=0 --overlap --container-name=ray-head --container-workdir=$COMMAND_WORKDIR --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/ray-job.log bash -c "$COMMAND" else echo "[INFO]: Ray Cluster is idled, run this on the slurm head node to get a shell to the head node:" cat <$CLUSTER_DIR/scripts/${SLURM_JOB_ID}-attach.sh @@ -440,4 +440,4 @@ EOF chmod +x $CLUSTER_DIR/scripts/${SLURM_JOB_ID}-attach.sh echo " bash $CLUSTER_DIR/scripts/${SLURM_JOB_ID}-attach.sh" sleep infinity -fi \ No newline at end of file +fi diff --git a/test/core/execution/artifacts/expected_ray_cluster_ssh.sub b/test/core/execution/artifacts/expected_ray_cluster_ssh.sub index e0d21e39..974ae42d 100644 --- a/test/core/execution/artifacts/expected_ray_cluster_ssh.sub +++ b/test/core/execution/artifacts/expected_ray_cluster_ssh.sub @@ -430,7 +430,7 @@ COMMAND="${COMMAND:-ray job submit --address ray://localhost:10001 --job-id trai COMMAND_WORKDIR=/workspace/training if [[ -n "$COMMAND" ]]; then - srun --no-container-mount-home --gpus=0 --overlap --container-name=ray-head --container-workdir=$COMMAND_WORKDIR --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/ray-job.log bash -c "$COMMAND" + srun --no-container-mount-home --mpi=pmix --gpus=0 --overlap --container-name=ray-head --container-workdir=$COMMAND_WORKDIR --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/ray-job.log bash -c "$COMMAND" else echo "[INFO]: Ray Cluster is idled, run this on the slurm head node to get a shell to the head node:" cat <$CLUSTER_DIR/scripts/${SLURM_JOB_ID}-attach.sh @@ -447,4 +447,4 @@ EOF chmod +x $CLUSTER_DIR/scripts/${SLURM_JOB_ID}-attach.sh echo " bash $CLUSTER_DIR/scripts/${SLURM_JOB_ID}-attach.sh" sleep infinity -fi \ No newline at end of file +fi diff --git a/test/core/execution/artifacts/expected_ray_het_cluster.sub b/test/core/execution/artifacts/expected_ray_het_cluster.sub index 9a1a0cd0..9a8666bc 100644 --- a/test/core/execution/artifacts/expected_ray_het_cluster.sub +++ b/test/core/execution/artifacts/expected_ray_het_cluster.sub @@ -463,7 +463,7 @@ COMMAND="${COMMAND:-}" COMMAND_WORKDIR=None if [[ -n "$COMMAND" ]]; then - srun --het-group=0 --no-container-mount-home --gpus=0 --overlap --container-name=ray-head --container-workdir=$COMMAND_WORKDIR --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/ray-job.log bash -c "$COMMAND" + srun --het-group=0 --no-container-mount-home --mpi=pmix --gpus=0 --overlap --container-name=ray-head --container-workdir=$COMMAND_WORKDIR --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/ray-job.log bash -c "$COMMAND" else echo "[INFO]: Ray Cluster is idled, run this on the slurm head node to get a shell to the head node:" cat <$CLUSTER_DIR/scripts/${SLURM_JOB_ID}-attach.sh @@ -480,4 +480,4 @@ EOF chmod +x $CLUSTER_DIR/scripts/${SLURM_JOB_ID}-attach.sh echo " bash $CLUSTER_DIR/scripts/${SLURM_JOB_ID}-attach.sh" sleep infinity -fi \ No newline at end of file +fi