RL/ray.sub at main · parthmannan/RL · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
#!/bin/bash
#SBATCH --nodes=2
#SBATCH --exclusive
#SBATCH --account=ACCOUNT
#SBATCH --job-name=JOB_NAME
#SBATCH --partition=PARTITION
#SBATCH --time=1:0:0
#SBATCH --dependency=singleton

# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


set -eoux pipefail

########################################################
# Function to detect if SLURM cluster uses GRES
########################################################
maybe_gres_arg() {
  # Check if any nodes in the partition have GRES configured
  # Assumes a homogeneous allocation (not a heterogeneous job)
  if sinfo -p $SLURM_JOB_PARTITION -h -o "%G" | grep -q "gpu:"; then
    # Do a quick assert here that gpus:8 == gpus:$GPUS_PER_NODE. It is probably a user error if someone isn't using GPUS_PER_NODE=8 on our clusters if it supports --gres=gpu:8 or gpu:a100:8
    if [[ $GPUS_PER_NODE -ne $(sinfo -p $SLURM_JOB_PARTITION -h -o "%G" | grep "gpu:" | awk -F: '{print $NF}') ]]; then
      echo "Error: GPUS_PER_NODE=$GPUS_PER_NODE but GRES detected is $(sinfo -p $SLURM_JOB_PARTITION -h -o "%G" | grep "gpu:") meaning GPUS_PER_NODE is not set to fully claim the GPUs on the nodes." >&2
      exit 1
    fi
    echo "--gres=gpu:${GPUS_PER_NODE}"
    return
  fi

  # No GRES support detected
  echo ""
}

########################################################
# User defined variables
########################################################
CONTAINER=$CONTAINER
MOUNTS=$MOUNTS
COMMAND=${COMMAND:-}  # This is a script relative to the SLURM_SUBMIT_DIR. If left empty, it will leave the cluster idle after it's brought up.
########################################################
# Ports for all nodes (should be odd numbers since we place head/worker[0] on the same node) so all workers get the odd ports, but the head will get +1 the ports
NODE_MANAGER_PORT=${NODE_MANAGER_PORT:-53001}
OBJECT_MANAGER_PORT=${OBJECT_MANAGER_PORT:-53003}
RUNTIME_ENV_AGENT_PORT=${RUNTIME_ENV_AGENT_PORT:-53005}
DASHBOARD_AGENT_GRPC_PORT=${DASHBOARD_AGENT_GRPC_PORT:-53007}
METRICS_EXPORT_PORT=${METRICS_EXPORT_PORT:-53009}

# Ports for the head node
PORT=${PORT:-54514}
RAY_CLIENT_SERVER_PORT=${RAY_CLIENT_SERVER_PORT:-10001}
#REDIT_SHARD_PORTS=${REDIT_SHARD_PORTS:-"random"} ??
DASHBOARD_PORT=${DASHBOARD_PORT:-8265}  # Also used by debugger
DASHBOARD_AGENT_LISTEN_PORT=${DASHBOARD_AGENT_LISTEN_PORT:-52365}
RAY_DEBUGGER_ARGS=
if [ "${RAY_DEBUG:-}" = "legacy" ]; then
  RAY_DEBUGGER_ARGS="--ray-debugger-external"
fi

# After ray>=2.47, this feature is enabled by default which creates uv venvs for any py_executable starting with `uv run`.
# There is severe contention and performance issues with this enabled considering our dependencies are so large and occasionally
# need to be compiled, so NeMo RL has an implementation in nemo_rl/utils/venv.py that does it once per node as opposed to once per task.
export RAY_ENABLE_UV_RUN_RUNTIME_ENV=0

# Setting ulimit is recommended by ray best practices page
# @ https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html
# It's session based and won't affect the system outside the script
# Ensure that the soft limit isn't above the hard limit
if [[ $(ulimit -Hn) == "unlimited" ]] || [[ 65535 -lt $(ulimit -Hn) ]]; then
  ulimit -Sn 65535
elif [[ $(ulimit -Hn) != "unlimited" ]] && [[ $(ulimit -Hn) -lt 65535 ]]; then
  echo "[WARNING]: Cannot increase ulimit on file descriptors to 65535 according ray recommendation: https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html. Speak to cluster admins to increase, otherwise ray may crash unexpectedly."
fi

# On our clusters, the largest port range on an idle worker appeared between 52369-64607
# (not including the other ports set by this script). So this range is chosen to be
# somewhere in the middle
MIN_WORKER_PORT=${MIN_WORKER_PORT:-54001}
MAX_WORKER_PORT=${MAX_WORKER_PORT:-54513}
########################################################
# Number seconds to sync logs from /tmp/ray/session_*/logs to $LOG_DIR/ray/
RAY_LOG_SYNC_FREQUENCY=${RAY_LOG_SYNC_FREQUENCY:-}
########################################################

# Unset UV_CACHE_DIR to avoid local cache directory interferring with the container cache
unset UV_CACHE_DIR

if [[ -n "${UV_CACHE_DIR_OVERRIDE:-}" ]]; then
  mkdir -p "$UV_CACHE_DIR_OVERRIDE"
  if [[ -n $MOUNTS ]]; then
    MOUNTS+=",$UV_CACHE_DIR_OVERRIDE:/root/.cache/uv"
  else
    MOUNTS="$UV_CACHE_DIR_OVERRIDE:/root/.cache/uv"
  fi
fi

# Create logs directory
BASE_LOG_DIR=${BASE_LOG_DIR:-$SLURM_SUBMIT_DIR}
LOG_DIR="$BASE_LOG_DIR/$SLURM_JOB_ID-logs"
mkdir -p $LOG_DIR

# Number of GPUs per worker node
GPUS_PER_NODE=${GPUS_PER_NODE:-8}

# Detect GRES support and set GRES_ARG
GRES_ARG=$(maybe_gres_arg)
if [[ -n "$GRES_ARG" ]]; then
  echo "[INFO] GRES support detected. Using: $GRES_ARG"
else
  echo "[INFO] No GRES support detected. Running without --gres flag."
fi

COMMON_SRUN_ARGS="$GRES_ARG"
COMMON_SRUN_ARGS+=" --no-container-mount-home"
COMMON_SRUN_ARGS+=" --mpi=pmix"
COMMON_SRUN_ARGS+=" --container-mounts=$MOUNTS"
COMMON_SRUN_ARGS+=" --container-image=$CONTAINER"
COMMON_SRUN_ARGS+=" --container-workdir=$SLURM_SUBMIT_DIR"
# TODO: delete these (just for debugging)
COMMON_SRUN_ARGS+=" -p $SLURM_JOB_PARTITION"
COMMON_SRUN_ARGS+=" -A $SLURM_JOB_ACCOUNT"
# Number of CPUs per worker node
CPUS_PER_WORKER=${CPUS_PER_WORKER:-$((GPUS_PER_NODE * 16))}

num_retries=3

# Track backgrounded srun client PIDs for head and workers
declare -A SRUN_PIDS

# Verify all backgrounded srun client processes are still alive; exit fast if any died
check_srun_processes() {
  for name in "${!SRUN_PIDS[@]}"; do
    pid="${SRUN_PIDS[$name]}"
    # Check if the process is still running
    if ! kill -0 "$pid" 2>/dev/null; then
      echo "[ERROR] Background srun '$name' died (pid=$pid). Could be a failure in startup or an issue with the node preventing the srun to start. Attempting to exit." >&2
      # Signal sidecars inside containers to terminate ASAP
      touch "$LOG_DIR/ENDED"
      exit 1
    fi
  done
}

# Getting the node names and IP addresses in the SLURM allocation
nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
nodes_array=($nodes)
ip_addresses_array=()

for node in $nodes; do
    # Try multiple methods to get IP address - ENHANCED VERSION v2.0
    echo "[DEBUG] Resolving hostname: $node using enhanced resolution methods"
    ip_address=""

    # Method 1: Try host command
    echo "[DEBUG] Method 1: host command"
    ip_address=$(host $node 2>/dev/null | awk '/has address/ { print $4 }' | head -1 || true)
    echo "[DEBUG] host result: '$ip_address'"

    # Method 2: If host fails, try getent
    if [[ -z "$ip_address" ]]; then
        echo "[DEBUG] Method 2: getent hosts"
        ip_address=$(getent hosts $node 2>/dev/null | awk '{ print $1 }' | head -1 || true)
        echo "[DEBUG] getent result: '$ip_address'"
    fi

    # Method 3: If getent fails, try nslookup
    if [[ -z "$ip_address" ]]; then
        echo "[DEBUG] Method 3: nslookup"
        ip_address=$(nslookup $node 2>/dev/null | awk '/^Address: / { print $2 }' | head -1 || true)
        echo "[DEBUG] nslookup result: '$ip_address'"
    fi

    # Method 4: If all DNS methods fail, try ping to extract IP
    if [[ -z "$ip_address" ]]; then
        echo "[DEBUG] Method 4: ping"
        ip_address=$(ping -c 1 $node 2>/dev/null | grep "PING" | sed 's/.*(\([^)]*\)).*/\1/' || true)
        echo "[DEBUG] ping result: '$ip_address'"
    fi

    # If still no IP, use the hostname itself (might work if it's already an IP or resolvable)
    if [[ -z "$ip_address" ]]; then
        echo "[WARNING] Could not resolve IP for $node, using hostname as fallback"
        ip_address=$node
    fi

    echo "[INFO] Node: $node -> IP: $ip_address"
    # Add the IP address to the array
    ip_addresses_array+=("$ip_address")
done

head_node=${nodes_array[0]}
head_node_ip=${ip_addresses_array[0]}

ip_head=$head_node_ip:$PORT

# First we start the head of the ray cluster on one of the physical nodes
# Give the head node actual resources to make it schedulable

head_cmd=$(cat <<EOF
# Touch a file to indicate that the head node has started
# Overlapping srun commands will check this file to determine if we can overlap a container command
touch $LOG_DIR/STARTED_RAY_HEAD
env

exit-dramatically() {
    # Use SIGTERM to forcefully terminate the srun process
    pkill -P $$ || true
    kill -TERM 0 || true
    # As a last resort, exit with a non-zero code
    exit 1
}
export -f exit-dramatically

# Background process to check for ENDED file
monitor-sidecar() {
  set +x
  while true; do
    sleep 60
    if [[ -f "$LOG_DIR/ENDED" ]]; then
      echo "Detected ENDED file, terminating..."
      exit-dramatically
    fi
  done
}
monitor-sidecar &

# Background process to sync ray logs every $RAY_LOG_SYNC_FREQUENCY seconds
log-sync-sidecar() {
  set +x
  if [[ -z "$RAY_LOG_SYNC_FREQUENCY" ]]; then
    echo "RAY_LOG_SYNC_FREQUENCY is not set, skipping log sync sidecar"
    return
  fi
  mkdir -p $LOG_DIR/ray
  while true; do
    sleep $RAY_LOG_SYNC_FREQUENCY
    if ls /tmp/ray/session_[0-9]* > /dev/null 2>&1; then
      for session_dir in /tmp/ray/session_[0-9]*/; do
        if [[ -d "\$session_dir/logs" ]]; then
          session_name=\$(basename "\$session_dir")
          mkdir -p "$LOG_DIR/ray/\$session_name"
          if command -v rsync > /dev/null 2>&1; then
            rsync -ahP "\$session_dir/logs/" "$LOG_DIR/ray/\$session_name/logs/" 2>/dev/null || true
          else
            cp -r "\$session_dir/logs" "$LOG_DIR/ray/\$session_name/"
          fi
        fi
      done
    fi
    if [[ -f "$LOG_DIR/ENDED" ]]; then
      echo "Log sync sidecar terminating..."
      break
    fi
  done
}
log-sync-sidecar &

# Patch nsight.py before starting Ray head
sed -i 's/context\.py_executable = " "\.join(self\.nsight_cmd) + " python"/context.py_executable = " ".join(self.nsight_cmd) + f" {context.py_executable}"/g' /opt/nemo_rl_venv/lib64/python*/site-packages/ray/_private/runtime_env/nsight.py

cat <<EOFINNER | tee /launch-head.sh
ray start --head \
    --disable-usage-stats \
    --resources="{\"worker_units\": $GPUS_PER_NODE, \"slurm_managed_ray_cluster\": 1}" \
    --node-ip-address="$head_node_ip" \
    --port=${PORT} \
    --ray-client-server-port=${RAY_CLIENT_SERVER_PORT} \
    --dashboard-port=${DASHBOARD_PORT} \
    --dashboard-host="$head_node_ip" \
    --include-dashboard=True \
    \
    --node-manager-port=$((${NODE_MANAGER_PORT} + 1)) \
    --object-manager-port=$((${OBJECT_MANAGER_PORT} + 1)) \
    --runtime-env-agent-port=$((${RUNTIME_ENV_AGENT_PORT} + 1)) \
    --dashboard-agent-grpc-port=$((${DASHBOARD_AGENT_GRPC_PORT} + 1)) \
    --dashboard-agent-listen-port=$((${DASHBOARD_AGENT_LISTEN_PORT} + 1)) \
    --metrics-export-port=$((${METRICS_EXPORT_PORT} + 1)) \
    $RAY_DEBUGGER_ARGS \
    \
    --block
EOFINNER
chmod +x /launch-head.sh

count=0
while [[ \$count -lt $num_retries ]]; do
  bash /launch-head.sh
  count=\$((count+1))
  echo "Head node failed \$count/$num_retries times, restarting in 5 seconds..."
  sleep 5
done
touch $LOG_DIR/ENDED
exit 1
EOF
)
srun $COMMON_SRUN_ARGS --container-name=ray-head --nodes=1 --ntasks=1 --cpus-per-task=$CPUS_PER_WORKER -w "$head_node" -o $LOG_DIR/ray-head.log bash -x -c "$head_cmd" &
SRUN_PIDS["ray-head"]=$!

NUM_ACTORS=$((GPUS_PER_NODE * SLURM_JOB_NUM_NODES))

# Start Ray worker nodes
# We want 1 Ray worker node per physical node (excluding the head node)
# Worker nodes are started with ray start but without the --head flag
# Start from node 1 since node 0 is running the head
for ((i = 1; i < SLURM_JOB_NUM_NODES; i++)); do
  node_i=${nodes_array[$i]}

  worker_cmd=$(cat <<EOF
env

exit-dramatically() {
    # Use SIGTERM to forcefully terminate the srun process
    pkill -P $$ || true
    kill -TERM 0 || true
    # As a last resort, exit with a non-zero code
    exit 1
}

# Background process to check for ENDED file
monitor-sidecar() {
  set +x
  while true; do
    sleep 60
    if [[ -f "$LOG_DIR/ENDED" ]]; then
      echo "Detected ENDED file, terminating..."
      exit-dramatically
    fi
  done
}
monitor-sidecar &

# Background process to sync ray logs every $RAY_LOG_SYNC_FREQUENCY seconds
log-sync-sidecar() {
  set +x
  if [[ -z "$RAY_LOG_SYNC_FREQUENCY" ]]; then
    echo "RAY_LOG_SYNC_FREQUENCY is not set, skipping log sync sidecar"
    return
  fi
  mkdir -p $LOG_DIR/ray/$node_i
  while true; do
    sleep $RAY_LOG_SYNC_FREQUENCY
    if ls /tmp/ray/session_[0-9]* > /dev/null 2>&1; then
      for session_dir in /tmp/ray/session_[0-9]*/; do
        if [[ -d "\$session_dir/logs" ]]; then
          session_name=\$(basename "\$session_dir")
          mkdir -p "$LOG_DIR/ray/$node_i/\$session_name"
          if command -v rsync > /dev/null 2>&1; then
            rsync -ahP "\$session_dir/logs/" $LOG_DIR/ray/$node_i/\$session_name/logs/ 2>/dev/null || true
          else
            cp -r "\$session_dir/logs" $LOG_DIR/ray/$node_i/\$session_name/
          fi
        fi
      done
    fi
    if [[ -f "$LOG_DIR/ENDED" ]]; then
      echo "Log sync sidecar terminating..."
      break
    fi
  done
}
log-sync-sidecar &

# Patch nsight.py before starting Ray worker
sed -i 's/context\.py_executable = " "\.join(self\.nsight_cmd) + " python"/context.py_executable = " ".join(self.nsight_cmd) + f" {context.py_executable}"/g' /opt/nemo_rl_venv/lib64/python*/site-packages/ray/_private/runtime_env/nsight.py

cat <<EOFINNER | tee /launch-worker.sh
ray start --address "$ip_head" \
          --disable-usage-stats \
          --resources="{\"worker_units\": $GPUS_PER_NODE, \"slurm_managed_ray_cluster\": 1}" \
          --min-worker-port=${MIN_WORKER_PORT} \
          --max-worker-port=${MAX_WORKER_PORT} \
          \
          --node-manager-port=${NODE_MANAGER_PORT} \
          --object-manager-port=${OBJECT_MANAGER_PORT} \
          --runtime-env-agent-port=${RUNTIME_ENV_AGENT_PORT} \
          --dashboard-agent-grpc-port=${DASHBOARD_AGENT_GRPC_PORT} \
          --dashboard-agent-listen-port=${DASHBOARD_AGENT_LISTEN_PORT} \
          --metrics-export-port=${METRICS_EXPORT_PORT} \
          $RAY_DEBUGGER_ARGS \
          \
          --block
EOFINNER

count=0
while [[ \$count -lt $num_retries ]]; do
  bash /launch-worker.sh
  count=\$((count+1))
  echo "Worker failed \$count/$num_retries times, restarting in 5 seconds..."
  sleep 5
done
touch $LOG_DIR/ENDED
exit 1
EOF
)
  srun $COMMON_SRUN_ARGS --container-name=ray-worker-$i --exact --nodes=1 --ntasks=1 --cpus-per-task=$CPUS_PER_WORKER -w "$node_i" -o $LOG_DIR/ray-worker-$i.log bash -x -c "$worker_cmd" &
  SRUN_PIDS["ray-worker-$i"]=$!
  sleep 3
done

# Then we wait here for the file to be created by the head node container
while check_srun_processes && ! srun --overlap --nodes=1 --ntasks=1 -w $head_node test -f $LOG_DIR/STARTED_RAY_HEAD; do
  echo "[INFO][$(date)] Waiting for head node container to start..."
  sleep 2
done

# At this stage the Ray cluster bringup has started on the physical nodes in the allocation
# Before we launch a job on this cluster we need to make sure that the bringup is complete
# We do so by querying the number of worker_units in the ray cluster and asserting = NUM_ACTORS
extract_worker_units() {
  status_output=$(srun --overlap --container-name=ray-head --nodes=1 --ntasks=1 -w "$head_node" ray status)
  if echo "$status_output" | grep -q "worker_units"; then
    worker_units=$(echo "$status_output" | grep "worker_units" | awk -F'[/. ]' '{print $4}')
    echo $worker_units
  else
    echo 0
  fi
}

# Poll to make sure that all Ray worker nodes have connected to the head.
# All workers have connected when number of GPUs in ray cluster
# is equal to NUM_ACTORS. We use the utility function above
# to check how many GPUs have come online in the ray cluster
while true; do
  worker_units=$(extract_worker_units)
  echo "[INFO] Number of actors online: $worker_units/$NUM_ACTORS"
  if [[ "$worker_units" -eq "$NUM_ACTORS" ]]; then
    break
  fi
  check_srun_processes
  sleep 2
done

echo "All workers connected!"

# We can now launch a job on this cluster
# We do so by launching a driver process on the physical node that the head node is on
# This driver process is responsible for launching a job on the Ray cluster
CONTAINER_CWD=$(scontrol show job $SLURM_JOB_ID | grep -oP 'WorkDir=\K[^ ]+' | head -1)
if [[ -n "$COMMAND" ]]; then
  srun --no-container-mount-home --overlap --container-name=ray-head --container-workdir=$CONTAINER_CWD --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/ray-driver.log bash -c "$COMMAND"
else
  echo "[INFO]: Ray Cluster is idled, run this on the slurm head node to get a shell to the head node:"
  cat <<EOF >$SLURM_SUBMIT_DIR/${SLURM_JOB_ID}-attach.sh
# No args launches on the head node (node 0)
# Args 1-N launch on worker nodes (nodes 1 through N-1)
# Optional: set COMMAND='...' to run non-interactively instead of opening an interactive shell
WORKER_NUM=\${1:-}
if [[ -z "\$WORKER_NUM" ]]; then
  # Empty means we are on the head node
  if [[ -n "\${COMMAND:-}" ]]; then
    srun --no-container-mount-home $GRES_ARG -A $SLURM_JOB_ACCOUNT -p $SLURM_JOB_PARTITION --overlap --container-name=ray-head --container-workdir=$CONTAINER_CWD --nodes=1 --ntasks=1 -w "$head_node" --jobid $SLURM_JOB_ID bash -c "\$COMMAND"
  else
    srun --no-container-mount-home $GRES_ARG -A $SLURM_JOB_ACCOUNT -p $SLURM_JOB_PARTITION --overlap --container-name=ray-head --container-workdir=$CONTAINER_CWD --nodes=1 --ntasks=1 -w "$head_node" --jobid $SLURM_JOB_ID --pty bash
  fi
else
  # Worker numbers 1 through N-1 correspond to ray-worker-1 through ray-worker-(N-1)
  # and use nodes_array[1] through nodes_array[N-1]
  if [[ \$WORKER_NUM -lt 1 || \$WORKER_NUM -ge $SLURM_JOB_NUM_NODES ]]; then
    echo "Error: WORKER_NUM must be between 1 and $((SLURM_JOB_NUM_NODES-1))"
    exit 1
  fi
  nodes_array=($nodes)
  if [[ -n "\${COMMAND:-}" ]]; then
    srun --no-container-mount-home $GRES_ARG -A $SLURM_JOB_ACCOUNT -p $SLURM_JOB_PARTITION --overlap --container-name=ray-worker-\$WORKER_NUM --container-workdir=$CONTAINER_CWD --nodes=1 --ntasks=1 -w "\${nodes_array[\$WORKER_NUM]}" --jobid $SLURM_JOB_ID bash -c "\$COMMAND"
  else
    srun --no-container-mount-home $GRES_ARG -A $SLURM_JOB_ACCOUNT -p $SLURM_JOB_PARTITION --overlap --container-name=ray-worker-\$WORKER_NUM --container-workdir=$CONTAINER_CWD --nodes=1 --ntasks=1 -w "\${nodes_array[\$WORKER_NUM]}" --jobid $SLURM_JOB_ID --pty bash
  fi
fi
EOF
  chmod +x $SLURM_SUBMIT_DIR/${SLURM_JOB_ID}-attach.sh
  echo "     COMMAND='echo hello' bash $SLURM_SUBMIT_DIR/${SLURM_JOB_ID}-attach.sh    # run a non-interactive command on head node"
  echo "     bash $SLURM_SUBMIT_DIR/${SLURM_JOB_ID}-attach.sh    # to attach to head node (i.e., 'worker 0')"
  echo "     bash $SLURM_SUBMIT_DIR/${SLURM_JOB_ID}-attach.sh 1  # to attach to worker 1"
  echo "     bash $SLURM_SUBMIT_DIR/${SLURM_JOB_ID}-attach.sh 2  # to attach to worker 2, etc."
  sleep infinity
fi