forked from NVIDIA-NeMo/RL
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathray.sub
More file actions
487 lines (436 loc) · 19.1 KB
/
ray.sub
File metadata and controls
487 lines (436 loc) · 19.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
#!/bin/bash
#SBATCH --nodes=2
#SBATCH --exclusive
#SBATCH --account=ACCOUNT
#SBATCH --job-name=JOB_NAME
#SBATCH --partition=PARTITION
#SBATCH --time=1:0:0
#SBATCH --dependency=singleton
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -eoux pipefail
########################################################
# Function to detect if SLURM cluster uses GRES
########################################################
maybe_gres_arg() {
# Check if any nodes in the partition have GRES configured
# Assumes a homogeneous allocation (not a heterogeneous job)
if sinfo -p $SLURM_JOB_PARTITION -h -o "%G" | grep -q "gpu:"; then
# Do a quick assert here that gpus:8 == gpus:$GPUS_PER_NODE. It is probably a user error if someone isn't using GPUS_PER_NODE=8 on our clusters if it supports --gres=gpu:8 or gpu:a100:8
if [[ $GPUS_PER_NODE -ne $(sinfo -p $SLURM_JOB_PARTITION -h -o "%G" | grep "gpu:" | awk -F: '{print $NF}') ]]; then
echo "Error: GPUS_PER_NODE=$GPUS_PER_NODE but GRES detected is $(sinfo -p $SLURM_JOB_PARTITION -h -o "%G" | grep "gpu:") meaning GPUS_PER_NODE is not set to fully claim the GPUs on the nodes." >&2
exit 1
fi
echo "--gres=gpu:${GPUS_PER_NODE}"
return
fi
# No GRES support detected
echo ""
}
########################################################
# User defined variables
########################################################
CONTAINER=$CONTAINER
MOUNTS=$MOUNTS
COMMAND=${COMMAND:-} # This is a script relative to the SLURM_SUBMIT_DIR. If left empty, it will leave the cluster idle after it's brought up.
########################################################
# Ports for all nodes (should be odd numbers since we place head/worker[0] on the same node) so all workers get the odd ports, but the head will get +1 the ports
NODE_MANAGER_PORT=${NODE_MANAGER_PORT:-53001}
OBJECT_MANAGER_PORT=${OBJECT_MANAGER_PORT:-53003}
RUNTIME_ENV_AGENT_PORT=${RUNTIME_ENV_AGENT_PORT:-53005}
DASHBOARD_AGENT_GRPC_PORT=${DASHBOARD_AGENT_GRPC_PORT:-53007}
METRICS_EXPORT_PORT=${METRICS_EXPORT_PORT:-53009}
# Ports for the head node
PORT=${PORT:-54514}
RAY_CLIENT_SERVER_PORT=${RAY_CLIENT_SERVER_PORT:-10001}
#REDIT_SHARD_PORTS=${REDIT_SHARD_PORTS:-"random"} ??
DASHBOARD_PORT=${DASHBOARD_PORT:-8265} # Also used by debugger
DASHBOARD_AGENT_LISTEN_PORT=${DASHBOARD_AGENT_LISTEN_PORT:-52365}
RAY_DEBUGGER_ARGS=
if [ "${RAY_DEBUG:-}" = "legacy" ]; then
RAY_DEBUGGER_ARGS="--ray-debugger-external"
fi
# After ray>=2.47, this feature is enabled by default which creates uv venvs for any py_executable starting with `uv run`.
# There is severe contention and performance issues with this enabled considering our dependencies are so large and occasionally
# need to be compiled, so NeMo RL has an implementation in nemo_rl/utils/venv.py that does it once per node as opposed to once per task.
export RAY_ENABLE_UV_RUN_RUNTIME_ENV=0
# Setting ulimit is recommended by ray best practices page
# @ https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html
# It's session based and won't affect the system outside the script
# Ensure that the soft limit isn't above the hard limit
if [[ $(ulimit -Hn) == "unlimited" ]] || [[ 65535 -lt $(ulimit -Hn) ]]; then
ulimit -Sn 65535
elif [[ $(ulimit -Hn) != "unlimited" ]] && [[ $(ulimit -Hn) -lt 65535 ]]; then
echo "[WARNING]: Cannot increase ulimit on file descriptors to 65535 according ray recommendation: https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html. Speak to cluster admins to increase, otherwise ray may crash unexpectedly."
fi
# On our clusters, the largest port range on an idle worker appeared between 52369-64607
# (not including the other ports set by this script). So this range is chosen to be
# somewhere in the middle
MIN_WORKER_PORT=${MIN_WORKER_PORT:-54001}
MAX_WORKER_PORT=${MAX_WORKER_PORT:-54513}
########################################################
# Number seconds to sync logs from /tmp/ray/session_*/logs to $LOG_DIR/ray/
RAY_LOG_SYNC_FREQUENCY=${RAY_LOG_SYNC_FREQUENCY:-}
########################################################
# Unset UV_CACHE_DIR to avoid local cache directory interferring with the container cache
unset UV_CACHE_DIR
if [[ -n "${UV_CACHE_DIR_OVERRIDE:-}" ]]; then
mkdir -p "$UV_CACHE_DIR_OVERRIDE"
if [[ -n $MOUNTS ]]; then
MOUNTS+=",$UV_CACHE_DIR_OVERRIDE:/root/.cache/uv"
else
MOUNTS="$UV_CACHE_DIR_OVERRIDE:/root/.cache/uv"
fi
fi
# Create logs directory
BASE_LOG_DIR=${BASE_LOG_DIR:-$SLURM_SUBMIT_DIR}
LOG_DIR="$BASE_LOG_DIR/$SLURM_JOB_ID-logs"
mkdir -p $LOG_DIR
# Number of GPUs per worker node
GPUS_PER_NODE=${GPUS_PER_NODE:-8}
# Detect GRES support and set GRES_ARG
GRES_ARG=$(maybe_gres_arg)
if [[ -n "$GRES_ARG" ]]; then
echo "[INFO] GRES support detected. Using: $GRES_ARG"
else
echo "[INFO] No GRES support detected. Running without --gres flag."
fi
COMMON_SRUN_ARGS="$GRES_ARG"
COMMON_SRUN_ARGS+=" --no-container-mount-home"
COMMON_SRUN_ARGS+=" --mpi=pmix"
COMMON_SRUN_ARGS+=" --container-mounts=$MOUNTS"
COMMON_SRUN_ARGS+=" --container-image=$CONTAINER"
COMMON_SRUN_ARGS+=" --container-workdir=$SLURM_SUBMIT_DIR"
# TODO: delete these (just for debugging)
COMMON_SRUN_ARGS+=" -p $SLURM_JOB_PARTITION"
COMMON_SRUN_ARGS+=" -A $SLURM_JOB_ACCOUNT"
# Number of CPUs per worker node
CPUS_PER_WORKER=${CPUS_PER_WORKER:-$((GPUS_PER_NODE * 16))}
num_retries=3
# Track backgrounded srun client PIDs for head and workers
declare -A SRUN_PIDS
# Verify all backgrounded srun client processes are still alive; exit fast if any died
check_srun_processes() {
for name in "${!SRUN_PIDS[@]}"; do
pid="${SRUN_PIDS[$name]}"
# Check if the process is still running
if ! kill -0 "$pid" 2>/dev/null; then
echo "[ERROR] Background srun '$name' died (pid=$pid). Could be a failure in startup or an issue with the node preventing the srun to start. Attempting to exit." >&2
# Signal sidecars inside containers to terminate ASAP
touch "$LOG_DIR/ENDED"
exit 1
fi
done
}
# Getting the node names and IP addresses in the SLURM allocation
nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
nodes_array=($nodes)
ip_addresses_array=()
for node in $nodes; do
# Try multiple methods to get IP address - ENHANCED VERSION v2.0
echo "[DEBUG] Resolving hostname: $node using enhanced resolution methods"
ip_address=""
# Method 1: Try host command
echo "[DEBUG] Method 1: host command"
ip_address=$(host $node 2>/dev/null | awk '/has address/ { print $4 }' | head -1 || true)
echo "[DEBUG] host result: '$ip_address'"
# Method 2: If host fails, try getent
if [[ -z "$ip_address" ]]; then
echo "[DEBUG] Method 2: getent hosts"
ip_address=$(getent hosts $node 2>/dev/null | awk '{ print $1 }' | head -1 || true)
echo "[DEBUG] getent result: '$ip_address'"
fi
# Method 3: If getent fails, try nslookup
if [[ -z "$ip_address" ]]; then
echo "[DEBUG] Method 3: nslookup"
ip_address=$(nslookup $node 2>/dev/null | awk '/^Address: / { print $2 }' | head -1 || true)
echo "[DEBUG] nslookup result: '$ip_address'"
fi
# Method 4: If all DNS methods fail, try ping to extract IP
if [[ -z "$ip_address" ]]; then
echo "[DEBUG] Method 4: ping"
ip_address=$(ping -c 1 $node 2>/dev/null | grep "PING" | sed 's/.*(\([^)]*\)).*/\1/' || true)
echo "[DEBUG] ping result: '$ip_address'"
fi
# If still no IP, use the hostname itself (might work if it's already an IP or resolvable)
if [[ -z "$ip_address" ]]; then
echo "[WARNING] Could not resolve IP for $node, using hostname as fallback"
ip_address=$node
fi
echo "[INFO] Node: $node -> IP: $ip_address"
# Add the IP address to the array
ip_addresses_array+=("$ip_address")
done
head_node=${nodes_array[0]}
head_node_ip=${ip_addresses_array[0]}
ip_head=$head_node_ip:$PORT
# First we start the head of the ray cluster on one of the physical nodes
# Give the head node actual resources to make it schedulable
head_cmd=$(cat <<EOF
# Touch a file to indicate that the head node has started
# Overlapping srun commands will check this file to determine if we can overlap a container command
touch $LOG_DIR/STARTED_RAY_HEAD
env
exit-dramatically() {
# Use SIGTERM to forcefully terminate the srun process
pkill -P $$ || true
kill -TERM 0 || true
# As a last resort, exit with a non-zero code
exit 1
}
export -f exit-dramatically
# Background process to check for ENDED file
monitor-sidecar() {
set +x
while true; do
sleep 60
if [[ -f "$LOG_DIR/ENDED" ]]; then
echo "Detected ENDED file, terminating..."
exit-dramatically
fi
done
}
monitor-sidecar &
# Background process to sync ray logs every $RAY_LOG_SYNC_FREQUENCY seconds
log-sync-sidecar() {
set +x
if [[ -z "$RAY_LOG_SYNC_FREQUENCY" ]]; then
echo "RAY_LOG_SYNC_FREQUENCY is not set, skipping log sync sidecar"
return
fi
mkdir -p $LOG_DIR/ray
while true; do
sleep $RAY_LOG_SYNC_FREQUENCY
if ls /tmp/ray/session_[0-9]* > /dev/null 2>&1; then
for session_dir in /tmp/ray/session_[0-9]*/; do
if [[ -d "\$session_dir/logs" ]]; then
session_name=\$(basename "\$session_dir")
mkdir -p "$LOG_DIR/ray/\$session_name"
if command -v rsync > /dev/null 2>&1; then
rsync -ahP "\$session_dir/logs/" "$LOG_DIR/ray/\$session_name/logs/" 2>/dev/null || true
else
cp -r "\$session_dir/logs" "$LOG_DIR/ray/\$session_name/"
fi
fi
done
fi
if [[ -f "$LOG_DIR/ENDED" ]]; then
echo "Log sync sidecar terminating..."
break
fi
done
}
log-sync-sidecar &
# Patch nsight.py before starting Ray head
sed -i 's/context\.py_executable = " "\.join(self\.nsight_cmd) + " python"/context.py_executable = " ".join(self.nsight_cmd) + f" {context.py_executable}"/g' /opt/nemo_rl_venv/lib64/python*/site-packages/ray/_private/runtime_env/nsight.py
cat <<EOFINNER | tee /launch-head.sh
ray start --head \
--disable-usage-stats \
--resources="{\"worker_units\": $GPUS_PER_NODE, \"slurm_managed_ray_cluster\": 1}" \
--node-ip-address="$head_node_ip" \
--port=${PORT} \
--ray-client-server-port=${RAY_CLIENT_SERVER_PORT} \
--dashboard-port=${DASHBOARD_PORT} \
--dashboard-host="$head_node_ip" \
--include-dashboard=True \
\
--node-manager-port=$((${NODE_MANAGER_PORT} + 1)) \
--object-manager-port=$((${OBJECT_MANAGER_PORT} + 1)) \
--runtime-env-agent-port=$((${RUNTIME_ENV_AGENT_PORT} + 1)) \
--dashboard-agent-grpc-port=$((${DASHBOARD_AGENT_GRPC_PORT} + 1)) \
--dashboard-agent-listen-port=$((${DASHBOARD_AGENT_LISTEN_PORT} + 1)) \
--metrics-export-port=$((${METRICS_EXPORT_PORT} + 1)) \
$RAY_DEBUGGER_ARGS \
\
--block
EOFINNER
chmod +x /launch-head.sh
count=0
while [[ \$count -lt $num_retries ]]; do
bash /launch-head.sh
count=\$((count+1))
echo "Head node failed \$count/$num_retries times, restarting in 5 seconds..."
sleep 5
done
touch $LOG_DIR/ENDED
exit 1
EOF
)
srun $COMMON_SRUN_ARGS --container-name=ray-head --nodes=1 --ntasks=1 --cpus-per-task=$CPUS_PER_WORKER -w "$head_node" -o $LOG_DIR/ray-head.log bash -x -c "$head_cmd" &
SRUN_PIDS["ray-head"]=$!
NUM_ACTORS=$((GPUS_PER_NODE * SLURM_JOB_NUM_NODES))
# Start Ray worker nodes
# We want 1 Ray worker node per physical node (excluding the head node)
# Worker nodes are started with ray start but without the --head flag
# Start from node 1 since node 0 is running the head
for ((i = 1; i < SLURM_JOB_NUM_NODES; i++)); do
node_i=${nodes_array[$i]}
worker_cmd=$(cat <<EOF
env
exit-dramatically() {
# Use SIGTERM to forcefully terminate the srun process
pkill -P $$ || true
kill -TERM 0 || true
# As a last resort, exit with a non-zero code
exit 1
}
# Background process to check for ENDED file
monitor-sidecar() {
set +x
while true; do
sleep 60
if [[ -f "$LOG_DIR/ENDED" ]]; then
echo "Detected ENDED file, terminating..."
exit-dramatically
fi
done
}
monitor-sidecar &
# Background process to sync ray logs every $RAY_LOG_SYNC_FREQUENCY seconds
log-sync-sidecar() {
set +x
if [[ -z "$RAY_LOG_SYNC_FREQUENCY" ]]; then
echo "RAY_LOG_SYNC_FREQUENCY is not set, skipping log sync sidecar"
return
fi
mkdir -p $LOG_DIR/ray/$node_i
while true; do
sleep $RAY_LOG_SYNC_FREQUENCY
if ls /tmp/ray/session_[0-9]* > /dev/null 2>&1; then
for session_dir in /tmp/ray/session_[0-9]*/; do
if [[ -d "\$session_dir/logs" ]]; then
session_name=\$(basename "\$session_dir")
mkdir -p "$LOG_DIR/ray/$node_i/\$session_name"
if command -v rsync > /dev/null 2>&1; then
rsync -ahP "\$session_dir/logs/" $LOG_DIR/ray/$node_i/\$session_name/logs/ 2>/dev/null || true
else
cp -r "\$session_dir/logs" $LOG_DIR/ray/$node_i/\$session_name/
fi
fi
done
fi
if [[ -f "$LOG_DIR/ENDED" ]]; then
echo "Log sync sidecar terminating..."
break
fi
done
}
log-sync-sidecar &
# Patch nsight.py before starting Ray worker
sed -i 's/context\.py_executable = " "\.join(self\.nsight_cmd) + " python"/context.py_executable = " ".join(self.nsight_cmd) + f" {context.py_executable}"/g' /opt/nemo_rl_venv/lib64/python*/site-packages/ray/_private/runtime_env/nsight.py
cat <<EOFINNER | tee /launch-worker.sh
ray start --address "$ip_head" \
--disable-usage-stats \
--resources="{\"worker_units\": $GPUS_PER_NODE, \"slurm_managed_ray_cluster\": 1}" \
--min-worker-port=${MIN_WORKER_PORT} \
--max-worker-port=${MAX_WORKER_PORT} \
\
--node-manager-port=${NODE_MANAGER_PORT} \
--object-manager-port=${OBJECT_MANAGER_PORT} \
--runtime-env-agent-port=${RUNTIME_ENV_AGENT_PORT} \
--dashboard-agent-grpc-port=${DASHBOARD_AGENT_GRPC_PORT} \
--dashboard-agent-listen-port=${DASHBOARD_AGENT_LISTEN_PORT} \
--metrics-export-port=${METRICS_EXPORT_PORT} \
$RAY_DEBUGGER_ARGS \
\
--block
EOFINNER
count=0
while [[ \$count -lt $num_retries ]]; do
bash /launch-worker.sh
count=\$((count+1))
echo "Worker failed \$count/$num_retries times, restarting in 5 seconds..."
sleep 5
done
touch $LOG_DIR/ENDED
exit 1
EOF
)
srun $COMMON_SRUN_ARGS --container-name=ray-worker-$i --exact --nodes=1 --ntasks=1 --cpus-per-task=$CPUS_PER_WORKER -w "$node_i" -o $LOG_DIR/ray-worker-$i.log bash -x -c "$worker_cmd" &
SRUN_PIDS["ray-worker-$i"]=$!
sleep 3
done
# Then we wait here for the file to be created by the head node container
while check_srun_processes && ! srun --overlap --nodes=1 --ntasks=1 -w $head_node test -f $LOG_DIR/STARTED_RAY_HEAD; do
echo "[INFO][$(date)] Waiting for head node container to start..."
sleep 2
done
# At this stage the Ray cluster bringup has started on the physical nodes in the allocation
# Before we launch a job on this cluster we need to make sure that the bringup is complete
# We do so by querying the number of worker_units in the ray cluster and asserting = NUM_ACTORS
extract_worker_units() {
status_output=$(srun --overlap --container-name=ray-head --nodes=1 --ntasks=1 -w "$head_node" ray status)
if echo "$status_output" | grep -q "worker_units"; then
worker_units=$(echo "$status_output" | grep "worker_units" | awk -F'[/. ]' '{print $4}')
echo $worker_units
else
echo 0
fi
}
# Poll to make sure that all Ray worker nodes have connected to the head.
# All workers have connected when number of GPUs in ray cluster
# is equal to NUM_ACTORS. We use the utility function above
# to check how many GPUs have come online in the ray cluster
while true; do
worker_units=$(extract_worker_units)
echo "[INFO] Number of actors online: $worker_units/$NUM_ACTORS"
if [[ "$worker_units" -eq "$NUM_ACTORS" ]]; then
break
fi
check_srun_processes
sleep 2
done
echo "All workers connected!"
# We can now launch a job on this cluster
# We do so by launching a driver process on the physical node that the head node is on
# This driver process is responsible for launching a job on the Ray cluster
CONTAINER_CWD=$(scontrol show job $SLURM_JOB_ID | grep -oP 'WorkDir=\K[^ ]+' | head -1)
if [[ -n "$COMMAND" ]]; then
srun --no-container-mount-home --overlap --container-name=ray-head --container-workdir=$CONTAINER_CWD --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/ray-driver.log bash -c "$COMMAND"
else
echo "[INFO]: Ray Cluster is idled, run this on the slurm head node to get a shell to the head node:"
cat <<EOF >$SLURM_SUBMIT_DIR/${SLURM_JOB_ID}-attach.sh
# No args launches on the head node (node 0)
# Args 1-N launch on worker nodes (nodes 1 through N-1)
# Optional: set COMMAND='...' to run non-interactively instead of opening an interactive shell
WORKER_NUM=\${1:-}
if [[ -z "\$WORKER_NUM" ]]; then
# Empty means we are on the head node
if [[ -n "\${COMMAND:-}" ]]; then
srun --no-container-mount-home $GRES_ARG -A $SLURM_JOB_ACCOUNT -p $SLURM_JOB_PARTITION --overlap --container-name=ray-head --container-workdir=$CONTAINER_CWD --nodes=1 --ntasks=1 -w "$head_node" --jobid $SLURM_JOB_ID bash -c "\$COMMAND"
else
srun --no-container-mount-home $GRES_ARG -A $SLURM_JOB_ACCOUNT -p $SLURM_JOB_PARTITION --overlap --container-name=ray-head --container-workdir=$CONTAINER_CWD --nodes=1 --ntasks=1 -w "$head_node" --jobid $SLURM_JOB_ID --pty bash
fi
else
# Worker numbers 1 through N-1 correspond to ray-worker-1 through ray-worker-(N-1)
# and use nodes_array[1] through nodes_array[N-1]
if [[ \$WORKER_NUM -lt 1 || \$WORKER_NUM -ge $SLURM_JOB_NUM_NODES ]]; then
echo "Error: WORKER_NUM must be between 1 and $((SLURM_JOB_NUM_NODES-1))"
exit 1
fi
nodes_array=($nodes)
if [[ -n "\${COMMAND:-}" ]]; then
srun --no-container-mount-home $GRES_ARG -A $SLURM_JOB_ACCOUNT -p $SLURM_JOB_PARTITION --overlap --container-name=ray-worker-\$WORKER_NUM --container-workdir=$CONTAINER_CWD --nodes=1 --ntasks=1 -w "\${nodes_array[\$WORKER_NUM]}" --jobid $SLURM_JOB_ID bash -c "\$COMMAND"
else
srun --no-container-mount-home $GRES_ARG -A $SLURM_JOB_ACCOUNT -p $SLURM_JOB_PARTITION --overlap --container-name=ray-worker-\$WORKER_NUM --container-workdir=$CONTAINER_CWD --nodes=1 --ntasks=1 -w "\${nodes_array[\$WORKER_NUM]}" --jobid $SLURM_JOB_ID --pty bash
fi
fi
EOF
chmod +x $SLURM_SUBMIT_DIR/${SLURM_JOB_ID}-attach.sh
echo " COMMAND='echo hello' bash $SLURM_SUBMIT_DIR/${SLURM_JOB_ID}-attach.sh # run a non-interactive command on head node"
echo " bash $SLURM_SUBMIT_DIR/${SLURM_JOB_ID}-attach.sh # to attach to head node (i.e., 'worker 0')"
echo " bash $SLURM_SUBMIT_DIR/${SLURM_JOB_ID}-attach.sh 1 # to attach to worker 1"
echo " bash $SLURM_SUBMIT_DIR/${SLURM_JOB_ID}-attach.sh 2 # to attach to worker 2, etc."
sleep infinity
fi