-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathentrypoint.sh
More file actions
39 lines (31 loc) · 917 Bytes
/
entrypoint.sh
File metadata and controls
39 lines (31 loc) · 917 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#!/bin/bash
set -e
MASTER_PORT=${MASTER_PORT:-29500}
NPROC_PER_NODE=${NPROC_PER_NODE:-1}
TRAINING_SCRIPT=${TRAINING_SCRIPT:-train.py}
HEADLESS_SERVICE="${NF_DISCOVERY_SERVICE}"
log() {
echo "[$(date -u +%H:%M:%S)] $1"
}
if [ -z "${HEADLESS_SERVICE}" ]; then
log "ERROR: NF_DISCOVERY_SERVICE not set"
exit 1
fi
if [ -z "${REPLICAS}" ]; then
log "ERROR: REPLICAS not set"
exit 1
fi
HOSTNAME=$(hostname)
NODE_RANK=${HOSTNAME##*-}
BASE_NAME=${HOSTNAME%-*}
MASTER_HOSTNAME="${BASE_NAME}-0"
MASTER_ADDR="${MASTER_HOSTNAME}.${HEADLESS_SERVICE}"
log "Node ${NODE_RANK}/${REPLICAS} starting, master=${MASTER_ADDR}"
log "Starting torchrun (rendezvous will synchronize nodes)"
exec torchrun \
--nnodes=${REPLICAS} \
--nproc-per-node=${NPROC_PER_NODE} \
--node-rank=${NODE_RANK} \
--master-addr=${MASTER_ADDR} \
--master-port=${MASTER_PORT} \
${TRAINING_SCRIPT} ${SCRIPT_ARGS}