Skip to content

Commit 588ef59

Browse files
committed
Try adding launcher_args and btl_tcp_opt
1 parent 3512ce2 commit 588ef59

File tree

1 file changed

+15
-2
lines changed

1 file changed

+15
-2
lines changed

deepspeed/launcher/multinode_runner.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ def get_cmd(self, environment, active_resources):
9494
assert not self.args.detect_nvlink_pairs, "openmpi backend does not support remapping visible devices"
9595
total_process_count = sum(self.resource_pool.values())
9696
allow_run_as_root = os.environ.get('RUN_MPI_AS_ROOT', False)
97+
9798
# Default
9899
# mpirun_cmd = [
99100
# 'mpirun',
@@ -109,6 +110,19 @@ def get_cmd(self, environment, active_resources):
109110
# 'eth0',
110111
# ]
111112

113+
# NOTE: Copied from latter version of DeeperSpeed
114+
launcher_args = split(self.args.launcher_args)
115+
116+
# NOTE: Copied from latter version of DeeperSpeed
117+
# If btl_tcp_if_include option is provided through launcher_args, we use it. Otherwise, we add
118+
# `--mca btl_tcp_if_include eth0` option as a default value for compatibility.
119+
btl_tcp_opt = ['--mca', 'btl_tcp_if_include', 'eth0']
120+
if len(launcher_args) >= 2:
121+
for i in range(len(launcher_args) - 1):
122+
if launcher_args[i] in ['-mca', '--mca'] and launcher_args[i + 1] == 'btl_tcp_if_include':
123+
btl_tcp_opt = []
124+
break
125+
112126
# Custom from previous PI cluster
113127
mpirun_cmd = [
114128
'mpirun',
@@ -139,8 +153,7 @@ def get_cmd(self, environment, active_resources):
139153
'CUDA_DEVICE_ORDER=PCI_BUS_ID',
140154
'--bind-to',
141155
'none',
142-
]
143-
# ] + btl_tcp_opt + launcher_args
156+
] + btl_tcp_opt + launcher_args
144157

145158
# Allow running as root
146159
# if allow_run_as_root:

0 commit comments

Comments
 (0)