Skip to content

Commit 3512ce2

Browse files
committed
Try added mpi run args
1 parent eb7f5cf commit 3512ce2

File tree

1 file changed

+46
-10
lines changed

1 file changed

+46
-10
lines changed

deepspeed/launcher/multinode_runner.py

Lines changed: 46 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -94,22 +94,58 @@ def get_cmd(self, environment, active_resources):
9494
assert not self.args.detect_nvlink_pairs, "openmpi backend does not support remapping visible devices"
9595
total_process_count = sum(self.resource_pool.values())
9696
allow_run_as_root = os.environ.get('RUN_MPI_AS_ROOT', False)
97+
# Default
98+
# mpirun_cmd = [
99+
# 'mpirun',
100+
# '-n',
101+
# f'{total_process_count}',
102+
# '-hostfile',
103+
# f'{self.args.hostfile}',
104+
# '--mca',
105+
# 'btl',
106+
# '^openib',
107+
# '--mca',
108+
# 'btl_tcp_if_include',
109+
# 'eth0',
110+
# ]
111+
112+
# Custom from previous PI cluster
97113
mpirun_cmd = [
98114
'mpirun',
99-
'-n',
115+
'--allow-run-as-root',
116+
'-np',
100117
f'{total_process_count}',
101118
'-hostfile',
102119
f'{self.args.hostfile}',
103-
'--mca',
104-
'btl',
105-
'^openib',
106-
'--mca',
107-
'btl_tcp_if_include',
108-
'eth0',
120+
'-mca',
121+
'btl tcp,self',
122+
'-mca',
123+
'coll_hcoll_enable 0',
124+
'-mca',
125+
'plm_rsh_args "-p 2222"',
126+
'-x',
127+
'PATH',
128+
'-x',
129+
'LD_LIBRARY_PATH',
130+
'-x',
131+
'NCCL_IB_AR_THRESHOLD=0',
132+
'-x',
133+
'NCCL_IB_PCI_RELAXED_ORDERING=1',
134+
'-x',
135+
'NCCL_IB_SPLIT_DATA_ON_QPS=0',
136+
'-x',
137+
'NCCL_IB_QPS_PER_CONNECTION=2',
138+
'-x',
139+
'CUDA_DEVICE_ORDER=PCI_BUS_ID',
140+
'--bind-to',
141+
'none',
109142
]
110-
if allow_run_as_root:
111-
mpirun_cmd.insert(1, '--allow-run-as-root')
112-
143+
# ] + btl_tcp_opt + launcher_args
144+
145+
# Allow running as root
146+
# if allow_run_as_root:
147+
# mpirun_cmd.insert(1, '--allow-run-as-root')
148+
113149
export_cmd = []
114150
for k, v in self.exports.items():
115151
export_cmd += ['-x', f'{k}={v}']

0 commit comments

Comments
 (0)