@@ -94,6 +94,7 @@ def get_cmd(self, environment, active_resources):
9494 assert not self .args .detect_nvlink_pairs , "openmpi backend does not support remapping visible devices"
9595 total_process_count = sum (self .resource_pool .values ())
9696 allow_run_as_root = os .environ .get ('RUN_MPI_AS_ROOT' , False )
97+
9798 # Default
9899 # mpirun_cmd = [
99100 # 'mpirun',
@@ -109,6 +110,19 @@ def get_cmd(self, environment, active_resources):
109110 # 'eth0',
110111 # ]
111112
113+ # NOTE: Copied from latter version of DeeperSpeed
114+ launcher_args = split (self .args .launcher_args )
115+
116+ # NOTE: Copied from latter version of DeeperSpeed
117+ # If btl_tcp_if_include option is provided through launcher_args, we use it. Otherwise, we add
118+ # `--mca btl_tcp_if_include eth0` option as a default value for compatibility.
119+ btl_tcp_opt = ['--mca' , 'btl_tcp_if_include' , 'eth0' ]
120+ if len (launcher_args ) >= 2 :
121+ for i in range (len (launcher_args ) - 1 ):
122+ if launcher_args [i ] in ['-mca' , '--mca' ] and launcher_args [i + 1 ] == 'btl_tcp_if_include' :
123+ btl_tcp_opt = []
124+ break
125+
112126 # Custom from previous PI cluster
113127 mpirun_cmd = [
114128 'mpirun' ,
@@ -139,8 +153,7 @@ def get_cmd(self, environment, active_resources):
139153 'CUDA_DEVICE_ORDER=PCI_BUS_ID' ,
140154 '--bind-to' ,
141155 'none' ,
142- ]
143- # ] + btl_tcp_opt + launcher_args
156+ ] + btl_tcp_opt + launcher_args
144157
145158 # Allow running as root
146159 # if allow_run_as_root:
0 commit comments