@@ -94,22 +94,58 @@ def get_cmd(self, environment, active_resources):
9494 assert not self .args .detect_nvlink_pairs , "openmpi backend does not support remapping visible devices"
9595 total_process_count = sum (self .resource_pool .values ())
9696 allow_run_as_root = os .environ .get ('RUN_MPI_AS_ROOT' , False )
97+ # Default
98+ # mpirun_cmd = [
99+ # 'mpirun',
100+ # '-n',
101+ # f'{total_process_count}',
102+ # '-hostfile',
103+ # f'{self.args.hostfile}',
104+ # '--mca',
105+ # 'btl',
106+ # '^openib',
107+ # '--mca',
108+ # 'btl_tcp_if_include',
109+ # 'eth0',
110+ # ]
111+
112+ # Custom from previous PI cluster
97113 mpirun_cmd = [
98114 'mpirun' ,
99- '-n' ,
115+ '--allow-run-as-root' ,
116+ '-np' ,
100117 f'{ total_process_count } ' ,
101118 '-hostfile' ,
102119 f'{ self .args .hostfile } ' ,
103- '--mca' ,
104- 'btl' ,
105- '^openib' ,
106- '--mca' ,
107- 'btl_tcp_if_include' ,
108- 'eth0' ,
120+ '-mca' ,
121+ 'btl tcp,self' ,
122+ '-mca' ,
123+ 'coll_hcoll_enable 0' ,
124+ '-mca' ,
125+ 'plm_rsh_args "-p 2222"' ,
126+ '-x' ,
127+ 'PATH' ,
128+ '-x' ,
129+ 'LD_LIBRARY_PATH' ,
130+ '-x' ,
131+ 'NCCL_IB_AR_THRESHOLD=0' ,
132+ '-x' ,
133+ 'NCCL_IB_PCI_RELAXED_ORDERING=1' ,
134+ '-x' ,
135+ 'NCCL_IB_SPLIT_DATA_ON_QPS=0' ,
136+ '-x' ,
137+ 'NCCL_IB_QPS_PER_CONNECTION=2' ,
138+ '-x' ,
139+ 'CUDA_DEVICE_ORDER=PCI_BUS_ID' ,
140+ '--bind-to' ,
141+ 'none' ,
109142 ]
110- if allow_run_as_root :
111- mpirun_cmd .insert (1 , '--allow-run-as-root' )
112-
143+ # ] + btl_tcp_opt + launcher_args
144+
145+ # Allow running as root
146+ # if allow_run_as_root:
147+ # mpirun_cmd.insert(1, '--allow-run-as-root')
148+
113149 export_cmd = []
114150 for k , v in self .exports .items ():
115151 export_cmd += ['-x' , f'{ k } ={ v } ' ]
0 commit comments