@@ -86,8 +86,8 @@ def __init__(self,
8686 bundles = []
8787 cpu_bundles = []
8888
89- # GPU/NPU placement groups: keep existing strategy (CPU= node_cpu//2) to avoid affecting training/inference throughput assumptions.
9089 for i in range (self .nnodes ):
90+ # TODO not accurate, because placement_group cannot distribute to node same ordered with self.nodes
9191 node_idx = self .min_node_idx + i if device_type != 'CPU' else i
9292 node = self .nodes [node_idx ]
9393 node_cpu = int (node ['Resources' ]['CPU' ])
@@ -149,6 +149,11 @@ def __init__(self,
149149
150150 self .device_groups = {}
151151 ray_address = str (ray .get_runtime_context ().gcs_address )
152+ if 'DEVICE_COUNT_PER_PHYSICAL_NODE' in os .environ :
153+ # Sometimes, multiply nodes are in one physical node, there may be error in `gpu_rank`
154+ device_per_node = int (os .environ ['DEVICE_COUNT_PER_PHYSICAL_NODE' ])
155+ else :
156+ device_per_node = nproc_per_node
152157 for group in groups :
153158 if group .device_type != 'CPU' :
154159 ranks = group .ranks
@@ -172,8 +177,8 @@ def __init__(self,
172177 worker_ranks = normalized_ranks [start_idx :start_idx + gpus_per_worker ]
173178
174179 # All GPUs for a worker should be on the same node
175- node_ranks = [r // nproc_per_node for r in worker_ranks ]
176- gpu_ranks_local = [r % nproc_per_node for r in worker_ranks ]
180+ node_ranks = [r // device_per_node for r in worker_ranks ]
181+ gpu_ranks_local = [r % device_per_node for r in worker_ranks ]
177182
178183 if len (set (node_ranks )) > 1 :
179184 raise ValueError (
@@ -190,8 +195,8 @@ def __init__(self,
190195 ray_address = ray_address ))
191196 else :
192197 for alloc_rank in normalized_ranks :
193- node_rank = alloc_rank // nproc_per_node
194- gpu_rank = alloc_rank % nproc_per_node
198+ node_rank = alloc_rank // device_per_node
199+ gpu_rank = alloc_rank % device_per_node
195200 local_device_groups .append (
196201 dict (
197202 node_rank = node_rank ,
@@ -218,39 +223,6 @@ def __init__(self,
218223 global_cpu_proc_idx += 1
219224 self .device_groups [group .name ] = local_device_groups
220225
221- import ray
222-
223- @ray .remote (num_gpus = 1 )
224- def check_gpu_info ():
225- import os
226- import torch
227-
228- node_id = ray .get_runtime_context ().get_node_id ()
229- cuda_visible = os .environ .get ("CUDA_VISIBLE_DEVICES" , "not set" )
230-
231- # 获取实际 GPU 信息
232- if torch .cuda .is_available ():
233- gpu_name = torch .cuda .get_device_name (0 )
234- gpu_uuid = torch .cuda .get_device_properties (0 ).uuid # 不一定有
235- else :
236- gpu_name = "N/A"
237-
238- return {
239- "node_id" : node_id ,
240- "CUDA_VISIBLE_DEVICES" : cuda_visible ,
241- "gpu_name" : gpu_name ,
242- }
243-
244- # 在指定 PG 上运行
245- result = ray .get (
246- check_gpu_info .options (
247- placement_group = self .placement_groups [0 ],
248- placement_group_bundle_index = 0
249- ).remote ()
250- )
251- print (result )
252- breakpoint ()
253-
254226 self .group_configs = groups
255227 logger .info (f"nodes: { [n ['NodeID' ][:8 ] for n in self .nodes ]} " )
256228 logger .info (f"node_ranks: { self .node_ranks } " )
0 commit comments