Skip to content

Commit de6b8a3

Browse files
committed
fix
1 parent 6488986 commit de6b8a3

File tree

1 file changed

+10
-38
lines changed

1 file changed

+10
-38
lines changed

src/twinkle/infra/_ray/resource_manager.py

Lines changed: 10 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,8 @@ def __init__(self,
8686
bundles = []
8787
cpu_bundles = []
8888

89-
# GPU/NPU placement groups: keep existing strategy (CPU= node_cpu//2) to avoid affecting training/inference throughput assumptions.
9089
for i in range(self.nnodes):
90+
# TODO not accurate, because placement_group cannot distribute to node same ordered with self.nodes
9191
node_idx = self.min_node_idx + i if device_type != 'CPU' else i
9292
node = self.nodes[node_idx]
9393
node_cpu = int(node['Resources']['CPU'])
@@ -149,6 +149,11 @@ def __init__(self,
149149

150150
self.device_groups = {}
151151
ray_address = str(ray.get_runtime_context().gcs_address)
152+
if 'DEVICE_COUNT_PER_PHYSICAL_NODE' in os.environ:
153+
# Sometimes, multiply nodes are in one physical node, there may be error in `gpu_rank`
154+
device_per_node = int(os.environ['DEVICE_COUNT_PER_PHYSICAL_NODE'])
155+
else:
156+
device_per_node = nproc_per_node
152157
for group in groups:
153158
if group.device_type != 'CPU':
154159
ranks = group.ranks
@@ -172,8 +177,8 @@ def __init__(self,
172177
worker_ranks = normalized_ranks[start_idx:start_idx + gpus_per_worker]
173178

174179
# All GPUs for a worker should be on the same node
175-
node_ranks = [r // nproc_per_node for r in worker_ranks]
176-
gpu_ranks_local = [r % nproc_per_node for r in worker_ranks]
180+
node_ranks = [r // device_per_node for r in worker_ranks]
181+
gpu_ranks_local = [r % device_per_node for r in worker_ranks]
177182

178183
if len(set(node_ranks)) > 1:
179184
raise ValueError(
@@ -190,8 +195,8 @@ def __init__(self,
190195
ray_address=ray_address))
191196
else:
192197
for alloc_rank in normalized_ranks:
193-
node_rank = alloc_rank // nproc_per_node
194-
gpu_rank = alloc_rank % nproc_per_node
198+
node_rank = alloc_rank // device_per_node
199+
gpu_rank = alloc_rank % device_per_node
195200
local_device_groups.append(
196201
dict(
197202
node_rank=node_rank,
@@ -218,39 +223,6 @@ def __init__(self,
218223
global_cpu_proc_idx += 1
219224
self.device_groups[group.name] = local_device_groups
220225

221-
import ray
222-
223-
@ray.remote(num_gpus=1)
224-
def check_gpu_info():
225-
import os
226-
import torch
227-
228-
node_id = ray.get_runtime_context().get_node_id()
229-
cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", "not set")
230-
231-
# 获取实际 GPU 信息
232-
if torch.cuda.is_available():
233-
gpu_name = torch.cuda.get_device_name(0)
234-
gpu_uuid = torch.cuda.get_device_properties(0).uuid # 不一定有
235-
else:
236-
gpu_name = "N/A"
237-
238-
return {
239-
"node_id": node_id,
240-
"CUDA_VISIBLE_DEVICES": cuda_visible,
241-
"gpu_name": gpu_name,
242-
}
243-
244-
# 在指定 PG 上运行
245-
result = ray.get(
246-
check_gpu_info.options(
247-
placement_group=self.placement_groups[0],
248-
placement_group_bundle_index=0
249-
).remote()
250-
)
251-
print(result)
252-
breakpoint()
253-
254226
self.group_configs = groups
255227
logger.info(f"nodes: {[n['NodeID'][:8] for n in self.nodes]}")
256228
logger.info(f"node_ranks: {self.node_ranks}")

0 commit comments

Comments
 (0)