wip

tastelikefeet · tastelikefeet · commit 6488986c5cd2 · 2026-02-10T19:36:20.000+08:00
diff --git a/cookbook/client/tinker/megatron/server_config.yaml b/cookbook/client/tinker/megatron/server_config.yaml
@@ -31,48 +31,13 @@ applications:
         ray_actor_options:
           num_cpus: 0.1                  # CPU resources allocated to this actor
 
-  # 2. Model Service (commented out) - Would host the base model for training.
-  #    Uncomment and configure if you need a training model worker.
-  - name: models-Qwen3-30B-A3B-Instruct-2507
-    route_prefix: /api/v1/model/Qwen/Qwen3-30B-A3B-Instruct-2507
-    import_path: model
-    args:
-      use_megatron: true                          # Use HuggingFace Transformers backend
-      model_id: "ms://Qwen/Qwen3-30B-A3B-Instruct-2507" # ModelScope model identifier
-      nproc_per_node: 4                            # Number of GPU processes per node
-      device_group:
-        name: model
-        ranks: [0, 1, 2, 3]                              # GPU rank indices
-        device_type: cuda
-      device_mesh:
-        device_type: cuda
-        dp_size: 2
-        tp_size: 2
-        ep_size: 2
-
-      queue_config:
-        rps_limit: 100                             # Max requests per second
-        tps_limit: 10000                           # Max tokens per second
-      adapter_config:
-        per_token_adapter_limit: 30                # Max concurrent LoRA adapters
-        adapter_timeout: 1800                      # Seconds before idle adapter unload
-    deployments:
-      - name: ModelManagement
-        autoscaling_config:
-          min_replicas: 1
-          max_replicas: 1
-          target_ongoing_requests: 16
-        ray_actor_options:
-          num_cpus: 0.1
-          num_gpus: 1
-
   # 3. Sampler Service - Runs inference / sampling using vLLM engine
   #    Used for generating text from the model (e.g., evaluating LoRA results).
-  - name: sampler-Qwen3-30B-A3B-Instruct-2507
-    route_prefix: /api/v1/sampler/Qwen/Qwen3-30B-A3B-Instruct-2507
+  - name: sampler-Qwen2.5-3B-Instruct
+    route_prefix: /api/v1/sampler/Qwen/Qwen2.5-3B-Instruct
     import_path: sampler
     args:
-      model_id: "ms://Qwen/Qwen3-30B-A3B-Instruct-2507"   # ModelScope model identifier
+      model_id: "ms://Qwen/Qwen2.5-3B-Instruct"   # ModelScope model identifier
       nproc_per_node: 4               # Number of GPU processes per node
       sampler_type: vllm              # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
       engine_args:                    # vLLM engine-specific settings
@@ -82,12 +47,11 @@ applications:
       device_group:                   # Logical device group for the sampler
         name: sampler
         gpus_per_worker: 1
-        ranks: [4,5,6,7]                    # GPU rank indices to use
+        ranks: [0,1,2,3]                    # GPU rank indices to use
         device_type: cuda
       device_mesh:
         device_type: cuda
         dp_size: 4
-        tp_size: 1
     deployments:
       - name: SamplerManagement
         autoscaling_config:
@@ -96,4 +60,35 @@ applications:
           target_ongoing_requests: 16
         ray_actor_options:
           num_cpus: 0.1
-          num_gpus: 1                 # Sampler needs a full GPU for inference
+
+  # 2. Model Service (commented out) - Would host the base model for training.
+  #    Uncomment and configure if you need a training model worker.
+  - name: models-Qwen2.5-3B-Instruct
+    route_prefix: /api/v1/model/Qwen/Qwen2.5-3B-Instruct
+    import_path: model
+    args:
+      use_megatron: false                          # Use HuggingFace Transformers backend
+      model_id: "ms://Qwen/Qwen2.5-3B-Instruct" # ModelScope model identifier
+      nproc_per_node: 4                            # Number of GPU processes per node
+      device_group:
+        name: model
+        ranks: [4,5,6,7]                              # GPU rank indices
+        device_type: cuda
+      device_mesh:
+        device_type: cuda
+        dp_size: 4
+
+      queue_config:
+        rps_limit: 100                             # Max requests per second
+        tps_limit: 10000                           # Max tokens per second
+      adapter_config:
+        per_token_adapter_limit: 30                # Max concurrent LoRA adapters
+        adapter_timeout: 1800                      # Seconds before idle adapter unload
+    deployments:
+      - name: ModelManagement
+        autoscaling_config:
+          min_replicas: 1
+          max_replicas: 1
+          target_ongoing_requests: 16
+        ray_actor_options:
+          num_cpus: 0.1
diff --git a/src/twinkle/infra/_ray/resource_manager.py b/src/twinkle/infra/_ray/resource_manager.py
@@ -88,7 +88,8 @@ def __init__(self,
 
         # GPU/NPU placement groups: keep existing strategy (CPU= node_cpu//2) to avoid affecting training/inference throughput assumptions.
         for i in range(self.nnodes):
-            node = self.nodes[i]
+            node_idx = self.min_node_idx + i if device_type != 'CPU' else i
+            node = self.nodes[node_idx]
             node_cpu = int(node['Resources']['CPU'])
             if device_type != 'CPU':
                 bundles.append({device_type: nproc_per_node, 'CPU': max(node_cpu // 2, 1)}) # create bundles
@@ -217,6 +218,39 @@ def __init__(self,
                     global_cpu_proc_idx += 1
                 self.device_groups[group.name] = local_device_groups
 
+        import ray
+
+        @ray.remote(num_gpus=1)
+        def check_gpu_info():
+            import os
+            import torch
+
+            node_id = ray.get_runtime_context().get_node_id()
+            cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", "not set")
+
+            # 获取实际 GPU 信息
+            if torch.cuda.is_available():
+                gpu_name = torch.cuda.get_device_name(0)
+                gpu_uuid = torch.cuda.get_device_properties(0).uuid  # 不一定有
+            else:
+                gpu_name = "N/A"
+
+            return {
+                "node_id": node_id,
+                "CUDA_VISIBLE_DEVICES": cuda_visible,
+                "gpu_name": gpu_name,
+            }
+
+        # 在指定 PG 上运行
+        result = ray.get(
+            check_gpu_info.options(
+                placement_group=self.placement_groups[0],
+                placement_group_bundle_index=0
+            ).remote()
+        )
+        print(result)
+        breakpoint()
+
         self.group_configs = groups
         logger.info(f"nodes: {[n['NodeID'][:8] for n in self.nodes]}")
         logger.info(f"node_ranks: {self.node_ranks}")
diff --git a/src/twinkle/server/tinker/sampler.py b/src/twinkle/server/tinker/sampler.py
@@ -110,6 +110,7 @@ def __init__(self, nproc_per_node: int, device_group: Dict[str, Any],
                     model_id=model_id,
                     engine_args=sampler_kwargs,
                     device_mesh=self.device_mesh,
+                    remote_group=self.device_group.name,
                     **{k: v for k, v in kwargs.items() if k not in ['engine_args']}
                 )
             else:  # torch sampler

Original file line number	Diff line number	Diff line change
`@@ -110,6 +110,7 @@ def __init__(self, nproc_per_node: int, device_group: Dict[str, Any],`
`110`	`110`	`model_id=model_id,`
`111`	`111`	`engine_args=sampler_kwargs,`
`112`	`112`	`device_mesh=self.device_mesh,`
	`113`	`+ remote_group=self.device_group.name,`
`113`	`114`	`**{k: v for k, v in kwargs.items() if k not in ['engine_args']}`
`114`	`115`	`)`
`115`	`116`	`else: # torch sampler`