wip

tastelikefeet · tastelikefeet · commit 946810ad2ef1 · 2026-02-15T19:03:44.000+08:00
diff --git a/cookbook/client/tinker/megatron/server_config.yaml b/cookbook/client/tinker/megatron/server_config.yaml
@@ -33,26 +33,25 @@ applications:
           runtime_env:
             env_vars:
               TWINKLE_TRUST_REMOTE_CODE: "0"
-              DEVICE_COUNT_PER_PHYSICAL_NODE: "8"
 
   # 3. Sampler Service - Runs inference / sampling using vLLM engine
   #    Used for generating text from the model (e.g., evaluating LoRA results).
   - name: sampler-Qwen3-30B-A3B-Instruct-2507
-    route_prefix: /api/v1/sampler/Qwen/Qwen3-30B-A3B-Instruct-2507
+    route_prefix: /api/v1/sampler/Qwen/Qwen2.5-7B-Instruct
     import_path: sampler
     args:
-      model_id: "ms://Qwen/Qwen3-30B-A3B-Instruct-2507"   # ModelScope model identifier
+      model_id: "ms://Qwen/Qwen2.5-7B-Instruct"   # ModelScope model identifier
       nproc_per_node: 4               # Number of GPU processes per node
       sampler_type: vllm              # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
       engine_args:                    # vLLM engine-specific settings
         max_model_len: 16000           # Maximum sequence length the engine supports
-        gpu_memory_utilization: 0.85   # Fraction of GPU memory to use (0.0-1.0)
+        gpu_memory_utilization: 0.7   # Fraction of GPU memory to use (0.0-1.0)
         enable_lora: true             # Allow loading LoRA adapters during inference
         max_loras: 5                  # Max allowed loras working on vLLM at the same time
       device_group:                   # Logical device group for the sampler
         name: sampler
         gpus_per_worker: 1
-        ranks: [0,1,2,3]                    # GPU rank indices to use
+        ranks: 4                   # GPU rank indices to use
         device_type: cuda
       device_mesh:
         device_type: cuda
@@ -63,30 +62,30 @@ applications:
     deployments:
       - name: SamplerManagement
         autoscaling_config:
-          min_replicas: 1
-          max_replicas: 1
+          min_replicas: 2
+          max_replicas: 2
           target_ongoing_requests: 16
         ray_actor_options:
           num_cpus: 0.1
           runtime_env:
             env_vars:
               TWINKLE_TRUST_REMOTE_CODE: "0"
-              DEVICE_COUNT_PER_PHYSICAL_NODE: "8"
+              DEVICE_COUNT_PER_PHYSICAL_NODE: "16"
 
   # 2. Model Service (commented out) - Would host the base model for training.
   #    Uncomment and configure if you need a training model worker.
   - name: models-Qwen3-30B-A3B-Instruct-2507
-    route_prefix: /api/v1/model/Qwen/Qwen3-30B-A3B-Instruct-2507
+    route_prefix: /api/v1/model/Qwen/Qwen2.5-7B-Instruct
     import_path: model
     args:
       use_megatron: true                          # Use HuggingFace Transformers backend
-      model_id: "ms://Qwen/Qwen3-30B-A3B-Instruct-2507" # ModelScope model identifier
+      model_id: "ms://Qwen/Qwen2.5-7B-Instruct" # ModelScope model identifier
       max_length: 16000                           # model max length
       max_loras: 5                                # model max loras
       nproc_per_node: 4                           # Number of GPU processes per node
       device_group:
         name: model
-        ranks: [4,5,6,7]                              # GPU rank indices
+        ranks: 4       # GPU rank indices
         device_type: cuda
       device_mesh:
         device_type: cuda
@@ -103,12 +102,12 @@ applications:
     deployments:
       - name: ModelManagement
         autoscaling_config:
-          min_replicas: 1
-          max_replicas: 1
+          min_replicas: 2
+          max_replicas: 2
           target_ongoing_requests: 8
         ray_actor_options:
           num_cpus: 0.1
           runtime_env:
             env_vars:
               TWINKLE_TRUST_REMOTE_CODE: "0"
-              DEVICE_COUNT_PER_PHYSICAL_NODE: "8"
+              DEVICE_COUNT_PER_PHYSICAL_NODE: "16"