modelscope
diff --git a/‎cookbook/client/tinker/megatron/server_config.yaml‎
Lines changed: 38 additions & 3 deletions b/‎cookbook/client/tinker/megatron/server_config.yaml‎
Lines changed: 38 additions & 3 deletions
diff --git a/‎cookbook/client/tinker/transformer/grpo.py‎
Lines changed: 1 addition & 1 deletion b/‎cookbook/client/tinker/transformer/grpo.py‎
Lines changed: 1 addition & 1 deletion
@@ -33,12 +33,13 @@ applications:
 
   # 2. Model Service - Hosts the base model for training (Megatron backend)
   #    This is the actual model worker that performs forward/backward passes.
-  - name: models-Qwen2.5-0.5B-Instruct
-    route_prefix: /api/v1/model/Qwen/Qwen2.5-0.5B-Instruct   # REST path for this model
+  - name: models-Qwen2.5-3B-Instruct
+    route_prefix: /api/v1/model/Qwen/Qwen2.5-3B-Instruct   # REST path for this model
     import_path: model
     args:
       use_megatron: true                               # Use Megatron-LM backend (not HuggingFace)
-      model_id: "ms://Qwen/Qwen2.5-0.5B-Instruct"     # ModelScope model identifier to load
+      mixed_precision: bf16                             # Use bfloat16 precision for training
+      model_id: "ms://Qwen/Qwen2.5-3B-Instruct"     # ModelScope model identifier to load
       nproc_per_node: 2               # Number of GPU processes per node
       device_group:                   # Logical device group for this model
         name: model
@@ -48,6 +49,9 @@ applications:
         device_type: cuda
         mesh: [0, 1]                  # Device indices in the mesh
         mesh_dim_names: ['dp']        # Mesh dimension names: 'dp' = data parallel
+      adapter_config:
+        per_token_adapter_limit: 30                # Max concurrent LoRA adapters
+        adapter_timeout: 1800                      # Seconds before idle adapter unload
     deployments:
       - name: ModelManagement
         autoscaling_config:
@@ -56,3 +60,34 @@ applications:
           target_ongoing_requests: 16
         ray_actor_options:
           num_cpus: 0.1
+
+  # 3. Sampler Service - Runs inference / sampling using vLLM engine
+  #    Used for generating text from the model (e.g., evaluating LoRA results).
+  - name: sampler-Qwen2.5-3B-Instruct
+    route_prefix: /api/v1/sampler/Qwen/Qwen2.5-3B-Instruct
+    import_path: sampler
+    args:
+      model_id: "ms://Qwen/Qwen2.5-3B-Instruct"   # ModelScope model identifier
+      nproc_per_node: 1               # Number of GPU processes per node
+      sampler_type: vllm              # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
+      engine_args:                    # vLLM engine-specific settings
+        max_model_len: 4096           # Maximum sequence length the engine supports
+        gpu_memory_utilization: 0.5   # Fraction of GPU memory to use (0.0-1.0)
+        enable_lora: true             # Allow loading LoRA adapters during inference
+      device_group:                   # Logical device group for the sampler
+        name: sampler
+        ranks: [0]                    # GPU rank indices to use
+        device_type: cuda
+      device_mesh:
+        device_type: cuda
+        mesh: [0]
+        mesh_dim_names: ['dp']
+    deployments:
+      - name: SamplerManagement
+        autoscaling_config:
+          min_replicas: 1
+          max_replicas: 1
+          target_ongoing_requests: 16
+        ray_actor_options:
+          num_cpus: 0.1
+          num_gpus: 1                 # Sampler needs a full GPU for inference
@@ -34,7 +34,7 @@
 logger = get_logger()
 
 # ========== Configuration ==========
-BASE_MODEL = 'Qwen/Qwen2.5-7B-Instruct'
+BASE_MODEL = 'Qwen/Qwen2.5-3B-Instruct'
 NUM_GENERATIONS = 4
 MAX_NEW_TOKENS = 1024
 LEARNING_RATE = 1e-5