@@ -33,12 +33,13 @@ applications:
3333
3434 # 2. Model Service - Hosts the base model for training (Megatron backend)
3535 # This is the actual model worker that performs forward/backward passes.
36- - name : models-Qwen2.5-0.5B -Instruct
37- route_prefix : /api/v1/model/Qwen/Qwen2.5-0.5B -Instruct # REST path for this model
36+ - name : models-Qwen2.5-3B -Instruct
37+ route_prefix : /api/v1/model/Qwen/Qwen2.5-3B -Instruct # REST path for this model
3838 import_path : model
3939 args :
4040 use_megatron : true # Use Megatron-LM backend (not HuggingFace)
41- model_id : " ms://Qwen/Qwen2.5-0.5B-Instruct" # ModelScope model identifier to load
41+ mixed_precision : bf16 # Use bfloat16 precision for training
42+ model_id : " ms://Qwen/Qwen2.5-3B-Instruct" # ModelScope model identifier to load
4243 nproc_per_node : 2 # Number of GPU processes per node
4344 device_group : # Logical device group for this model
4445 name : model
@@ -48,6 +49,9 @@ applications:
4849 device_type : cuda
4950 mesh : [0, 1] # Device indices in the mesh
5051 mesh_dim_names : ['dp'] # Mesh dimension names: 'dp' = data parallel
52+ adapter_config :
53+ per_token_adapter_limit : 30 # Max concurrent LoRA adapters
54+ adapter_timeout : 1800 # Seconds before idle adapter unload
5155 deployments :
5256 - name : ModelManagement
5357 autoscaling_config :
@@ -56,3 +60,34 @@ applications:
5660 target_ongoing_requests : 16
5761 ray_actor_options :
5862 num_cpus : 0.1
63+
64+ # 3. Sampler Service - Runs inference / sampling using vLLM engine
65+ # Used for generating text from the model (e.g., evaluating LoRA results).
66+ - name : sampler-Qwen2.5-3B-Instruct
67+ route_prefix : /api/v1/sampler/Qwen/Qwen2.5-3B-Instruct
68+ import_path : sampler
69+ args :
70+ model_id : " ms://Qwen/Qwen2.5-3B-Instruct" # ModelScope model identifier
71+ nproc_per_node : 1 # Number of GPU processes per node
72+ sampler_type : vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
73+ engine_args : # vLLM engine-specific settings
74+ max_model_len : 4096 # Maximum sequence length the engine supports
75+ gpu_memory_utilization : 0.5 # Fraction of GPU memory to use (0.0-1.0)
76+ enable_lora : true # Allow loading LoRA adapters during inference
77+ device_group : # Logical device group for the sampler
78+ name : sampler
79+ ranks : [0] # GPU rank indices to use
80+ device_type : cuda
81+ device_mesh :
82+ device_type : cuda
83+ mesh : [0]
84+ mesh_dim_names : ['dp']
85+ deployments :
86+ - name : SamplerManagement
87+ autoscaling_config :
88+ min_replicas : 1
89+ max_replicas : 1
90+ target_ongoing_requests : 16
91+ ray_actor_options :
92+ num_cpus : 0.1
93+ num_gpus : 1 # Sampler needs a full GPU for inference
0 commit comments