@@ -33,26 +33,25 @@ applications:
3333 runtime_env :
3434 env_vars :
3535 TWINKLE_TRUST_REMOTE_CODE : " 0"
36- DEVICE_COUNT_PER_PHYSICAL_NODE : " 8"
3736
3837 # 3. Sampler Service - Runs inference / sampling using vLLM engine
3938 # Used for generating text from the model (e.g., evaluating LoRA results).
4039 - name : sampler-Qwen3-30B-A3B-Instruct-2507
41- route_prefix : /api/v1/sampler/Qwen/Qwen3-30B-A3B- Instruct-2507
40+ route_prefix : /api/v1/sampler/Qwen/Qwen2.5-7B- Instruct
4241 import_path : sampler
4342 args :
44- model_id : " ms://Qwen/Qwen3-30B-A3B- Instruct-2507 " # ModelScope model identifier
43+ model_id : " ms://Qwen/Qwen2.5-7B- Instruct" # ModelScope model identifier
4544 nproc_per_node : 4 # Number of GPU processes per node
4645 sampler_type : vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
4746 engine_args : # vLLM engine-specific settings
4847 max_model_len : 16000 # Maximum sequence length the engine supports
49- gpu_memory_utilization : 0.85 # Fraction of GPU memory to use (0.0-1.0)
48+ gpu_memory_utilization : 0.7 # Fraction of GPU memory to use (0.0-1.0)
5049 enable_lora : true # Allow loading LoRA adapters during inference
5150 max_loras : 5 # Max allowed loras working on vLLM at the same time
5251 device_group : # Logical device group for the sampler
5352 name : sampler
5453 gpus_per_worker : 1
55- ranks : [0,1,2,3] # GPU rank indices to use
54+ ranks : 4 # GPU rank indices to use
5655 device_type : cuda
5756 device_mesh :
5857 device_type : cuda
@@ -63,30 +62,30 @@ applications:
6362 deployments :
6463 - name : SamplerManagement
6564 autoscaling_config :
66- min_replicas : 1
67- max_replicas : 1
65+ min_replicas : 2
66+ max_replicas : 2
6867 target_ongoing_requests : 16
6968 ray_actor_options :
7069 num_cpus : 0.1
7170 runtime_env :
7271 env_vars :
7372 TWINKLE_TRUST_REMOTE_CODE : " 0"
74- DEVICE_COUNT_PER_PHYSICAL_NODE : " 8 "
73+ DEVICE_COUNT_PER_PHYSICAL_NODE : " 16 "
7574
7675 # 2. Model Service (commented out) - Would host the base model for training.
7776 # Uncomment and configure if you need a training model worker.
7877 - name : models-Qwen3-30B-A3B-Instruct-2507
79- route_prefix : /api/v1/model/Qwen/Qwen3-30B-A3B- Instruct-2507
78+ route_prefix : /api/v1/model/Qwen/Qwen2.5-7B- Instruct
8079 import_path : model
8180 args :
8281 use_megatron : true # Use HuggingFace Transformers backend
83- model_id : " ms://Qwen/Qwen3-30B-A3B- Instruct-2507 " # ModelScope model identifier
82+ model_id : " ms://Qwen/Qwen2.5-7B- Instruct" # ModelScope model identifier
8483 max_length : 16000 # model max length
8584 max_loras : 5 # model max loras
8685 nproc_per_node : 4 # Number of GPU processes per node
8786 device_group :
8887 name : model
89- ranks : [4,5,6,7] # GPU rank indices
88+ ranks : 4 # GPU rank indices
9089 device_type : cuda
9190 device_mesh :
9291 device_type : cuda
@@ -103,12 +102,12 @@ applications:
103102 deployments :
104103 - name : ModelManagement
105104 autoscaling_config :
106- min_replicas : 1
107- max_replicas : 1
105+ min_replicas : 2
106+ max_replicas : 2
108107 target_ongoing_requests : 8
109108 ray_actor_options :
110109 num_cpus : 0.1
111110 runtime_env :
112111 env_vars :
113112 TWINKLE_TRUST_REMOTE_CODE : " 0"
114- DEVICE_COUNT_PER_PHYSICAL_NODE : " 8 "
113+ DEVICE_COUNT_PER_PHYSICAL_NODE : " 16 "
0 commit comments