@@ -36,11 +36,11 @@ applications:
3636
3737 # 3. Sampler Service - Runs inference / sampling using vLLM engine
3838 # Used for generating text from the model (e.g., evaluating LoRA results).
39- - name : sampler-Qwen3-30B-A3B-Instruct-2507
40- route_prefix : /api/v1/sampler/Qwen/Qwen3-30B-A3B-Instruct-2507
39+ - name : sampler-Qwen3.5-4B
40+ route_prefix : /api/v1/sampler/Qwen/Qwen3.5-4B
4141 import_path : sampler
4242 args :
43- model_id : " ms://Qwen/Qwen3-30B-A3B-Instruct-2507 " # ModelScope model identifier
43+ model_id : " ms://Qwen/Qwen3.5-4B " # ModelScope model identifier
4444 nproc_per_node : 4 # Number of GPU processes per node
4545 sampler_type : vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
4646 engine_args : # vLLM engine-specific settings
@@ -73,12 +73,12 @@ applications:
7373
7474 # 2. Model Service (commented out) - Would host the base model for training.
7575 # Uncomment and configure if you need a training model worker.
76- - name : models-Qwen3-30B-A3B-Instruct-2507
77- route_prefix : /api/v1/model/Qwen/Qwen3-30B-A3B-Instruct-2507
76+ - name : models-Qwen3.5-4B
77+ route_prefix : /api/v1/model/Qwen/Qwen3.5-4B
7878 import_path : model
7979 args :
8080 use_megatron : true # Use HuggingFace Transformers backend
81- model_id : " ms://Qwen/Qwen3-30B-A3B-Instruct-2507 " # ModelScope model identifier
81+ model_id : " ms://Qwen/Qwen3.5-4B " # ModelScope model identifier
8282 max_length : 16000 # model max length
8383 max_loras : 5 # model max loras
8484 nproc_per_node : 4 # Number of GPU processes per node
0 commit comments