1- # Twinkle Server Configuration - Tinker-Compatible Megatron Backend
1+ # Twinkle Server Configuration - Tinker-Compatible Transformers Backend
22
33# Server protocol type: "tinker" enables the Tinker-compatible API
44server_type : tinker
@@ -31,23 +31,30 @@ applications:
3131 ray_actor_options :
3232 num_cpus : 0.1 # CPU resources allocated to this actor
3333
34- # 2. Model Service - Hosts the base model for training (Megatron backend)
35- # This is the actual model worker that performs forward/backward passes .
36- - name : models-Qwen2.5-0.5B- Instruct
37- route_prefix : /api/v1/model/Qwen/Qwen2.5-0.5B- Instruct # REST path for this model
34+ # 2. Model Service (commented out) - Would host the base model for training.
35+ # Uncomment and configure if you need a training model worker .
36+ - name : models-Qwen3-30B-A3B- Instruct-2507
37+ route_prefix : /api/v1/model/Qwen/Qwen3-30B-A3B- Instruct-2507
3838 import_path : model
3939 args :
40- use_megatron : true # Use Megatron-LM backend (not HuggingFace)
41- model_id : " ms://Qwen/Qwen2.5-0.5B- Instruct" # ModelScope model identifier to load
42- nproc_per_node : 2 # Number of GPU processes per node
43- device_group : # Logical device group for this model
40+ use_megatron : true # Use HuggingFace Transformers backend
41+ model_id : " ms://Qwen/Qwen3-30B-A3B- Instruct-2507 " # ModelScope model identifier
42+ nproc_per_node : 4 # Number of GPU processes per node
43+ device_group :
4444 name : model
45- ranks : [0, 1] # GPU rank indices to use
45+ ranks : [0, 1, 2, 3 ] # GPU rank indices
4646 device_type : cuda
47- device_mesh : # Distributed training mesh configuration
47+ device_mesh :
4848 device_type : cuda
49- mesh : [0, 1] # Device indices in the mesh
50- mesh_dim_names : ['dp'] # Mesh dimension names: 'dp' = data parallel
49+ dp_size : 2
50+ tp_size : 2
51+
52+ queue_config :
53+ rps_limit : 100 # Max requests per second
54+ tps_limit : 10000 # Max tokens per second
55+ adapter_config :
56+ per_token_adapter_limit : 30 # Max concurrent LoRA adapters
57+ adapter_timeout : 1800 # Seconds before idle adapter unload
5158 deployments :
5259 - name : ModelManagement
5360 autoscaling_config :
@@ -56,3 +63,35 @@ applications:
5663 target_ongoing_requests : 16
5764 ray_actor_options :
5865 num_cpus : 0.1
66+
67+ # 3. Sampler Service - Runs inference / sampling using vLLM engine
68+ # Used for generating text from the model (e.g., evaluating LoRA results).
69+ - name : sampler-Qwen3-30B-A3B-Instruct-2507
70+ route_prefix : /api/v1/sampler/Qwen/Qwen3-30B-A3B-Instruct-2507
71+ import_path : sampler
72+ args :
73+ model_id : " ms://Qwen/Qwen3-30B-A3B-Instruct-2507" # ModelScope model identifier
74+ nproc_per_node : 4 # Number of GPU processes per node
75+ sampler_type : vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
76+ engine_args : # vLLM engine-specific settings
77+ max_model_len : 4096 # Maximum sequence length the engine supports
78+ gpu_memory_utilization : 0.7 # Fraction of GPU memory to use (0.0-1.0)
79+ enable_lora : true # Allow loading LoRA adapters during inference
80+ device_group : # Logical device group for the sampler
81+ name : sampler
82+ gpus_per_worker : 2
83+ ranks : [4,5,6,7] # GPU rank indices to use
84+ device_type : cuda
85+ device_mesh :
86+ device_type : cuda
87+ dp_size : 4
88+ tp_size : 1
89+ deployments :
90+ - name : SamplerManagement
91+ autoscaling_config :
92+ min_replicas : 1
93+ max_replicas : 1
94+ target_ongoing_requests : 16
95+ ray_actor_options :
96+ num_cpus : 0.1
97+ num_gpus : 1 # Sampler needs a full GPU for inference
0 commit comments