@@ -24,6 +24,7 @@ applications:
2424 - Qwen/Qwen3.5-4B
2525 deployments :
2626 - name : TinkerCompatServer
27+ max_ongoing_requests : 50
2728 autoscaling_config :
2829 min_replicas : 1 # Minimum number of replicas
2930 max_replicas : 1 # Maximum number of replicas
@@ -70,39 +71,39 @@ applications:
7071
7172 # 3. Sampler Service - Runs inference / sampling using vLLM engine
7273 # Used for generating text from the model (e.g., evaluating LoRA results).
73- # - name: sampler-Qwen3.5-4B
74- # route_prefix: /api/v1/sampler/Qwen/Qwen3.5-4B
75- # import_path: sampler
76- # args:
77- # model_id: "ms://Qwen/Qwen3.5-4B" # ModelScope model identifier
78- # nproc_per_node: 2 # Number of GPU processes per node
79- # sampler_type: vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
80- # engine_args: # vLLM engine-specific settings
81- # max_model_len: 4096 # Maximum sequence length the engine supports
82- # gpu_memory_utilization: 0.5 # Fraction of GPU memory to use (0.0-1.0)
83- # enable_lora: true # Allow loading LoRA adapters during inference
84- # logprobs_mode: processed_logprobs # Logprobs mode for sampling results
85- # device_group: # Logical device group for the sampler
86- # name: sampler
87- # ranks: 1 # Number of GPUs to use
88- # device_type: cuda
89- # device_mesh:
90- # device_type: cuda
91- # dp_size: 1
92- # queue_config:
93- # rps_limit: 100 # Max requests per second
94- # tps_limit: 100000 # Max tokens per second
95- # deployments:
96- # - name: SamplerManagement
97- # autoscaling_config:
98- # min_replicas: 1
99- # max_replicas: 1
100- # target_ongoing_requests: 16
101- # ray_actor_options:
102- # num_cpus: 0.1
103- # runtime_env:
104- # env_vars:
105- # TWINKLE_TRUST_REMOTE_CODE: "0"
74+ - name : sampler-Qwen3.5-4B
75+ route_prefix : /api/v1/sampler/Qwen/Qwen3.5-4B
76+ import_path : sampler
77+ args :
78+ model_id : " ms://Qwen/Qwen3.5-4B" # ModelScope model identifier
79+ nproc_per_node : 2 # Number of GPU processes per node
80+ sampler_type : vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
81+ engine_args : # vLLM engine-specific settings
82+ max_model_len : 4096 # Maximum sequence length the engine supports
83+ gpu_memory_utilization : 0.5 # Fraction of GPU memory to use (0.0-1.0)
84+ enable_lora : true # Allow loading LoRA adapters during inference
85+ logprobs_mode : processed_logprobs # Logprobs mode for sampling results
86+ device_group : # Logical device group for the sampler
87+ name : sampler
88+ ranks : 1 # Number of GPUs to use
89+ device_type : cuda
90+ device_mesh :
91+ device_type : cuda
92+ dp_size : 1
93+ queue_config :
94+ rps_limit : 100 # Max requests per second
95+ tps_limit : 100000 # Max tokens per second
96+ deployments :
97+ - name : SamplerManagement
98+ autoscaling_config :
99+ min_replicas : 1
100+ max_replicas : 1
101+ target_ongoing_requests : 16
102+ ray_actor_options :
103+ num_cpus : 0.1
104+ runtime_env :
105+ env_vars :
106+ TWINKLE_TRUST_REMOTE_CODE : " 0"
106107
107108 # 4. Processor Service
108109 - name : processor
0 commit comments