@@ -71,37 +71,37 @@ applications:
7171
7272 # 3. Sampler Service - Runs inference / sampling using vLLM engine
7373 # Used for generating text from the model (e.g., evaluating LoRA results).
74- # - name: sampler-Qwen2.5-7B-Instruct
75- # route_prefix: /api/v1/sampler/Qwen/Qwen2.5-7B-Instruct
76- # import_path: sampler
77- # args:
78- # model_id: "ms://Qwen/Qwen2.5-7B-Instruct" # ModelScope model identifier
79- # nproc_per_node: 2 # Number of GPU processes per node
80- # sampler_type: vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
81- # engine_args: # vLLM engine-specific settings
82- # max_model_len: 4096 # Maximum sequence length the engine supports
83- # gpu_memory_utilization: 0.5 # Fraction of GPU memory to use (0.0-1.0)
84- # enable_lora: true # Allow loading LoRA adapters during inference
85- # logprobs_mode: processed_logprobs # Logprobs mode for sampling results
86- # device_group: # Logical device group for the sampler
87- # name: sampler
88- # ranks: [2] # GPU rank indices to use
89- # device_type: cuda
90- # device_mesh:
91- # device_type: cuda
92- # dp_size: 1
93- # queue_config:
94- # rps_limit: 100 # Max requests per second
95- # tps_limit: 100000 # Max tokens per second
96- # deployments:
97- # - name: SamplerManagement
98- # autoscaling_config:
99- # min_replicas: 1
100- # max_replicas: 1
101- # target_ongoing_requests: 16
102- # ray_actor_options:
103- # num_cpus: 0.1
104- # runtime_env:
105- # env_vars:
106- # TWINKLE_TRUST_REMOTE_CODE: "0"
107- # DEVICE_COUNT_PER_PHYSICAL_NODE: "8"
74+ - name : sampler-Qwen2.5-7B-Instruct
75+ route_prefix : /api/v1/sampler/Qwen/Qwen2.5-7B-Instruct
76+ import_path : sampler
77+ args :
78+ model_id : " ms://Qwen/Qwen2.5-7B-Instruct" # ModelScope model identifier
79+ nproc_per_node : 2 # Number of GPU processes per node
80+ sampler_type : vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
81+ engine_args : # vLLM engine-specific settings
82+ max_model_len : 4096 # Maximum sequence length the engine supports
83+ gpu_memory_utilization : 0.5 # Fraction of GPU memory to use (0.0-1.0)
84+ enable_lora : true # Allow loading LoRA adapters during inference
85+ logprobs_mode : processed_logprobs # Logprobs mode for sampling results
86+ device_group : # Logical device group for the sampler
87+ name : sampler
88+ ranks : [2] # GPU rank indices to use
89+ device_type : cuda
90+ device_mesh :
91+ device_type : cuda
92+ dp_size : 1
93+ queue_config :
94+ rps_limit : 100 # Max requests per second
95+ tps_limit : 100000 # Max tokens per second
96+ deployments :
97+ - name : SamplerManagement
98+ autoscaling_config :
99+ min_replicas : 1
100+ max_replicas : 1
101+ target_ongoing_requests : 16
102+ ray_actor_options :
103+ num_cpus : 0.1
104+ runtime_env :
105+ env_vars :
106+ TWINKLE_TRUST_REMOTE_CODE : " 0"
107+ DEVICE_COUNT_PER_PHYSICAL_NODE : " 8"
0 commit comments