Skip to content

Commit a3cd140

Browse files
committed
update
1 parent d61482c commit a3cd140

File tree

4 files changed

+9
-9
lines changed

4 files changed

+9
-9
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
export RAY_ROTATION_MAX_BYTES=1024
22
export RAY_ROTATION_BACKUP_COUNT=1
3-
CUDA_VISIBLE_DEVICES=0,1,2,3 ray start --head --port=6379 --num-gpus=4 --disable-usage-stats --temp-dir=/dashscope/caches/application/ray_logs
3+
CUDA_VISIBLE_DEVICES=0,1,2,3 ray start --head --port=6379 --num-gpus=4 --disable-usage-stats --include-dashboard=true --temp-dir=/dashscope/caches/application/ray_logs
44
CUDA_VISIBLE_DEVICES="" ray start --address=127.0.0.1:6379 --num-gpus=0
55
python server.py

cookbook/client/server/megatron/server.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
# Resolve the path to server_config.yaml relative to this script's location
1717
file_dir = os.path.abspath(os.path.dirname(__file__))
18-
config_path = os.path.join(file_dir, 'server_config.yaml')
18+
config_path = os.path.join(file_dir, 'server_config_4b.yaml')
1919

2020
# Launch the Twinkle server — this call blocks until the server is shut down
2121
launch_server(config_path=config_path)

cookbook/client/server/megatron/server_config_4b.yaml

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ proxy_location: EveryNode
77
# HTTP listener settings
88
http_options:
99
host: 0.0.0.0 # Listen on all network interfaces
10-
port: 8000 # Port number for the server
10+
port: 9000 # Port number for the server
1111

1212
# Applications: each entry defines a service component deployed on the server
1313
applications:
@@ -39,7 +39,6 @@ applications:
3939
import_path: model
4040
args:
4141
use_megatron: true
42-
model_cls: Qwen3_5ForConditionalGeneration
4342
model_id: "ms://Qwen/Qwen3.5-4B" # ModelScope model identifier
4443
max_length: 10240
4544
nproc_per_node: 2 # Number of GPU processes per node
@@ -52,8 +51,8 @@ applications:
5251
dp_size: 2
5352
queue_config:
5453
rps_limit: 100 # Max requests per second
55-
tps_limit: 10000 # Max tokens per second for a single user
56-
max_input_tokens: 10000 # Maximum input tokens per request
54+
tps_limit: 100000 # Max tokens per second for a single user
55+
max_input_tokens: 60000 # Maximum input tokens per request
5756
adapter_config:
5857
adapter_timeout: 30 # Seconds before idle adapter unload
5958
adapter_max_lifetime: 36000 # Maximum lifetime of an adapter in seconds (e.g., 10 hours)
@@ -80,8 +79,8 @@ applications:
8079
nproc_per_node: 2 # Number of GPU processes per node
8180
sampler_type: vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
8281
engine_args: # vLLM engine-specific settings
83-
max_model_len: 4096 # Maximum sequence length the engine supports
84-
gpu_memory_utilization: 0.5 # Fraction of GPU memory to use (0.0-1.0)
82+
max_model_len: 16000 # Maximum sequence length the engine supports
83+
gpu_memory_utilization: 0.7 # Fraction of GPU memory to use (0.0-1.0)
8584
enable_lora: true # Allow loading LoRA adapters during inference
8685
logprobs_mode: processed_logprobs # Logprobs mode for sampling results
8786
device_group: # Logical device group for the sampler
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1-
./prometheus --config.file=/dashscope/caches/application/ray_logs/session_latest/metrics/prometheus/prometheus.yml
1+
# refer to https://docs.ray.io/en/latest/cluster/metrics.html#recommended-use-ray-dashboard-with-embedded-grafana-visualizations
2+
/dashscope/caches/application/monitor/prometheus-3.10.0.linux-amd64/prometheus --config.file=/dashscope/caches/application/ray_logs/session_latest/metrics/prometheus/prometheus.yml

0 commit comments

Comments
 (0)