update

Yunnglin · Yunnglin · commit a3cd140bda67 · 2026-03-30T16:37:56.000+08:00
diff --git a/cookbook/client/server/megatron/run.sh b/cookbook/client/server/megatron/run.sh
@@ -1,5 +1,5 @@
 export RAY_ROTATION_MAX_BYTES=1024
 export RAY_ROTATION_BACKUP_COUNT=1
-CUDA_VISIBLE_DEVICES=0,1,2,3 ray start --head --port=6379 --num-gpus=4 --disable-usage-stats --temp-dir=/dashscope/caches/application/ray_logs
+CUDA_VISIBLE_DEVICES=0,1,2,3 ray start --head --port=6379 --num-gpus=4 --disable-usage-stats --include-dashboard=true --temp-dir=/dashscope/caches/application/ray_logs
 CUDA_VISIBLE_DEVICES="" ray start --address=127.0.0.1:6379 --num-gpus=0
 python server.py
diff --git a/cookbook/client/server/megatron/server.py b/cookbook/client/server/megatron/server.py
@@ -15,7 +15,7 @@
 
 # Resolve the path to server_config.yaml relative to this script's location
 file_dir = os.path.abspath(os.path.dirname(__file__))
-config_path = os.path.join(file_dir, 'server_config.yaml')
+config_path = os.path.join(file_dir, 'server_config_4b.yaml')
 
 # Launch the Twinkle server — this call blocks until the server is shut down
 launch_server(config_path=config_path)
diff --git a/cookbook/client/server/megatron/server_config_4b.yaml b/cookbook/client/server/megatron/server_config_4b.yaml
@@ -7,7 +7,7 @@ proxy_location: EveryNode
 # HTTP listener settings
 http_options:
   host: 0.0.0.0        # Listen on all network interfaces
-  port: 8000            # Port number for the server
+  port: 9000            # Port number for the server
 
 # Applications: each entry defines a service component deployed on the server
 applications:
@@ -39,7 +39,6 @@ applications:
     import_path: model
     args:
       use_megatron: true
-      model_cls: Qwen3_5ForConditionalGeneration
       model_id: "ms://Qwen/Qwen3.5-4B" # ModelScope model identifier
       max_length: 10240
       nproc_per_node: 2                            # Number of GPU processes per node
@@ -52,8 +51,8 @@ applications:
         dp_size: 2
       queue_config:
         rps_limit: 100                             # Max requests per second
-        tps_limit: 10000                           # Max tokens per second for a single user
-        max_input_tokens: 10000                    # Maximum input tokens per request
+        tps_limit: 100000                           # Max tokens per second for a single user
+        max_input_tokens: 60000                    # Maximum input tokens per request
       adapter_config:
         adapter_timeout: 30                        # Seconds before idle adapter unload
         adapter_max_lifetime: 36000                # Maximum lifetime of an adapter in seconds (e.g., 10 hours)
@@ -80,8 +79,8 @@ applications:
       nproc_per_node: 2               # Number of GPU processes per node
       sampler_type: vllm              # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
       engine_args:                    # vLLM engine-specific settings
-        max_model_len: 4096           # Maximum sequence length the engine supports
-        gpu_memory_utilization: 0.5   # Fraction of GPU memory to use (0.0-1.0)
+        max_model_len: 16000           # Maximum sequence length the engine supports
+        gpu_memory_utilization: 0.7   # Fraction of GPU memory to use (0.0-1.0)
         enable_lora: true             # Allow loading LoRA adapters during inference
         logprobs_mode: processed_logprobs # Logprobs mode for sampling results
       device_group:                   # Logical device group for the sampler
diff --git a/cookbook/client/server/megatron/start_monitor.sh b/cookbook/client/server/megatron/start_monitor.sh
@@ -1 +1,2 @@
-./prometheus --config.file=/dashscope/caches/application/ray_logs/session_latest/metrics/prometheus/prometheus.yml
+# refer to https://docs.ray.io/en/latest/cluster/metrics.html#recommended-use-ray-dashboard-with-embedded-grafana-visualizations
+/dashscope/caches/application/monitor/prometheus-3.10.0.linux-amd64/prometheus --config.file=/dashscope/caches/application/ray_logs/session_latest/metrics/prometheus/prometheus.yml

Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`		`-./prometheus --config.file=/dashscope/caches/application/ray_logs/session_latest/metrics/prometheus/prometheus.yml`
	`1`	`+# refer to https://docs.ray.io/en/latest/cluster/metrics.html#recommended-use-ray-dashboard-with-embedded-grafana-visualizations`
	`2`	`+/dashscope/caches/application/monitor/prometheus-3.10.0.linux-amd64/prometheus --config.file=/dashscope/caches/application/ray_logs/session_latest/metrics/prometheus/prometheus.yml`