1212# --gpu-workers LIST GPU Worker 列表,分号分隔多个节点 (默认: 4,5,6,7:4)
1313# --cpu-workers N CPU Worker 数量 (默认: 1)
1414# --temp-dir DIR Ray 临时目录 (默认: /dashscope/caches/application/ray_logs)
15+ # --save-dir DIR Twinkle 模型保存目录 (默认: /dashscope/caches/application/save)
16+ # --server-config FILE Twinkle 服务器配置文件路径 (默认: /twinkle/cookbook/client/server/megatron/server_config.yaml)
1517# --help 显示帮助信息
1618#
1719# 示例:
@@ -49,6 +51,8 @@ RAY_ADDRESS="127.0.0.1:$RAY_PORT"
4951# --- 路径配置 ---
5052DEFAULT_TEMP_DIR=" /dashscope/caches/application/ray_logs"
5153LOG_FILE=" run.log"
54+ DEFAULT_SAVE_DIR=" /dashscope/caches/application/save"
55+ DEFAULT_SERVER_CONFIG_FILE=" /twinkle/cookbook/client/server/megatron/server_config.yaml"
5256
5357# --- Prometheus 监控配置 ---
5458PROMETHEUS_BIN=" /dashscope/caches/application/monitor/prometheus-3.10.0.linux-amd64/prometheus"
@@ -67,6 +71,8 @@ HEAD_NODE="0,1,2,3"
6771GPU_WORKERS_INPUT=" 4,5,6,7"
6872CPU_WORKER_COUNT=" 1"
6973TEMP_DIR=" $DEFAULT_TEMP_DIR "
74+ SAVE_DIR=" $DEFAULT_SAVE_DIR "
75+ SERVER_CONFIG_FILE=" $DEFAULT_SERVER_CONFIG_FILE "
7076
7177# 解析命名参数
7278while [[ $# -gt 0 ]]; do
@@ -103,6 +109,22 @@ while [[ $# -gt 0 ]]; do
103109 TEMP_DIR=" ${1#* =} "
104110 shift
105111 ;;
112+ --save-dir)
113+ SAVE_DIR=" $2 "
114+ shift 2
115+ ;;
116+ --save-dir=* )
117+ SAVE_DIR=" ${1#* =} "
118+ shift
119+ ;;
120+ --server-config)
121+ SERVER_CONFIG_FILE=" $2 "
122+ shift 2
123+ ;;
124+ --server-config=* )
125+ SERVER_CONFIG_FILE=" ${1#* =} "
126+ shift
127+ ;;
106128 --help|-h)
107129 echo " 用法: ./run.sh [选项]"
108130 echo " "
@@ -111,6 +133,8 @@ while [[ $# -gt 0 ]]; do
111133 echo " --gpu-workers LIST GPU Worker 列表,分号分隔多个节点 (默认: 4,5,6,7)"
112134 echo " --cpu-workers N CPU Worker 数量 (默认: 1)"
113135 echo " --temp-dir DIR Ray 临时目录"
136+ echo " --save-dir DIR Twinkle 模型保存目录 (默认: $DEFAULT_SAVE_DIR )"
137+ echo " --server-config FILE Twinkle 服务器配置文件路径 (默认: $DEFAULT_SERVER_CONFIG_FILE )"
114138 echo " --help, -h 显示帮助信息"
115139 echo " "
116140 echo " 示例:"
@@ -129,6 +153,9 @@ while [[ $# -gt 0 ]]; do
129153 esac
130154done
131155
156+ # 将 SAVE_DIR export 给子进程(python server 通过环境变量读取)
157+ export TWINKLE_DEFAULT_SAVE_DIR=" $SAVE_DIR "
158+
132159# 将分号分隔的字符串转为数组
133160if [ -z " $GPU_WORKERS_INPUT " ]; then
134161 GPU_WORKERS=()
@@ -222,6 +249,8 @@ echo ""
222249print_info " 运行参数:"
223250echo " - Ray 地址: $RAY_ADDRESS "
224251echo " - 临时目录: $TEMP_DIR "
252+ echo " - 保存目录: $TWINKLE_DEFAULT_SAVE_DIR "
253+ echo " - 服务配置: $SERVER_CONFIG_FILE "
225254echo " - 日志文件: $LOG_FILE "
226255echo " "
227256
235264# 停止已有 Ray 集群和 Prometheus
236265# ============================================
237266print_header " 清理环境"
267+
268+ # 停止 Twinkle server.py(twinkle.server 模块)
269+ print_info " 停止已有的 Twinkle Server..."
270+ pkill -f " twinkle.server" 2> /dev/null || true
271+
272+ # 停止 vLLM 进程
273+ print_info " 停止已有的 vLLM 进程..."
274+ pkill -f " vllm" 2> /dev/null || true
275+
276+ # 等待上述进程退出
277+ sleep 2
278+
279+ # 若仍有残留则强制 SIGKILL
280+ if pgrep -f " twinkle.server" > /dev/null 2>&1 ; then
281+ print_warning " Twinkle Server 未退出,强制终止..."
282+ pkill -9 -f " twinkle.server" 2> /dev/null || true
283+ fi
284+ if pgrep -f " vllm" > /dev/null 2>&1 ; then
285+ print_warning " vLLM 进程未退出,强制终止..."
286+ pkill -9 -f " vllm" 2> /dev/null || true
287+ fi
288+
238289print_info " 停止已有的 Ray 集群..."
239290ray stop --force 2> /dev/null || true
240291
@@ -334,8 +385,10 @@ print_info "日志输出到: $LOG_FILE"
334385echo " "
335386
336387# 启动服务器并实时显示日志
337- nohup python server.py > " $LOG_FILE " 2>&1 &
388+ touch " $LOG_FILE " # 预创建文件,避免 tail -f 在文件尚未写入时报错
389+ nohup python -m twinkle.server --config " $SERVER_CONFIG_FILE " > " $LOG_FILE " 2>&1 &
338390SERVER_PID=$!
391+ print_success " Twinkle Server 已启动 (PID: $SERVER_PID )"
339392
340393# 实时显示日志
341394tail -f " $LOG_FILE "
0 commit comments