Skip to content

Commit fe28585

Browse files
committed
update run.sh
1 parent d722128 commit fe28585

File tree

3 files changed

+31
-22
lines changed

3 files changed

+31
-22
lines changed

cookbook/client/server/megatron/run.sh

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
# --gpu-workers LIST GPU Worker 列表,分号分隔多个节点 (默认: 4,5,6,7:4)
1313
# --cpu-workers N CPU Worker 数量 (默认: 1)
1414
# --temp-dir DIR Ray 临时目录 (默认: /dashscope/caches/application/ray_logs)
15+
# --save-dir DIR Twinkle 模型保存目录 (默认: /dashscope/caches/application/save)
16+
# --server-config FILE Twinkle 服务器配置文件路径 (默认: /twinkle/cookbook/client/server/megatron/server_config.yaml)
1517
# --help 显示帮助信息
1618
#
1719
# 示例:
@@ -49,6 +51,8 @@ RAY_ADDRESS="127.0.0.1:$RAY_PORT"
4951
# --- 路径配置 ---
5052
DEFAULT_TEMP_DIR="/dashscope/caches/application/ray_logs"
5153
LOG_FILE="run.log"
54+
DEFAULT_SAVE_DIR="/dashscope/caches/application/save"
55+
DEFAULT_SERVER_CONFIG_FILE="/twinkle/cookbook/client/server/megatron/server_config.yaml"
5256

5357
# --- Prometheus 监控配置 ---
5458
PROMETHEUS_BIN="/dashscope/caches/application/monitor/prometheus-3.10.0.linux-amd64/prometheus"
@@ -67,6 +71,8 @@ HEAD_NODE="0,1,2,3"
6771
GPU_WORKERS_INPUT="4,5,6,7"
6872
CPU_WORKER_COUNT="1"
6973
TEMP_DIR="$DEFAULT_TEMP_DIR"
74+
SAVE_DIR="$DEFAULT_SAVE_DIR"
75+
SERVER_CONFIG_FILE="$DEFAULT_SERVER_CONFIG_FILE"
7076

7177
# 解析命名参数
7278
while [[ $# -gt 0 ]]; do
@@ -103,6 +109,22 @@ while [[ $# -gt 0 ]]; do
103109
TEMP_DIR="${1#*=}"
104110
shift
105111
;;
112+
--save-dir)
113+
SAVE_DIR="$2"
114+
shift 2
115+
;;
116+
--save-dir=*)
117+
SAVE_DIR="${1#*=}"
118+
shift
119+
;;
120+
--server-config)
121+
SERVER_CONFIG_FILE="$2"
122+
shift 2
123+
;;
124+
--server-config=*)
125+
SERVER_CONFIG_FILE="${1#*=}"
126+
shift
127+
;;
106128
--help|-h)
107129
echo "用法: ./run.sh [选项]"
108130
echo ""
@@ -111,6 +133,8 @@ while [[ $# -gt 0 ]]; do
111133
echo " --gpu-workers LIST GPU Worker 列表,分号分隔多个节点 (默认: 4,5,6,7)"
112134
echo " --cpu-workers N CPU Worker 数量 (默认: 1)"
113135
echo " --temp-dir DIR Ray 临时目录"
136+
echo " --save-dir DIR Twinkle 模型保存目录 (默认: $DEFAULT_SAVE_DIR)"
137+
echo " --server-config FILE Twinkle 服务器配置文件路径 (默认: $DEFAULT_SERVER_CONFIG_FILE)"
114138
echo " --help, -h 显示帮助信息"
115139
echo ""
116140
echo "示例:"
@@ -129,6 +153,9 @@ while [[ $# -gt 0 ]]; do
129153
esac
130154
done
131155

156+
# 将 SAVE_DIR export 给子进程(python server 通过环境变量读取)
157+
export TWINKLE_DEFAULT_SAVE_DIR="$SAVE_DIR"
158+
132159
# 将分号分隔的字符串转为数组
133160
if [ -z "$GPU_WORKERS_INPUT" ]; then
134161
GPU_WORKERS=()
@@ -222,6 +249,8 @@ echo ""
222249
print_info "运行参数:"
223250
echo " - Ray 地址: $RAY_ADDRESS"
224251
echo " - 临时目录: $TEMP_DIR"
252+
echo " - 保存目录: $TWINKLE_DEFAULT_SAVE_DIR"
253+
echo " - 服务配置: $SERVER_CONFIG_FILE"
225254
echo " - 日志文件: $LOG_FILE"
226255
echo ""
227256

@@ -334,7 +363,7 @@ print_info "日志输出到: $LOG_FILE"
334363
echo ""
335364

336365
# 启动服务器并实时显示日志
337-
nohup python server.py > "$LOG_FILE" 2>&1 &
366+
nohup python -m twinkle.server --config "$SERVER_CONFIG_FILE" > "$LOG_FILE" 2>&1 &
338367
SERVER_PID=$!
339368

340369
# 实时显示日志

cookbook/client/server/megatron/server.py

Lines changed: 0 additions & 21 deletions
This file was deleted.

cookbook/client/server/megatron/server_config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ applications:
5050
enable_lora: true # Allow loading LoRA adapters during inference
5151
max_loras: 5 # Max allowed loras working on vLLM at the same time
5252
max_lora_rank: 32 # Support up to rank 64 LoRA adapters
53+
enable_tower_connector_lora: true
5354
device_group: # Logical device group for the sampler
5455
name: sampler
5556
gpus_per_worker: 2

0 commit comments

Comments
 (0)