Skip to content

Commit dca29d4

Browse files
authored
Add server metrics monitor and DPO client (#132)
1 parent 9b4d0f0 commit dca29d4

File tree

35 files changed

+1717
-261
lines changed

35 files changed

+1717
-261
lines changed
Lines changed: 339 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,341 @@
1+
#!/bin/bash
2+
3+
# ============================================
4+
# Twinkle Megatron 服务启动脚本
5+
# ============================================
6+
# 功能:启动 Ray 集群(支持多 GPU/CPU 节点)、Prometheus 监控和 Twinkle 服务器
7+
#
8+
# 用法:./run.sh [选项]
9+
#
10+
# 选项:
11+
# --head NODE Head 节点 GPU 配置,格式 "设备列表:数量" (默认: 0,1,2,3:4)
12+
# --gpu-workers LIST GPU Worker 列表,分号分隔多个节点 (默认: 4,5,6,7:4)
13+
# --cpu-workers N CPU Worker 数量 (默认: 1)
14+
# --temp-dir DIR Ray 临时目录 (默认: /dashscope/caches/application/ray_logs)
15+
# --help 显示帮助信息
16+
#
17+
# 示例:
18+
# ./run.sh # 使用默认配置
19+
# ./run.sh --head "0,1,2,3" --gpu-workers "4,5,6,7" --cpu-workers 1
20+
# ./run.sh --head "0,1,2,3" --gpu-workers "" --cpu-workers 0
21+
# ./run.sh --head "" --cpu-workers 4 # 纯 CPU 模式
22+
# ./run.sh --temp-dir /tmp/my_ray_logs # 自定义临时目录
23+
# ============================================
24+
25+
set -e # 遇到错误立即退出
26+
27+
# ============================================
28+
# 配置区(根据你的环境修改)
29+
# ============================================
30+
31+
# --- Ray 集群配置 ---
32+
# Head 节点(必须是第一个启动)
33+
# 格式:"GPU设备列表:GPU数量",如 "0,1,2,3:4"
34+
# 如果不需要 GPU,设为空字符串 ""
35+
# 可通过命令行参数 $1 传入
36+
37+
# GPU Worker 节点列表(可以有多个)
38+
# 格式:用分号分隔的 "GPU设备列表:GPU数量"
39+
# 示例:"4,5,6,7:4" 或 "4,5,6,7:4;8,9,10,11:4"
40+
# 可通过命令行参数 $2 传入
41+
42+
# CPU Worker 数量
43+
# 可通过命令行参数 $3 传入
44+
45+
# --- 网络配置 ---
46+
RAY_PORT=6379
47+
RAY_ADDRESS="127.0.0.1:$RAY_PORT"
48+
49+
# --- 路径配置 ---
50+
DEFAULT_TEMP_DIR="/dashscope/caches/application/ray_logs"
51+
LOG_FILE="run.log"
52+
53+
# --- Prometheus 监控配置 ---
54+
PROMETHEUS_BIN="/dashscope/caches/application/monitor/prometheus-3.10.0.linux-amd64/prometheus"
55+
PROMETHEUS_CONFIG_SUFFIX="session_latest/metrics/prometheus/prometheus.yml"
56+
57+
# --- Ray 日志轮转配置 ---
158
export RAY_ROTATION_MAX_BYTES=1024
259
export RAY_ROTATION_BACKUP_COUNT=1
3-
CUDA_VISIBLE_DEVICES=0,1,2,3 ray start --head --port=6379 --num-gpus=4 --disable-usage-stats --include-dashboard=false
4-
CUDA_VISIBLE_DEVICES=4,5,6,7 ray start --address=127.0.0.1:6379 --num-gpus=4
5-
CUDA_VISIBLE_DEVICES="" ray start --address=127.0.0.1:6379 --num-gpus=0
6-
python "$(dirname "$0")/server.py"
60+
61+
# ============================================
62+
# 参数解析(支持 --key=value 或 --key value 格式)
63+
# ============================================
64+
65+
# 默认值
66+
HEAD_NODE="0,1,2,3"
67+
GPU_WORKERS_INPUT="4,5,6,7"
68+
CPU_WORKER_COUNT="1"
69+
TEMP_DIR="$DEFAULT_TEMP_DIR"
70+
71+
# 解析命名参数
72+
while [[ $# -gt 0 ]]; do
73+
case $1 in
74+
--head)
75+
HEAD_NODE="$2"
76+
shift 2
77+
;;
78+
--head=*)
79+
HEAD_NODE="${1#*=}"
80+
shift
81+
;;
82+
--gpu-workers)
83+
GPU_WORKERS_INPUT="$2"
84+
shift 2
85+
;;
86+
--gpu-workers=*)
87+
GPU_WORKERS_INPUT="${1#*=}"
88+
shift
89+
;;
90+
--cpu-workers)
91+
CPU_WORKER_COUNT="$2"
92+
shift 2
93+
;;
94+
--cpu-workers=*)
95+
CPU_WORKER_COUNT="${1#*=}"
96+
shift
97+
;;
98+
--temp-dir)
99+
TEMP_DIR="$2"
100+
shift 2
101+
;;
102+
--temp-dir=*)
103+
TEMP_DIR="${1#*=}"
104+
shift
105+
;;
106+
--help|-h)
107+
echo "用法: ./run.sh [选项]"
108+
echo ""
109+
echo "选项:"
110+
echo " --head NODE Head 节点 GPU 设备列表,逗号分隔 (默认: 0,1,2,3)"
111+
echo " --gpu-workers LIST GPU Worker 列表,分号分隔多个节点 (默认: 4,5,6,7)"
112+
echo " --cpu-workers N CPU Worker 数量 (默认: 1)"
113+
echo " --temp-dir DIR Ray 临时目录"
114+
echo " --help, -h 显示帮助信息"
115+
echo ""
116+
echo "示例:"
117+
echo " ./run.sh # 默认配置"
118+
echo " ./run.sh --head '0,1,2,3' --gpu-workers '4,5,6,7'"
119+
echo " ./run.sh --head '0,1,2,3,4,5,6,7' # 单机 8 卡"
120+
echo " ./run.sh --gpu-workers '4,5,6,7;8,9,10,11' # 多 GPU Worker"
121+
echo " ./run.sh --cpu-workers 4 --head '' # 纯 CPU 模式"
122+
exit 0
123+
;;
124+
*)
125+
print_error "未知参数: $1"
126+
echo "使用 --help 查看帮助"
127+
exit 1
128+
;;
129+
esac
130+
done
131+
132+
# 将分号分隔的字符串转为数组
133+
if [ -z "$GPU_WORKERS_INPUT" ]; then
134+
GPU_WORKERS=()
135+
else
136+
IFS=';' read -ra GPU_WORKERS <<< "$GPU_WORKERS_INPUT"
137+
fi
138+
139+
PROMETHEUS_CONFIG="${TEMP_DIR}/${PROMETHEUS_CONFIG_SUFFIX}"
140+
141+
# ============================================
142+
# 辅助函数
143+
# ============================================
144+
print_info() {
145+
echo -e "\033[36m[INFO]\033[0m $1"
146+
}
147+
148+
print_success() {
149+
echo -e "\033[32m[SUCCESS]\033[0m $1"
150+
}
151+
152+
print_warning() {
153+
echo -e "\033[33m[WARNING]\033[0m $1"
154+
}
155+
156+
print_error() {
157+
echo -e "\033[31m[ERROR]\033[0m $1"
158+
}
159+
160+
print_separator() {
161+
echo "============================================"
162+
}
163+
164+
print_header() {
165+
echo ""
166+
print_separator
167+
echo -e "\033[1;34m $1 \033[0m"
168+
print_separator
169+
}
170+
171+
# 解析节点配置 "devices" -> 返回 devices 和自动计算 _gpu_count
172+
# 示例: "0,1,2,3" -> devices="0,1,2,3", count=4
173+
parse_node_config() {
174+
local config="$1"
175+
if [ -z "$config" ]; then
176+
_gpu_devices=""
177+
_gpu_count=0
178+
return
179+
fi
180+
_gpu_devices="$config"
181+
# 通过逗号数量+1计算 GPU 数量
182+
local comma_count=$(echo "$config" | tr -cd ',' | wc -c)
183+
_gpu_count=$((comma_count + 1))
184+
}
185+
186+
# ============================================
187+
# 开始启动
188+
# ============================================
189+
print_header "Twinkle Megatron 服务启动脚本"
190+
191+
# 打印配置信息
192+
print_info "集群配置:"
193+
echo ""
194+
195+
# 解析并显示 Head 节点
196+
parse_node_config "$HEAD_NODE"
197+
if [ -n "$_gpu_devices" ]; then
198+
echo " [Head 节点]"
199+
echo " - GPU 设备: $_gpu_devices"
200+
echo " - GPU 数量: $_gpu_count"
201+
else
202+
echo " [Head 节点] CPU only"
203+
fi
204+
205+
# 显示 GPU Worker 节点
206+
if [ ${#GPU_WORKERS[@]} -gt 0 ]; then
207+
echo ""
208+
echo " [GPU Worker 节点] 共 ${#GPU_WORKERS[@]}"
209+
for i in "${!GPU_WORKERS[@]}"; do
210+
parse_node_config "${GPU_WORKERS[$i]}"
211+
echo " Worker $((i+1)): GPU=$_gpu_devices, Count=$_gpu_count"
212+
done
213+
fi
214+
215+
# 显示 CPU Worker
216+
if [ "$CPU_WORKER_COUNT" -gt 0 ]; then
217+
echo ""
218+
echo " [CPU Worker 节点] $CPU_WORKER_COUNT"
219+
fi
220+
221+
echo ""
222+
print_info "运行参数:"
223+
echo " - Ray 地址: $RAY_ADDRESS"
224+
echo " - 临时目录: $TEMP_DIR"
225+
echo " - 日志文件: $LOG_FILE"
226+
echo ""
227+
228+
# 检查临时目录
229+
if [ ! -d "$TEMP_DIR" ]; then
230+
print_info "创建临时目录: $TEMP_DIR"
231+
mkdir -p "$TEMP_DIR"
232+
fi
233+
234+
# ============================================
235+
# 停止已有 Ray 集群和 Prometheus
236+
# ============================================
237+
print_header "清理环境"
238+
print_info "停止已有的 Ray 集群..."
239+
ray stop --force 2>/dev/null || true
240+
241+
print_info "停止已有的 Prometheus..."
242+
pkill prometheus 2>/dev/null || true
243+
244+
# ============================================
245+
# 启动 Ray Head 节点
246+
# ============================================
247+
print_header "启动 Ray 集群"
248+
249+
parse_node_config "$HEAD_NODE"
250+
if [ -n "$_gpu_devices" ]; then
251+
print_info "启动 Head 节点 (GPU: $_gpu_devices)..."
252+
CUDA_VISIBLE_DEVICES="$_gpu_devices" ray start --head \
253+
--port=$RAY_PORT \
254+
--num-gpus=$_gpu_count \
255+
--disable-usage-stats \
256+
--include-dashboard=true \
257+
--temp-dir="$TEMP_DIR"
258+
else
259+
print_info "启动 Head 节点 (CPU only)..."
260+
CUDA_VISIBLE_DEVICES="" ray start --head \
261+
--port=$RAY_PORT \
262+
--num-gpus=0 \
263+
--disable-usage-stats \
264+
--include-dashboard=true \
265+
--temp-dir="$TEMP_DIR"
266+
fi
267+
print_success "Head 节点启动成功!"
268+
269+
# ============================================
270+
# 启动 GPU Worker 节点
271+
# ============================================
272+
for i in "${!GPU_WORKERS[@]}"; do
273+
parse_node_config "${GPU_WORKERS[$i]}"
274+
print_info "启动 GPU Worker $((i+1)) (GPU: $_gpu_devices)..."
275+
CUDA_VISIBLE_DEVICES="$_gpu_devices" ray start \
276+
--address=$RAY_ADDRESS \
277+
--num-gpus=$_gpu_count
278+
print_success "GPU Worker $((i+1)) 启动成功!"
279+
done
280+
281+
# ============================================
282+
# 启动 CPU Worker 节点
283+
# ============================================
284+
if [ "$CPU_WORKER_COUNT" -gt 0 ]; then
285+
print_info "启动 $CPU_WORKER_COUNT 个 CPU Worker..."
286+
for ((i=1; i<=CPU_WORKER_COUNT; i++)); do
287+
CUDA_VISIBLE_DEVICES="" ray start \
288+
--address=$RAY_ADDRESS \
289+
--num-gpus=0
290+
done
291+
print_success "CPU Worker 启动成功!"
292+
fi
293+
294+
# ============================================
295+
# 显示集群状态
296+
# ============================================
297+
echo ""
298+
print_info "集群状态:"
299+
ray status 2>/dev/null || true
300+
301+
# ============================================
302+
# 启动 Prometheus 监控(可选)
303+
# ============================================
304+
print_header "启动监控(可选)"
305+
306+
PROMETHEUS_PID=""
307+
if [ -f "$PROMETHEUS_BIN" ]; then
308+
print_info "检测到 Prometheus,正在启动监控服务..."
309+
310+
# 等待 Ray 生成 Prometheus 配置
311+
sleep 2
312+
313+
if [ -f "$PROMETHEUS_CONFIG" ]; then
314+
nohup "$PROMETHEUS_BIN" --config.file="$PROMETHEUS_CONFIG" > prometheus.log 2>&1 &
315+
PROMETHEUS_PID=$!
316+
print_success "Prometheus 监控已启动 (PID: $PROMETHEUS_PID)"
317+
echo " - 监控日志: prometheus.log"
318+
echo " - 配置文件: $PROMETHEUS_CONFIG"
319+
else
320+
print_warning "Prometheus 配置文件不存在,跳过监控启动"
321+
echo " - 预期路径: $PROMETHEUS_CONFIG"
322+
fi
323+
else
324+
print_warning "未检测到 Prometheus,跳过监控启动"
325+
echo " - 预期路径: $PROMETHEUS_BIN"
326+
fi
327+
328+
# ============================================
329+
# 启动 Twinkle 服务器
330+
# ============================================
331+
print_header "启动 Twinkle 服务器"
332+
333+
print_info "日志输出到: $LOG_FILE"
334+
echo ""
335+
336+
# 启动服务器并实时显示日志
337+
nohup python server.py > "$LOG_FILE" 2>&1 &
338+
SERVER_PID=$!
339+
340+
# 实时显示日志
341+
tail -f "$LOG_FILE"

cookbook/client/server/megatron/server_config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ applications:
8787
nproc_per_node: 4 # Number of GPU processes per node
8888
device_group:
8989
name: model
90-
ranks: 4 # GPU rank indices
90+
ranks: 4
9191
device_type: cuda
9292
device_mesh:
9393
device_type: cuda

0 commit comments

Comments
 (0)