|
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | +# ============================================ |
| 4 | +# Twinkle Megatron 服务启动脚本 |
| 5 | +# ============================================ |
| 6 | +# 功能:启动 Ray 集群(支持多 GPU/CPU 节点)、Prometheus 监控和 Twinkle 服务器 |
| 7 | +# |
| 8 | +# 用法:./run.sh [选项] |
| 9 | +# |
| 10 | +# 选项: |
| 11 | +# --head NODE Head 节点 GPU 配置,格式 "设备列表:数量" (默认: 0,1,2,3:4) |
| 12 | +# --gpu-workers LIST GPU Worker 列表,分号分隔多个节点 (默认: 4,5,6,7:4) |
| 13 | +# --cpu-workers N CPU Worker 数量 (默认: 1) |
| 14 | +# --temp-dir DIR Ray 临时目录 (默认: /dashscope/caches/application/ray_logs) |
| 15 | +# --help 显示帮助信息 |
| 16 | +# |
| 17 | +# 示例: |
| 18 | +# ./run.sh # 使用默认配置 |
| 19 | +# ./run.sh --head "0,1,2,3" --gpu-workers "4,5,6,7" --cpu-workers 1 |
| 20 | +# ./run.sh --head "0,1,2,3" --gpu-workers "" --cpu-workers 0 |
| 21 | +# ./run.sh --head "" --cpu-workers 4 # 纯 CPU 模式 |
| 22 | +# ./run.sh --temp-dir /tmp/my_ray_logs # 自定义临时目录 |
| 23 | +# ============================================ |
| 24 | + |
| 25 | +set -e # 遇到错误立即退出 |
| 26 | + |
| 27 | +# ============================================ |
| 28 | +# 配置区(根据你的环境修改) |
| 29 | +# ============================================ |
| 30 | + |
| 31 | +# --- Ray 集群配置 --- |
| 32 | +# Head 节点(必须是第一个启动) |
| 33 | +# 格式:"GPU设备列表:GPU数量",如 "0,1,2,3:4" |
| 34 | +# 如果不需要 GPU,设为空字符串 "" |
| 35 | +# 可通过命令行参数 $1 传入 |
| 36 | + |
| 37 | +# GPU Worker 节点列表(可以有多个) |
| 38 | +# 格式:用分号分隔的 "GPU设备列表:GPU数量" |
| 39 | +# 示例:"4,5,6,7:4" 或 "4,5,6,7:4;8,9,10,11:4" |
| 40 | +# 可通过命令行参数 $2 传入 |
| 41 | + |
| 42 | +# CPU Worker 数量 |
| 43 | +# 可通过命令行参数 $3 传入 |
| 44 | + |
| 45 | +# --- 网络配置 --- |
| 46 | +RAY_PORT=6379 |
| 47 | +RAY_ADDRESS="127.0.0.1:$RAY_PORT" |
| 48 | + |
| 49 | +# --- 路径配置 --- |
| 50 | +DEFAULT_TEMP_DIR="/dashscope/caches/application/ray_logs" |
| 51 | +LOG_FILE="run.log" |
| 52 | + |
| 53 | +# --- Prometheus 监控配置 --- |
| 54 | +PROMETHEUS_BIN="/dashscope/caches/application/monitor/prometheus-3.10.0.linux-amd64/prometheus" |
| 55 | +PROMETHEUS_CONFIG_SUFFIX="session_latest/metrics/prometheus/prometheus.yml" |
| 56 | + |
| 57 | +# --- Ray 日志轮转配置 --- |
1 | 58 | export RAY_ROTATION_MAX_BYTES=1024 |
2 | 59 | export RAY_ROTATION_BACKUP_COUNT=1 |
3 | | -CUDA_VISIBLE_DEVICES=0,1,2,3 ray start --head --port=6379 --num-gpus=4 --disable-usage-stats --include-dashboard=false |
4 | | -CUDA_VISIBLE_DEVICES=4,5,6,7 ray start --address=127.0.0.1:6379 --num-gpus=4 |
5 | | -CUDA_VISIBLE_DEVICES="" ray start --address=127.0.0.1:6379 --num-gpus=0 |
6 | | -python "$(dirname "$0")/server.py" |
| 60 | + |
| 61 | +# ============================================ |
| 62 | +# 参数解析(支持 --key=value 或 --key value 格式) |
| 63 | +# ============================================ |
| 64 | + |
| 65 | +# 默认值 |
| 66 | +HEAD_NODE="0,1,2,3" |
| 67 | +GPU_WORKERS_INPUT="4,5,6,7" |
| 68 | +CPU_WORKER_COUNT="1" |
| 69 | +TEMP_DIR="$DEFAULT_TEMP_DIR" |
| 70 | + |
| 71 | +# 解析命名参数 |
| 72 | +while [[ $# -gt 0 ]]; do |
| 73 | + case $1 in |
| 74 | + --head) |
| 75 | + HEAD_NODE="$2" |
| 76 | + shift 2 |
| 77 | + ;; |
| 78 | + --head=*) |
| 79 | + HEAD_NODE="${1#*=}" |
| 80 | + shift |
| 81 | + ;; |
| 82 | + --gpu-workers) |
| 83 | + GPU_WORKERS_INPUT="$2" |
| 84 | + shift 2 |
| 85 | + ;; |
| 86 | + --gpu-workers=*) |
| 87 | + GPU_WORKERS_INPUT="${1#*=}" |
| 88 | + shift |
| 89 | + ;; |
| 90 | + --cpu-workers) |
| 91 | + CPU_WORKER_COUNT="$2" |
| 92 | + shift 2 |
| 93 | + ;; |
| 94 | + --cpu-workers=*) |
| 95 | + CPU_WORKER_COUNT="${1#*=}" |
| 96 | + shift |
| 97 | + ;; |
| 98 | + --temp-dir) |
| 99 | + TEMP_DIR="$2" |
| 100 | + shift 2 |
| 101 | + ;; |
| 102 | + --temp-dir=*) |
| 103 | + TEMP_DIR="${1#*=}" |
| 104 | + shift |
| 105 | + ;; |
| 106 | + --help|-h) |
| 107 | + echo "用法: ./run.sh [选项]" |
| 108 | + echo "" |
| 109 | + echo "选项:" |
| 110 | + echo " --head NODE Head 节点 GPU 设备列表,逗号分隔 (默认: 0,1,2,3)" |
| 111 | + echo " --gpu-workers LIST GPU Worker 列表,分号分隔多个节点 (默认: 4,5,6,7)" |
| 112 | + echo " --cpu-workers N CPU Worker 数量 (默认: 1)" |
| 113 | + echo " --temp-dir DIR Ray 临时目录" |
| 114 | + echo " --help, -h 显示帮助信息" |
| 115 | + echo "" |
| 116 | + echo "示例:" |
| 117 | + echo " ./run.sh # 默认配置" |
| 118 | + echo " ./run.sh --head '0,1,2,3' --gpu-workers '4,5,6,7'" |
| 119 | + echo " ./run.sh --head '0,1,2,3,4,5,6,7' # 单机 8 卡" |
| 120 | + echo " ./run.sh --gpu-workers '4,5,6,7;8,9,10,11' # 多 GPU Worker" |
| 121 | + echo " ./run.sh --cpu-workers 4 --head '' # 纯 CPU 模式" |
| 122 | + exit 0 |
| 123 | + ;; |
| 124 | + *) |
| 125 | + print_error "未知参数: $1" |
| 126 | + echo "使用 --help 查看帮助" |
| 127 | + exit 1 |
| 128 | + ;; |
| 129 | + esac |
| 130 | +done |
| 131 | + |
| 132 | +# 将分号分隔的字符串转为数组 |
| 133 | +if [ -z "$GPU_WORKERS_INPUT" ]; then |
| 134 | + GPU_WORKERS=() |
| 135 | +else |
| 136 | + IFS=';' read -ra GPU_WORKERS <<< "$GPU_WORKERS_INPUT" |
| 137 | +fi |
| 138 | + |
| 139 | +PROMETHEUS_CONFIG="${TEMP_DIR}/${PROMETHEUS_CONFIG_SUFFIX}" |
| 140 | + |
| 141 | +# ============================================ |
| 142 | +# 辅助函数 |
| 143 | +# ============================================ |
| 144 | +print_info() { |
| 145 | + echo -e "\033[36m[INFO]\033[0m $1" |
| 146 | +} |
| 147 | + |
| 148 | +print_success() { |
| 149 | + echo -e "\033[32m[SUCCESS]\033[0m $1" |
| 150 | +} |
| 151 | + |
| 152 | +print_warning() { |
| 153 | + echo -e "\033[33m[WARNING]\033[0m $1" |
| 154 | +} |
| 155 | + |
| 156 | +print_error() { |
| 157 | + echo -e "\033[31m[ERROR]\033[0m $1" |
| 158 | +} |
| 159 | + |
| 160 | +print_separator() { |
| 161 | + echo "============================================" |
| 162 | +} |
| 163 | + |
| 164 | +print_header() { |
| 165 | + echo "" |
| 166 | + print_separator |
| 167 | + echo -e "\033[1;34m $1 \033[0m" |
| 168 | + print_separator |
| 169 | +} |
| 170 | + |
| 171 | +# 解析节点配置 "devices" -> 返回 devices 和自动计算 _gpu_count |
| 172 | +# 示例: "0,1,2,3" -> devices="0,1,2,3", count=4 |
| 173 | +parse_node_config() { |
| 174 | + local config="$1" |
| 175 | + if [ -z "$config" ]; then |
| 176 | + _gpu_devices="" |
| 177 | + _gpu_count=0 |
| 178 | + return |
| 179 | + fi |
| 180 | + _gpu_devices="$config" |
| 181 | + # 通过逗号数量+1计算 GPU 数量 |
| 182 | + local comma_count=$(echo "$config" | tr -cd ',' | wc -c) |
| 183 | + _gpu_count=$((comma_count + 1)) |
| 184 | +} |
| 185 | + |
| 186 | +# ============================================ |
| 187 | +# 开始启动 |
| 188 | +# ============================================ |
| 189 | +print_header "Twinkle Megatron 服务启动脚本" |
| 190 | + |
| 191 | +# 打印配置信息 |
| 192 | +print_info "集群配置:" |
| 193 | +echo "" |
| 194 | + |
| 195 | +# 解析并显示 Head 节点 |
| 196 | +parse_node_config "$HEAD_NODE" |
| 197 | +if [ -n "$_gpu_devices" ]; then |
| 198 | + echo " [Head 节点]" |
| 199 | + echo " - GPU 设备: $_gpu_devices" |
| 200 | + echo " - GPU 数量: $_gpu_count" |
| 201 | +else |
| 202 | + echo " [Head 节点] CPU only" |
| 203 | +fi |
| 204 | + |
| 205 | +# 显示 GPU Worker 节点 |
| 206 | +if [ ${#GPU_WORKERS[@]} -gt 0 ]; then |
| 207 | + echo "" |
| 208 | + echo " [GPU Worker 节点] 共 ${#GPU_WORKERS[@]} 个" |
| 209 | + for i in "${!GPU_WORKERS[@]}"; do |
| 210 | + parse_node_config "${GPU_WORKERS[$i]}" |
| 211 | + echo " Worker $((i+1)): GPU=$_gpu_devices, Count=$_gpu_count" |
| 212 | + done |
| 213 | +fi |
| 214 | + |
| 215 | +# 显示 CPU Worker |
| 216 | +if [ "$CPU_WORKER_COUNT" -gt 0 ]; then |
| 217 | + echo "" |
| 218 | + echo " [CPU Worker 节点] $CPU_WORKER_COUNT 个" |
| 219 | +fi |
| 220 | + |
| 221 | +echo "" |
| 222 | +print_info "运行参数:" |
| 223 | +echo " - Ray 地址: $RAY_ADDRESS" |
| 224 | +echo " - 临时目录: $TEMP_DIR" |
| 225 | +echo " - 日志文件: $LOG_FILE" |
| 226 | +echo "" |
| 227 | + |
| 228 | +# 检查临时目录 |
| 229 | +if [ ! -d "$TEMP_DIR" ]; then |
| 230 | + print_info "创建临时目录: $TEMP_DIR" |
| 231 | + mkdir -p "$TEMP_DIR" |
| 232 | +fi |
| 233 | + |
| 234 | +# ============================================ |
| 235 | +# 停止已有 Ray 集群和 Prometheus |
| 236 | +# ============================================ |
| 237 | +print_header "清理环境" |
| 238 | +print_info "停止已有的 Ray 集群..." |
| 239 | +ray stop --force 2>/dev/null || true |
| 240 | + |
| 241 | +print_info "停止已有的 Prometheus..." |
| 242 | +pkill prometheus 2>/dev/null || true |
| 243 | + |
| 244 | +# ============================================ |
| 245 | +# 启动 Ray Head 节点 |
| 246 | +# ============================================ |
| 247 | +print_header "启动 Ray 集群" |
| 248 | + |
| 249 | +parse_node_config "$HEAD_NODE" |
| 250 | +if [ -n "$_gpu_devices" ]; then |
| 251 | + print_info "启动 Head 节点 (GPU: $_gpu_devices)..." |
| 252 | + CUDA_VISIBLE_DEVICES="$_gpu_devices" ray start --head \ |
| 253 | + --port=$RAY_PORT \ |
| 254 | + --num-gpus=$_gpu_count \ |
| 255 | + --disable-usage-stats \ |
| 256 | + --include-dashboard=true \ |
| 257 | + --temp-dir="$TEMP_DIR" |
| 258 | +else |
| 259 | + print_info "启动 Head 节点 (CPU only)..." |
| 260 | + CUDA_VISIBLE_DEVICES="" ray start --head \ |
| 261 | + --port=$RAY_PORT \ |
| 262 | + --num-gpus=0 \ |
| 263 | + --disable-usage-stats \ |
| 264 | + --include-dashboard=true \ |
| 265 | + --temp-dir="$TEMP_DIR" |
| 266 | +fi |
| 267 | +print_success "Head 节点启动成功!" |
| 268 | + |
| 269 | +# ============================================ |
| 270 | +# 启动 GPU Worker 节点 |
| 271 | +# ============================================ |
| 272 | +for i in "${!GPU_WORKERS[@]}"; do |
| 273 | + parse_node_config "${GPU_WORKERS[$i]}" |
| 274 | + print_info "启动 GPU Worker $((i+1)) (GPU: $_gpu_devices)..." |
| 275 | + CUDA_VISIBLE_DEVICES="$_gpu_devices" ray start \ |
| 276 | + --address=$RAY_ADDRESS \ |
| 277 | + --num-gpus=$_gpu_count |
| 278 | + print_success "GPU Worker $((i+1)) 启动成功!" |
| 279 | +done |
| 280 | + |
| 281 | +# ============================================ |
| 282 | +# 启动 CPU Worker 节点 |
| 283 | +# ============================================ |
| 284 | +if [ "$CPU_WORKER_COUNT" -gt 0 ]; then |
| 285 | + print_info "启动 $CPU_WORKER_COUNT 个 CPU Worker..." |
| 286 | + for ((i=1; i<=CPU_WORKER_COUNT; i++)); do |
| 287 | + CUDA_VISIBLE_DEVICES="" ray start \ |
| 288 | + --address=$RAY_ADDRESS \ |
| 289 | + --num-gpus=0 |
| 290 | + done |
| 291 | + print_success "CPU Worker 启动成功!" |
| 292 | +fi |
| 293 | + |
| 294 | +# ============================================ |
| 295 | +# 显示集群状态 |
| 296 | +# ============================================ |
| 297 | +echo "" |
| 298 | +print_info "集群状态:" |
| 299 | +ray status 2>/dev/null || true |
| 300 | + |
| 301 | +# ============================================ |
| 302 | +# 启动 Prometheus 监控(可选) |
| 303 | +# ============================================ |
| 304 | +print_header "启动监控(可选)" |
| 305 | + |
| 306 | +PROMETHEUS_PID="" |
| 307 | +if [ -f "$PROMETHEUS_BIN" ]; then |
| 308 | + print_info "检测到 Prometheus,正在启动监控服务..." |
| 309 | + |
| 310 | + # 等待 Ray 生成 Prometheus 配置 |
| 311 | + sleep 2 |
| 312 | + |
| 313 | + if [ -f "$PROMETHEUS_CONFIG" ]; then |
| 314 | + nohup "$PROMETHEUS_BIN" --config.file="$PROMETHEUS_CONFIG" > prometheus.log 2>&1 & |
| 315 | + PROMETHEUS_PID=$! |
| 316 | + print_success "Prometheus 监控已启动 (PID: $PROMETHEUS_PID)" |
| 317 | + echo " - 监控日志: prometheus.log" |
| 318 | + echo " - 配置文件: $PROMETHEUS_CONFIG" |
| 319 | + else |
| 320 | + print_warning "Prometheus 配置文件不存在,跳过监控启动" |
| 321 | + echo " - 预期路径: $PROMETHEUS_CONFIG" |
| 322 | + fi |
| 323 | +else |
| 324 | + print_warning "未检测到 Prometheus,跳过监控启动" |
| 325 | + echo " - 预期路径: $PROMETHEUS_BIN" |
| 326 | +fi |
| 327 | + |
| 328 | +# ============================================ |
| 329 | +# 启动 Twinkle 服务器 |
| 330 | +# ============================================ |
| 331 | +print_header "启动 Twinkle 服务器" |
| 332 | + |
| 333 | +print_info "日志输出到: $LOG_FILE" |
| 334 | +echo "" |
| 335 | + |
| 336 | +# 启动服务器并实时显示日志 |
| 337 | +nohup python server.py > "$LOG_FILE" 2>&1 & |
| 338 | +SERVER_PID=$! |
| 339 | + |
| 340 | +# 实时显示日志 |
| 341 | +tail -f "$LOG_FILE" |
0 commit comments