Skip to content

Commit c713631

Browse files
fix short math grpo cookbook (#149)
* update short math grpo * update run.sh * update run.sh * update run.sh * update * Update src/twinkle/template/base.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --------- Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
1 parent 2431d25 commit c713631

File tree

11 files changed

+218
-199
lines changed

11 files changed

+218
-199
lines changed

cookbook/client/server/megatron/run.sh

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
# --gpu-workers LIST GPU Worker 列表,分号分隔多个节点 (默认: 4,5,6,7:4)
1313
# --cpu-workers N CPU Worker 数量 (默认: 1)
1414
# --temp-dir DIR Ray 临时目录 (默认: /dashscope/caches/application/ray_logs)
15+
# --save-dir DIR Twinkle 模型保存目录 (默认: /dashscope/caches/application/save)
16+
# --server-config FILE Twinkle 服务器配置文件路径 (默认: /twinkle/cookbook/client/server/megatron/server_config.yaml)
1517
# --help 显示帮助信息
1618
#
1719
# 示例:
@@ -49,6 +51,8 @@ RAY_ADDRESS="127.0.0.1:$RAY_PORT"
4951
# --- 路径配置 ---
5052
DEFAULT_TEMP_DIR="/dashscope/caches/application/ray_logs"
5153
LOG_FILE="run.log"
54+
DEFAULT_SAVE_DIR="/dashscope/caches/application/save"
55+
DEFAULT_SERVER_CONFIG_FILE="/twinkle/cookbook/client/server/megatron/server_config.yaml"
5256

5357
# --- Prometheus 监控配置 ---
5458
PROMETHEUS_BIN="/dashscope/caches/application/monitor/prometheus-3.10.0.linux-amd64/prometheus"
@@ -67,6 +71,8 @@ HEAD_NODE="0,1,2,3"
6771
GPU_WORKERS_INPUT="4,5,6,7"
6872
CPU_WORKER_COUNT="1"
6973
TEMP_DIR="$DEFAULT_TEMP_DIR"
74+
SAVE_DIR="$DEFAULT_SAVE_DIR"
75+
SERVER_CONFIG_FILE="$DEFAULT_SERVER_CONFIG_FILE"
7076

7177
# 解析命名参数
7278
while [[ $# -gt 0 ]]; do
@@ -103,6 +109,22 @@ while [[ $# -gt 0 ]]; do
103109
TEMP_DIR="${1#*=}"
104110
shift
105111
;;
112+
--save-dir)
113+
SAVE_DIR="$2"
114+
shift 2
115+
;;
116+
--save-dir=*)
117+
SAVE_DIR="${1#*=}"
118+
shift
119+
;;
120+
--server-config)
121+
SERVER_CONFIG_FILE="$2"
122+
shift 2
123+
;;
124+
--server-config=*)
125+
SERVER_CONFIG_FILE="${1#*=}"
126+
shift
127+
;;
106128
--help|-h)
107129
echo "用法: ./run.sh [选项]"
108130
echo ""
@@ -111,6 +133,8 @@ while [[ $# -gt 0 ]]; do
111133
echo " --gpu-workers LIST GPU Worker 列表,分号分隔多个节点 (默认: 4,5,6,7)"
112134
echo " --cpu-workers N CPU Worker 数量 (默认: 1)"
113135
echo " --temp-dir DIR Ray 临时目录"
136+
echo " --save-dir DIR Twinkle 模型保存目录 (默认: $DEFAULT_SAVE_DIR)"
137+
echo " --server-config FILE Twinkle 服务器配置文件路径 (默认: $DEFAULT_SERVER_CONFIG_FILE)"
114138
echo " --help, -h 显示帮助信息"
115139
echo ""
116140
echo "示例:"
@@ -129,6 +153,9 @@ while [[ $# -gt 0 ]]; do
129153
esac
130154
done
131155

156+
# 将 SAVE_DIR export 给子进程(python server 通过环境变量读取)
157+
export TWINKLE_DEFAULT_SAVE_DIR="$SAVE_DIR"
158+
132159
# 将分号分隔的字符串转为数组
133160
if [ -z "$GPU_WORKERS_INPUT" ]; then
134161
GPU_WORKERS=()
@@ -222,6 +249,8 @@ echo ""
222249
print_info "运行参数:"
223250
echo " - Ray 地址: $RAY_ADDRESS"
224251
echo " - 临时目录: $TEMP_DIR"
252+
echo " - 保存目录: $TWINKLE_DEFAULT_SAVE_DIR"
253+
echo " - 服务配置: $SERVER_CONFIG_FILE"
225254
echo " - 日志文件: $LOG_FILE"
226255
echo ""
227256

@@ -235,6 +264,28 @@ fi
235264
# 停止已有 Ray 集群和 Prometheus
236265
# ============================================
237266
print_header "清理环境"
267+
268+
# 停止 Twinkle server.py(twinkle.server 模块)
269+
print_info "停止已有的 Twinkle Server..."
270+
pkill -f "twinkle.server" 2>/dev/null || true
271+
272+
# 停止 vLLM 进程
273+
print_info "停止已有的 vLLM 进程..."
274+
pkill -f "vllm" 2>/dev/null || true
275+
276+
# 等待上述进程退出
277+
sleep 2
278+
279+
# 若仍有残留则强制 SIGKILL
280+
if pgrep -f "twinkle.server" > /dev/null 2>&1; then
281+
print_warning "Twinkle Server 未退出,强制终止..."
282+
pkill -9 -f "twinkle.server" 2>/dev/null || true
283+
fi
284+
if pgrep -f "vllm" > /dev/null 2>&1; then
285+
print_warning "vLLM 进程未退出,强制终止..."
286+
pkill -9 -f "vllm" 2>/dev/null || true
287+
fi
288+
238289
print_info "停止已有的 Ray 集群..."
239290
ray stop --force 2>/dev/null || true
240291

@@ -334,8 +385,10 @@ print_info "日志输出到: $LOG_FILE"
334385
echo ""
335386

336387
# 启动服务器并实时显示日志
337-
nohup python server.py > "$LOG_FILE" 2>&1 &
388+
touch "$LOG_FILE" # 预创建文件,避免 tail -f 在文件尚未写入时报错
389+
nohup python -m twinkle.server --config "$SERVER_CONFIG_FILE" > "$LOG_FILE" 2>&1 &
338390
SERVER_PID=$!
391+
print_success "Twinkle Server 已启动 (PID: $SERVER_PID)"
339392

340393
# 实时显示日志
341394
tail -f "$LOG_FILE"

cookbook/client/server/megatron/server.py

Lines changed: 0 additions & 21 deletions
This file was deleted.

cookbook/client/server/megatron/server_config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ applications:
5050
enable_lora: true # Allow loading LoRA adapters during inference
5151
max_loras: 5 # Max allowed loras working on vLLM at the same time
5252
max_lora_rank: 32 # Support up to rank 64 LoRA adapters
53+
enable_tower_connector_lora: true
5354
device_group: # Logical device group for the sampler
5455
name: sampler
5556
gpus_per_worker: 2

cookbook/client/server/megatron/server_config_4b.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,13 +76,14 @@ applications:
7676
import_path: sampler
7777
args:
7878
model_id: "ms://Qwen/Qwen3.5-4B" # ModelScope model identifier
79-
nproc_per_node: 2 # Number of GPU processes per node
79+
nproc_per_node: 1 # Number of GPU processes per node
8080
sampler_type: vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
8181
engine_args: # vLLM engine-specific settings
8282
max_model_len: 16000 # Maximum sequence length the engine supports
8383
gpu_memory_utilization: 0.7 # Fraction of GPU memory to use (0.0-1.0)
8484
enable_lora: true # Allow loading LoRA adapters during inference
8585
logprobs_mode: processed_logprobs # Logprobs mode for sampling results
86+
enable_tower_connector_lora: true
8687
device_group: # Logical device group for the sampler
8788
name: sampler
8889
ranks: 1 # Number of GPUs to use

0 commit comments

Comments
 (0)