Skip to content

Reproduction of DisCO #1

@Wloner0809

Description

@Wloner0809

Hi authors, thank you for your excellent work! I encountered some issues while reproducing DisCO. Below is my script — my codebase is verl 0.5.0, and the implementation follows the DisCO PR in the verl repository.

set -xeuo pipefail

project_name='DEBUG'
exp_name='DisCO-Qwen3-1.7B-Base-DAPO_MATH-A40'

adv_estimator=disco

use_kl_in_reward=False
kl_coef=0.0
use_kl_loss=False
kl_loss_coef=0.0

max_prompt_length=$((1024 * 2))
max_response_length=$((1024 * 8))
enable_overlong_buffer=False
overlong_buffer_len=512
overlong_penalty_factor=1.0

policy_loss_mode="disco"
disco_score_func='logL'
delta=1e-4
beta=1e3
tau=10
loss_agg_mode="token-mean"

#* (ppo_micro_batch_size_per_gpu * nnodes * n_gpus_per_node) % rollout.n = 0
enable_filter_groups=False
train_prompt_bsz=128
train_prompt_mini_bsz=16
n_resp_per_prompt=8

# Ray
RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:9312"}
WORKING_DIR=${WORKING_DIR:-"${PWD}"}
RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
NNODES=${NNODES:-1}
# Paths
RAY_DATA_HOME=xxx
MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/model/Qwen3-1.7B-Base"}
CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/verl-ckpts/${project_name}/${exp_name}"}
TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/verl-data/dapo-math-17k-common_prompt.parquet"}
TEST_FILE=${TEST_FILE:-"[${RAY_DATA_HOME}/data/verl-data/aime-2024.parquet,${RAY_DATA_HOME}/data/verl-data/aime-2025.parquet]"}

export TENSORBOARD_DIR="xxx/verl-tensorboard/${project_name}/${exp_name}"

# Algorithm
train_temperature=0.6
train_top_p=1.0
train_top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
val_temperature=0.6
val_top_p=0.95
val_top_k=-1 # 0 for HF rollout, -1 for vLLM rollout

# Mathematically equivalent
use_dynamic_bsz=False
infer_micro_batch_size_per_gpu=2
train_micro_batch_size_per_gpu=2
offload=False

NUM_GPUS=8

export RAY_BACKEND_LOG_LEVEL=debug
export RAY_DISABLE_IMPORT_WARNING=1
export RAY_DISABLE_GPU_MONITOR=1
export RAY_DEBUG_POST_MORTEM=1

ray job submit --runtime-env="${RUNTIME_ENV}" \
    --working-dir "${WORKING_DIR}" \
    -- python3 -m recipe.disco.main_disco \
    data.train_files="${TRAIN_FILE}" \
    data.val_files="${TEST_FILE}" \
    data.prompt_key=prompt \
    data.reward_fn_key=data_source \
    data.max_prompt_length=${max_prompt_length} \
    data.max_response_length=${max_response_length} \
    data.train_batch_size=${train_prompt_bsz} \
    data.truncation='left' \
    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
    algorithm.adv_estimator=${adv_estimator} \
    algorithm.use_kl_in_reward=${use_kl_in_reward} \
    algorithm.kl_ctrl.kl_coef=${kl_coef} \
    algorithm.filter_groups.enable=${enable_filter_groups} \
    actor_rollout_ref.model.use_remove_padding=True \
    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=$((max_prompt_length + max_response_length)) \
    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=$((max_prompt_length + max_response_length)) \
    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=$((max_prompt_length + max_response_length)) \
    actor_rollout_ref.model.path="${MODEL_PATH}" \
    actor_rollout_ref.model.enable_gradient_checkpointing=True \
    +actor_rollout_ref.ref.enable=False \
    actor_rollout_ref.actor.optim.lr=2e-6 \
    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=${train_micro_batch_size_per_gpu} \
    actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \
    actor_rollout_ref.actor.entropy_coeff=0 \
    actor_rollout_ref.actor.policy_loss.loss_mode=${policy_loss_mode} \
    actor_rollout_ref.actor.policy_loss.disco_score_func=${disco_score_func} \
    actor_rollout_ref.actor.policy_loss.delta=${delta} \
    actor_rollout_ref.actor.policy_loss.beta=${beta} \
    actor_rollout_ref.actor.policy_loss.tau=${tau} \
    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
    actor_rollout_ref.actor.ulysses_sequence_parallel_size=1 \
    actor_rollout_ref.rollout.gpu_memory_utilization=0.85 \
    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=${infer_micro_batch_size_per_gpu} \
    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
    actor_rollout_ref.rollout.enable_chunked_prefill=True \
    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
    actor_rollout_ref.rollout.temperature=${train_temperature} \
    actor_rollout_ref.rollout.top_p=${train_top_p} \
    actor_rollout_ref.rollout.top_k="${train_top_k}" \
    actor_rollout_ref.rollout.val_kwargs.temperature=${val_temperature} \
    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
    actor_rollout_ref.rollout.val_kwargs.top_k=${val_top_k} \
    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
    actor_rollout_ref.rollout.val_kwargs.n=1 \
    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=${infer_micro_batch_size_per_gpu} \
    actor_rollout_ref.ref.fsdp_config.param_offload=True \
    actor_rollout_ref.ref.ulysses_sequence_parallel_size=1 \
    actor_rollout_ref.actor.fsdp_config.fsdp_size=-1 \
    reward_model.reward_manager=dapo \
    reward_model.overlong_buffer.enable=${enable_overlong_buffer} \
    reward_model.overlong_buffer.len=${overlong_buffer_len} \
    reward_model.overlong_buffer.penalty_factor=${overlong_penalty_factor} \
    trainer.logger='["console","tensorboard"]' \
    trainer.validation_data_dir="${TENSORBOARD_DIR}" \
    trainer.n_gpus_per_node=${NUM_GPUS} \
    trainer.nnodes="${NNODES}" \
    trainer.balance_batch=False \
    trainer.val_before_train=True \
    trainer.test_freq=5 \
    trainer.save_freq=10 \
    trainer.total_epochs=10 \
    trainer.default_local_dir="${CKPTS_DIR}" \
    trainer.resume_mode=auto

I obtained the following TensorBoard visualization:

Image

Is there an issue with my experimental setup? Or do I need to adjust $\tau$ / $\delta$ / $\beta$ because I changed the model and dataset?

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions