Skip to content

Urgent, training encountered an error: batch size must be positive #166

@WanFan11

Description

@WanFan11

This is my training error log. Flash-attention underlying error:

Traceback (most recent call last):
File "/record/workspace/Search-R1/verl/trainer/main_ppo.py", line 127, in main
ray.get(main_task.remote(config))
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 22, in auto_init_wrapper
return fn(*args, **kwargs)
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
return func(*args, **kwargs)
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/ray/_private/worker.py", line 2624, in get
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(RuntimeError): ^[[36mray::main_task()^[[39m (pid=62145, ip=10.223.3.13)
File "/record/workspace/Search-R1/verl/trainer/main_ppo.py", line 215, in main_task
trainer.fit()
File "/record/workspace/Search-R1/verl/trainer/ppo/ray_trainer.py", line 825, in fit
final_gen_batch_output = generation_manager.run_llm_loop(
File "/record/workspace/Search-R1/search_r1/llm_agent/generation.py", line 282, in run_llm_loop
gen_output = self._generate_with_gpu_padding(rollings_active)
File "/record/workspace/Search-R1/search_r1/llm_agent/generation.py", line 227, in _generate_with_gpu_padding
padded_output = self.actor_rollout_wg.generate_sequences(padded_active_batch)
File "/record/workspace/Search-R1/verl/single_controller/ray/base.py", line 42, in func
output = ray.get(output)
ray.exceptions.RayTaskError(RuntimeError): ^[[36mray::WorkerDict.actor_rollout_generate_sequences()^[[39m (pid=64041, ip=10.223.3.13, actor_id=2c150c1cdd8b662ecc1517dd01000000, repr=<verl.single_controller.ray.base.WorkerDict object at 0x7f8eea144940>)
File "/record/workspace/Search-R1/verl/single_controller/ray/base.py", line 399, in func
return getattr(self.worker_dict[key], name)(*args, **kwargs)
File "/record/workspace/Search-R1/verl/single_controller/base/decorator.py", line 404, in inner
return func(*args, **kwargs)
File "/record/workspace/Search-R1/verl/workers/fsdp_workers.py", line 465, in generate_sequences
old_log_probs = self.actor.compute_log_prob(data=output)
File "/record/workspace/Search-R1/verl/workers/actor/dp_actor.py", line 191, in compute_log_prob
_, log_probs = self._forward_micro_batch(micro_batch, temperature=temperature)
File "/record/workspace/Search-R1/verl/workers/actor/dp_actor.py", line 94, in _forward_micro_batch
output = self.actor_module(input_ids=input_ids_rmpad,
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)

File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 863, in forward
output = self._fsdp_wrapped_module(*args, **kwargs)
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 1165, in forward
outputs = self.model(
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 895, in forward
layer_outputs = decoder_layer(
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 443, in forward
attn_output = _flash_attention_forward(
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/transformers/modeling_flash_attention_utils.py", line 346, in _flash_attention_forward
attn_output = flash_attn_varlen_func(
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/flash_attn/flash_attn_interface.py", line 1443, in flash_attn_varlen_func
return FlashAttnVarlenFunc.apply(
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/torch/autograd/function.py", line 574, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/flash_attn/flash_attn_interface.py", line 925, in forward
out_padded, softmax_lse, S_dmask, rng_state = _wrapped_flash_attn_varlen_forward(

File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/torch/ops.py", line 1061, in call
return self
._op(*args, **(kwargs or {}))
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/torch/_library/autograd.py", line 98, in autograd_impl
result = Generated.apply(*args, Metadata(keyset, keyword_only_args)) # type: ignore[attr-defined]
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/torch/autograd/function.py", line 574, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/torch/_library/autograd.py", line 40, in forward
result = op.redispatch(keyset & _C._after_autograd_keyset, *args, **kwargs)
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/torch/ops.py", line 672, in redispatch
return self
._handle.redispatch_boxed(keyset, *args, **kwargs)
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/torch/_library/custom_ops.py", line 236, in backend_impl
result = self._backend_fns[device_type](*args, **kwargs)
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/flash_attn/flash_attn_interface.py", line 165, in _flash_attn_varlen_forward
out, softmax_lse, S_dmask, rng_state = flash_attn_gpu.varlen_fwd(
RuntimeError: batch size must be positive

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions