-
Notifications
You must be signed in to change notification settings - Fork 342
Description
This is my training error log. Flash-attention underlying error:
Traceback (most recent call last):
File "/record/workspace/Search-R1/verl/trainer/main_ppo.py", line 127, in main
ray.get(main_task.remote(config))
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 22, in auto_init_wrapper
return fn(*args, **kwargs)
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
return func(*args, **kwargs)
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/ray/_private/worker.py", line 2624, in get
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(RuntimeError): ^[[36mray::main_task()^[[39m (pid=62145, ip=10.223.3.13)
File "/record/workspace/Search-R1/verl/trainer/main_ppo.py", line 215, in main_task
trainer.fit()
File "/record/workspace/Search-R1/verl/trainer/ppo/ray_trainer.py", line 825, in fit
final_gen_batch_output = generation_manager.run_llm_loop(
File "/record/workspace/Search-R1/search_r1/llm_agent/generation.py", line 282, in run_llm_loop
gen_output = self._generate_with_gpu_padding(rollings_active)
File "/record/workspace/Search-R1/search_r1/llm_agent/generation.py", line 227, in _generate_with_gpu_padding
padded_output = self.actor_rollout_wg.generate_sequences(padded_active_batch)
File "/record/workspace/Search-R1/verl/single_controller/ray/base.py", line 42, in func
output = ray.get(output)
ray.exceptions.RayTaskError(RuntimeError): ^[[36mray::WorkerDict.actor_rollout_generate_sequences()^[[39m (pid=64041, ip=10.223.3.13, actor_id=2c150c1cdd8b662ecc1517dd01000000, repr=<verl.single_controller.ray.base.WorkerDict object at 0x7f8eea144940>)
File "/record/workspace/Search-R1/verl/single_controller/ray/base.py", line 399, in func
return getattr(self.worker_dict[key], name)(*args, **kwargs)
File "/record/workspace/Search-R1/verl/single_controller/base/decorator.py", line 404, in inner
return func(*args, **kwargs)
File "/record/workspace/Search-R1/verl/workers/fsdp_workers.py", line 465, in generate_sequences
old_log_probs = self.actor.compute_log_prob(data=output)
File "/record/workspace/Search-R1/verl/workers/actor/dp_actor.py", line 191, in compute_log_prob
_, log_probs = self._forward_micro_batch(micro_batch, temperature=temperature)
File "/record/workspace/Search-R1/verl/workers/actor/dp_actor.py", line 94, in _forward_micro_batch
output = self.actor_module(input_ids=input_ids_rmpad,
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 863, in forward
output = self._fsdp_wrapped_module(*args, **kwargs)
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 1165, in forward
outputs = self.model(
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 895, in forward
layer_outputs = decoder_layer(
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 443, in forward
attn_output = _flash_attention_forward(
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/transformers/modeling_flash_attention_utils.py", line 346, in _flash_attention_forward
attn_output = flash_attn_varlen_func(
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/flash_attn/flash_attn_interface.py", line 1443, in flash_attn_varlen_func
return FlashAttnVarlenFunc.apply(
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/torch/autograd/function.py", line 574, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/flash_attn/flash_attn_interface.py", line 925, in forward
out_padded, softmax_lse, S_dmask, rng_state = _wrapped_flash_attn_varlen_forward(
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/torch/ops.py", line 1061, in call
return self._op(*args, **(kwargs or {}))
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/torch/_library/autograd.py", line 98, in autograd_impl
result = Generated.apply(*args, Metadata(keyset, keyword_only_args)) # type: ignore[attr-defined]
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/torch/autograd/function.py", line 574, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/torch/_library/autograd.py", line 40, in forward
result = op.redispatch(keyset & _C._after_autograd_keyset, *args, **kwargs)
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/torch/ops.py", line 672, in redispatch
return self._handle.redispatch_boxed(keyset, *args, **kwargs)
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/torch/_library/custom_ops.py", line 236, in backend_impl
result = self._backend_fns[device_type](*args, **kwargs)
File "/home/research/miniconda3/envs/r1search_test/lib/python3.9/site-packages/flash_attn/flash_attn_interface.py", line 165, in _flash_attn_varlen_forward
out, softmax_lse, S_dmask, rng_state = flash_attn_gpu.varlen_fwd(
RuntimeError: batch size must be positive