-
Notifications
You must be signed in to change notification settings - Fork 14
Why do I keep getting a feature dimension mismatch error when testing the original LLaDA? #9
Description
Running generate_until requests: 0%| | 0/330 [00:00<?, ?it/s][rank1]: Traceback (most recent call last):
[rank1]: File "/data/14thdd/users/xiaoliu/code/dLLM-cache/evaluation_script.py", line 34, in
[rank1]: cli_evaluate()
[rank1]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/lm_eval/main.py", line 10, in cli_evaluate
[rank1]: parser.execute(args)
[rank1]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/lm_eval/_cli/harness.py", line 60, in execute
[rank1]: args.func(args)
[rank1]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/lm_eval/_cli/run.py", line 379, in _execute
[rank1]: results = simple_evaluate(
[rank1]: ^^^^^^^^^^^^^^^^
[rank1]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/lm_eval/utils.py", line 498, in _wrapper
[rank1]: return fn(*args, **kwargs)
[rank1]: ^^^^^^^^^^^^^^^^^^^
[rank1]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/lm_eval/evaluator.py", line 366, in simple_evaluate
[rank1]: results = evaluate(
[rank1]: ^^^^^^^^^
[rank1]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/lm_eval/utils.py", line 498, in _wrapper
[rank1]: return fn(*args, **kwargs)
[rank1]: ^^^^^^^^^^^^^^^^^^^
[rank1]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/lm_eval/evaluator.py", line 595, in evaluate
[rank1]: resps = getattr(lm, reqtype)(cloned_reqs)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/data/14thdd/users/xiaoliu/code/dLLM-cache/eval_model/LLaDA.py", line 795, in generate_until
[rank1]: out = generate(
[rank1]: ^^^^^^^^^
[rank1]: File "/data/14thdd/users/xiaoliu/code/dLLM-cache/utils/generate_function.py", line 115, in generate
[rank1]: logits = model(x, attention_mask=attention_mask).logits[
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
[rank1]: return self._call_impl(*args, **kwargs)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
[rank1]: return forward_call(*args, **kwargs)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/home/xiaoliu/.cache/huggingface/modules/transformers_modules/GSAI_hyphen_ML/LLaDA_hyphen_8B_hyphen_Instruct/08b83a6feb34df1a6011b80c3c00c7563e963b07/modeling_llada.py", line 1431, in forward
[rank1]: outputs = self.model.forward(
[rank1]: ^^^^^^^^^^^^^^^^^^^
[rank1]: File "/home/xiaoliu/.cache/huggingface/modules/transformers_modules/GSAI_hyphen_ML/LLaDA_hyphen_8B_hyphen_Instruct/08b83a6feb34df1a6011b80c3c00c7563e963b07/modeling_llada.py", line 1328, in forward
[rank1]: x, cache = block(x, attention_bias=attention_bias, layer_past=layer_past, use_cache=use_cache)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
[rank1]: return self._call_impl(*args, **kwargs)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
[rank1]: return forward_call(*args, **kwargs)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/home/xiaoliu/.cache/huggingface/modules/transformers_modules/GSAI_hyphen_ML/LLaDA_hyphen_8B_hyphen_Instruct/08b83a6feb34df1a6011b80c3c00c7563e963b07/modeling_llada.py", line 911, in forward
[rank1]: att, cache = self.attention(q, k, v, attention_bias, layer_past=layer_past, use_cache=use_cache)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/home/xiaoliu/.cache/huggingface/modules/transformers_modules/GSAI_hyphen_ML/LLaDA_hyphen_8B_hyphen_Instruct/08b83a6feb34df1a6011b80c3c00c7563e963b07/modeling_llada.py", line 711, in attention
[rank1]: att = self._scaled_dot_product_attention(
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/home/xiaoliu/.cache/huggingface/modules/transformers_modules/GSAI_hyphen_ML/LLaDA_hyphen_8B_hyphen_Instruct/08b83a6feb34df1a6011b80c3c00c7563e963b07/modeling_llada.py", line 653, in _scaled_dot_product_attention
[rank1]: return F.scaled_dot_product_attention(
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: RuntimeError: The size of tensor a (1268) must match the size of tensor b (1012) at non-singleton dimension 3
[rank3]: Traceback (most recent call last):
[rank3]: File "/data/14thdd/users/xiaoliu/code/dLLM-cache/evaluation_script.py", line 34, in
[rank3]: cli_evaluate()
[rank3]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/lm_eval/main.py", line 10, in cli_evaluate
[rank3]: parser.execute(args)
[rank3]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/lm_eval/_cli/harness.py", line 60, in execute
[rank3]: args.func(args)
[rank3]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/lm_eval/_cli/run.py", line 379, in _execute
[rank3]: results = simple_evaluate(
[rank3]: ^^^^^^^^^^^^^^^^
[rank3]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/lm_eval/utils.py", line 498, in _wrapper
[rank3]: return fn(*args, **kwargs)
[rank3]: ^^^^^^^^^^^^^^^^^^^
[rank3]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/lm_eval/evaluator.py", line 366, in simple_evaluate
[rank3]: results = evaluate(
[rank3]: ^^^^^^^^^
[rank3]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/lm_eval/utils.py", line 498, in _wrapper
[rank3]: return fn(*args, **kwargs)
[rank3]: ^^^^^^^^^^^^^^^^^^^
[rank3]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/lm_eval/evaluator.py", line 595, in evaluate
[rank3]: resps = getattr(lm, reqtype)(cloned_reqs)
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/data/14thdd/users/xiaoliu/code/dLLM-cache/eval_model/LLaDA.py", line 795, in generate_until
[rank3]: out = generate(
[rank3]: ^^^^^^^^^
[rank3]: File "/data/14thdd/users/xiaoliu/code/dLLM-cache/utils/generate_function.py", line 115, in generate
[rank3]: logits = model(x, attention_mask=attention_mask).logits[
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
[rank3]: return self._call_impl(*args, **kwargs)
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
[rank3]: return forward_call(*args, **kwargs)
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/home/xiaoliu/.cache/huggingface/modules/transformers_modules/GSAI_hyphen_ML/LLaDA_hyphen_8B_hyphen_Instruct/08b83a6feb34df1a6011b80c3c00c7563e963b07/modeling_llada.py", line 1431, in forward
[rank3]: outputs = self.model.forward(
[rank3]: ^^^^^^^^^^^^^^^^^^^
[rank3]: File "/home/xiaoliu/.cache/huggingface/modules/transformers_modules/GSAI_hyphen_ML/LLaDA_hyphen_8B_hyphen_Instruct/08b83a6feb34df1a6011b80c3c00c7563e963b07/modeling_llada.py", line 1328, in forward
[rank3]: x, cache = block(x, attention_bias=attention_bias, layer_past=layer_past, use_cache=use_cache)
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
[rank3]: return self._call_impl(*args, **kwargs)
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
[rank3]: return forward_call(*args, **kwargs)
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/home/xiaoliu/.cache/huggingface/modules/transformers_modules/GSAI_hyphen_ML/LLaDA_hyphen_8B_hyphen_Instruct/08b83a6feb34df1a6011b80c3c00c7563e963b07/modeling_llada.py", line 911, in forward
[rank3]: att, cache = self.attention(q, k, v, attention_bias, layer_past=layer_past, use_cache=use_cache)
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/home/xiaoliu/.cache/huggingface/modules/transformers_modules/GSAI_hyphen_ML/LLaDA_hyphen_8B_hyphen_Instruct/08b83a6feb34df1a6011b80c3c00c7563e963b07/modeling_llada.py", line 711, in attention
[rank3]: att = self._scaled_dot_product_attention(
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/home/xiaoliu/.cache/huggingface/modules/transformers_modules/GSAI_hyphen_ML/LLaDA_hyphen_8B_hyphen_Instruct/08b83a6feb34df1a6011b80c3c00c7563e963b07/modeling_llada.py", line 653, in _scaled_dot_product_attention
[rank3]: return F.scaled_dot_product_attention(
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: RuntimeError: The size of tensor a (1281) must match the size of tensor b (1025) at non-singleton dimension 3
[rank0]: Traceback (most recent call last):
[rank0]: File "/data/14thdd/users/xiaoliu/code/dLLM-cache/evaluation_script.py", line 34, in
[rank0]: cli_evaluate()
[rank0]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/lm_eval/main.py", line 10, in cli_evaluate
[rank0]: parser.execute(args)
[rank0]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/lm_eval/_cli/harness.py", line 60, in execute
[rank0]: args.func(args)
[rank0]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/lm_eval/_cli/run.py", line 379, in _execute
[rank0]: results = simple_evaluate(
[rank0]: ^^^^^^^^^^^^^^^^
[rank0]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/lm_eval/utils.py", line 498, in _wrapper
[rank0]: return fn(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/lm_eval/evaluator.py", line 366, in simple_evaluate
[rank0]: results = evaluate(
[rank0]: ^^^^^^^^^
[rank0]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/lm_eval/utils.py", line 498, in _wrapper
[rank0]: return fn(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/lm_eval/evaluator.py", line 595, in evaluate
[rank0]: resps = getattr(lm, reqtype)(cloned_reqs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/14thdd/users/xiaoliu/code/dLLM-cache/eval_model/LLaDA.py", line 795, in generate_until
[rank0]: out = generate(
[rank0]: ^^^^^^^^^
[rank0]: File "/data/14thdd/users/xiaoliu/code/dLLM-cache/utils/generate_function.py", line 115, in generate
[rank0]: logits = model(x, attention_mask=attention_mask).logits[
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/xiaoliu/.cache/huggingface/modules/transformers_modules/GSAI_hyphen_ML/LLaDA_hyphen_8B_hyphen_Instruct/08b83a6feb34df1a6011b80c3c00c7563e963b07/modeling_llada.py", line 1431, in forward
[rank0]: outputs = self.model.forward(
[rank0]: ^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/xiaoliu/.cache/huggingface/modules/transformers_modules/GSAI_hyphen_ML/LLaDA_hyphen_8B_hyphen_Instruct/08b83a6feb34df1a6011b80c3c00c7563e963b07/modeling_llada.py", line 1328, in forward
[rank0]: x, cache = block(x, attention_bias=attention_bias, layer_past=layer_past, use_cache=use_cache)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/xiaoliu/.cache/huggingface/modules/transformers_modules/GSAI_hyphen_ML/LLaDA_hyphen_8B_hyphen_Instruct/08b83a6feb34df1a6011b80c3c00c7563e963b07/modeling_llada.py", line 911, in forward
[rank0]: att, cache = self.attention(q, k, v, attention_bias, layer_past=layer_past, use_cache=use_cache)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/xiaoliu/.cache/huggingface/modules/transformers_modules/GSAI_hyphen_ML/LLaDA_hyphen_8B_hyphen_Instruct/08b83a6feb34df1a6011b80c3c00c7563e963b07/modeling_llada.py", line 711, in attention
[rank0]: att = self._scaled_dot_product_attention(
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/xiaoliu/.cache/huggingface/modules/transformers_modules/GSAI_hyphen_ML/LLaDA_hyphen_8B_hyphen_Instruct/08b83a6feb34df1a6011b80c3c00c7563e963b07/modeling_llada.py", line 653, in _scaled_dot_product_attention
[rank0]: return F.scaled_dot_product_attention(
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: RuntimeError: The size of tensor a (1322) must match the size of tensor b (1066) at non-singleton dimension 3
[rank2]: Traceback (most recent call last):
[rank2]: File "/data/14thdd/users/xiaoliu/code/dLLM-cache/evaluation_script.py", line 34, in
[rank2]: cli_evaluate()
[rank2]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/lm_eval/main.py", line 10, in cli_evaluate
[rank2]: parser.execute(args)
[rank2]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/lm_eval/_cli/harness.py", line 60, in execute
[rank2]: args.func(args)
[rank2]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/lm_eval/_cli/run.py", line 379, in _execute
[rank2]: results = simple_evaluate(
[rank2]: ^^^^^^^^^^^^^^^^
[rank2]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/lm_eval/utils.py", line 498, in _wrapper
[rank2]: return fn(*args, **kwargs)
[rank2]: ^^^^^^^^^^^^^^^^^^^
[rank2]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/lm_eval/evaluator.py", line 366, in simple_evaluate
[rank2]: results = evaluate(
[rank2]: ^^^^^^^^^
[rank2]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/lm_eval/utils.py", line 498, in _wrapper
[rank2]: return fn(*args, **kwargs)
[rank2]: ^^^^^^^^^^^^^^^^^^^
[rank2]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/lm_eval/evaluator.py", line 595, in evaluate
[rank2]: resps = getattr(lm, reqtype)(cloned_reqs)
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank2]: File "/data/14thdd/users/xiaoliu/code/dLLM-cache/eval_model/LLaDA.py", line 795, in generate_until
[rank2]: out = generate(
[rank2]: ^^^^^^^^^
[rank2]: File "/data/14thdd/users/xiaoliu/code/dLLM-cache/utils/generate_function.py", line 115, in generate
[rank2]: logits = model(x, attention_mask=attention_mask).logits[
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank2]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
[rank2]: return self._call_impl(*args, **kwargs)
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank2]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
[rank2]: return forward_call(*args, **kwargs)
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank2]: File "/home/xiaoliu/.cache/huggingface/modules/transformers_modules/GSAI_hyphen_ML/LLaDA_hyphen_8B_hyphen_Instruct/08b83a6feb34df1a6011b80c3c00c7563e963b07/modeling_llada.py", line 1431, in forward
[rank2]: outputs = self.model.forward(
[rank2]: ^^^^^^^^^^^^^^^^^^^
[rank2]: File "/home/xiaoliu/.cache/huggingface/modules/transformers_modules/GSAI_hyphen_ML/LLaDA_hyphen_8B_hyphen_Instruct/08b83a6feb34df1a6011b80c3c00c7563e963b07/modeling_llada.py", line 1328, in forward
[rank2]: x, cache = block(x, attention_bias=attention_bias, layer_past=layer_past, use_cache=use_cache)
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank2]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
[rank2]: return self._call_impl(*args, **kwargs)
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank2]: File "/home/xiaoliu/.conda/envs/dllm_cache/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
[rank2]: return forward_call(*args, **kwargs)
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank2]: File "/home/xiaoliu/.cache/huggingface/modules/transformers_modules/GSAI_hyphen_ML/LLaDA_hyphen_8B_hyphen_Instruct/08b83a6feb34df1a6011b80c3c00c7563e963b07/modeling_llada.py", line 911, in forward
[rank2]: att, cache = self.attention(q, k, v, attention_bias, layer_past=layer_past, use_cache=use_cache)
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank2]: File "/home/xiaoliu/.cache/huggingface/modules/transformers_modules/GSAI_hyphen_ML/LLaDA_hyphen_8B_hyphen_Instruct/08b83a6feb34df1a6011b80c3c00c7563e963b07/modeling_llada.py", line 711, in attention
[rank2]: att = self._scaled_dot_product_attention(
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank2]: File "/home/xiaoliu/.cache/huggingface/modules/transformers_modules/GSAI_hyphen_ML/LLaDA_hyphen_8B_hyphen_Instruct/08b83a6feb34df1a6011b80c3c00c7563e963b07/modeling_llada.py", line 653, in _scaled_dot_product_attention
[rank2]: return F.scaled_dot_product_attention(
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank2]: RuntimeError: The size of tensor a (1254) must match the size of tensor b (998) at non-singleton dimension 3