From 1d7fa5d6035048e1c377807f158e987a4b738c04 Mon Sep 17 00:00:00 2001 From: zhuyuhua-v Date: Fri, 14 Nov 2025 10:08:08 +0800 Subject: [PATCH 1/4] use aiter triton kernel as triton mha fallback path Signed-off-by: zhuyuhua-v --- vllm/v1/attention/backends/mla/triton_mla.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/vllm/v1/attention/backends/mla/triton_mla.py b/vllm/v1/attention/backends/mla/triton_mla.py index 781f77e96319..959af0ed1e96 100644 --- a/vllm/v1/attention/backends/mla/triton_mla.py +++ b/vllm/v1/attention/backends/mla/triton_mla.py @@ -129,14 +129,20 @@ def _flash_attn_varlen_diff_headdims( q, k, v, softmax_scale=softmax_scale, **kwargs ) else: - return super()._flash_attn_varlen_diff_headdims( - q, - k, - v, - return_softmax_lse=return_softmax_lse, + from aiter.ops.triton.mha import flash_attn_varlen_func + result = flash_attn_varlen_func( + q=q, + k=k, + v=v, + return_lse=return_softmax_lse, softmax_scale=softmax_scale, **kwargs, ) + if type(result) is tuple and return_softmax_lse: + output, lse = result + lse = lse.T.contiguous() + return (output, lse) + return result def _forward_decode( self, From e54682cec69490d1ad25435b375e996519f16cb6 Mon Sep 17 00:00:00 2001 From: ZhiweiYan-96 Date: Wed, 19 Nov 2025 06:07:47 +0000 Subject: [PATCH 2/4] Update environment for fixing deepseek fp4 acc --- evaluation/deepseek_fp4/launch_deepseekr1_fp4_TP.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/evaluation/deepseek_fp4/launch_deepseekr1_fp4_TP.sh b/evaluation/deepseek_fp4/launch_deepseekr1_fp4_TP.sh index 26ef8cd4dce9..38fa4b2ae5a3 100644 --- a/evaluation/deepseek_fp4/launch_deepseekr1_fp4_TP.sh +++ b/evaluation/deepseek_fp4/launch_deepseekr1_fp4_TP.sh @@ -1,16 +1,16 @@ export VLLM_USE_V1=1 -export VLLM_USE_TRITON_FLASH_ATTN=0 +export VLLM_USE_TRITON_FLASH_ATTN=1 # use triton mha # export VLLM_LOGGING_LEVEL=DEBUG export VLLM_RPC_TIMEOUT=1800000 export VLLM_ROCM_USE_AITER=1 export VLLM_ROCM_USE_AITER_MHA=0 -export VLLM_ROCM_USE_AITER_MLA=1 +export VLLM_ROCM_USE_AITER_MLA=0 # use triton mha export VLLM_ROCM_USE_AITER_MOE=1 export VLLM_ROCM_USE_TRITON_ROPE=1 # add for acc export VLLM_DISABLE_COMPILE_CACHE=1 # FIXME: for now disable fp4 asm gemm because of running issue export VLLM_ROCM_USE_AITER_FP4_ASM_GEMM=0 -#export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=0 # for now disable +export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=0 # disable for acc export TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 export TRITON_HIP_USE_ASYNC_COPY=1 @@ -31,14 +31,14 @@ echo "running $model_path" # FIXME: for now use 0.8 for memory utilization vllm serve $model_path \ --host localhost \ - --port 9000 \ + --port 6789 \ --tensor-parallel-size 8 \ --max-num-batched-tokens 32768 \ --trust-remote-code \ --no-enable-prefix-caching \ --disable-log-requests \ --compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \ - --gpu_memory_utilization 0.8 \ + --gpu_memory_utilization 0.7 \ --async-scheduling \ --load-format fastsafetensors \ --seed 123 2>&1 | tee log.server.log & From 670b9a468db209b72120e11e37254130dcb41174 Mon Sep 17 00:00:00 2001 From: ZhiweiYan-96 Date: Wed, 19 Nov 2025 07:51:47 +0000 Subject: [PATCH 3/4] [ds fp4] set block-size to 16 --- evaluation/deepseek_fp4/launch_deepseekr1_fp4_TP.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/evaluation/deepseek_fp4/launch_deepseekr1_fp4_TP.sh b/evaluation/deepseek_fp4/launch_deepseekr1_fp4_TP.sh index 38fa4b2ae5a3..88277621d5fa 100644 --- a/evaluation/deepseek_fp4/launch_deepseekr1_fp4_TP.sh +++ b/evaluation/deepseek_fp4/launch_deepseekr1_fp4_TP.sh @@ -31,17 +31,18 @@ echo "running $model_path" # FIXME: for now use 0.8 for memory utilization vllm serve $model_path \ --host localhost \ - --port 6789 \ + --port 9000 \ --tensor-parallel-size 8 \ --max-num-batched-tokens 32768 \ --trust-remote-code \ --no-enable-prefix-caching \ --disable-log-requests \ - --compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \ + --enforce-eager \ --gpu_memory_utilization 0.7 \ --async-scheduling \ + --block-size 16 \ --load-format fastsafetensors \ --seed 123 2>&1 | tee log.server.log & - # --enforce-eager \ +# --compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \ # --enable-expert-parallel \ From 82080ea5665ef5339d99ccfa611389e7f4c8a2df Mon Sep 17 00:00:00 2001 From: ZhiweiYan-96 Date: Thu, 20 Nov 2025 02:41:41 +0000 Subject: [PATCH 4/4] lint Signed-off-by: ZhiweiYan-96 --- vllm/v1/attention/backends/mla/triton_mla.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/v1/attention/backends/mla/triton_mla.py b/vllm/v1/attention/backends/mla/triton_mla.py index 959af0ed1e96..b2a1711613ad 100644 --- a/vllm/v1/attention/backends/mla/triton_mla.py +++ b/vllm/v1/attention/backends/mla/triton_mla.py @@ -130,7 +130,8 @@ def _flash_attn_varlen_diff_headdims( ) else: from aiter.ops.triton.mha import flash_attn_varlen_func - result = flash_attn_varlen_func( + + result = flash_attn_varlen_func( q=q, k=k, v=v,