From 047685953d4801f6d8880dcdc04f224378a2706f Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Thu, 5 Mar 2026 01:19:05 -0600 Subject: [PATCH 1/2] vllm 0.17.0 updates --- .github/configs/amd-master.yaml | 8 ++++---- benchmarks/single_node/gptoss_fp4_mi300x.sh | 19 ++++++++++--------- benchmarks/single_node/gptoss_fp4_mi325x.sh | 20 +++++++++++--------- benchmarks/single_node/gptoss_fp4_mi355x.sh | 20 +++++++++++--------- 4 files changed, 36 insertions(+), 31 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 00fd01936..69322966b 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -390,7 +390,7 @@ minimaxm2.5-fp8-mi325x-vllm: - { tp: 4, conc-start: 4, conc-end: 64 } gptoss-fp4-mi300x-vllm: - image: vllm/vllm-openai-rocm:v0.16.0 + image: vllm/vllm-openai-rocm:v0.17.0 model: openai/gpt-oss-120b model-prefix: gptoss runner: mi300x @@ -421,7 +421,7 @@ gptoss-fp4-mi300x-vllm: - { tp: 8, conc-start: 4, conc-end: 16 } gptoss-fp4-mi325x-vllm: - image: vllm/vllm-openai-rocm:v0.16.0 + image: vllm/vllm-openai-rocm:v0.17.0 model: openai/gpt-oss-120b model-prefix: gptoss runner: mi325x @@ -452,8 +452,8 @@ gptoss-fp4-mi325x-vllm: - { tp: 8, conc-start: 4, conc-end: 16 } gptoss-fp4-mi355x-vllm: - image: vllm/vllm-openai-rocm:v0.16.0 - model: openai/gpt-oss-120b + image: vllm/vllm-openai-rocm:v0.17.0 + model: amd/gpt-oss-120b-w-mxfp4-a-fp8 model-prefix: gptoss runner: mi355x precision: fp4 diff --git a/benchmarks/single_node/gptoss_fp4_mi300x.sh b/benchmarks/single_node/gptoss_fp4_mi300x.sh index 7b64418e7..a1b701a89 100644 --- a/benchmarks/single_node/gptoss_fp4_mi300x.sh +++ b/benchmarks/single_node/gptoss_fp4_mi300x.sh @@ -33,23 +33,24 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" fi +export AMDGCN_USE_BUFFER_OPS=0 export VLLM_ROCM_USE_AITER=1 -export VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION=1 -export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 +ATTN_BACKEND="--attention-backend ROCM_AITER_UNIFIED_ATTN" +FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph_partition=True" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} set -x vllm serve $MODEL --port $PORT \ ---tensor-parallel-size=$TP \ ---gpu-memory-utilization 0.95 \ ---max-model-len $MAX_MODEL_LEN \ ---compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \ ---block-size=64 \ ---no-enable-prefix-caching \ ---disable-log-requests > $SERVER_LOG 2>&1 & + $ATTN_BACKEND $FUSE_ROPE_KVCACHE \ + --tensor-parallel-size=$TP \ + --gpu-memory-utilization 0.95 \ + --max-model-len $MAX_MODEL_LEN \ + --block-size=64 \ + --no-enable-prefix-caching \ + --disable-log-requests > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/gptoss_fp4_mi325x.sh b/benchmarks/single_node/gptoss_fp4_mi325x.sh index c8edf0c15..a1b701a89 100644 --- a/benchmarks/single_node/gptoss_fp4_mi325x.sh +++ b/benchmarks/single_node/gptoss_fp4_mi325x.sh @@ -33,22 +33,24 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" fi +export AMDGCN_USE_BUFFER_OPS=0 export VLLM_ROCM_USE_AITER=1 -export VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION=1 -export VLLM_ROCM_USE_AITER_MHA=0 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 +ATTN_BACKEND="--attention-backend ROCM_AITER_UNIFIED_ATTN" +FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph_partition=True" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} set -x vllm serve $MODEL --port $PORT \ ---tensor-parallel-size=$TP \ ---gpu-memory-utilization 0.95 \ ---max-model-len $MAX_MODEL_LEN \ ---compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \ ---block-size=64 \ ---no-enable-prefix-caching \ ---disable-log-requests > $SERVER_LOG 2>&1 & + $ATTN_BACKEND $FUSE_ROPE_KVCACHE \ + --tensor-parallel-size=$TP \ + --gpu-memory-utilization 0.95 \ + --max-model-len $MAX_MODEL_LEN \ + --block-size=64 \ + --no-enable-prefix-caching \ + --disable-log-requests > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/gptoss_fp4_mi355x.sh b/benchmarks/single_node/gptoss_fp4_mi355x.sh index 2012db23d..02e38f0d1 100644 --- a/benchmarks/single_node/gptoss_fp4_mi355x.sh +++ b/benchmarks/single_node/gptoss_fp4_mi355x.sh @@ -33,22 +33,24 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" fi +export AMDGCN_USE_BUFFER_OPS=0 export VLLM_ROCM_USE_AITER=1 -export VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION=1 -export VLLM_ROCM_USE_AITER_MHA=0 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 +ATTN_BACKEND="--attention-backend ROCM_AITER_UNIFIED_ATTN" +FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph_partition=True" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} set -x vllm serve $MODEL --port $PORT \ ---tensor-parallel-size=$TP \ ---gpu-memory-utilization 0.95 \ ---max-model-len $MAX_MODEL_LEN \ ---compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \ ---block-size=64 \ ---no-enable-prefix-caching \ ---disable-log-requests > $SERVER_LOG 2>&1 & + $ATTN_BACKEND $FUSE_ROPE_KVCACHE \ + --tensor-parallel-size=$TP \ + --gpu-memory-utilization 0.95 \ + --max-model-len $MAX_MODEL_LEN \ + --block-size=64 \ + --no-enable-prefix-caching \ + --disable-log-requests > $SERVER_LOG 2>&1 & SERVER_PID=$! From 0e91a718b695f2a69a0edcbe33f83613bbef1292 Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Thu, 5 Mar 2026 01:20:52 -0600 Subject: [PATCH 2/2] use aiter rope on mi355 --- benchmarks/single_node/gptoss_fp4_mi355x.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/single_node/gptoss_fp4_mi355x.sh b/benchmarks/single_node/gptoss_fp4_mi355x.sh index 02e38f0d1..ccf0807b3 100644 --- a/benchmarks/single_node/gptoss_fp4_mi355x.sh +++ b/benchmarks/single_node/gptoss_fp4_mi355x.sh @@ -35,6 +35,7 @@ fi export AMDGCN_USE_BUFFER_OPS=0 export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_USE_AITER_TRITON_ROPE=1 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 ATTN_BACKEND="--attention-backend ROCM_AITER_UNIFIED_ATTN" FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph_partition=True"