From 047685953d4801f6d8880dcdc04f224378a2706f Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Thu, 5 Mar 2026 01:19:05 -0600 Subject: [PATCH 1/6] vllm 0.17.0 updates --- .github/configs/amd-master.yaml | 8 ++++---- benchmarks/single_node/gptoss_fp4_mi300x.sh | 19 ++++++++++--------- benchmarks/single_node/gptoss_fp4_mi325x.sh | 20 +++++++++++--------- benchmarks/single_node/gptoss_fp4_mi355x.sh | 20 +++++++++++--------- 4 files changed, 36 insertions(+), 31 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 00fd01936..69322966b 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -390,7 +390,7 @@ minimaxm2.5-fp8-mi325x-vllm: - { tp: 4, conc-start: 4, conc-end: 64 } gptoss-fp4-mi300x-vllm: - image: vllm/vllm-openai-rocm:v0.16.0 + image: vllm/vllm-openai-rocm:v0.17.0 model: openai/gpt-oss-120b model-prefix: gptoss runner: mi300x @@ -421,7 +421,7 @@ gptoss-fp4-mi300x-vllm: - { tp: 8, conc-start: 4, conc-end: 16 } gptoss-fp4-mi325x-vllm: - image: vllm/vllm-openai-rocm:v0.16.0 + image: vllm/vllm-openai-rocm:v0.17.0 model: openai/gpt-oss-120b model-prefix: gptoss runner: mi325x @@ -452,8 +452,8 @@ gptoss-fp4-mi325x-vllm: - { tp: 8, conc-start: 4, conc-end: 16 } gptoss-fp4-mi355x-vllm: - image: vllm/vllm-openai-rocm:v0.16.0 - model: openai/gpt-oss-120b + image: vllm/vllm-openai-rocm:v0.17.0 + model: amd/gpt-oss-120b-w-mxfp4-a-fp8 model-prefix: gptoss runner: mi355x precision: fp4 diff --git a/benchmarks/single_node/gptoss_fp4_mi300x.sh b/benchmarks/single_node/gptoss_fp4_mi300x.sh index 7b64418e7..a1b701a89 100644 --- a/benchmarks/single_node/gptoss_fp4_mi300x.sh +++ b/benchmarks/single_node/gptoss_fp4_mi300x.sh @@ -33,23 +33,24 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" fi +export AMDGCN_USE_BUFFER_OPS=0 export VLLM_ROCM_USE_AITER=1 -export VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION=1 -export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 +ATTN_BACKEND="--attention-backend ROCM_AITER_UNIFIED_ATTN" +FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph_partition=True" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} set -x vllm serve $MODEL --port $PORT \ ---tensor-parallel-size=$TP \ ---gpu-memory-utilization 0.95 \ ---max-model-len $MAX_MODEL_LEN \ ---compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \ ---block-size=64 \ ---no-enable-prefix-caching \ ---disable-log-requests > $SERVER_LOG 2>&1 & + $ATTN_BACKEND $FUSE_ROPE_KVCACHE \ + --tensor-parallel-size=$TP \ + --gpu-memory-utilization 0.95 \ + --max-model-len $MAX_MODEL_LEN \ + --block-size=64 \ + --no-enable-prefix-caching \ + --disable-log-requests > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/gptoss_fp4_mi325x.sh b/benchmarks/single_node/gptoss_fp4_mi325x.sh index c8edf0c15..a1b701a89 100644 --- a/benchmarks/single_node/gptoss_fp4_mi325x.sh +++ b/benchmarks/single_node/gptoss_fp4_mi325x.sh @@ -33,22 +33,24 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" fi +export AMDGCN_USE_BUFFER_OPS=0 export VLLM_ROCM_USE_AITER=1 -export VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION=1 -export VLLM_ROCM_USE_AITER_MHA=0 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 +ATTN_BACKEND="--attention-backend ROCM_AITER_UNIFIED_ATTN" +FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph_partition=True" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} set -x vllm serve $MODEL --port $PORT \ ---tensor-parallel-size=$TP \ ---gpu-memory-utilization 0.95 \ ---max-model-len $MAX_MODEL_LEN \ ---compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \ ---block-size=64 \ ---no-enable-prefix-caching \ ---disable-log-requests > $SERVER_LOG 2>&1 & + $ATTN_BACKEND $FUSE_ROPE_KVCACHE \ + --tensor-parallel-size=$TP \ + --gpu-memory-utilization 0.95 \ + --max-model-len $MAX_MODEL_LEN \ + --block-size=64 \ + --no-enable-prefix-caching \ + --disable-log-requests > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/gptoss_fp4_mi355x.sh b/benchmarks/single_node/gptoss_fp4_mi355x.sh index 2012db23d..02e38f0d1 100644 --- a/benchmarks/single_node/gptoss_fp4_mi355x.sh +++ b/benchmarks/single_node/gptoss_fp4_mi355x.sh @@ -33,22 +33,24 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" fi +export AMDGCN_USE_BUFFER_OPS=0 export VLLM_ROCM_USE_AITER=1 -export VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION=1 -export VLLM_ROCM_USE_AITER_MHA=0 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 +ATTN_BACKEND="--attention-backend ROCM_AITER_UNIFIED_ATTN" +FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph_partition=True" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} set -x vllm serve $MODEL --port $PORT \ ---tensor-parallel-size=$TP \ ---gpu-memory-utilization 0.95 \ ---max-model-len $MAX_MODEL_LEN \ ---compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \ ---block-size=64 \ ---no-enable-prefix-caching \ ---disable-log-requests > $SERVER_LOG 2>&1 & + $ATTN_BACKEND $FUSE_ROPE_KVCACHE \ + --tensor-parallel-size=$TP \ + --gpu-memory-utilization 0.95 \ + --max-model-len $MAX_MODEL_LEN \ + --block-size=64 \ + --no-enable-prefix-caching \ + --disable-log-requests > $SERVER_LOG 2>&1 & SERVER_PID=$! From 0e91a718b695f2a69a0edcbe33f83613bbef1292 Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Thu, 5 Mar 2026 01:20:52 -0600 Subject: [PATCH 2/6] use aiter rope on mi355 --- benchmarks/single_node/gptoss_fp4_mi355x.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/single_node/gptoss_fp4_mi355x.sh b/benchmarks/single_node/gptoss_fp4_mi355x.sh index 02e38f0d1..ccf0807b3 100644 --- a/benchmarks/single_node/gptoss_fp4_mi355x.sh +++ b/benchmarks/single_node/gptoss_fp4_mi355x.sh @@ -35,6 +35,7 @@ fi export AMDGCN_USE_BUFFER_OPS=0 export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_USE_AITER_TRITON_ROPE=1 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 ATTN_BACKEND="--attention-backend ROCM_AITER_UNIFIED_ATTN" FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph_partition=True" From 862e35de267aea2b90d431f6afc27dca43cc8116 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Sat, 7 Mar 2026 13:26:06 -0600 Subject: [PATCH 3/6] update perf changelog --- perf-changelog.yaml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 15d00da6d..2c8ff9a05 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -875,3 +875,15 @@ - "Server: reasoning-parser qwen3, tool-call-parser qwen3_coder, enable-flashinfer-allreduce-fusion, mem-fraction-static 0.8" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/855 +- config-keys: + - gptoss-fp4-mi300x-vllm + - gptoss-fp4-mi325x-vllm + - gptoss-fp4-mi355x-vllm + description: + - "Update AMD GPT-OSS vLLM image from v0.16.0 to v0.17.0 for MI300X, MI325X, and MI355X" + - "MI355X: Switch model to amd/gpt-oss-120b-w-mxfp4-a-fp8 (MXFP4 weights + FP8 activations)" + - "MI355X: Add VLLM_ROCM_USE_AITER_TRITON_ROPE=1 for AITER triton RoPE kernel" + - "Add AMDGCN_USE_BUFFER_OPS=0 and VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 env vars" + - "Switch to --attention-backend ROCM_AITER_UNIFIED_ATTN and add fuse_rope_kvcache compilation pass" + - "Remove deprecated VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION/VLLM_ROCM_USE_AITER_MHA env vars and compilation-config cudagraph_mode" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/867 From ba11fb138cabfcd42c2debfa790b45d3faade797 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Sat, 7 Mar 2026 13:45:00 -0600 Subject: [PATCH 4/6] remove disable log requests --- benchmarks/single_node/gptoss_fp4_mi300x.sh | 3 +-- benchmarks/single_node/gptoss_fp4_mi325x.sh | 3 +-- benchmarks/single_node/gptoss_fp4_mi355x.sh | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/benchmarks/single_node/gptoss_fp4_mi300x.sh b/benchmarks/single_node/gptoss_fp4_mi300x.sh index a1b701a89..14eef7d5c 100644 --- a/benchmarks/single_node/gptoss_fp4_mi300x.sh +++ b/benchmarks/single_node/gptoss_fp4_mi300x.sh @@ -49,8 +49,7 @@ vllm serve $MODEL --port $PORT \ --gpu-memory-utilization 0.95 \ --max-model-len $MAX_MODEL_LEN \ --block-size=64 \ - --no-enable-prefix-caching \ - --disable-log-requests > $SERVER_LOG 2>&1 & + --no-enable-prefix-caching > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/gptoss_fp4_mi325x.sh b/benchmarks/single_node/gptoss_fp4_mi325x.sh index a1b701a89..14eef7d5c 100644 --- a/benchmarks/single_node/gptoss_fp4_mi325x.sh +++ b/benchmarks/single_node/gptoss_fp4_mi325x.sh @@ -49,8 +49,7 @@ vllm serve $MODEL --port $PORT \ --gpu-memory-utilization 0.95 \ --max-model-len $MAX_MODEL_LEN \ --block-size=64 \ - --no-enable-prefix-caching \ - --disable-log-requests > $SERVER_LOG 2>&1 & + --no-enable-prefix-caching > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/gptoss_fp4_mi355x.sh b/benchmarks/single_node/gptoss_fp4_mi355x.sh index ccf0807b3..b7c9c3ddb 100644 --- a/benchmarks/single_node/gptoss_fp4_mi355x.sh +++ b/benchmarks/single_node/gptoss_fp4_mi355x.sh @@ -50,8 +50,7 @@ vllm serve $MODEL --port $PORT \ --gpu-memory-utilization 0.95 \ --max-model-len $MAX_MODEL_LEN \ --block-size=64 \ - --no-enable-prefix-caching \ - --disable-log-requests > $SERVER_LOG 2>&1 & + --no-enable-prefix-caching > $SERVER_LOG 2>&1 & SERVER_PID=$! From f3e8823fb8adf66781bc9a3e8f33c266b194bd6b Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 9 Mar 2026 08:01:58 -0500 Subject: [PATCH 5/6] add flock to enroot import mi355x --- runners/launch_mi355x-amds.sh | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index f4f1e561f..5c88877b5 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -156,6 +156,7 @@ else PARTITION="compute" SQUASH_FILE="/var/lib/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" + LOCK_FILE="${SQUASH_FILE}.lock" set -x salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=128 --time=180 --no-shell --job-name="$RUNNER_NAME" @@ -163,16 +164,20 @@ else srun --jobid=$JOB_ID bash -c "docker stop \$(docker ps -a -q)" - if [[ "$FRAMEWORK" == "atom" ]]; then - srun --jobid=$JOB_ID bash -c "rm $SQUASH_FILE" - fi - - srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" - if ! srun --jobid=$JOB_ID bash -c "unsquashfs -l $SQUASH_FILE > /dev/null"; then - echo "unsquashfs failed, removing $SQUASH_FILE and re-importing..." - srun --jobid=$JOB_ID bash -c "rm -f $SQUASH_FILE" - srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" - fi + # Use flock to serialize concurrent imports to the same squash file + srun --jobid=$JOB_ID bash -c " + exec 9>\"$LOCK_FILE\" + flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; } + if [[ \"$FRAMEWORK\" == \"atom\" ]]; then + rm -f \"$SQUASH_FILE\" + fi + if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then + echo 'Squash file already exists and is valid, skipping import' + else + rm -f \"$SQUASH_FILE\" + enroot import -o \"$SQUASH_FILE\" docker://$IMAGE + fi + " srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ From 7a86cc0ce2dbf49aca8a0da1436d3583224c0383 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 9 Mar 2026 08:45:16 -0500 Subject: [PATCH 6/6] change cache path for vllm mi355x amds single node scenario --- runners/launch_mi355x-amds.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index 5c88877b5..fc04f5bb3 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -179,6 +179,8 @@ else fi " + export VLLM_CACHE_ROOT="/it-share/gharunners/.cache/vllm" + srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \