Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -390,7 +390,7 @@ minimaxm2.5-fp8-mi325x-vllm:
- { tp: 4, conc-start: 4, conc-end: 64 }

gptoss-fp4-mi300x-vllm:
image: vllm/vllm-openai-rocm:v0.16.0
image: vllm/vllm-openai-rocm:v0.17.0
model: openai/gpt-oss-120b
model-prefix: gptoss
runner: mi300x
Expand Down Expand Up @@ -421,7 +421,7 @@ gptoss-fp4-mi300x-vllm:
- { tp: 8, conc-start: 4, conc-end: 16 }

gptoss-fp4-mi325x-vllm:
image: vllm/vllm-openai-rocm:v0.16.0
image: vllm/vllm-openai-rocm:v0.17.0
model: openai/gpt-oss-120b
model-prefix: gptoss
runner: mi325x
Expand Down Expand Up @@ -452,8 +452,8 @@ gptoss-fp4-mi325x-vllm:
- { tp: 8, conc-start: 4, conc-end: 16 }

gptoss-fp4-mi355x-vllm:
image: vllm/vllm-openai-rocm:v0.16.0
model: openai/gpt-oss-120b
image: vllm/vllm-openai-rocm:v0.17.0
model: amd/gpt-oss-120b-w-mxfp4-a-fp8
model-prefix: gptoss
runner: mi355x
precision: fp4
Expand Down
19 changes: 10 additions & 9 deletions benchmarks/single_node/gptoss_fp4_mi300x.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,23 +33,24 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
fi

export AMDGCN_USE_BUFFER_OPS=0
export VLLM_ROCM_USE_AITER=1
export VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION=1
export VLLM_ROCM_USE_AITER_MHA=0
export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
ATTN_BACKEND="--attention-backend ROCM_AITER_UNIFIED_ATTN"
FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph_partition=True"

SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}

set -x
vllm serve $MODEL --port $PORT \
--tensor-parallel-size=$TP \
--gpu-memory-utilization 0.95 \
--max-model-len $MAX_MODEL_LEN \
--compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \
--block-size=64 \
--no-enable-prefix-caching \
--disable-log-requests > $SERVER_LOG 2>&1 &
$ATTN_BACKEND $FUSE_ROPE_KVCACHE \
--tensor-parallel-size=$TP \
--gpu-memory-utilization 0.95 \
--max-model-len $MAX_MODEL_LEN \
--block-size=64 \
--no-enable-prefix-caching \
--disable-log-requests > $SERVER_LOG 2>&1 &

SERVER_PID=$!

Expand Down
20 changes: 11 additions & 9 deletions benchmarks/single_node/gptoss_fp4_mi325x.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,22 +33,24 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
fi

export AMDGCN_USE_BUFFER_OPS=0
export VLLM_ROCM_USE_AITER=1
export VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION=1
export VLLM_ROCM_USE_AITER_MHA=0
export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
ATTN_BACKEND="--attention-backend ROCM_AITER_UNIFIED_ATTN"
FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph_partition=True"

SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}

set -x
vllm serve $MODEL --port $PORT \
--tensor-parallel-size=$TP \
--gpu-memory-utilization 0.95 \
--max-model-len $MAX_MODEL_LEN \
--compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \
--block-size=64 \
--no-enable-prefix-caching \
--disable-log-requests > $SERVER_LOG 2>&1 &
$ATTN_BACKEND $FUSE_ROPE_KVCACHE \
--tensor-parallel-size=$TP \
--gpu-memory-utilization 0.95 \
--max-model-len $MAX_MODEL_LEN \
--block-size=64 \
--no-enable-prefix-caching \
--disable-log-requests > $SERVER_LOG 2>&1 &

SERVER_PID=$!

Expand Down
21 changes: 12 additions & 9 deletions benchmarks/single_node/gptoss_fp4_mi355x.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,22 +33,25 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
fi

export AMDGCN_USE_BUFFER_OPS=0
export VLLM_ROCM_USE_AITER=1
export VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION=1
export VLLM_ROCM_USE_AITER_MHA=0
export VLLM_ROCM_USE_AITER_TRITON_ROPE=1
export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
ATTN_BACKEND="--attention-backend ROCM_AITER_UNIFIED_ATTN"
FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph_partition=True"

SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}

set -x
vllm serve $MODEL --port $PORT \
--tensor-parallel-size=$TP \
--gpu-memory-utilization 0.95 \
--max-model-len $MAX_MODEL_LEN \
--compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \
--block-size=64 \
--no-enable-prefix-caching \
--disable-log-requests > $SERVER_LOG 2>&1 &
$ATTN_BACKEND $FUSE_ROPE_KVCACHE \
--tensor-parallel-size=$TP \
--gpu-memory-utilization 0.95 \
--max-model-len $MAX_MODEL_LEN \
--block-size=64 \
--no-enable-prefix-caching \
--disable-log-requests > $SERVER_LOG 2>&1 &
Comment on lines +36 to +54
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks for this PR, can u open an PR in vllm recipes repo to update with these new flags. Lets ensure that the documentation is first class such that the entire ml community can benefit from your hard work!

https://github.com/vllm-project/recipes/blob/main/OpenAI/GPT-OSS.md#mi355xgfx950

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


SERVER_PID=$!

Expand Down
Loading