Skip to content
6 changes: 3 additions & 3 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3428,15 +3428,15 @@ minimaxm2.5-fp8-h200-vllm:
- isl: 1024
osl: 1024
search-space:
- { tp: 4, conc-start: 4, conc-end: 64 }
- { tp: 8, conc-start: 4, conc-end: 128 }
- isl: 1024
osl: 8192
search-space:
- { tp: 4, conc-start: 4, conc-end: 64 }
- { tp: 8, conc-start: 4, conc-end: 128 }
- isl: 8192
osl: 1024
search-space:
- { tp: 4, conc-start: 4, conc-end: 64 }
- { tp: 8, conc-start: 4, conc-end: 128 }

dsr1-fp4-gb200-dynamo-trt:
image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2
Expand Down
8 changes: 8 additions & 0 deletions benchmarks/single_node/minimaxm2.5_fp8_h200.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ source "$(dirname "$0")/../benchmark_lib.sh"
check_env_vars \
MODEL \
TP \
EP_SIZE \
CONC \
ISL \
OSL \
Expand All @@ -21,9 +22,16 @@ hf download "$MODEL"
SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}

if [ "$EP_SIZE" -ge 1 ]; then
EP=" --enable-expert-parallel"
else
EP=" "
fi

set -x
vllm serve $MODEL --port $PORT \
--tensor-parallel-size=$TP \
$EP \
--gpu-memory-utilization 0.95 \
--max-model-len $MAX_MODEL_LEN \
--disable-log-requests \
Expand Down
7 changes: 6 additions & 1 deletion perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -931,4 +931,9 @@
- "Switch to --attention-backend ROCM_AITER_UNIFIED_ATTN and add fuse_rope_kvcache compilation pass"
- "Remove deprecated VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION/VLLM_ROCM_USE_AITER_MHA env vars and compilation-config cudagraph_mode"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/867


- config-keys:
- minimaxm2.5-fp8-h200-vllm
description:
- "Extend MiniMax M2.5 FP8 single-node config for H200 with vLLM v0.16.0 (TP8)"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/869