diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index fc837704c..e12a0cede 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -3428,15 +3428,15 @@ minimaxm2.5-fp8-h200-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 128 } - isl: 1024 osl: 8192 search-space: - - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 128 } - isl: 8192 osl: 1024 search-space: - - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 128 } dsr1-fp4-gb200-dynamo-trt: image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 diff --git a/benchmarks/single_node/minimaxm2.5_fp8_h200.sh b/benchmarks/single_node/minimaxm2.5_fp8_h200.sh index 9c8f1b271..acac5d533 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_h200.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_h200.sh @@ -5,6 +5,7 @@ source "$(dirname "$0")/../benchmark_lib.sh" check_env_vars \ MODEL \ TP \ + EP_SIZE \ CONC \ ISL \ OSL \ @@ -21,9 +22,16 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +if [ "$EP_SIZE" -ge 1 ]; then + EP=" --enable-expert-parallel" +else + EP=" " +fi + set -x vllm serve $MODEL --port $PORT \ --tensor-parallel-size=$TP \ +$EP \ --gpu-memory-utilization 0.95 \ --max-model-len $MAX_MODEL_LEN \ --disable-log-requests \ diff --git a/perf-changelog.yaml b/perf-changelog.yaml index c19ddbd1a..5c5d444a3 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -931,4 +931,9 @@ - "Switch to --attention-backend ROCM_AITER_UNIFIED_ATTN and add fuse_rope_kvcache compilation pass" - "Remove deprecated VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION/VLLM_ROCM_USE_AITER_MHA env vars and compilation-config cudagraph_mode" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/867 - + +- config-keys: + - minimaxm2.5-fp8-h200-vllm + description: + - "Extend MiniMax M2.5 FP8 single-node config for H200 with vLLM v0.16.0 (TP8)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/869