From 45ddaf01b7a70ebeeeb739cd9223d3c47a6f1cae Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Thu, 5 Mar 2026 11:33:19 -0800 Subject: [PATCH 1/7] extend configs --- .github/configs/nvidia-master.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 6885f36cb..2f2e60f1d 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -3429,14 +3429,17 @@ minimaxm2.5-fp8-h200-vllm: osl: 1024 search-space: - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 16 } - isl: 1024 osl: 8192 search-space: - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 16 } - isl: 8192 osl: 1024 search-space: - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 16 } dsr1-fp4-gb200-dynamo-trt: image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 From 53f7cc3c1bf4d10a46750f023655fd51776c5516 Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Thu, 5 Mar 2026 12:03:09 -0800 Subject: [PATCH 2/7] perf change --- perf-changelog.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index dbb3abc88..58db5e1ad 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -881,3 +881,9 @@ - "Expanding TP search space" - "Adding kv-cache-fp8" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/865 + +- config-keys: + - minimaxm2.5-fp8-h200-vllm + description: + - "Extend MiniMax M2.5 FP8 single-node config for H200 with vLLM v0.16.0 (TP8)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/869 From c035ae50f421ec68ccde008708d5fbb443b58b74 Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Thu, 5 Mar 2026 13:39:37 -0800 Subject: [PATCH 3/7] remove tp4 --- .github/configs/nvidia-master.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 2f2e60f1d..767dc05f8 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -3428,17 +3428,14 @@ minimaxm2.5-fp8-h200-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 16 } - isl: 1024 osl: 8192 search-space: - - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 16 } - isl: 8192 osl: 1024 search-space: - - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 16 } dsr1-fp4-gb200-dynamo-trt: From 4db963a287673ab570ca65db5c00c70c695f20d1 Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Thu, 5 Mar 2026 15:00:46 -0800 Subject: [PATCH 4/7] enable ep --- benchmarks/single_node/minimaxm2.5_fp8_h200.sh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/benchmarks/single_node/minimaxm2.5_fp8_h200.sh b/benchmarks/single_node/minimaxm2.5_fp8_h200.sh index 9c8f1b271..acac5d533 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_h200.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_h200.sh @@ -5,6 +5,7 @@ source "$(dirname "$0")/../benchmark_lib.sh" check_env_vars \ MODEL \ TP \ + EP_SIZE \ CONC \ ISL \ OSL \ @@ -21,9 +22,16 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +if [ "$EP_SIZE" -ge 1 ]; then + EP=" --enable-expert-parallel" +else + EP=" " +fi + set -x vllm serve $MODEL --port $PORT \ --tensor-parallel-size=$TP \ +$EP \ --gpu-memory-utilization 0.95 \ --max-model-len $MAX_MODEL_LEN \ --disable-log-requests \ From eb48394be54d6e6ff626f877b5fab90f92edafa4 Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Thu, 5 Mar 2026 16:40:56 -0800 Subject: [PATCH 5/7] conc increase --- benchmarks/single_node/minimaxm2.5_fp8_h200.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/single_node/minimaxm2.5_fp8_h200.sh b/benchmarks/single_node/minimaxm2.5_fp8_h200.sh index acac5d533..9df465634 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_h200.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_h200.sh @@ -22,7 +22,7 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} -if [ "$EP_SIZE" -ge 1 ]; then +if [ "$EP_SIZE" -gt 1 ]; then EP=" --enable-expert-parallel" else EP=" " From 1fcdea247af9c786702bc5c1fc0c6fbfa502647e Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Thu, 5 Mar 2026 16:42:55 -0800 Subject: [PATCH 6/7] fix ep condition --- .github/configs/nvidia-master.yaml | 6 +++--- benchmarks/single_node/minimaxm2.5_fp8_h200.sh | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 767dc05f8..dc82e66af 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -3428,15 +3428,15 @@ minimaxm2.5-fp8-h200-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 16 } + - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 1024 osl: 8192 search-space: - - { tp: 8, conc-start: 4, conc-end: 16 } + - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 16 } + - { tp: 8, conc-start: 4, conc-end: 64 } dsr1-fp4-gb200-dynamo-trt: image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 diff --git a/benchmarks/single_node/minimaxm2.5_fp8_h200.sh b/benchmarks/single_node/minimaxm2.5_fp8_h200.sh index 9df465634..acac5d533 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_h200.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_h200.sh @@ -22,7 +22,7 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} -if [ "$EP_SIZE" -gt 1 ]; then +if [ "$EP_SIZE" -ge 1 ]; then EP=" --enable-expert-parallel" else EP=" " From e425178a9e902b911b3e60a389c67bb8acbce04a Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Fri, 6 Mar 2026 14:46:31 -0800 Subject: [PATCH 7/7] update: high conc --- .github/configs/nvidia-master.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index dc82e66af..2aaeca114 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -3428,15 +3428,15 @@ minimaxm2.5-fp8-h200-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 128 } - isl: 1024 osl: 8192 search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 128 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 128 } dsr1-fp4-gb200-dynamo-trt: image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2