Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1873,6 +1873,31 @@ kimik2.5-int4-h200-vllm:
search-space:
- { tp: 8, conc-start: 4, conc-end: 64 }

kimik2.5-fp4-b200-vllm:
image: vllm/vllm-openai:v0.16.0
model: nvidia/Kimi-K2.5-NVFP4
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks for the PR! overview LGTM!

@ankursingh-nv @kedarpotdar-nv 1 small thing is if you can add documentations about nvfp4 version of kimi k2.5 nvidia/Kimi-K2.5-NVFP4 to the vllm recipes https://github.com/vllm-project/recipes/blob/main/moonshotai/Kimi-K2.5.md . Lets ensure that the documentation is first class such that the entire ml community can benefit from your hard work!

+viz @faradawn

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

started PR here vllm-project/recipes#267

model-prefix: kimik2.5
runner: b200
precision: fp4
framework: vllm
multinode: false
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, ep: 1, conc-start: 4, conc-end: 128 }
- { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
- isl: 1024
osl: 8192
search-space:
- { tp: 8, ep: 1, conc-start: 4, conc-end: 128 }
- { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, ep: 1, conc-start: 4, conc-end: 128 }
- { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }

dsr1-fp8-b200-sglang-mtp:
image: lmsysorg/sglang:v0.5.8-cu130-amd64
model: deepseek-ai/DeepSeek-R1-0528
Expand Down
66 changes: 66 additions & 0 deletions benchmarks/single_node/kimik2.5_fp4_b200.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#!/usr/bin/env bash

source "$(dirname "$0")/../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
CONC \
ISL \
OSL \
MAX_MODEL_LEN \
RANDOM_RANGE_RATIO \
RESULT_FILENAME

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

hf download "$MODEL"

nvidia-smi

export TORCH_CUDA_ARCH_LIST="10.0"
export PYTHONNOUSERSITE=1

SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}

set -x
vllm serve $MODEL --host 0.0.0.0 --port $PORT \
--tensor-parallel-size=$TP \
--gpu-memory-utilization 0.90 \
--max-model-len $MAX_MODEL_LEN \
--max-num-seqs $CONC \
--reasoning-parser kimi_k2 \
--tool-call-parser kimi_k2 \
--compilation_config.pass_config.fuse_allreduce_rms true \
--trust-remote-code \
--disable-log-requests > $SERVER_LOG 2>&1 &

SERVER_PID=$!

# Wait for server to be ready
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

pip install -q datasets pandas

run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts $(( CONC * 10 )) \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/ \
--trust-remote-code

# After throughput, run evaluation only if RUN_EVAL is true
if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
append_lm_eval_summary
fi
set +x
7 changes: 7 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -875,3 +875,10 @@
- "Server: reasoning-parser qwen3, tool-call-parser qwen3_coder, enable-flashinfer-allreduce-fusion, mem-fraction-static 0.8"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/855

- config-keys:
- kimik2.5-fp4-b200-vllm
description:
- "Add Kimi K2.5 NVFP4 vLLM benchmark configuration for B200"
- "Model: nvidia/Kimi-K2.5-NVFP4 with --reasoning-parser kimi_k2 and --trust-remote-code"
- "Image: vllm/vllm-openai:v0.16.0"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/862