From c79c5d8d96c8a389e07d3d6a16072cbaae81c3da Mon Sep 17 00:00:00 2001 From: Matthias Wolf Date: Thu, 18 Dec 2025 11:24:06 +0000 Subject: [PATCH] LLM benchmarking: replace vLLM with SGLang --- .../llm-benchmark-docker/files/compose.yaml | 11 +++++++---- .../llm-benchmark-docker/files/config.json | 12 +++++++----- .../files/scripts/benchmark.py | 8 ++++++-- .../files/scripts/startllm.py | 17 ++++++++++++----- 4 files changed, 32 insertions(+), 16 deletions(-) diff --git a/cloud-infrastructure/compute-including-hpc/ai-infra-gpu/ai-infrastructure/llm-benchmark-docker/files/compose.yaml b/cloud-infrastructure/compute-including-hpc/ai-infra-gpu/ai-infrastructure/llm-benchmark-docker/files/compose.yaml index 32b8c58b3..469cec9aa 100644 --- a/cloud-infrastructure/compute-including-hpc/ai-infra-gpu/ai-infrastructure/llm-benchmark-docker/files/compose.yaml +++ b/cloud-infrastructure/compute-including-hpc/ai-infra-gpu/ai-infrastructure/llm-benchmark-docker/files/compose.yaml @@ -1,17 +1,18 @@ # Copyright (c) 2025 Oracle and/or its affiliates. -version: "3" services: llm: - image: vllm/vllm-openai:v0.8.5.post1 + image: lmsysorg/sglang:latest container_name: llm runtime: nvidia + ipc: host volumes: - "$HOME/.cache/huggingface:/huggingface" - "$PWD:/appli" ports: - - "127.0.0.1:8000:8000" + - "30000:30000" environment: "HF_HOME": "/huggingface" + "NCCL_IB_DISABLE": "1" working_dir: "/appli" entrypoint: - "/appli/scripts/startllm.py" @@ -29,9 +30,11 @@ services: command: - "wait-for-it.sh" - "--timeout=300" - - "llm:8000" + - "llm:30000" - "--" - "/appli/scripts/benchmark.py" + - "--scenario" + - "gen_heavy" plot: build: plot container_name: plot diff --git a/cloud-infrastructure/compute-including-hpc/ai-infra-gpu/ai-infrastructure/llm-benchmark-docker/files/config.json b/cloud-infrastructure/compute-including-hpc/ai-infra-gpu/ai-infrastructure/llm-benchmark-docker/files/config.json index a54ba2ec3..18d906130 100644 --- a/cloud-infrastructure/compute-including-hpc/ai-infra-gpu/ai-infrastructure/llm-benchmark-docker/files/config.json +++ b/cloud-infrastructure/compute-including-hpc/ai-infra-gpu/ai-infrastructure/llm-benchmark-docker/files/config.json @@ -1,7 +1,9 @@ { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "gpu_memory_utilization": 0.98, - "tensor_parallel_size": 1, - "max_model_len": 8192, - "max_num_batched_tokens": 8192 + "model": "openai/gpt-oss-120b", + "tensor-parallel-size": 2, + "genai-perf": { + "inputs": { + "reasoning_effort": "low" + } + } } diff --git a/cloud-infrastructure/compute-including-hpc/ai-infra-gpu/ai-infrastructure/llm-benchmark-docker/files/scripts/benchmark.py b/cloud-infrastructure/compute-including-hpc/ai-infra-gpu/ai-infrastructure/llm-benchmark-docker/files/scripts/benchmark.py index b16de3e24..d8983aef7 100755 --- a/cloud-infrastructure/compute-including-hpc/ai-infra-gpu/ai-infrastructure/llm-benchmark-docker/files/scripts/benchmark.py +++ b/cloud-infrastructure/compute-including-hpc/ai-infra-gpu/ai-infrastructure/llm-benchmark-docker/files/scripts/benchmark.py @@ -85,7 +85,7 @@ def get_shape(): parser.add_argument( "--port", type=int, - default=8000, + default=30_000, help="Port to use for vLLM.", ) parser.add_argument( @@ -106,6 +106,10 @@ def get_shape(): model_name = server_configuration["model"].rsplit("/", 1)[-1] model_stub = model_name.lower() +extra_inputs = [] +for k, v in server_configuration.get("genai-perf", {}).get("inputs", {}).items(): + extra_inputs.extend(["--extra-inputs", f"{k}:{v}"]) + benchmark_command = [ "genai-perf", "profile", @@ -151,7 +155,7 @@ def get_shape(): f"--extra-inputs=min_tokens:{tokens_mean - tokens_stddev}", ] + [ f"--{k}={v}" for k, v in scfg.items() - ] + ] + extra_inputs logging.warning("running: %s", " ".join(cmd)) subprocess.check_call(cmd) diff --git a/cloud-infrastructure/compute-including-hpc/ai-infra-gpu/ai-infrastructure/llm-benchmark-docker/files/scripts/startllm.py b/cloud-infrastructure/compute-including-hpc/ai-infra-gpu/ai-infrastructure/llm-benchmark-docker/files/scripts/startllm.py index 86600ace1..8a7a10724 100755 --- a/cloud-infrastructure/compute-including-hpc/ai-infra-gpu/ai-infrastructure/llm-benchmark-docker/files/scripts/startllm.py +++ b/cloud-infrastructure/compute-including-hpc/ai-infra-gpu/ai-infrastructure/llm-benchmark-docker/files/scripts/startllm.py @@ -11,8 +11,8 @@ parser.add_argument( "--port", type=int, - default=8000, - help="Port to use for vLLM.", + default=30000, + help="Port to use for sglang.", ) parser.add_argument( "server_configuration", @@ -26,12 +26,19 @@ with args.server_configuration.open() as fd: configuration = json.load(fd) +# Remove performance measurement parameters +configuration.pop("genai-perf", None) + model = configuration.pop("model") server_command = [ - "vllm", - "serve", - "--disable-log-requests", + "python3", + "-m", + "sglang.launch_server", + "--log-requests-level", + "0", + "--host=0.0.0.0", f"--port={args.port}", + "--model-path", model, ] + [ f"--{k}={v}" for k, v in configuration.items()