Skip to content

Multi-Turn Sweep - tp=[1, 2, 4, 8] users=[8, 16, 32, 64, 128, 256, 512] offload=["on", "off"] #39

Multi-Turn Sweep - tp=[1, 2, 4, 8] users=[8, 16, 32, 64, 128, 256, 512] offload=["on", "off"]

Multi-Turn Sweep - tp=[1, 2, 4, 8] users=[8, 16, 32, 64, 128, 256, 512] offload=["on", "off"] #39

name: Multi-Turn Benchmark Sweep
run-name: "Multi-Turn Sweep - tp=${{ inputs.tp_values }} users=${{ inputs.user_values }} offload=${{ inputs.offload_values }}"
on:
# push:
# branches:
# - experimental/multi-turn-benchmark
# paths:
# - .github/workflows/multiturn-sweep.yml
workflow_dispatch:
inputs:
tp_values:
description: 'TP sizes (JSON array)'
required: true
default: '[1, 2, 4, 8]'
type: string
user_values:
description: 'Concurrent user counts (JSON array)'
required: true
default: '[8, 16, 32, 64, 128, 256, 512, 1024, 2048]'
type: string
offload_values:
description: 'Offload modes (JSON array: on/off/noprefix)'
required: true
default: '["on", "off", "noprefix"]'
type: string
duration:
description: 'Benchmark duration in seconds'
required: true
default: '300'
type: string
request_rate:
description: 'Request rate per client (Poisson, req/s). 0 = no delay.'
required: false
default: '0'
type: string
total_cpu_dram_gb:
description: 'Total CPU DRAM for KV offload (GB)'
required: true
default: '100'
type: string
image:
description: 'Container image'
required: true
default: 'vllm/vllm-openai:v0.16.0'
type: string
model:
description: 'Model name'
required: true
default: 'nvidia/Llama-3.3-70B-Instruct-FP4'
type: string
script_suffix:
description: 'Suffix for benchmark script (e.g. "_lmcache" → multiturn_fp4_b200_lmcache.sh)'
required: false
default: ''
type: string
ref:
description: 'Git ref (branch/sha) to checkout'
required: false
type: string
jobs:
# ---------------------------------------------------------------------------
# Matrix benchmark jobs — each cell calls the multiturn template
# ---------------------------------------------------------------------------
sweep:
uses: ./.github/workflows/benchmark-multiturn-tmpl.yml
name: sweep /
strategy:
fail-fast: false
matrix:
tp: ${{ fromJson(inputs.tp_values) }}
users: ${{ fromJson(inputs.user_values) }}
offload: ${{ fromJson(inputs.offload_values) }}
secrets: inherit
with:
runner: b200
image: ${{ inputs.image }}
model: ${{ inputs.model }}
exp-name: "multiturn_tp${{ matrix.tp }}_users${{ matrix.users }}_offload${{ matrix.offload }}"
tp: "${{ matrix.tp }}"
users: "${{ matrix.users }}"
offload-mode: ${{ matrix.offload }}
duration: ${{ inputs.duration }}
request-rate: ${{ inputs.request_rate }}
total-cpu-dram-gb: ${{ inputs.total_cpu_dram_gb }}
script-suffix: ${{ inputs.script_suffix }}
ref: ${{ inputs.ref }}
# ---------------------------------------------------------------------------
# Collect & aggregate results
# ---------------------------------------------------------------------------
collect:
runs-on: ubuntu-latest
needs: sweep
if: always()
name: Collect results
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
token: ${{ secrets.REPO_PAT }}
fetch-depth: 1
ref: ${{ inputs.ref || github.ref }}
- uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install dependencies
run: pip install pandas matplotlib numpy
- name: Download all artifacts
uses: actions/download-artifact@v4
with:
pattern: 'multiturn_*'
path: results/
- name: Run aggregation
run: |
python experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py results/ aggregated/
- name: Upload aggregated results
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
with:
name: multiturn_aggregated
path: aggregated/