Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 33 additions & 4 deletions tests/compile/test_startup.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,40 @@ def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache):
counters.clear()
with compilation_counter.expect(
num_compiled_artifacts_loaded=3,
num_compiled_artifacts_saved=0,
# TODO: warm start should not save any artifacts
# https://github.com/vllm-project/vllm/issues/35708
num_compiled_artifacts_saved=1,
Comment on lines +64 to +66
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@eellison we fixed this, please rebase

):
_run_vllm(vllm_runner)
assert counters["aot_autograd"]["total"] == 30
assert counters["aot_autograd"]["autograd_cache_miss"] == 0
assert (
counters["aot_autograd"]["autograd_cache_hit"] == 0
) # No miss at aot_autograd level causing disk I/O.
assert counters["aot_autograd"]["autograd_cache_hit"] == 1

Comment on lines +71 to +72
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here


def test_parallel_compile_pool(monkeypatch, vllm_runner):
"""Test that parallel compile pool is warmed up and quiesced by vLLM."""
from torch._inductor.async_compile import (
_pool_set,
shutdown_compile_workers,
)

# Explicitly set parallel compilation to 4 processes.
monkeypatch.setenv("VLLM_COMPILE_PROCESSES", "4")
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")

try:
# Run vLLM — the worker should set compile_threads, warm up
# the pool, then quiesce it before cudagraph capture.
_run_vllm(vllm_runner)

# Verify pool exists and was quiesced (not shut down).
# After quiesce(), SubprocPool.quiesce_waitcounter is set to a
# non-None value while the pool itself stays alive for reuse.
assert len(_pool_set) > 0, "Pool should exist after vLLM run"
for pool in _pool_set:
assert pool.quiesce_waitcounter is not None, (
"Pool should be quiesced after compilation"
)
finally:
# Clean up for other tests in the same pytest session
shutdown_compile_workers()
Comment on lines +99 to +100
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does the other test in this file affect this test?

2 changes: 1 addition & 1 deletion vllm/env_override.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,8 @@ def _maybe_set_cuda_compatibility_path():
os.environ["PYTORCH_NVML_BASED_CUDA_CHECK"] = "1"

# see https://github.com/vllm-project/vllm/issues/10480
os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
# see https://github.com/vllm-project/vllm/issues/10619
os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
torch._inductor.config.compile_threads = 1

# ===================================================
Expand Down
7 changes: 7 additions & 0 deletions vllm/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@
VLLM_DEBUG_DUMP_PATH: str | None = None
VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE: bool = True
VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING: bool = True
VLLM_COMPILE_PROCESSES: int | None = None
VLLM_USE_NCCL_SYMM_MEM: bool = False
VLLM_NCCL_INCLUDE_PATH: str | None = None
VLLM_USE_FBGEMM: bool = False
Expand Down Expand Up @@ -1545,6 +1546,12 @@ def _get_or_set_default() -> str:
"VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING": lambda: bool(
int(os.getenv("VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING", "1"))
),
# Number of parallel compile processes for torch.inductor.
# None (default) = auto-compute based on CPU/GPU count.
# Set to 1 to disable parallel compilation.
"VLLM_COMPILE_PROCESSES": lambda: (
int(v) if (v := os.getenv("VLLM_COMPILE_PROCESSES")) is not None else None
),
# Flag to enable NCCL symmetric memory allocation and registration
"VLLM_USE_NCCL_SYMM_MEM": lambda: bool(
int(os.getenv("VLLM_USE_NCCL_SYMM_MEM", "0"))
Expand Down
75 changes: 75 additions & 0 deletions vllm/v1/worker/gpu_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,12 +344,84 @@ def load_model(self) -> None:
)
self.model_runner.eep_eplb_suppressed = True

# Set up compile threads and warm the pool before the first
# torch.compile (which happens in determine_available_memory).
self._maybe_warm_up_compile_pool()

def update_config(self, overrides: dict[str, Any]) -> None:
self.model_runner.update_config(overrides)

def reload_weights(self, *args, **kwargs) -> None:
self.model_runner.reload_weights(*args, **kwargs)

@property
def _num_parallel_compile_processes(self) -> int:
"""Return the number of parallel compile processes if applicable,
or 0 if parallel compilation is not in use."""
using_inductor = (
self.vllm_config.compilation_config.mode != CompilationMode.NONE
and self.vllm_config.compilation_config.backend in ("inductor", "")
)
if not using_inductor:
return 0
# Verify the PyTorch version supports quiesce — we need it
# to stop the pool before cudagraph capture. If missing,
# fall back to single-threaded compilation.
from torch._inductor.compile_worker.subproc_pool import (
SubprocPool,
)

if not hasattr(SubprocPool, "quiesce"):
return 0
Comment on lines +367 to +375
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why pytorch versions don't? If the answer is < 2.9, then we can just remove the extra check

compile_processes = envs.VLLM_COMPILE_PROCESSES
if compile_processes is not None:
return compile_processes
# Auto-compute parallel compile processes.
# - Cap at 8: vLLM's graph splitting typically does not produce
# many inductor triton kernels
# - Divide CPUs by GPU count: each GPU worker spawns its own
# compile pool, so we split the machine's CPUs
# - Reserve 1 core per worker for the main thread which runs
# Dynamo tracing and graph lowering concurrently.
cpu_count = (
len(os.sched_getaffinity(0))
if hasattr(os, "sched_getaffinity")
else os.cpu_count() or 1
)
num_gpus = max(torch.cuda.device_count(), 1)
cpus_per_gpu = cpu_count // num_gpus
return max(1, min(8, cpus_per_gpu - 1))

def _maybe_warm_up_compile_pool(self) -> None:
"""Set up parallel compile threads and pre-warm the inductor
compile worker pool. Must be called before first torch.compile."""
num_procs = self._num_parallel_compile_processes
# Always set compile_threads to ensure a safe value, even if
# env_override.py set a higher value at import time but the
# current environment can't support it (e.g., old PyTorch).
torch._inductor.config.compile_threads = max(1, num_procs)
if num_procs <= 1:
return
logger.info("Using %d parallel compile processes", num_procs)
from torch._inductor.async_compile import AsyncCompile

AsyncCompile.warm_pool()

def _maybe_quiesce_compile_pool(self) -> None:
"""Quiesce the compile worker pool before cudagraph capture.

Uses quiesce() instead of shutdown() — it's instant vs 6+ seconds.
Quiesce stops the internal ProcessPoolExecutor but keeps the
sidecar subprocess alive (sleeping, 0% CPU) for potential reuse.
"""
if self._num_parallel_compile_processes <= 1:
return
from torch._inductor.async_compile import _pool_set

logger.info("Quiescing compile worker pools")
for pool in _pool_set:
pool.quiesce()

@torch.inference_mode()
def determine_available_memory(self) -> int:
"""Profiles the peak memory usage of the model to determine how much
Expand Down Expand Up @@ -593,6 +665,9 @@ def compile_or_warm_up_model(self) -> float:
# cuda graph capture.
kernel_warmup(self)

# Quiesce the compile worker pool before cudagraph capture.
self._maybe_quiesce_compile_pool()

cuda_graph_memory_bytes = 0
if not self.model_config.enforce_eager:
cuda_graph_memory_bytes = self.model_runner.capture_model()
Expand Down