vllm-project · eellison · Mar 4, 2026 · Mar 6, 2026 · zou3519 · Mar 10, 2026
diff --git a/tests/compile/test_startup.py b/tests/compile/test_startup.py
@@ -61,11 +61,40 @@ def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache):
     counters.clear()
     with compilation_counter.expect(
         num_compiled_artifacts_loaded=3,
-        num_compiled_artifacts_saved=0,
+        # TODO: warm start should not save any artifacts
+        # https://github.com/vllm-project/vllm/issues/35708
+        num_compiled_artifacts_saved=1,
     ):
         _run_vllm(vllm_runner)
     assert counters["aot_autograd"]["total"] == 30
     assert counters["aot_autograd"]["autograd_cache_miss"] == 0
-    assert (
-        counters["aot_autograd"]["autograd_cache_hit"] == 0
-    )  # No miss at aot_autograd level causing disk I/O.
+    assert counters["aot_autograd"]["autograd_cache_hit"] == 1
+
+
+def test_parallel_compile_pool(monkeypatch, vllm_runner):
+    """Test that parallel compile pool is warmed up and quiesced by vLLM."""
+    from torch._inductor.async_compile import (
+        _pool_set,
+        shutdown_compile_workers,
+    )
+
+    # Explicitly set parallel compilation to 4 processes.
+    monkeypatch.setenv("VLLM_COMPILE_PROCESSES", "4")
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+
+    try:
+        # Run vLLM — the worker should set compile_threads, warm up
+        # the pool, then quiesce it before cudagraph capture.
+        _run_vllm(vllm_runner)
+
+        # Verify pool exists and was quiesced (not shut down).
+        # After quiesce(), SubprocPool.quiesce_waitcounter is set to a
+        # non-None value while the pool itself stays alive for reuse.
+        assert len(_pool_set) > 0, "Pool should exist after vLLM run"
+        for pool in _pool_set:
+            assert pool.quiesce_waitcounter is not None, (
+                "Pool should be quiesced after compilation"
+            )
+    finally:
+        # Clean up for other tests in the same pytest session
+        shutdown_compile_workers()
diff --git a/vllm/env_override.py b/vllm/env_override.py
@@ -101,8 +101,8 @@ def _maybe_set_cuda_compatibility_path():
 os.environ["PYTORCH_NVML_BASED_CUDA_CHECK"] = "1"
 
 # see https://github.com/vllm-project/vllm/issues/10480
-os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
 # see https://github.com/vllm-project/vllm/issues/10619
+os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
 torch._inductor.config.compile_threads = 1
 
 # ===================================================

diff --git a/vllm/envs.py b/vllm/envs.py
@@ -225,6 +225,7 @@
     VLLM_DEBUG_DUMP_PATH: str | None = None
     VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE: bool = True
     VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING: bool = True
+    VLLM_COMPILE_PROCESSES: int | None = None
     VLLM_USE_NCCL_SYMM_MEM: bool = False
     VLLM_NCCL_INCLUDE_PATH: str | None = None
     VLLM_USE_FBGEMM: bool = False
@@ -1545,6 +1546,12 @@ def _get_or_set_default() -> str:
     "VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING": lambda: bool(
         int(os.getenv("VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING", "1"))
     ),
+    # Number of parallel compile processes for torch.inductor.
+    # None (default) = auto-compute based on CPU/GPU count.
+    # Set to 1 to disable parallel compilation.
+    "VLLM_COMPILE_PROCESSES": lambda: (
+        int(v) if (v := os.getenv("VLLM_COMPILE_PROCESSES")) is not None else None
+    ),
     # Flag to enable NCCL symmetric memory allocation and registration
     "VLLM_USE_NCCL_SYMM_MEM": lambda: bool(
         int(os.getenv("VLLM_USE_NCCL_SYMM_MEM", "0"))

@@ -344,12 +344,84 @@ def load_model(self) -> None:
             )
             self.model_runner.eep_eplb_suppressed = True
 
+        # Set up compile threads and warm the pool before the first
+        # torch.compile (which happens in determine_available_memory).
+        self._maybe_warm_up_compile_pool()
+
     def update_config(self, overrides: dict[str, Any]) -> None:
         self.model_runner.update_config(overrides)
 
     def reload_weights(self, *args, **kwargs) -> None:
         self.model_runner.reload_weights(*args, **kwargs)
 
+    @property
+    def _num_parallel_compile_processes(self) -> int:
+        """Return the number of parallel compile processes if applicable,
+        or 0 if parallel compilation is not in use."""
+        using_inductor = (
+            self.vllm_config.compilation_config.mode != CompilationMode.NONE
+            and self.vllm_config.compilation_config.backend in ("inductor", "")
+        )
+        if not using_inductor:
+            return 0
+        # Verify the PyTorch version supports quiesce — we need it
+        # to stop the pool before cudagraph capture. If missing,
+        # fall back to single-threaded compilation.
+        from torch._inductor.compile_worker.subproc_pool import (
+            SubprocPool,
+        )
+
+        if not hasattr(SubprocPool, "quiesce"):
+            return 0
+        compile_processes = envs.VLLM_COMPILE_PROCESSES
+        if compile_processes is not None:
+            return compile_processes
+        # Auto-compute parallel compile processes.
+        # - Cap at 8: vLLM's graph splitting typically does not produce
+        #   many inductor triton kernels
+        # - Divide CPUs by GPU count: each GPU worker spawns its own
+        #   compile pool, so we split the machine's CPUs
+        # - Reserve 1 core per worker for the main thread which runs
+        #   Dynamo tracing and graph lowering concurrently.
+        cpu_count = (
+            len(os.sched_getaffinity(0))
+            if hasattr(os, "sched_getaffinity")
+            else os.cpu_count() or 1
+        )
+        num_gpus = max(torch.cuda.device_count(), 1)
+        cpus_per_gpu = cpu_count // num_gpus
+        return max(1, min(8, cpus_per_gpu - 1))
+
+    def _maybe_warm_up_compile_pool(self) -> None:
+        """Set up parallel compile threads and pre-warm the inductor
+        compile worker pool. Must be called before first torch.compile."""
+        num_procs = self._num_parallel_compile_processes
+        # Always set compile_threads to ensure a safe value, even if
+        # env_override.py set a higher value at import time but the
+        # current environment can't support it (e.g., old PyTorch).
+        torch._inductor.config.compile_threads = max(1, num_procs)
+        if num_procs <= 1:
+            return
+        logger.info("Using %d parallel compile processes", num_procs)
+        from torch._inductor.async_compile import AsyncCompile
+
+        AsyncCompile.warm_pool()
+
+    def _maybe_quiesce_compile_pool(self) -> None:
+        """Quiesce the compile worker pool before cudagraph capture.
+
+        Uses quiesce() instead of shutdown() — it's instant vs 6+ seconds.
+        Quiesce stops the internal ProcessPoolExecutor but keeps the
+        sidecar subprocess alive (sleeping, 0% CPU) for potential reuse.
+        """
+        if self._num_parallel_compile_processes <= 1:
+            return
+        from torch._inductor.async_compile import _pool_set
+
+        logger.info("Quiescing compile worker pools")
+        for pool in _pool_set:
+            pool.quiesce()
+
     @torch.inference_mode()
     def determine_available_memory(self) -> int:
         """Profiles the peak memory usage of the model to determine how much
@@ -593,6 +665,9 @@ def compile_or_warm_up_model(self) -> float:
         # cuda graph capture.
         kernel_warmup(self)
 
+        # Quiesce the compile worker pool before cudagraph capture.
+        self._maybe_quiesce_compile_pool()
+
         cuda_graph_memory_bytes = 0
         if not self.model_config.enforce_eager:
             cuda_graph_memory_bytes = self.model_runner.capture_model()