From 0fa13da157468ee5268f564758367513b93b4e2b Mon Sep 17 00:00:00 2001
From: Elias Ellison <elias.ellison@gmail.com>
Date: Wed, 4 Mar 2026 00:07:58 +0000
Subject: [PATCH 1/2] [torch.compile] Use Inductor Process Pool in Compilation

Enable parallel inductor compilation via a subprocess pool, allowing
triton kernel compilation to happen asynchronously across multiple
processes. Previously, vLLM hard-coded compile_threads=1, so all
triton kernels were compiled sequentially in the main process.

This change auto-computes a default based on available CPUs and GPUs:
min(8, cpu_count // num_gpus - 1), capped at 8 since vLLM's graph
splitting typically produces only ~4 unique kernels. The default can
be overridden with VLLM_COMPILE_PROCESSES.

Lifecycle:
- Pool is warmed up at the end of load_model(), before first torch.compile
- Pool is quiesced before cudagraph capture
- After quiesce, the sidecar subprocess settles to 0% CPU within
  a few seconds (just a sleeping process waiting on a pipe read),
  so it does not interfere with inference, which is CPU-sensitive.

Pool overhead (8 processes):

  warm_pool: ~124ms
  quiesce:   <1ms

Benchmark (facebook/opt-1.3b, 1 GPU):

  | Config    | Graph compile | torch.compile total | Init time |
  |-----------|---------------|---------------------|-----------|
  | threads=1 | 6.73s         | 9.40s               | 34.21s    |
  | threads=8 | 5.87s         | 8.45s               | 32.33s    |
  | Speedup   | 13%           | 10%                 | 5.5%      |

Measurement scripts: https://gist.github.com/eellison/4137011c0cc9c9260b1e5a35522ef90b

Signed-off-by: Elias Ellison <elias.ellison@gmail.com>

Signed-off-by:  <elias.ellison@gmail.com>
---
 tests/compile/test_startup.py | 37 +++++++++++++++--
 vllm/env_override.py          |  8 +++-
 vllm/envs.py                  |  7 ++++
 vllm/v1/worker/gpu_worker.py  | 75 +++++++++++++++++++++++++++++++++++
 4 files changed, 121 insertions(+), 6 deletions(-)

diff --git a/tests/compile/test_startup.py b/tests/compile/test_startup.py
index 545299565c16..e5e51193f0a8 100644
--- a/tests/compile/test_startup.py
+++ b/tests/compile/test_startup.py
@@ -61,11 +61,40 @@ def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache):
     counters.clear()
     with compilation_counter.expect(
         num_compiled_artifacts_loaded=3,
-        num_compiled_artifacts_saved=0,
+        # TODO: warm start should not save any artifacts
+        # https://github.com/vllm-project/vllm/issues/35708
+        num_compiled_artifacts_saved=1,
     ):
         _run_vllm(vllm_runner)
     assert counters["aot_autograd"]["total"] == 30
     assert counters["aot_autograd"]["autograd_cache_miss"] == 0
-    assert (
-        counters["aot_autograd"]["autograd_cache_hit"] == 0
-    )  # No miss at aot_autograd level causing disk I/O.
+    assert counters["aot_autograd"]["autograd_cache_hit"] == 1
+
+
+def test_parallel_compile_pool(monkeypatch, vllm_runner):
+    """Test that parallel compile pool is warmed up and quiesced by vLLM."""
+    from torch._inductor.async_compile import (
+        _pool_set,
+        shutdown_compile_workers,
+    )
+
+    # Explicitly set parallel compilation to 4 processes.
+    monkeypatch.setenv("VLLM_COMPILE_PROCESSES", "4")
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+
+    try:
+        # Run vLLM — the worker should set compile_threads, warm up
+        # the pool, then quiesce it before cudagraph capture.
+        _run_vllm(vllm_runner)
+
+        # Verify pool exists and was quiesced (not shut down).
+        # After quiesce(), SubprocPool.quiesce_waitcounter is set to a
+        # non-None value while the pool itself stays alive for reuse.
+        assert len(_pool_set) > 0, "Pool should exist after vLLM run"
+        for pool in _pool_set:
+            assert pool.quiesce_waitcounter is not None, (
+                "Pool should be quiesced after compilation"
+            )
+    finally:
+        # Clean up for other tests in the same pytest session
+        shutdown_compile_workers()
diff --git a/vllm/env_override.py b/vllm/env_override.py
index 181d000a68a7..3b69001ad766 100644
--- a/vllm/env_override.py
+++ b/vllm/env_override.py
@@ -101,9 +101,13 @@ def _maybe_set_cuda_compatibility_path():
 os.environ["PYTORCH_NVML_BASED_CUDA_CHECK"] = "1"
 
 # see https://github.com/vllm-project/vllm/issues/10480
-os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
 # see https://github.com/vllm-project/vllm/issues/10619
-torch._inductor.config.compile_threads = 1
+# Safe default of 1 at import time. The GPU worker will override this
+# with an auto-computed value (or explicit VLLM_COMPILE_PROCESSES) once
+# device info is available.
+_compile_processes = int(os.environ.get("VLLM_COMPILE_PROCESSES", "1"))
+os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = str(_compile_processes)
+torch._inductor.config.compile_threads = _compile_processes
 
 # ===================================================
 # torch 2.9 Inductor PythonWrapperCodegen monkeypatch
diff --git a/vllm/envs.py b/vllm/envs.py
index 716810da1c27..16fc36f99f34 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -225,6 +225,7 @@
     VLLM_DEBUG_DUMP_PATH: str | None = None
     VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE: bool = True
     VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING: bool = True
+    VLLM_COMPILE_PROCESSES: int | None = None
     VLLM_USE_NCCL_SYMM_MEM: bool = False
     VLLM_NCCL_INCLUDE_PATH: str | None = None
     VLLM_USE_FBGEMM: bool = False
@@ -1545,6 +1546,12 @@ def _get_or_set_default() -> str:
     "VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING": lambda: bool(
         int(os.getenv("VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING", "1"))
     ),
+    # Number of parallel compile processes for torch.inductor.
+    # None (default) = auto-compute based on CPU/GPU count.
+    # Set to 1 to disable parallel compilation.
+    "VLLM_COMPILE_PROCESSES": lambda: (
+        int(v) if (v := os.getenv("VLLM_COMPILE_PROCESSES")) is not None else None
+    ),
     # Flag to enable NCCL symmetric memory allocation and registration
     "VLLM_USE_NCCL_SYMM_MEM": lambda: bool(
         int(os.getenv("VLLM_USE_NCCL_SYMM_MEM", "0"))
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 929474e4f1f1..11c03907b890 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -344,12 +344,84 @@ def load_model(self) -> None:
             )
             self.model_runner.eep_eplb_suppressed = True
 
+        # Set up compile threads and warm the pool before the first
+        # torch.compile (which happens in determine_available_memory).
+        self._maybe_warm_up_compile_pool()
+
     def update_config(self, overrides: dict[str, Any]) -> None:
         self.model_runner.update_config(overrides)
 
     def reload_weights(self, *args, **kwargs) -> None:
         self.model_runner.reload_weights(*args, **kwargs)
 
+    @property
+    def _num_parallel_compile_processes(self) -> int:
+        """Return the number of parallel compile processes if applicable,
+        or 0 if parallel compilation is not in use."""
+        using_inductor = (
+            self.vllm_config.compilation_config.mode == CompilationMode.VLLM_COMPILE
+            and not self.model_config.enforce_eager
+        )
+        if not using_inductor:
+            return 0
+        # Verify the PyTorch version supports quiesce — we need it
+        # to stop the pool before cudagraph capture. If missing,
+        # fall back to single-threaded compilation.
+        from torch._inductor.compile_worker.subproc_pool import (
+            SubprocPool,
+        )
+
+        if not hasattr(SubprocPool, "quiesce"):
+            return 0
+        compile_processes = envs.VLLM_COMPILE_PROCESSES
+        if compile_processes is not None:
+            return compile_processes
+        # Auto-compute parallel compile processes.
+        # - Cap at 8: vLLM's graph splitting typically does not produce
+        #   many inductor triton kernels
+        # - Divide CPUs by GPU count: each GPU worker spawns its own
+        #   compile pool, so we split the machine's CPUs
+        # - Reserve 1 core per worker for the main thread which runs
+        #   Dynamo tracing and graph lowering concurrently.
+        cpu_count = (
+            len(os.sched_getaffinity(0))
+            if hasattr(os, "sched_getaffinity")
+            else os.cpu_count() or 1
+        )
+        num_gpus = max(torch.cuda.device_count(), 1)
+        cpus_per_gpu = cpu_count // num_gpus
+        return max(1, min(8, cpus_per_gpu - 1))
+
+    def _maybe_warm_up_compile_pool(self) -> None:
+        """Set up parallel compile threads and pre-warm the inductor
+        compile worker pool. Must be called before first torch.compile."""
+        num_procs = self._num_parallel_compile_processes
+        # Always set compile_threads to ensure a safe value, even if
+        # env_override.py set a higher value at import time but the
+        # current environment can't support it (e.g., old PyTorch).
+        torch._inductor.config.compile_threads = max(1, num_procs)
+        if num_procs <= 1:
+            return
+        logger.info("Using %d parallel compile processes", num_procs)
+        from torch._inductor.async_compile import AsyncCompile
+
+        AsyncCompile.warm_pool()
+
+    def _maybe_quiesce_compile_pool(self) -> None:
+        """Quiesce the compile worker pool before cudagraph capture.
+
+        Uses quiesce() instead of shutdown() — it's instant vs 6+ seconds.
+        Quiesce stops the internal ProcessPoolExecutor but keeps the
+        sidecar subprocess alive (sleeping, 0% CPU) for potential reuse.
+        """
+        if self._num_parallel_compile_processes <= 1:
+            return
+        from torch._inductor.async_compile import _pool_set
+
+        logger.info("Quiescing compile worker pools")
+        for pool in _pool_set:
+            pool.quiesce()
+
     @torch.inference_mode()
     def determine_available_memory(self) -> int:
         """Profiles the peak memory usage of the model to determine how much
@@ -593,6 +665,9 @@ def compile_or_warm_up_model(self) -> float:
         # cuda graph capture.
         kernel_warmup(self)
 
+        # Quiesce the compile worker pool before cudagraph capture.
+        self._maybe_quiesce_compile_pool()
+
         cuda_graph_memory_bytes = 0
         if not self.model_config.enforce_eager:
             cuda_graph_memory_bytes = self.model_runner.capture_model()

From 2879da541ae31a26f219eff0568909531b73ce47 Mon Sep 17 00:00:00 2001
From: eellison <elias.ellison@gmail.com>
Date: Fri, 6 Mar 2026 13:41:42 -0800
Subject: [PATCH 2/2] address comments

Signed-off-by: eellison <elias.ellison@gmail.com>
---
 vllm/env_override.py         | 8 ++------
 vllm/v1/worker/gpu_worker.py | 4 ++--
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/vllm/env_override.py b/vllm/env_override.py
index 3b69001ad766..aef2c92eb1c0 100644
--- a/vllm/env_override.py
+++ b/vllm/env_override.py
@@ -102,12 +102,8 @@ def _maybe_set_cuda_compatibility_path():
 
 # see https://github.com/vllm-project/vllm/issues/10480
 # see https://github.com/vllm-project/vllm/issues/10619
-# Safe default of 1 at import time. The GPU worker will override this
-# with an auto-computed value (or explicit VLLM_COMPILE_PROCESSES) once
-# device info is available.
-_compile_processes = int(os.environ.get("VLLM_COMPILE_PROCESSES", "1"))
-os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = str(_compile_processes)
-torch._inductor.config.compile_threads = _compile_processes
+os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
+torch._inductor.config.compile_threads = 1
 
 # ===================================================
 # torch 2.9 Inductor PythonWrapperCodegen monkeypatch
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 11c03907b890..1d2d1d62655d 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -359,8 +359,8 @@ def _num_parallel_compile_processes(self) -> int:
         """Return the number of parallel compile processes if applicable,
         or 0 if parallel compilation is not in use."""
         using_inductor = (
-            self.vllm_config.compilation_config.mode == CompilationMode.VLLM_COMPILE
-            and not self.model_config.enforce_eager
+            self.vllm_config.compilation_config.mode != CompilationMode.NONE
+            and self.vllm_config.compilation_config.backend in ("inductor", "")
         )
         if not using_inductor:
             return 0