-
-
Notifications
You must be signed in to change notification settings - Fork 14.2k
[torch.compile] Use Inductor Process Pool in Compilation #36028
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -61,11 +61,40 @@ def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache): | |
| counters.clear() | ||
| with compilation_counter.expect( | ||
| num_compiled_artifacts_loaded=3, | ||
| num_compiled_artifacts_saved=0, | ||
| # TODO: warm start should not save any artifacts | ||
| # https://github.com/vllm-project/vllm/issues/35708 | ||
| num_compiled_artifacts_saved=1, | ||
| ): | ||
| _run_vllm(vllm_runner) | ||
| assert counters["aot_autograd"]["total"] == 30 | ||
| assert counters["aot_autograd"]["autograd_cache_miss"] == 0 | ||
| assert ( | ||
| counters["aot_autograd"]["autograd_cache_hit"] == 0 | ||
| ) # No miss at aot_autograd level causing disk I/O. | ||
| assert counters["aot_autograd"]["autograd_cache_hit"] == 1 | ||
|
|
||
|
Comment on lines
+71
to
+72
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same here |
||
|
|
||
| def test_parallel_compile_pool(monkeypatch, vllm_runner): | ||
| """Test that parallel compile pool is warmed up and quiesced by vLLM.""" | ||
| from torch._inductor.async_compile import ( | ||
| _pool_set, | ||
| shutdown_compile_workers, | ||
| ) | ||
|
|
||
| # Explicitly set parallel compilation to 4 processes. | ||
| monkeypatch.setenv("VLLM_COMPILE_PROCESSES", "4") | ||
| monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") | ||
|
|
||
| try: | ||
| # Run vLLM — the worker should set compile_threads, warm up | ||
| # the pool, then quiesce it before cudagraph capture. | ||
| _run_vllm(vllm_runner) | ||
|
|
||
| # Verify pool exists and was quiesced (not shut down). | ||
| # After quiesce(), SubprocPool.quiesce_waitcounter is set to a | ||
| # non-None value while the pool itself stays alive for reuse. | ||
| assert len(_pool_set) > 0, "Pool should exist after vLLM run" | ||
| for pool in _pool_set: | ||
| assert pool.quiesce_waitcounter is not None, ( | ||
| "Pool should be quiesced after compilation" | ||
| ) | ||
| finally: | ||
| # Clean up for other tests in the same pytest session | ||
| shutdown_compile_workers() | ||
|
Comment on lines
+99
to
+100
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does the other test in this file affect this test? |
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -344,12 +344,84 @@ def load_model(self) -> None: | |
| ) | ||
| self.model_runner.eep_eplb_suppressed = True | ||
|
|
||
| # Set up compile threads and warm the pool before the first | ||
| # torch.compile (which happens in determine_available_memory). | ||
| self._maybe_warm_up_compile_pool() | ||
|
|
||
| def update_config(self, overrides: dict[str, Any]) -> None: | ||
| self.model_runner.update_config(overrides) | ||
|
|
||
| def reload_weights(self, *args, **kwargs) -> None: | ||
| self.model_runner.reload_weights(*args, **kwargs) | ||
|
|
||
| @property | ||
| def _num_parallel_compile_processes(self) -> int: | ||
| """Return the number of parallel compile processes if applicable, | ||
| or 0 if parallel compilation is not in use.""" | ||
| using_inductor = ( | ||
| self.vllm_config.compilation_config.mode != CompilationMode.NONE | ||
| and self.vllm_config.compilation_config.backend in ("inductor", "") | ||
| ) | ||
eellison marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| if not using_inductor: | ||
| return 0 | ||
| # Verify the PyTorch version supports quiesce — we need it | ||
| # to stop the pool before cudagraph capture. If missing, | ||
| # fall back to single-threaded compilation. | ||
| from torch._inductor.compile_worker.subproc_pool import ( | ||
| SubprocPool, | ||
| ) | ||
|
|
||
| if not hasattr(SubprocPool, "quiesce"): | ||
| return 0 | ||
|
Comment on lines
+367
to
+375
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why pytorch versions don't? If the answer is < 2.9, then we can just remove the extra check |
||
| compile_processes = envs.VLLM_COMPILE_PROCESSES | ||
| if compile_processes is not None: | ||
| return compile_processes | ||
| # Auto-compute parallel compile processes. | ||
| # - Cap at 8: vLLM's graph splitting typically does not produce | ||
| # many inductor triton kernels | ||
| # - Divide CPUs by GPU count: each GPU worker spawns its own | ||
| # compile pool, so we split the machine's CPUs | ||
| # - Reserve 1 core per worker for the main thread which runs | ||
| # Dynamo tracing and graph lowering concurrently. | ||
| cpu_count = ( | ||
| len(os.sched_getaffinity(0)) | ||
| if hasattr(os, "sched_getaffinity") | ||
| else os.cpu_count() or 1 | ||
| ) | ||
| num_gpus = max(torch.cuda.device_count(), 1) | ||
| cpus_per_gpu = cpu_count // num_gpus | ||
| return max(1, min(8, cpus_per_gpu - 1)) | ||
|
|
||
| def _maybe_warm_up_compile_pool(self) -> None: | ||
| """Set up parallel compile threads and pre-warm the inductor | ||
| compile worker pool. Must be called before first torch.compile.""" | ||
| num_procs = self._num_parallel_compile_processes | ||
| # Always set compile_threads to ensure a safe value, even if | ||
| # env_override.py set a higher value at import time but the | ||
| # current environment can't support it (e.g., old PyTorch). | ||
| torch._inductor.config.compile_threads = max(1, num_procs) | ||
| if num_procs <= 1: | ||
| return | ||
| logger.info("Using %d parallel compile processes", num_procs) | ||
| from torch._inductor.async_compile import AsyncCompile | ||
|
|
||
| AsyncCompile.warm_pool() | ||
eellison marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| def _maybe_quiesce_compile_pool(self) -> None: | ||
| """Quiesce the compile worker pool before cudagraph capture. | ||
|
|
||
| Uses quiesce() instead of shutdown() — it's instant vs 6+ seconds. | ||
| Quiesce stops the internal ProcessPoolExecutor but keeps the | ||
| sidecar subprocess alive (sleeping, 0% CPU) for potential reuse. | ||
| """ | ||
| if self._num_parallel_compile_processes <= 1: | ||
| return | ||
| from torch._inductor.async_compile import _pool_set | ||
|
|
||
| logger.info("Quiescing compile worker pools") | ||
| for pool in _pool_set: | ||
| pool.quiesce() | ||
|
|
||
| @torch.inference_mode() | ||
| def determine_available_memory(self) -> int: | ||
| """Profiles the peak memory usage of the model to determine how much | ||
|
|
@@ -593,6 +665,9 @@ def compile_or_warm_up_model(self) -> float: | |
| # cuda graph capture. | ||
| kernel_warmup(self) | ||
|
|
||
| # Quiesce the compile worker pool before cudagraph capture. | ||
| self._maybe_quiesce_compile_pool() | ||
|
|
||
| cuda_graph_memory_bytes = 0 | ||
| if not self.model_config.enforce_eager: | ||
| cuda_graph_memory_bytes = self.model_runner.capture_model() | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@eellison we fixed this, please rebase