From 65d265fe4ae6554c1072b02ecba91ad72ad31bb2 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Mon, 9 Feb 2026 18:02:49 +0000
Subject: [PATCH 1/4] fix

Signed-off-by: Roger Wang <hey@rogerw.io>
---
 test_gc_behavior.py                   | 211 ++++++++++++++++++++++++++
 test_mem_leak.py                      | 115 ++++++++++++++
 test_mem_leak_mm.py                   | 120 +++++++++++++++
 test_request_gc.py                    | 181 ++++++++++++++++++++++
 tests/v1/core/test_async_scheduler.py |   2 +-
 vllm/v1/core/sched/scheduler.py       |   4 +-
 vllm/v1/request.py                    |  15 +-
 7 files changed, 638 insertions(+), 10 deletions(-)
 create mode 100644 test_gc_behavior.py
 create mode 100644 test_mem_leak.py
 create mode 100644 test_mem_leak_mm.py
 create mode 100644 test_request_gc.py

diff --git a/test_gc_behavior.py b/test_gc_behavior.py
new file mode 100644
index 000000000000..0b4a47631ee2
--- /dev/null
+++ b/test_gc_behavior.py
@@ -0,0 +1,211 @@
+"""
+Test to understand GC behavior with Request objects and gc.freeze().
+Measures how often gen0/gen1/gen2 collections happen and whether
+cyclic garbage from Request objects is collected.
+"""
+import gc
+import os
+import sys
+import weakref
+
+os.environ["CUDA_VISIBLE_DEVICES"] = ""
+
+from functools import partial
+
+from vllm.utils.hashing import sha256
+from vllm.v1.core.kv_cache_utils import (
+    get_request_block_hasher,
+    init_none_hash,
+)
+from vllm.pooling_params import PoolingParams
+from vllm.v1.request import Request
+
+
+def test_gc_collection_frequency():
+    """Check how often GC runs and whether it collects Request cycles."""
+
+    init_none_hash(sha256)
+    block_size = 16
+    block_hasher = get_request_block_hasher(block_size, sha256)
+    pooling_params = PoolingParams()
+
+    # Simulate gc.freeze() like EngineCore does
+    gc.collect(0)
+    gc.collect(1)
+    gc.collect(2)
+    gc.freeze()
+
+    print(f"GC thresholds: {gc.get_threshold()}")
+    print(f"After freeze - GC counts: {gc.get_count()}, frozen: {gc.get_freeze_count()}")
+
+    weak_refs = []
+    collected_counts = []
+
+    # Track GC events
+    gc_events = []
+    original_callbacks = gc.callbacks[:]
+
+    def gc_callback(phase, info):
+        if phase == "stop":
+            gc_events.append({
+                "generation": info["generation"],
+                "collected": info["collected"],
+                "uncollectable": info["uncollectable"],
+            })
+
+    gc.callbacks.append(gc_callback)
+
+    print(f"\nCreating 1000 Request objects with prefix caching (simulating serving)...")
+
+    for i in range(1000):
+        prompt_tokens = list(range(i * 100, i * 100 + 200))
+        req = Request(
+            request_id=f"req-{i}",
+            prompt_token_ids=prompt_tokens,
+            sampling_params=None,
+            pooling_params=pooling_params,
+            eos_token_id=None,
+            block_hasher=block_hasher,
+        )
+        weak_refs.append(weakref.ref(req))
+
+        # Simulate freeing the request (like scheduler does)
+        del req
+
+        if (i + 1) % 100 == 0:
+            alive = sum(1 for ref in weak_refs if ref() is not None)
+            gen0, gen1, gen2 = gc.get_count()
+            gen0_events = sum(1 for e in gc_events if e["generation"] == 0)
+            gen1_events = sum(1 for e in gc_events if e["generation"] == 1)
+            gen2_events = sum(1 for e in gc_events if e["generation"] == 2)
+            total_collected = sum(e["collected"] for e in gc_events)
+            print(f"  After {i+1} requests: alive={alive}, "
+                  f"gc_counts=({gen0},{gen1},{gen2}), "
+                  f"collections: gen0={gen0_events}, gen1={gen1_events}, gen2={gen2_events}, "
+                  f"total_collected={total_collected}")
+
+    alive_final = sum(1 for ref in weak_refs if ref() is not None)
+    print(f"\nFinal alive count: {alive_final}")
+    print(f"Total GC events: {len(gc_events)}")
+    for gen in range(3):
+        events = [e for e in gc_events if e["generation"] == gen]
+        total = sum(e["collected"] for e in events)
+        print(f"  Gen {gen}: {len(events)} collections, {total} objects collected")
+
+    gc.callbacks.remove(gc_callback)
+    gc.unfreeze()
+    gc.collect()
+
+
+def test_gc_with_fewer_tracked_objects():
+    """Simulate v0.11.1's optimization that creates fewer GC-tracked objects."""
+
+    init_none_hash(sha256)
+    block_size = 16
+    block_hasher = get_request_block_hasher(block_size, sha256)
+    pooling_params = PoolingParams()
+
+    gc.collect()
+    gc.freeze()
+
+    print(f"\n{'='*60}")
+    print("Test: Fewer GC-tracked objects (v0.11.1 behavior)")
+    print(f"{'='*60}")
+
+    weak_refs = []
+    gc_events = []
+
+    def gc_callback(phase, info):
+        if phase == "stop":
+            gc_events.append({
+                "generation": info["generation"],
+                "collected": info["collected"],
+            })
+
+    gc.callbacks.append(gc_callback)
+
+    for i in range(1000):
+        prompt_tokens = list(range(i * 100, i * 100 + 200))
+        req = Request(
+            request_id=f"req-{i}",
+            prompt_token_ids=prompt_tokens,
+            sampling_params=None,
+            pooling_params=pooling_params,
+            eos_token_id=None,
+            block_hasher=block_hasher,
+        )
+        weak_refs.append(weakref.ref(req))
+        del req
+
+        # Simulate v0.11.1's optimization: use tuples instead of lists
+        # This creates fewer GC-tracked objects per iteration
+        _ = ()  # empty tuple (not tracked by GC)
+
+    alive = sum(1 for ref in weak_refs if ref() is not None)
+    gen0_events = sum(1 for e in gc_events if e["generation"] == 0)
+    gen2_events = sum(1 for e in gc_events if e["generation"] == 2)
+    print(f"After 1000 requests: alive={alive}")
+    print(f"GC collections: gen0={gen0_events}, gen2={gen2_events}")
+
+    gc.callbacks.remove(gc_callback)
+    gc.unfreeze()
+    gc.collect()
+
+
+def test_gc_with_large_data():
+    """Test with larger data (simulating mm_features) to see memory impact."""
+    import resource
+
+    init_none_hash(sha256)
+    block_size = 16
+    block_hasher = get_request_block_hasher(block_size, sha256)
+    pooling_params = PoolingParams()
+
+    gc.collect()
+    gc.freeze()
+
+    print(f"\n{'='*60}")
+    print("Test: Request objects with large auxiliary data")
+    print(f"{'='*60}")
+
+    rss_before = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024
+
+    weak_refs = []
+
+    for batch in range(10):
+        for i in range(100):
+            prompt_tokens = list(range(200))
+            req = Request(
+                request_id=f"req-{batch}-{i}",
+                prompt_token_ids=prompt_tokens,
+                sampling_params=None,
+                pooling_params=pooling_params,
+                eos_token_id=None,
+                block_hasher=block_hasher,
+            )
+            # Simulate large mm_features data
+            req._large_data = bytearray(1024 * 1024)  # 1MB per request
+
+            weak_refs.append(weakref.ref(req))
+            del req
+
+        alive = sum(1 for ref in weak_refs if ref() is not None)
+        rss_now = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024
+        gen0, gen1, gen2 = gc.get_count()
+        print(f"  Batch {batch+1}: alive={alive}, "
+              f"RSS={rss_now:.1f}MB (delta={rss_now-rss_before:+.1f}MB), "
+              f"gc_counts=({gen0},{gen1},{gen2})")
+
+    gc.unfreeze()
+    gc.collect()
+
+    alive_after = sum(1 for ref in weak_refs if ref() is not None)
+    rss_after = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024
+    print(f"\nAfter gc.collect(): alive={alive_after}, "
+          f"RSS={rss_after:.1f}MB (delta={rss_after-rss_before:+.1f}MB)")
+
+
+if __name__ == "__main__":
+    test_gc_collection_frequency()
+    test_gc_with_fewer_tracked_objects()
+    test_gc_with_large_data()
diff --git a/test_mem_leak.py b/test_mem_leak.py
new file mode 100644
index 000000000000..aa8cbe69242e
--- /dev/null
+++ b/test_mem_leak.py
@@ -0,0 +1,115 @@
+"""
+Minimal reproduction script for CPU memory leak with prefix caching.
+Tests whether CPU RSS memory grows unboundedly during inference with
+prefix caching enabled vs disabled.
+"""
+import gc
+import os
+import resource
+import tracemalloc
+
+# Use a single GPU
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+
+from vllm import LLM, SamplingParams
+
+
+def get_rss_mb():
+    """Get current RSS memory in MB."""
+    return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024
+
+
+def get_tracemalloc_mb():
+    """Get current traced memory in MB."""
+    current, peak = tracemalloc.get_traced_memory()
+    return current / (1024 * 1024), peak / (1024 * 1024)
+
+
+def run_test(enable_prefix_caching: bool, num_rounds: int = 5,
+             prompts_per_round: int = 200):
+    """Run inference test and monitor memory."""
+    print(f"\n{'='*60}")
+    print(f"Testing with prefix_caching={'enabled' if enable_prefix_caching else 'disabled'}")
+    print(f"{'='*60}")
+
+    tracemalloc.start()
+
+    # Use a small model
+    llm = LLM(
+        model="Qwen/Qwen2.5-VL-3B-Instruct",
+        max_model_len=4096,
+        gpu_memory_utilization=0.5,
+        enable_prefix_caching=enable_prefix_caching,
+        limit_mm_per_prompt={"video": 0},
+        enforce_eager=True,
+    )
+
+    sampling_params = SamplingParams(temperature=0.8, max_tokens=64)
+
+    rss_before = get_rss_mb()
+    traced_before = tracemalloc.get_traced_memory()[0] / (1024 * 1024)
+    print(f"Initial RSS: {rss_before:.1f} MB, Traced: {traced_before:.1f} MB")
+
+    for round_idx in range(num_rounds):
+        # Create prompts that share common prefixes (to exercise prefix caching)
+        # Use varying suffixes to create different block hashes
+        common_prefix = "You are a helpful assistant. " * 50
+        prompts = [
+            f"{common_prefix} Question {round_idx * prompts_per_round + i}: "
+            f"What is {i * 7 + round_idx}? Answer briefly."
+            for i in range(prompts_per_round)
+        ]
+
+        outputs = llm.generate(prompts, sampling_params)
+
+        # Force garbage collection
+        del outputs
+        gc.collect()
+
+        rss_after = get_rss_mb()
+        traced_current, traced_peak = get_tracemalloc_mb()
+        print(
+            f"Round {round_idx + 1}: RSS={rss_after:.1f} MB "
+            f"(delta={rss_after - rss_before:+.1f} MB), "
+            f"Traced={traced_current:.1f} MB (peak={traced_peak:.1f} MB)"
+        )
+
+    # Take a snapshot of the top memory consumers
+    snapshot = tracemalloc.take_snapshot()
+    top_stats = snapshot.statistics('lineno')
+    print(f"\nTop 15 memory consumers:")
+    for stat in top_stats[:15]:
+        print(f"  {stat}")
+
+    # Also check by filename
+    top_stats_file = snapshot.statistics('filename')
+    print(f"\nTop 10 memory consumers by file:")
+    for stat in top_stats_file[:10]:
+        print(f"  {stat}")
+
+    tracemalloc.stop()
+
+    final_rss = get_rss_mb()
+    print(f"\nFinal RSS: {final_rss:.1f} MB (total delta: {final_rss - rss_before:+.1f} MB)")
+
+    del llm
+    gc.collect()
+
+    return final_rss - rss_before
+
+
+if __name__ == "__main__":
+    # Test with prefix caching enabled
+    delta_with_caching = run_test(enable_prefix_caching=True)
+
+    gc.collect()
+
+    # Test with prefix caching disabled
+    delta_without_caching = run_test(enable_prefix_caching=False)
+
+    print(f"\n{'='*60}")
+    print(f"SUMMARY")
+    print(f"{'='*60}")
+    print(f"RSS growth with prefix caching:    {delta_with_caching:+.1f} MB")
+    print(f"RSS growth without prefix caching: {delta_without_caching:+.1f} MB")
+    print(f"Difference: {delta_with_caching - delta_without_caching:+.1f} MB")
diff --git a/test_mem_leak_mm.py b/test_mem_leak_mm.py
new file mode 100644
index 000000000000..b8ee944ed693
--- /dev/null
+++ b/test_mem_leak_mm.py
@@ -0,0 +1,120 @@
+"""
+Reproduction script for CPU memory leak with prefix caching + multimodal inputs.
+Tests whether CPU RSS memory grows unboundedly during VLM inference.
+"""
+import gc
+import os
+import resource
+import tracemalloc
+
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+
+from vllm import LLM, SamplingParams
+
+
+def get_rss_mb():
+    """Get current RSS memory in MB."""
+    return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024
+
+
+def run_test(enable_prefix_caching: bool, num_rounds: int = 5,
+             prompts_per_round: int = 100):
+    """Run inference test with multimodal inputs and monitor memory."""
+    print(f"\n{'='*60}")
+    print(f"Testing with prefix_caching={'enabled' if enable_prefix_caching else 'disabled'}")
+    print(f"{'='*60}")
+
+    tracemalloc.start()
+
+    llm = LLM(
+        model="Qwen/Qwen2.5-VL-3B-Instruct",
+        max_model_len=4096,
+        gpu_memory_utilization=0.5,
+        enable_prefix_caching=enable_prefix_caching,
+        limit_mm_per_prompt={"image": 2, "video": 0},
+        enforce_eager=True,
+    )
+
+    sampling_params = SamplingParams(temperature=0.8, max_tokens=32)
+
+    rss_before = get_rss_mb()
+    traced_before = tracemalloc.get_traced_memory()[0] / (1024 * 1024)
+    print(f"Initial RSS: {rss_before:.1f} MB, Traced: {traced_before:.1f} MB")
+
+    # Use different image URLs to simulate variety in multimodal inputs
+    image_urls = [
+        "https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/PNG_transparency_demonstration_1.png/300px-PNG_transparency_demonstration_1.png",
+        "https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/1200px-Cat03.jpg",
+        "https://upload.wikimedia.org/wikipedia/commons/thumb/4/4d/Cat_November_2010-1a.jpg/1200px-Cat_November_2010-1a.jpg",
+        "https://upload.wikimedia.org/wikipedia/commons/thumb/b/b6/Image_created_with_a_mobile_phone.png/1200px-Image_created_with_a_mobile_phone.png",
+    ]
+
+    for round_idx in range(num_rounds):
+        # Create multimodal prompts with images
+        prompts = []
+        for i in range(prompts_per_round):
+            img_url = image_urls[i % len(image_urls)]
+            prompt = {
+                "prompt": f"<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image briefly. Question {round_idx * prompts_per_round + i}.<|im_end|>\n<|im_start|>assistant\n",
+                "multi_modal_data": {
+                    "image": img_url,
+                },
+            }
+            prompts.append(prompt)
+
+        outputs = llm.generate(prompts, sampling_params)
+
+        del outputs
+        gc.collect()
+
+        rss_after = get_rss_mb()
+        traced_current, traced_peak = get_tracemalloc_mb()
+        print(
+            f"Round {round_idx + 1}: RSS={rss_after:.1f} MB "
+            f"(delta={rss_after - rss_before:+.1f} MB), "
+            f"Traced={traced_current:.1f} MB (peak={traced_peak:.1f} MB)"
+        )
+
+    # Take a snapshot of the top memory consumers
+    snapshot = tracemalloc.take_snapshot()
+    top_stats = snapshot.statistics('lineno')
+    print(f"\nTop 20 memory consumers:")
+    for stat in top_stats[:20]:
+        print(f"  {stat}")
+
+    top_stats_file = snapshot.statistics('filename')
+    print(f"\nTop 10 memory consumers by file:")
+    for stat in top_stats_file[:10]:
+        print(f"  {stat}")
+
+    tracemalloc.stop()
+
+    final_rss = get_rss_mb()
+    print(f"\nFinal RSS: {final_rss:.1f} MB (total delta: {final_rss - rss_before:+.1f} MB)")
+
+    del llm
+    gc.collect()
+
+    return final_rss - rss_before
+
+
+def get_tracemalloc_mb():
+    current, peak = tracemalloc.get_traced_memory()
+    return current / (1024 * 1024), peak / (1024 * 1024)
+
+
+if __name__ == "__main__":
+    # Test with prefix caching enabled
+    delta_with_caching = run_test(enable_prefix_caching=True)
+
+    gc.collect()
+
+    # Test with prefix caching disabled
+    delta_without_caching = run_test(enable_prefix_caching=False)
+
+    print(f"\n{'='*60}")
+    print(f"SUMMARY")
+    print(f"{'='*60}")
+    print(f"RSS growth with prefix caching:    {delta_with_caching:+.1f} MB")
+    print(f"RSS growth without prefix caching: {delta_without_caching:+.1f} MB")
+    print(f"Difference: {delta_with_caching - delta_without_caching:+.1f} MB")
diff --git a/test_request_gc.py b/test_request_gc.py
new file mode 100644
index 000000000000..b8959302f469
--- /dev/null
+++ b/test_request_gc.py
@@ -0,0 +1,181 @@
+"""
+Test to verify that the fix for the Request reference cycle works.
+Request objects should be freed immediately by reference counting
+when prefix caching is enabled (no cyclic GC needed).
+"""
+import gc
+import os
+import sys
+import weakref
+
+os.environ["CUDA_VISIBLE_DEVICES"] = ""
+
+from vllm.utils.hashing import sha256
+from vllm.v1.core.kv_cache_utils import (
+    get_request_block_hasher,
+    init_none_hash,
+)
+from vllm.pooling_params import PoolingParams
+from vllm.v1.request import Request
+
+
+def test_request_gc_with_prefix_caching():
+    """Test that Request objects are properly freed WITHOUT needing cyclic GC."""
+
+    init_none_hash(sha256)
+    block_size = 16
+    block_hasher = get_request_block_hasher(block_size, sha256)
+
+    weak_refs = []
+
+    # Disable automatic GC to test reference counting alone
+    gc.disable()
+
+    pooling_params = PoolingParams()
+    for i in range(100):
+        prompt_tokens = list(range(i * 100, i * 100 + 200))
+        req = Request(
+            request_id=f"req-{i}",
+            prompt_token_ids=prompt_tokens,
+            sampling_params=None,
+            pooling_params=pooling_params,
+            eos_token_id=None,
+            block_hasher=block_hasher,
+        )
+        weak_refs.append(weakref.ref(req))
+        del req
+
+    alive_before = sum(1 for ref in weak_refs if ref() is not None)
+    print(f"With prefix caching:")
+    print(f"  Alive Request objects before gc.collect(): {alive_before}")
+
+    collected = gc.collect()
+    print(f"  GC collected {collected} objects")
+
+    alive_after = sum(1 for ref in weak_refs if ref() is not None)
+    print(f"  Alive Request objects after gc.collect(): {alive_after}")
+
+    gc.enable()
+
+    if alive_before > 0:
+        print(f"\n  FAIL: {alive_before} Request objects were NOT freed "
+              "without explicit gc.collect()!")
+        print("  Reference cycle still exists.")
+        return False
+    else:
+        print(f"\n  PASS: All Request objects freed immediately by "
+              "reference counting (no cycle).")
+        return True
+
+
+def test_request_gc_without_prefix_caching():
+    """Test that Request objects without prefix caching don't have cycles."""
+
+    weak_refs = []
+    gc.disable()
+
+    pooling_params = PoolingParams()
+    for i in range(100):
+        prompt_tokens = list(range(i * 100, i * 100 + 200))
+        req = Request(
+            request_id=f"req-{i}",
+            prompt_token_ids=prompt_tokens,
+            sampling_params=None,
+            pooling_params=pooling_params,
+            eos_token_id=None,
+            block_hasher=None,
+        )
+        weak_refs.append(weakref.ref(req))
+        del req
+
+    alive_before = sum(1 for ref in weak_refs if ref() is not None)
+    print(f"\nWithout prefix caching:")
+    print(f"  Alive Request objects before gc.collect(): {alive_before}")
+
+    collected = gc.collect()
+    alive_after = sum(1 for ref in weak_refs if ref() is not None)
+    print(f"  GC collected {collected} objects")
+    print(f"  Alive Request objects after gc.collect(): {alive_after}")
+
+    gc.enable()
+
+    if alive_before == 0:
+        print(f"\n  PASS: All Request objects freed immediately.")
+        return True
+    else:
+        print(f"\n  FAIL: {alive_before} Request objects leaked.")
+        return False
+
+
+def test_memory_growth_with_freeze():
+    """Test that there's no memory growth even with gc.freeze()."""
+    import resource
+
+    init_none_hash(sha256)
+    block_size = 16
+    block_hasher = get_request_block_hasher(block_size, sha256)
+
+    gc.collect()
+    gc.freeze()
+
+    print(f"\n{'='*60}")
+    print("Test: Memory stability with gc.freeze() + prefix caching")
+    print(f"{'='*60}")
+
+    rss_before = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024
+    weak_refs = []
+
+    pooling_params = PoolingParams()
+    for round_idx in range(10):
+        for i in range(500):
+            prompt_tokens = list(range(500))
+            req = Request(
+                request_id=f"req-{round_idx}-{i}",
+                prompt_token_ids=prompt_tokens,
+                sampling_params=None,
+                pooling_params=pooling_params,
+                eos_token_id=None,
+                block_hasher=block_hasher,
+            )
+            weak_refs.append(weakref.ref(req))
+            del req
+
+        alive = sum(1 for ref in weak_refs if ref() is not None)
+        rss_after = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024
+        print(f"  Round {round_idx + 1}: alive={alive}, "
+              f"RSS={rss_after:.1f} MB (delta={rss_after - rss_before:+.1f} MB)")
+
+    gc.unfreeze()
+    gc.collect()
+
+    alive_final = sum(1 for ref in weak_refs if ref() is not None)
+    print(f"\n  Final alive: {alive_final}")
+    if alive_final == 0:
+        print("  PASS: All objects freed.")
+        return True
+    else:
+        print(f"  FAIL: {alive_final} objects still alive.")
+        return False
+
+
+if __name__ == "__main__":
+    results = []
+    results.append(("prefix caching GC", test_request_gc_with_prefix_caching()))
+    results.append(("no prefix caching GC", test_request_gc_without_prefix_caching()))
+    results.append(("memory growth with freeze", test_memory_growth_with_freeze()))
+
+    print(f"\n{'='*60}")
+    print("SUMMARY")
+    print(f"{'='*60}")
+    all_pass = True
+    for name, passed in results:
+        status = "PASS" if passed else "FAIL"
+        print(f"  {status}: {name}")
+        if not passed:
+            all_pass = False
+
+    if all_pass:
+        print("\nAll tests passed! The reference cycle fix is working.")
+    else:
+        print("\nSome tests failed!")
+    sys.exit(0 if all_pass else 1)
diff --git a/tests/v1/core/test_async_scheduler.py b/tests/v1/core/test_async_scheduler.py
index e0645ed43015..8c09ec7b2b6a 100644
--- a/tests/v1/core/test_async_scheduler.py
+++ b/tests/v1/core/test_async_scheduler.py
@@ -236,7 +236,7 @@ def test_prefix_caching_for_multi_turn():
         req._all_token_ids = req.prompt_token_ids.copy()
         req.all_token_ids = ConstantList(req._all_token_ids)
         req.block_hashes = []
-        req.block_hashes = req.get_hash_new_full_blocks()
+        req.block_hashes = req._block_hasher(req)
 
     # Schedule the next-turn requests.
     for req in next_turn_requests:
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 90ca584410df..b807d051ccfa 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -984,8 +984,8 @@ def _update_request_as_session(
         session.prompt_token_ids.extend(update.prompt_token_ids or ())
         # Update block hashes for the new tokens
         # (mirrors Request.append_output_token_ids)
-        if session.get_hash_new_full_blocks is not None:
-            session.block_hashes.extend(session.get_hash_new_full_blocks())
+        if session._block_hasher is not None:
+            session.block_hashes.extend(session._block_hasher(session))
         session.num_prompt_tokens = len(session.prompt_token_ids)
         session.arrival_time = update.arrival_time
         session.sampling_params = update.sampling_params
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 3b829875f390..09f42edfed3f 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -6,7 +6,6 @@
 from collections import deque
 from collections.abc import Callable, Mapping
 from dataclasses import dataclass
-from functools import partial
 from typing import TYPE_CHECKING, Any
 
 import torch
@@ -164,10 +163,12 @@ def __init__(
         self.num_external_computed_tokens = 0
 
         self.block_hashes: list[BlockHash] = []
-        self.get_hash_new_full_blocks: Callable[[], list[BlockHash]] | None = None
-        if block_hasher is not None:
-            self.get_hash_new_full_blocks = partial(block_hasher, self)
-            self.block_hashes = self.get_hash_new_full_blocks()
+        # Store the block hasher without binding self to avoid creating a
+        # reference cycle (Request -> partial -> Request) that prevents
+        # immediate garbage collection via reference counting.
+        self._block_hasher: Callable[["Request"], list[BlockHash]] | None = block_hasher
+        if self._block_hasher is not None:
+            self.block_hashes = self._block_hasher(self)
 
         self.skip_reading_prefix_cache = self.get_skip_reading_prefix_cache()
 
@@ -212,8 +213,8 @@ def append_output_token_ids(
             self._output_token_ids.extend(token_ids)
             self._all_token_ids.extend(token_ids)
 
-        if self.get_hash_new_full_blocks is not None:
-            self.block_hashes.extend(self.get_hash_new_full_blocks())
+        if self._block_hasher is not None:
+            self.block_hashes.extend(self._block_hasher(self))
 
     @property
     def use_structured_output(self) -> bool:

From bf230610ca1bb81f2c9bff863c961a004a708b99 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Mon, 9 Feb 2026 20:09:52 +0000
Subject: [PATCH 2/4] add

Signed-off-by: Roger Wang <hey@rogerw.io>
---
 test_mm_memory_leak.py | 309 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 309 insertions(+)
 create mode 100644 test_mm_memory_leak.py

diff --git a/test_mm_memory_leak.py b/test_mm_memory_leak.py
new file mode 100644
index 000000000000..d866359cffcf
--- /dev/null
+++ b/test_mm_memory_leak.py
@@ -0,0 +1,309 @@
+#!/usr/bin/env python3
+"""
+Reproduction script for CPU memory leak with multimodal models (issue #28726).
+
+Uses the EXACT reproduction from the issue report:
+  - vllm serve Qwen2.5-VL-3B-Instruct --max-model-len 25000
+  - vllm bench serve with lmarena-ai/VisionArena-Chat dataset (1000 prompts)
+
+Measures the ENGINE CORE subprocess RSS specifically (the EngineCore runs in
+a child process in V1 architecture, and that's where Request objects live).
+
+The leak is caused by a reference cycle in Request objects:
+    Request -> partial(block_hasher, self) -> Request
+
+Usage:
+    python test_mm_memory_leak.py          # run on current branch
+    python test_mm_memory_leak.py --rounds 3  # fewer rounds
+
+Requirements:
+    - GPU with enough VRAM for Qwen2.5-VL-3B-Instruct (~8GB)
+    - Model cached at ~/.cache/huggingface/hub/
+    - Internet access (first run downloads VisionArena-Chat dataset)
+"""
+
+import argparse
+import json
+import os
+import re
+import subprocess
+import sys
+import tempfile
+import time
+import urllib.error
+import urllib.request
+from pathlib import Path
+
+os.environ.setdefault("CUDA_VISIBLE_DEVICES", "0")
+
+MODEL = "Qwen/Qwen2.5-VL-3B-Instruct"
+PORT = 29345
+HOST = f"http://localhost:{PORT}"
+
+
+def get_process_tree_rss(root_pid: int) -> dict[int, float]:
+    """Get RSS for every process in the tree. Returns {pid: rss_gb}."""
+    result = {}
+    pids = _get_all_descendant_pids(root_pid)
+    pids.add(root_pid)
+    for pid in pids:
+        try:
+            with open(f"/proc/{pid}/status") as f:
+                rss_kb = 0
+                cmdline = ""
+                for line in f:
+                    if line.startswith("VmRSS:"):
+                        rss_kb = int(line.split()[1])
+                        break
+            # Also get command name
+            try:
+                cmdline = Path(f"/proc/{pid}/cmdline").read_text().replace("\0", " ")[:100]
+            except Exception:
+                cmdline = "?"
+            result[pid] = (rss_kb / (1024 * 1024), cmdline)
+        except (FileNotFoundError, ProcessLookupError, PermissionError):
+            pass
+    return result
+
+
+def get_tree_total_rss_gb(root_pid: int) -> float:
+    tree = get_process_tree_rss(root_pid)
+    return sum(rss for rss, _ in tree.values())
+
+
+def get_engine_core_rss_gb(root_pid: int) -> float:
+    """Find and return RSS of the EngineCore subprocess."""
+    tree = get_process_tree_rss(root_pid)
+    for pid, (rss_gb, cmdline) in tree.items():
+        if pid != root_pid and "python" in cmdline.lower() and rss_gb > 0.1:
+            # The engine core subprocess is the largest Python child process
+            return rss_gb
+    # Fallback: return total tree RSS
+    return sum(rss for rss, _ in tree.values())
+
+
+def _get_all_descendant_pids(pid: int) -> set[int]:
+    children = set()
+    try:
+        result = subprocess.run(
+            ["pgrep", "-P", str(pid)], capture_output=True, text=True,
+        )
+        for line in result.stdout.strip().splitlines():
+            if line.strip():
+                child = int(line.strip())
+                children.add(child)
+                children.update(_get_all_descendant_pids(child))
+    except Exception:
+        pass
+    return children
+
+
+def wait_for_server(timeout: int = 300) -> bool:
+    start = time.time()
+    while time.time() - start < timeout:
+        try:
+            req = urllib.request.Request(f"{HOST}/health")
+            with urllib.request.urlopen(req, timeout=5) as resp:
+                if resp.status == 200:
+                    return True
+        except (urllib.error.URLError, ConnectionRefusedError, OSError):
+            pass
+        time.sleep(3)
+    return False
+
+
+def run_bench(num_prompts: int) -> tuple[float, bool]:
+    """Run vllm bench serve with VisionArena-Chat. Returns (elapsed, success)."""
+    cmd = [
+        sys.executable, "-c",
+        "import sys; "
+        f"sys.argv = ['vllm', 'bench', 'serve', "
+        f"'--backend', 'openai-chat', "
+        f"'--model', '{MODEL}', "
+        f"'--endpoint', '/v1/chat/completions', "
+        f"'--dataset-name', 'hf', "
+        f"'--dataset-path', 'lmarena-ai/VisionArena-Chat', "
+        f"'--hf-split', 'train', "
+        f"'--num-prompts', '{num_prompts}', "
+        f"'--port', '{PORT}']; "
+        "from vllm.entrypoints.cli.main import main; main()",
+    ]
+    t0 = time.time()
+    result = subprocess.run(cmd, capture_output=True, text=True, timeout=1200)
+    elapsed = time.time() - t0
+    success = result.returncode == 0
+    if not success:
+        err = (result.stderr or "")[-500:] + (result.stdout or "")[-500:]
+        print(f"  Bench output: {err}")
+    return elapsed, success
+
+
+def print_process_tree(root_pid: int, label: str):
+    """Print RSS breakdown of all processes in the tree."""
+    tree = get_process_tree_rss(root_pid)
+    print(f"\n  Process tree ({label}):")
+    for pid, (rss_gb, cmdline) in sorted(tree.items()):
+        short_cmd = cmdline[:60]
+        print(f"    PID {pid:>7}: {rss_gb:6.2f} GB  {short_cmd}")
+    total = sum(rss for rss, _ in tree.values())
+    print(f"    {'TOTAL':>11}: {total:6.2f} GB")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--rounds", type=int, default=5)
+    parser.add_argument("--prompts", type=int, default=1000)
+    args = parser.parse_args()
+
+    num_rounds = args.rounds
+    prompts_per_round = args.prompts
+
+    print("=" * 70)
+    print(" CPU Memory Leak Reproduction - Multimodal + Prefix Caching")
+    print(" (using lmarena-ai/VisionArena-Chat dataset)")
+    print("=" * 70)
+
+    repo_dir = os.path.dirname(os.path.abspath(__file__))
+    branch = subprocess.run(
+        ["git", "rev-parse", "--abbrev-ref", "HEAD"],
+        capture_output=True, text=True, cwd=repo_dir,
+    ).stdout.strip()
+    commit = subprocess.run(
+        ["git", "rev-parse", "--short", "HEAD"],
+        capture_output=True, text=True, cwd=repo_dir,
+    ).stdout.strip()
+    print(f"Branch:      {branch} ({commit})")
+    print(f"Model:       {MODEL}")
+    print(f"Config:      {num_rounds} rounds x {prompts_per_round} prompts")
+    print(f"             prefix caching ON, max-model-len 25000")
+    print()
+
+    # --- Start vLLM server ---
+    log_file = tempfile.NamedTemporaryFile(
+        mode="w", prefix="vllm_server_", suffix=".log", delete=False,
+    )
+    log_path = log_file.name
+    print(f"Starting vLLM server (log: {log_path})...")
+
+    server_cmd = [
+        sys.executable, "-m", "vllm.entrypoints.openai.api_server",
+        "--model", MODEL,
+        "--port", str(PORT),
+        "--max-model-len", "25000",
+        "--gpu-memory-utilization", "0.95",
+        "--limit-mm-per-prompt", json.dumps({"image": 1, "video": 0}),
+        "--mm-processor-kwargs", json.dumps({
+            "min_pixels": 28 * 28,
+            "max_pixels": 1280 * 28 * 28,
+        }),
+    ]
+
+    server_proc = subprocess.Popen(
+        server_cmd, stdout=log_file, stderr=subprocess.STDOUT,
+    )
+
+    try:
+        if not wait_for_server(timeout=300):
+            print("ERROR: Server failed to start within timeout.")
+            server_proc.kill()
+            server_proc.wait(timeout=10)
+            log_file.close()
+            print("Server log (last 3000 chars):")
+            print(Path(log_path).read_text()[-3000:])
+            return 1
+
+        server_pid = server_proc.pid
+        rss_idle = get_tree_total_rss_gb(server_pid)
+        ec_idle = get_engine_core_rss_gb(server_pid)
+        print(f"Server ready (PID {server_pid}).")
+        print_process_tree(server_pid, "idle")
+
+        # --- Run benchmark rounds ---
+        print(f"\n{'Round':<7} {'Reqs':<8} {'Total(GB)':<11} {'EC(GB)':<9} "
+              f"{'EC delta':<10} {'EC round':<10} {'Time'}")
+        print("-" * 70)
+        print(f"{'idle':<7} {0:<8} {rss_idle:<11.2f} {ec_idle:<9.2f} "
+              f"{'---':<10} {'---':<10}")
+
+        ec_history = [ec_idle]
+        rss_history = [rss_idle]
+
+        for round_num in range(1, num_rounds + 1):
+            elapsed, success = run_bench(prompts_per_round)
+
+            if not success:
+                print(f"  Round {round_num} bench had errors, continuing...")
+
+            time.sleep(3)
+
+            rss_now = get_tree_total_rss_gb(server_pid)
+            ec_now = get_engine_core_rss_gb(server_pid)
+            ec_delta = ec_now - ec_idle
+            ec_round = ec_now - ec_history[-1]
+            ec_history.append(ec_now)
+            rss_history.append(rss_now)
+
+            total_reqs = round_num * prompts_per_round
+            print(f"{round_num:<7} {total_reqs:<8} {rss_now:<11.2f} "
+                  f"{ec_now:<9.2f} "
+                  f"{'+' if ec_delta >= 0 else ''}{ec_delta:<9.2f} "
+                  f"{'+' if ec_round >= 0 else ''}{ec_round:<9.2f} "
+                  f"{elapsed:.0f}s"
+                  f"{' (FAIL)' if not success else ''}")
+
+        # Print final tree.
+        print_process_tree(server_pid, "final")
+
+        # --- Summary ---
+        print()
+        print("=" * 70)
+        ec_growth = ec_history[-1] - ec_history[1]
+        ec_avg = ec_growth / max(num_rounds - 1, 1)
+        total_growth = rss_history[-1] - rss_history[1]
+        total_avg = total_growth / max(num_rounds - 1, 1)
+
+        if ec_avg > 0.3 or total_avg > 0.5:
+            verdict = "\033[91mLEAK DETECTED\033[0m"
+        else:
+            verdict = "\033[92mMEMORY STABLE\033[0m"
+
+        print(f"Result: {verdict}")
+        print(f"  Branch:               {branch} ({commit})")
+        print(f"  Total RSS idle:       {rss_idle:.2f} GB")
+        print(f"  Total RSS after R1:   {rss_history[1]:.2f} GB  (warmup)")
+        print(f"  Total RSS after R{num_rounds}:   {rss_history[-1]:.2f} GB")
+        print(f"  Total growth R2-{num_rounds}:    {total_growth:+.2f} GB "
+              f"(avg {total_avg:+.2f} GB/round)")
+        print()
+        print(f"  EngineCore RSS idle:  {ec_idle:.2f} GB")
+        print(f"  EngineCore after R1:  {ec_history[1]:.2f} GB  (warmup)")
+        print(f"  EngineCore after R{num_rounds}:  {ec_history[-1]:.2f} GB")
+        print(f"  EngineCore growth:    {ec_growth:+.2f} GB "
+              f"(avg {ec_avg:+.2f} GB/round)")
+
+        if ec_avg > 0.3 or total_avg > 0.5:
+            print()
+            print("  The EngineCore subprocess RSS keeps growing.")
+            print("  This confirms the CPU memory leak from the Request")
+            print("  reference cycle (partial(block_hasher, self)).")
+        else:
+            print()
+            print("  Memory is stable after warmup - no leak.")
+
+        print("=" * 70)
+        return 0
+
+    finally:
+        print("\nShutting down server...")
+        server_proc.terminate()
+        try:
+            server_proc.wait(timeout=15)
+        except subprocess.TimeoutExpired:
+            server_proc.kill()
+            server_proc.wait(timeout=5)
+        log_file.close()
+        print("Done.")
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From 502c29b70d422a8beb2e8d0d9efe7a1b4a483f1a Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Mon, 9 Feb 2026 20:10:18 +0000
Subject: [PATCH 3/4] delete

Signed-off-by: Roger Wang <hey@rogerw.io>
---
 test_gc_behavior.py | 211 --------------------------------------------
 test_mem_leak.py    | 115 ------------------------
 test_mem_leak_mm.py | 120 -------------------------
 test_request_gc.py  | 181 -------------------------------------
 4 files changed, 627 deletions(-)
 delete mode 100644 test_gc_behavior.py
 delete mode 100644 test_mem_leak.py
 delete mode 100644 test_mem_leak_mm.py
 delete mode 100644 test_request_gc.py

diff --git a/test_gc_behavior.py b/test_gc_behavior.py
deleted file mode 100644
index 0b4a47631ee2..000000000000
--- a/test_gc_behavior.py
+++ /dev/null
@@ -1,211 +0,0 @@
-"""
-Test to understand GC behavior with Request objects and gc.freeze().
-Measures how often gen0/gen1/gen2 collections happen and whether
-cyclic garbage from Request objects is collected.
-"""
-import gc
-import os
-import sys
-import weakref
-
-os.environ["CUDA_VISIBLE_DEVICES"] = ""
-
-from functools import partial
-
-from vllm.utils.hashing import sha256
-from vllm.v1.core.kv_cache_utils import (
-    get_request_block_hasher,
-    init_none_hash,
-)
-from vllm.pooling_params import PoolingParams
-from vllm.v1.request import Request
-
-
-def test_gc_collection_frequency():
-    """Check how often GC runs and whether it collects Request cycles."""
-
-    init_none_hash(sha256)
-    block_size = 16
-    block_hasher = get_request_block_hasher(block_size, sha256)
-    pooling_params = PoolingParams()
-
-    # Simulate gc.freeze() like EngineCore does
-    gc.collect(0)
-    gc.collect(1)
-    gc.collect(2)
-    gc.freeze()
-
-    print(f"GC thresholds: {gc.get_threshold()}")
-    print(f"After freeze - GC counts: {gc.get_count()}, frozen: {gc.get_freeze_count()}")
-
-    weak_refs = []
-    collected_counts = []
-
-    # Track GC events
-    gc_events = []
-    original_callbacks = gc.callbacks[:]
-
-    def gc_callback(phase, info):
-        if phase == "stop":
-            gc_events.append({
-                "generation": info["generation"],
-                "collected": info["collected"],
-                "uncollectable": info["uncollectable"],
-            })
-
-    gc.callbacks.append(gc_callback)
-
-    print(f"\nCreating 1000 Request objects with prefix caching (simulating serving)...")
-
-    for i in range(1000):
-        prompt_tokens = list(range(i * 100, i * 100 + 200))
-        req = Request(
-            request_id=f"req-{i}",
-            prompt_token_ids=prompt_tokens,
-            sampling_params=None,
-            pooling_params=pooling_params,
-            eos_token_id=None,
-            block_hasher=block_hasher,
-        )
-        weak_refs.append(weakref.ref(req))
-
-        # Simulate freeing the request (like scheduler does)
-        del req
-
-        if (i + 1) % 100 == 0:
-            alive = sum(1 for ref in weak_refs if ref() is not None)
-            gen0, gen1, gen2 = gc.get_count()
-            gen0_events = sum(1 for e in gc_events if e["generation"] == 0)
-            gen1_events = sum(1 for e in gc_events if e["generation"] == 1)
-            gen2_events = sum(1 for e in gc_events if e["generation"] == 2)
-            total_collected = sum(e["collected"] for e in gc_events)
-            print(f"  After {i+1} requests: alive={alive}, "
-                  f"gc_counts=({gen0},{gen1},{gen2}), "
-                  f"collections: gen0={gen0_events}, gen1={gen1_events}, gen2={gen2_events}, "
-                  f"total_collected={total_collected}")
-
-    alive_final = sum(1 for ref in weak_refs if ref() is not None)
-    print(f"\nFinal alive count: {alive_final}")
-    print(f"Total GC events: {len(gc_events)}")
-    for gen in range(3):
-        events = [e for e in gc_events if e["generation"] == gen]
-        total = sum(e["collected"] for e in events)
-        print(f"  Gen {gen}: {len(events)} collections, {total} objects collected")
-
-    gc.callbacks.remove(gc_callback)
-    gc.unfreeze()
-    gc.collect()
-
-
-def test_gc_with_fewer_tracked_objects():
-    """Simulate v0.11.1's optimization that creates fewer GC-tracked objects."""
-
-    init_none_hash(sha256)
-    block_size = 16
-    block_hasher = get_request_block_hasher(block_size, sha256)
-    pooling_params = PoolingParams()
-
-    gc.collect()
-    gc.freeze()
-
-    print(f"\n{'='*60}")
-    print("Test: Fewer GC-tracked objects (v0.11.1 behavior)")
-    print(f"{'='*60}")
-
-    weak_refs = []
-    gc_events = []
-
-    def gc_callback(phase, info):
-        if phase == "stop":
-            gc_events.append({
-                "generation": info["generation"],
-                "collected": info["collected"],
-            })
-
-    gc.callbacks.append(gc_callback)
-
-    for i in range(1000):
-        prompt_tokens = list(range(i * 100, i * 100 + 200))
-        req = Request(
-            request_id=f"req-{i}",
-            prompt_token_ids=prompt_tokens,
-            sampling_params=None,
-            pooling_params=pooling_params,
-            eos_token_id=None,
-            block_hasher=block_hasher,
-        )
-        weak_refs.append(weakref.ref(req))
-        del req
-
-        # Simulate v0.11.1's optimization: use tuples instead of lists
-        # This creates fewer GC-tracked objects per iteration
-        _ = ()  # empty tuple (not tracked by GC)
-
-    alive = sum(1 for ref in weak_refs if ref() is not None)
-    gen0_events = sum(1 for e in gc_events if e["generation"] == 0)
-    gen2_events = sum(1 for e in gc_events if e["generation"] == 2)
-    print(f"After 1000 requests: alive={alive}")
-    print(f"GC collections: gen0={gen0_events}, gen2={gen2_events}")
-
-    gc.callbacks.remove(gc_callback)
-    gc.unfreeze()
-    gc.collect()
-
-
-def test_gc_with_large_data():
-    """Test with larger data (simulating mm_features) to see memory impact."""
-    import resource
-
-    init_none_hash(sha256)
-    block_size = 16
-    block_hasher = get_request_block_hasher(block_size, sha256)
-    pooling_params = PoolingParams()
-
-    gc.collect()
-    gc.freeze()
-
-    print(f"\n{'='*60}")
-    print("Test: Request objects with large auxiliary data")
-    print(f"{'='*60}")
-
-    rss_before = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024
-
-    weak_refs = []
-
-    for batch in range(10):
-        for i in range(100):
-            prompt_tokens = list(range(200))
-            req = Request(
-                request_id=f"req-{batch}-{i}",
-                prompt_token_ids=prompt_tokens,
-                sampling_params=None,
-                pooling_params=pooling_params,
-                eos_token_id=None,
-                block_hasher=block_hasher,
-            )
-            # Simulate large mm_features data
-            req._large_data = bytearray(1024 * 1024)  # 1MB per request
-
-            weak_refs.append(weakref.ref(req))
-            del req
-
-        alive = sum(1 for ref in weak_refs if ref() is not None)
-        rss_now = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024
-        gen0, gen1, gen2 = gc.get_count()
-        print(f"  Batch {batch+1}: alive={alive}, "
-              f"RSS={rss_now:.1f}MB (delta={rss_now-rss_before:+.1f}MB), "
-              f"gc_counts=({gen0},{gen1},{gen2})")
-
-    gc.unfreeze()
-    gc.collect()
-
-    alive_after = sum(1 for ref in weak_refs if ref() is not None)
-    rss_after = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024
-    print(f"\nAfter gc.collect(): alive={alive_after}, "
-          f"RSS={rss_after:.1f}MB (delta={rss_after-rss_before:+.1f}MB)")
-
-
-if __name__ == "__main__":
-    test_gc_collection_frequency()
-    test_gc_with_fewer_tracked_objects()
-    test_gc_with_large_data()
diff --git a/test_mem_leak.py b/test_mem_leak.py
deleted file mode 100644
index aa8cbe69242e..000000000000
--- a/test_mem_leak.py
+++ /dev/null
@@ -1,115 +0,0 @@
-"""
-Minimal reproduction script for CPU memory leak with prefix caching.
-Tests whether CPU RSS memory grows unboundedly during inference with
-prefix caching enabled vs disabled.
-"""
-import gc
-import os
-import resource
-import tracemalloc
-
-# Use a single GPU
-os.environ["CUDA_VISIBLE_DEVICES"] = "0"
-
-from vllm import LLM, SamplingParams
-
-
-def get_rss_mb():
-    """Get current RSS memory in MB."""
-    return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024
-
-
-def get_tracemalloc_mb():
-    """Get current traced memory in MB."""
-    current, peak = tracemalloc.get_traced_memory()
-    return current / (1024 * 1024), peak / (1024 * 1024)
-
-
-def run_test(enable_prefix_caching: bool, num_rounds: int = 5,
-             prompts_per_round: int = 200):
-    """Run inference test and monitor memory."""
-    print(f"\n{'='*60}")
-    print(f"Testing with prefix_caching={'enabled' if enable_prefix_caching else 'disabled'}")
-    print(f"{'='*60}")
-
-    tracemalloc.start()
-
-    # Use a small model
-    llm = LLM(
-        model="Qwen/Qwen2.5-VL-3B-Instruct",
-        max_model_len=4096,
-        gpu_memory_utilization=0.5,
-        enable_prefix_caching=enable_prefix_caching,
-        limit_mm_per_prompt={"video": 0},
-        enforce_eager=True,
-    )
-
-    sampling_params = SamplingParams(temperature=0.8, max_tokens=64)
-
-    rss_before = get_rss_mb()
-    traced_before = tracemalloc.get_traced_memory()[0] / (1024 * 1024)
-    print(f"Initial RSS: {rss_before:.1f} MB, Traced: {traced_before:.1f} MB")
-
-    for round_idx in range(num_rounds):
-        # Create prompts that share common prefixes (to exercise prefix caching)
-        # Use varying suffixes to create different block hashes
-        common_prefix = "You are a helpful assistant. " * 50
-        prompts = [
-            f"{common_prefix} Question {round_idx * prompts_per_round + i}: "
-            f"What is {i * 7 + round_idx}? Answer briefly."
-            for i in range(prompts_per_round)
-        ]
-
-        outputs = llm.generate(prompts, sampling_params)
-
-        # Force garbage collection
-        del outputs
-        gc.collect()
-
-        rss_after = get_rss_mb()
-        traced_current, traced_peak = get_tracemalloc_mb()
-        print(
-            f"Round {round_idx + 1}: RSS={rss_after:.1f} MB "
-            f"(delta={rss_after - rss_before:+.1f} MB), "
-            f"Traced={traced_current:.1f} MB (peak={traced_peak:.1f} MB)"
-        )
-
-    # Take a snapshot of the top memory consumers
-    snapshot = tracemalloc.take_snapshot()
-    top_stats = snapshot.statistics('lineno')
-    print(f"\nTop 15 memory consumers:")
-    for stat in top_stats[:15]:
-        print(f"  {stat}")
-
-    # Also check by filename
-    top_stats_file = snapshot.statistics('filename')
-    print(f"\nTop 10 memory consumers by file:")
-    for stat in top_stats_file[:10]:
-        print(f"  {stat}")
-
-    tracemalloc.stop()
-
-    final_rss = get_rss_mb()
-    print(f"\nFinal RSS: {final_rss:.1f} MB (total delta: {final_rss - rss_before:+.1f} MB)")
-
-    del llm
-    gc.collect()
-
-    return final_rss - rss_before
-
-
-if __name__ == "__main__":
-    # Test with prefix caching enabled
-    delta_with_caching = run_test(enable_prefix_caching=True)
-
-    gc.collect()
-
-    # Test with prefix caching disabled
-    delta_without_caching = run_test(enable_prefix_caching=False)
-
-    print(f"\n{'='*60}")
-    print(f"SUMMARY")
-    print(f"{'='*60}")
-    print(f"RSS growth with prefix caching:    {delta_with_caching:+.1f} MB")
-    print(f"RSS growth without prefix caching: {delta_without_caching:+.1f} MB")
-    print(f"Difference: {delta_with_caching - delta_without_caching:+.1f} MB")
diff --git a/test_mem_leak_mm.py b/test_mem_leak_mm.py
deleted file mode 100644
index b8ee944ed693..000000000000
--- a/test_mem_leak_mm.py
+++ /dev/null
@@ -1,120 +0,0 @@
-"""
-Reproduction script for CPU memory leak with prefix caching + multimodal inputs.
-Tests whether CPU RSS memory grows unboundedly during VLM inference.
-"""
-import gc
-import os
-import resource
-import tracemalloc
-
-os.environ["CUDA_VISIBLE_DEVICES"] = "0"
-
-from vllm import LLM, SamplingParams
-
-
-def get_rss_mb():
-    """Get current RSS memory in MB."""
-    return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024
-
-
-def run_test(enable_prefix_caching: bool, num_rounds: int = 5,
-             prompts_per_round: int = 100):
-    """Run inference test with multimodal inputs and monitor memory."""
-    print(f"\n{'='*60}")
-    print(f"Testing with prefix_caching={'enabled' if enable_prefix_caching else 'disabled'}")
-    print(f"{'='*60}")
-
-    tracemalloc.start()
-
-    llm = LLM(
-        model="Qwen/Qwen2.5-VL-3B-Instruct",
-        max_model_len=4096,
-        gpu_memory_utilization=0.5,
-        enable_prefix_caching=enable_prefix_caching,
-        limit_mm_per_prompt={"image": 2, "video": 0},
-        enforce_eager=True,
-    )
-
-    sampling_params = SamplingParams(temperature=0.8, max_tokens=32)
-
-    rss_before = get_rss_mb()
-    traced_before = tracemalloc.get_traced_memory()[0] / (1024 * 1024)
-    print(f"Initial RSS: {rss_before:.1f} MB, Traced: {traced_before:.1f} MB")
-
-    # Use different image URLs to simulate variety in multimodal inputs
-    image_urls = [
-        "https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/PNG_transparency_demonstration_1.png/300px-PNG_transparency_demonstration_1.png",
-        "https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/1200px-Cat03.jpg",
-        "https://upload.wikimedia.org/wikipedia/commons/thumb/4/4d/Cat_November_2010-1a.jpg/1200px-Cat_November_2010-1a.jpg",
-        "https://upload.wikimedia.org/wikipedia/commons/thumb/b/b6/Image_created_with_a_mobile_phone.png/1200px-Image_created_with_a_mobile_phone.png",
-    ]
-
-    for round_idx in range(num_rounds):
-        # Create multimodal prompts with images
-        prompts = []
-        for i in range(prompts_per_round):
-            img_url = image_urls[i % len(image_urls)]
-            prompt = {
-                "prompt": f"<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image briefly. Question {round_idx * prompts_per_round + i}.<|im_end|>\n<|im_start|>assistant\n",
-                "multi_modal_data": {
-                    "image": img_url,
-                },
-            }
-            prompts.append(prompt)
-
-        outputs = llm.generate(prompts, sampling_params)
-
-        del outputs
-        gc.collect()
-
-        rss_after = get_rss_mb()
-        traced_current, traced_peak = get_tracemalloc_mb()
-        print(
-            f"Round {round_idx + 1}: RSS={rss_after:.1f} MB "
-            f"(delta={rss_after - rss_before:+.1f} MB), "
-            f"Traced={traced_current:.1f} MB (peak={traced_peak:.1f} MB)"
-        )
-
-    # Take a snapshot of the top memory consumers
-    snapshot = tracemalloc.take_snapshot()
-    top_stats = snapshot.statistics('lineno')
-    print(f"\nTop 20 memory consumers:")
-    for stat in top_stats[:20]:
-        print(f"  {stat}")
-
-    top_stats_file = snapshot.statistics('filename')
-    print(f"\nTop 10 memory consumers by file:")
-    for stat in top_stats_file[:10]:
-        print(f"  {stat}")
-
-    tracemalloc.stop()
-
-    final_rss = get_rss_mb()
-    print(f"\nFinal RSS: {final_rss:.1f} MB (total delta: {final_rss - rss_before:+.1f} MB)")
-
-    del llm
-    gc.collect()
-
-    return final_rss - rss_before
-
-
-def get_tracemalloc_mb():
-    current, peak = tracemalloc.get_traced_memory()
-    return current / (1024 * 1024), peak / (1024 * 1024)
-
-
-if __name__ == "__main__":
-    # Test with prefix caching enabled
-    delta_with_caching = run_test(enable_prefix_caching=True)
-
-    gc.collect()
-
-    # Test with prefix caching disabled
-    delta_without_caching = run_test(enable_prefix_caching=False)
-
-    print(f"\n{'='*60}")
-    print(f"SUMMARY")
-    print(f"{'='*60}")
-    print(f"RSS growth with prefix caching:    {delta_with_caching:+.1f} MB")
-    print(f"RSS growth without prefix caching: {delta_without_caching:+.1f} MB")
-    print(f"Difference: {delta_with_caching - delta_without_caching:+.1f} MB")
diff --git a/test_request_gc.py b/test_request_gc.py
deleted file mode 100644
index b8959302f469..000000000000
--- a/test_request_gc.py
+++ /dev/null
@@ -1,181 +0,0 @@
-"""
-Test to verify that the fix for the Request reference cycle works.
-Request objects should be freed immediately by reference counting
-when prefix caching is enabled (no cyclic GC needed).
-"""
-import gc
-import os
-import sys
-import weakref
-
-os.environ["CUDA_VISIBLE_DEVICES"] = ""
-
-from vllm.utils.hashing import sha256
-from vllm.v1.core.kv_cache_utils import (
-    get_request_block_hasher,
-    init_none_hash,
-)
-from vllm.pooling_params import PoolingParams
-from vllm.v1.request import Request
-
-
-def test_request_gc_with_prefix_caching():
-    """Test that Request objects are properly freed WITHOUT needing cyclic GC."""
-
-    init_none_hash(sha256)
-    block_size = 16
-    block_hasher = get_request_block_hasher(block_size, sha256)
-
-    weak_refs = []
-
-    # Disable automatic GC to test reference counting alone
-    gc.disable()
-
-    pooling_params = PoolingParams()
-    for i in range(100):
-        prompt_tokens = list(range(i * 100, i * 100 + 200))
-        req = Request(
-            request_id=f"req-{i}",
-            prompt_token_ids=prompt_tokens,
-            sampling_params=None,
-            pooling_params=pooling_params,
-            eos_token_id=None,
-            block_hasher=block_hasher,
-        )
-        weak_refs.append(weakref.ref(req))
-        del req
-
-    alive_before = sum(1 for ref in weak_refs if ref() is not None)
-    print(f"With prefix caching:")
-    print(f"  Alive Request objects before gc.collect(): {alive_before}")
-
-    collected = gc.collect()
-    print(f"  GC collected {collected} objects")
-
-    alive_after = sum(1 for ref in weak_refs if ref() is not None)
-    print(f"  Alive Request objects after gc.collect(): {alive_after}")
-
-    gc.enable()
-
-    if alive_before > 0:
-        print(f"\n  FAIL: {alive_before} Request objects were NOT freed "
-              "without explicit gc.collect()!")
-        print("  Reference cycle still exists.")
-        return False
-    else:
-        print(f"\n  PASS: All Request objects freed immediately by "
-              "reference counting (no cycle).")
-        return True
-
-
-def test_request_gc_without_prefix_caching():
-    """Test that Request objects without prefix caching don't have cycles."""
-
-    weak_refs = []
-    gc.disable()
-
-    pooling_params = PoolingParams()
-    for i in range(100):
-        prompt_tokens = list(range(i * 100, i * 100 + 200))
-        req = Request(
-            request_id=f"req-{i}",
-            prompt_token_ids=prompt_tokens,
-            sampling_params=None,
-            pooling_params=pooling_params,
-            eos_token_id=None,
-            block_hasher=None,
-        )
-        weak_refs.append(weakref.ref(req))
-        del req
-
-    alive_before = sum(1 for ref in weak_refs if ref() is not None)
-    print(f"\nWithout prefix caching:")
-    print(f"  Alive Request objects before gc.collect(): {alive_before}")
-
-    collected = gc.collect()
-    alive_after = sum(1 for ref in weak_refs if ref() is not None)
-    print(f"  GC collected {collected} objects")
-    print(f"  Alive Request objects after gc.collect(): {alive_after}")
-
-    gc.enable()
-
-    if alive_before == 0:
-        print(f"\n  PASS: All Request objects freed immediately.")
-        return True
-    else:
-        print(f"\n  FAIL: {alive_before} Request objects leaked.")
-        return False
-
-
-def test_memory_growth_with_freeze():
-    """Test that there's no memory growth even with gc.freeze()."""
-    import resource
-
-    init_none_hash(sha256)
-    block_size = 16
-    block_hasher = get_request_block_hasher(block_size, sha256)
-
-    gc.collect()
-    gc.freeze()
-
-    print(f"\n{'='*60}")
-    print("Test: Memory stability with gc.freeze() + prefix caching")
-    print(f"{'='*60}")
-
-    rss_before = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024
-    weak_refs = []
-
-    pooling_params = PoolingParams()
-    for round_idx in range(10):
-        for i in range(500):
-            prompt_tokens = list(range(500))
-            req = Request(
-                request_id=f"req-{round_idx}-{i}",
-                prompt_token_ids=prompt_tokens,
-                sampling_params=None,
-                pooling_params=pooling_params,
-                eos_token_id=None,
-                block_hasher=block_hasher,
-            )
-            weak_refs.append(weakref.ref(req))
-            del req
-
-        alive = sum(1 for ref in weak_refs if ref() is not None)
-        rss_after = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024
-        print(f"  Round {round_idx + 1}: alive={alive}, "
-              f"RSS={rss_after:.1f} MB (delta={rss_after - rss_before:+.1f} MB)")
-
-    gc.unfreeze()
-    gc.collect()
-
-    alive_final = sum(1 for ref in weak_refs if ref() is not None)
-    print(f"\n  Final alive: {alive_final}")
-    if alive_final == 0:
-        print("  PASS: All objects freed.")
-        return True
-    else:
-        print(f"  FAIL: {alive_final} objects still alive.")
-        return False
-
-
-if __name__ == "__main__":
-    results = []
-    results.append(("prefix caching GC", test_request_gc_with_prefix_caching()))
-    results.append(("no prefix caching GC", test_request_gc_without_prefix_caching()))
-    results.append(("memory growth with freeze", test_memory_growth_with_freeze()))
-
-    print(f"\n{'='*60}")
-    print("SUMMARY")
-    print(f"{'='*60}")
-    all_pass = True
-    for name, passed in results:
-        status = "PASS" if passed else "FAIL"
-        print(f"  {status}: {name}")
-        if not passed:
-            all_pass = False
-
-    if all_pass:
-        print("\nAll tests passed! The reference cycle fix is working.")
-    else:
-        print("\nSome tests failed!")
-    sys.exit(0 if all_pass else 1)

From 3873b02fd9468ed2dc523b9cccff81d581e51e1c Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Mon, 9 Feb 2026 21:30:13 +0000
Subject: [PATCH 4/4] add analysis

Signed-off-by: Roger Wang <hey@rogerw.io>
---
 analysis.md | 205 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 205 insertions(+)
 create mode 100644 analysis.md

diff --git a/analysis.md b/analysis.md
new file mode 100644
index 000000000000..d571bbdec04e
--- /dev/null
+++ b/analysis.md
@@ -0,0 +1,205 @@
+# CPU Memory Leak Analysis (GitHub Issue #28726)
+
+## Summary
+
+vLLM suffers from continuous CPU memory growth when serving multimodal (VLM)
+models with prefix caching enabled (the default). The EngineCore subprocess
+RSS grows by ~1.5 GB per 1000 requests and never stabilizes, eventually
+causing OOM. The issue appeared between v0.11.0 and v0.11.1.
+
+**Root cause**: A reference cycle in `Request` objects prevents Python's
+reference counting from freeing them. A GC optimization introduced in v0.11.1
+reduces how often the cyclic garbage collector runs, causing these cyclic
+`Request` objects (each holding megabytes of multimodal feature data) to
+accumulate far faster than the GC can reclaim them.
+
+## The Reference Cycle
+
+In `vllm/v1/request.py`, when prefix caching is enabled, each `Request` binds
+itself into a `functools.partial`:
+
+```python
+# vllm/v1/request.py:167-170 (main branch)
+self.get_hash_new_full_blocks: Callable[[], list[BlockHash]] | None = None
+if block_hasher is not None:
+    self.get_hash_new_full_blocks = partial(block_hasher, self)  # <-- cycle
+    self.block_hashes = self.get_hash_new_full_blocks()
+```
+
+This creates a **reference cycle**:
+
+```
+Request ──(self.get_hash_new_full_blocks)──> partial object
+   ^                                              │
+   └──────────(partial stores self as arg)────────┘
+```
+
+### Why this matters
+
+Python uses two garbage collection mechanisms:
+
+1. **Reference counting** (immediate): When an object's reference count drops
+   to zero, it is freed instantly. This is the fast path.
+
+2. **Cyclic garbage collector** (deferred): Periodically scans for groups of
+   objects that reference each other but are unreachable from the rest of the
+   program. This is the slow path, and it runs based on heuristic thresholds.
+
+The reference cycle means that when the scheduler finishes a request and does
+`del self.requests[request_id]`, the `Request` object's reference count
+**does not drop to zero** -- the `partial` still holds a reference. The
+`partial`'s count doesn't drop to zero either -- the `Request` still holds
+it. Both objects are unreachable from the program, but neither can be freed
+by reference counting. They become **cyclic garbage**, waiting for the cyclic
+GC to detect and collect them.
+
+### Why it only affects prefix caching
+
+When prefix caching is **disabled**, `block_hasher` is `None`, so the
+`partial` is never created. There is no cycle. `Request` objects are freed
+immediately by reference counting when the scheduler removes them. This is
+why `--no-enable-prefix-caching` prevents the leak.
+
+### Why it only affects multimodal models visibly
+
+Each `Request` object holds a `mm_features: list[MultiModalFeatureSpec]`
+field. For vision-language models, this contains the **processed image
+feature tensors** -- several megabytes per image. A text-only request has
+empty `mm_features` and is only a few kilobytes. When cyclic garbage
+accumulates:
+
+- **Text-only**: 100 leaked Request objects ~ a few MB (invisible)
+- **VLM with images**: 100 leaked Request objects ~ hundreds of MB to GBs
+  (causes OOM)
+
+## Why It Became a Problem in v0.11.1
+
+The reference cycle existed since prefix caching was introduced. In v0.11.0,
+it was harmless because the cyclic GC ran frequently enough to clean it up.
+Two changes in v0.11.1 broke this equilibrium:
+
+### Change 1: Fewer GC-tracked objects per request (primary cause)
+
+**Commit `acaa2c0a4`** -- *"Reuse empty block lists whenever possible in
+KVCacheBlocks to mitigate GC costs"*
+
+This optimization replaced empty `list` objects (`[]`) with empty `tuple`
+objects (`()`) in `KVCacheBlocks`. Empty tuples are **not tracked by the
+cyclic GC** (CPython optimization), while empty lists are. This means each
+request cycle creates fewer GC-tracked objects.
+
+Python's cyclic GC uses a generational scheme with thresholds (default:
+`(700, 10, 10)`):
+
+- **Generation 0** collection triggers when 700+ new tracked objects
+  accumulate since the last gen-0 collection.
+- **Generation 1** triggers every 10 gen-0 collections.
+- **Generation 2** triggers every 10 gen-1 collections (every 100 gen-0's).
+
+With fewer tracked objects created per request, it takes longer for the
+generation-0 threshold (700 objects) to be reached. This means:
+- Gen-0 collections happen less often
+- Gen-1 and gen-2 collections happen much less often
+- Cyclic garbage from `Request` objects accumulates longer before being swept
+
+In v0.11.0, the extra `list` objects from `KVCacheBlocks` kept the GC
+running frequently. Gen-2 collections (which sweep long-lived cyclic
+garbage) ran often enough that the leaked `Request` memory stabilized.
+In v0.11.1, gen-2 collections became too infrequent, and memory grew
+without bound.
+
+### Change 2: Earlier gc.freeze() (secondary contributor)
+
+**Commit `b30372cbd`** -- *"Move gc.freeze logic from EngineCoreProc to
+EngineCore for better coverage"*
+
+`gc.freeze()` moves all currently tracked objects into a permanent
+generation that the GC never scans. This was moved from the end of
+`EngineCoreProc.__init__()` to the end of `EngineCore.__init__()`,
+freezing objects earlier. While this doesn't directly prevent collection
+of new `Request` objects, the different freeze timing subtly changes the
+GC's generation accounting, further reducing the frequency of collections
+on unfrozen objects.
+
+## Reproduction Results
+
+Using `Qwen/Qwen2.5-VL-3B-Instruct` with the `lmarena-ai/VisionArena-Chat`
+dataset (real user-uploaded images), 1000 prompts per round, prefix caching
+enabled:
+
+### main branch (leak present)
+
+```
+Round   Reqs     Total(GB)   EC(GB)    EC delta   EC round   Time
+----------------------------------------------------------------------
+idle    0        3.63        3.63      ---        ---
+1       1000     10.97       10.97     +7.33      +7.33      66s
+2       2000     14.34       14.34     +10.71     +3.37      57s
+3       3000     15.94       15.94     +12.31     +1.60      55s
+4       4000     16.91       16.91     +13.28     +0.97      58s
+5       5000     17.38       17.38     +13.74     +0.47      58s
+
+EngineCore final RSS: 14.70 GB (started at 2.40 GB)
+Growth rate: +1.60 GB/round average -- NEVER STABILIZES
+```
+
+### fix-cpu-leak branch (leak fixed)
+
+```
+Round   Reqs     Total(GB)   EC(GB)    EC delta   EC round   Time
+----------------------------------------------------------------------
+idle    0        3.63        3.63      ---        ---
+1       1000     9.86        9.86      +6.22      +6.22      67s
+2       2000     10.50       10.50     +6.86      +0.64      56s
+3       3000     10.55       10.55     +6.91      +0.05      57s
+4       4000     10.55       10.55     +6.92      +0.01      56s
+5       5000     10.64       10.64     +7.01      +0.09      56s
+
+EngineCore final RSS: 7.51 GB (started at 2.41 GB)
+Growth rate: +0.20 GB/round average -- STABLE after round 1
+```
+
+The fix reduces EngineCore memory by **half** (7.51 GB vs 14.70 GB) after
+5000 multimodal requests.
+
+## The Fix
+
+**Break the reference cycle** by storing `block_hasher` directly without
+`partial`, and passing `self` explicitly at call sites:
+
+```python
+# BEFORE (creates cycle):
+self.get_hash_new_full_blocks = partial(block_hasher, self)
+self.block_hashes = self.get_hash_new_full_blocks()
+
+# AFTER (no cycle):
+self._block_hasher = block_hasher
+self.block_hashes = self._block_hasher(self)
+```
+
+Without the cycle, `Request` objects are freed **immediately** by reference
+counting when the scheduler removes them -- no cyclic GC needed. This
+eliminates the leak regardless of GC frequency or `gc.freeze()` behavior.
+
+### Files changed
+
+- **`vllm/v1/request.py`** -- Store `_block_hasher` instead of
+  `partial(block_hasher, self)`. Update `append_output_token_ids()` to call
+  `self._block_hasher(self)`.
+- **`vllm/v1/core/sched/scheduler.py`** -- Update session block hash call
+  site from `session.get_hash_new_full_blocks()` to
+  `session._block_hasher(session)`.
+- **`tests/v1/core/test_async_scheduler.py`** -- Update test call site.
+
+### Verification (unit test)
+
+With cyclic GC disabled (`gc.disable()`), create 100 Request objects with
+prefix caching and delete all external references:
+
+| | main (cycle) | fix (no cycle) |
+|---|---|---|
+| Objects alive after `del` | **100** (all leaked) | **0** (all freed) |
+| Freed by `gc.collect()` | 100 | 0 (nothing to collect) |
+
+All 137 existing tests pass (86 scheduler + 43 kv_cache_utils + 8 async
+scheduler).