sbintuitions · junya-takayama · Feb 26, 2026 · Feb 26, 2026 · Feb 26, 2026 · Feb 26, 2026
diff --git a/flexeval/core/language_model/vllm_model.py b/flexeval/core/language_model/vllm_model.py
@@ -152,7 +152,7 @@ def wrapper(self: VLLM, *args: tuple, **kwargs: dict) -> Callable:
 
                 self.llm = LLM(self.model_name, **self.model_kwargs)
                 if self.model_limit_tokens == "default":
-                    self.model_limit_tokens = self.llm.llm_engine.get_model_config().max_model_len
+                    self.model_limit_tokens = self.llm.llm_engine.model_config.max_model_len
             return method(self, *args, **kwargs)
 
         return wrapper
@@ -306,7 +306,7 @@ def _batch_compute_log_probs(
             prefix + continuation for prefix, continuation in zip(batch_prefix_ids, batch_continuation_ids)
         ]
 
-        max_length = self.llm.llm_engine.get_model_config().max_seq_len_to_capture
+        max_length = self.llm.llm_engine.model_config.max_model_len
         stride = stride or max_length // 2
         if not (0 < stride < max_length):
             msg = f"stride must be in (0, {max_length}), but got {stride}"
@@ -315,7 +315,7 @@ def _batch_compute_log_probs(
 
         from vllm import RequestOutput, SamplingParams
         from vllm.inputs import TokensPrompt
-        from vllm.sequence import Logprob
+        from vllm.logprobs import Logprob
 
         sampling_params = SamplingParams(temperature=0.0, max_tokens=1, prompt_logprobs=1)
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -29,7 +29,7 @@ sacrebleu = {extras = ["ja"], version = "^2.4.1"}
 jiwer = "^3.0.4"
 openai = "^1.52.2"
 google-api-python-client = "^2.131.0"
-vllm = {version = "0.10.2", optional = true }
+vllm = {version = "0.16.0", optional = true }
 loguru = "^0.7.2"
 wandb = {version = "^0.17.2", optional = true}
 pyarrow = "16.1.0"  # set the version because we get "Unable to find installation candidates" with 17.0.0

diff --git a/tests/core/language_model/vllm/test_vllm_common.py b/tests/core/language_model/vllm/test_vllm_common.py
@@ -29,34 +29,14 @@ def chat_lm() -> VLLM:
         model_kwargs={
             "seed": 42,
             "gpu_memory_utilization": 0.1,
+            "max_model_len": 2048,
             "enforce_eager": True,
             "disable_custom_all_reduce": True,
         },
         tokenizer_kwargs={"use_fast": False},
     )
     yield llm
-    from vllm.distributed.parallel_state import cleanup_dist_env_and_memory
-
-    cleanup_dist_env_and_memory()
-
-
-@pytest.fixture(scope="module")
-def chat_lm_with_system_message() -> VLLM:
-    llm = VLLM(
-        model="sbintuitions/tiny-lm-chat",
-        model_kwargs={
-            "seed": 42,
-            "gpu_memory_utilization": 0.1,
-            "enforce_eager": True,
-            "disable_custom_all_reduce": True,
-        },
-        tokenizer_kwargs={"use_fast": False},
-        system_message="You are a helpful assistant.",
-    )
-    yield llm
-    from vllm.distributed.parallel_state import cleanup_dist_env_and_memory
-
-    cleanup_dist_env_and_memory()
+    llm.cleanup_resources()
 
 
 @pytest.mark.skipif(not is_vllm_enabled(), reason="vllm library is not installed")

diff --git a/tests/core/language_model/vllm/test_vllm_custom_template.py b/tests/core/language_model/vllm/test_vllm_custom_template.py
@@ -23,16 +23,15 @@ def chat_lm_with_custom_chat_template() -> VLLM:
         model_kwargs={
             "seed": 42,
             "gpu_memory_utilization": 0.1,
+            "max_model_len": 2048,
             "enforce_eager": True,
             "disable_custom_all_reduce": True,
         },
         tokenizer_kwargs={"use_fast": False},
         custom_chat_template=custom_chat_template,
     )
     yield llm
-    from vllm.distributed.parallel_state import cleanup_dist_env_and_memory
-
-    cleanup_dist_env_and_memory()
+    llm.cleanup_resources()
 
 
 # With 1 1 1, the continuation was not 1.
@@ -55,6 +54,7 @@ def chat_lm_with_fill_zeros() -> VLLM:
         model_kwargs={
             "seed": 42,
             "gpu_memory_utilization": 0.1,
+            "max_model_len": 2048,
             "enforce_eager": True,
             "disable_custom_all_reduce": True,
         },
@@ -63,9 +63,7 @@ def chat_lm_with_fill_zeros() -> VLLM:
         chat_template_kwargs={"fill_with_zeros": True},
     )
     yield llm
-    from vllm.distributed.parallel_state import cleanup_dist_env_and_memory
-
-    cleanup_dist_env_and_memory()
+    llm.cleanup_resources()
 
 
 @pytest.fixture(scope="module")
@@ -78,6 +76,7 @@ def chat_lm_with_fill_xs() -> VLLM:
         model_kwargs={
             "seed": 42,
             "gpu_memory_utilization": 0.1,
+            "max_model_len": 2048,
             "enforce_eager": True,
             "disable_custom_all_reduce": True,
         },
@@ -86,9 +85,7 @@ def chat_lm_with_fill_xs() -> VLLM:
         chat_template_kwargs={"fill_with_zeros": False},
     )
     yield llm
-    from vllm.distributed.parallel_state import cleanup_dist_env_and_memory
-
-    cleanup_dist_env_and_memory()
+    llm.cleanup_resources()
 
 
 @pytest.mark.skipif(not is_vllm_enabled(), reason="vllm library is not installed")

diff --git a/tests/core/language_model/vllm/test_vllm_serve_lm.py b/tests/core/language_model/vllm/test_vllm_serve_lm.py
@@ -31,13 +31,14 @@ def chat_lm() -> VLLMServeLM:
         model_kwargs={
             "seed": 42,
             "gpu_memory_utilization": 0.1,
+            "max_model_len": 2048,
             "enforce_eager": True,
             "disable_custom_all_reduce": True,
             "tokenizer_mode": "slow",
         },
     )
     yield llm
-    llm.manager.stop()
+    llm.cleanup_resources()
     if openai_api_key is not None:
         os.environ["OPENAI_API_KEY"] = openai_api_key
 

diff --git a/tests/core/language_model/vllm/test_vllm_specific.py b/tests/core/language_model/vllm/test_vllm_specific.py
@@ -14,22 +14,20 @@
 
 
 @pytest.fixture(scope="module")
-def chat_lm() -> Generator[VLLM, None, None]:
+def chat_lm() -> VLLM:
     llm = VLLM(
         model="sbintuitions/tiny-lm-chat",
         model_kwargs={
             "seed": 42,
             "gpu_memory_utilization": 0.1,
+            "max_model_len": 2048,
             "enforce_eager": True,
-            "dtype": "float32",
             "disable_custom_all_reduce": True,
         },
         tokenizer_kwargs={"use_fast": False},
     )
     yield llm
-    from vllm.distributed.parallel_state import cleanup_dist_env_and_memory
-
-    cleanup_dist_env_and_memory()
+    llm.cleanup_resources()
 
 
 @pytest.fixture(scope="module")
@@ -39,14 +37,29 @@ def chat_lm_qwen() -> Generator[VLLM, None, None]:
         model_kwargs={
             "seed": 42,
             "gpu_memory_utilization": 0.1,
+            "max_model_len": 2048,
+        },
+    )
+    yield llm
+    llm.cleanup_resources()
+
+
+@pytest.fixture(scope="module")
+def chat_lm_with_system_message() -> VLLM:
+    llm = VLLM(
+        model="sbintuitions/tiny-lm-chat",
+        model_kwargs={
+            "seed": 42,
+            "gpu_memory_utilization": 0.1,
+            "max_model_len": 2048,
             "enforce_eager": True,
             "disable_custom_all_reduce": True,
         },
+        tokenizer_kwargs={"use_fast": False},
+        system_message="You are a helpful assistant.",
     )
     yield llm
-    from vllm.distributed.parallel_state import cleanup_dist_env_and_memory
-
-    cleanup_dist_env_and_memory()
+    llm.cleanup_resources()
 
 
 @pytest.fixture(scope="module")
@@ -57,17 +70,15 @@ def chat_lm_for_tool_calling() -> Generator[VLLM, None, None]:
         model_kwargs={
             "seed": 42,
             "gpu_memory_utilization": 0.1,
+            "max_model_len": 2048,
             "enforce_eager": True,
-            "dtype": "float32",
             "disable_custom_all_reduce": True,
         },
         tokenizer_kwargs={"use_fast": False},
         tool_parser=tool_parser,
     )
     yield llm
-    from vllm.distributed.parallel_state import cleanup_dist_env_and_memory
-
-    cleanup_dist_env_and_memory()
+    llm.cleanup_resources()
 
 
 @pytest.fixture(scope="module")
@@ -77,24 +88,38 @@ def hf_lm(model_name: str = "sbintuitions/tiny-lm-chat") -> HuggingFaceLM:
     )
 
 
+@pytest.fixture(scope="module")
+def hf_lm_qwen(model_name: str = "Qwen/Qwen3-0.6B-Base") -> HuggingFaceLM:
+    return HuggingFaceLM(
+        model=model_name, model_kwargs={"torch_dtype": "float32"}, default_gen_kwargs={"temperature": 0.0}
+    )
+
+
 @pytest.mark.skipif(not is_vllm_enabled(), reason="vllm library is not installed")
-@pytest.mark.parametrize("chat_lm_name", ["chat_lm", "chat_lm_qwen"])
+@pytest.mark.parametrize(
+    ("chat_lm_name", "hf_lm_name"),
+    [
+        ("chat_lm", "hf_lm"),
+        ("chat_lm_qwen", "hf_lm_qwen"),
+    ],
+)
 def test_batch_compute_log_probs_approximates_hf_lm(
     request: pytest.FixtureRequest,
     chat_lm_name: str,
-    hf_lm: HuggingFaceLM,
+    hf_lm_name: str,
 ) -> None:
     chat_lm = request.getfixturevalue(chat_lm_name)
+    hf_lm = request.getfixturevalue(hf_lm_name)
     prefix_list = ["それは正しい日本語ですか？"]
     text_list = ["これは正しい日本語です。"]
 
     vllm_log_probs = chat_lm.compute_log_probs(text_list)
     hf_log_probs = hf_lm.compute_log_probs(text_list)
-    assert vllm_log_probs == pytest.approx(hf_log_probs, abs=1e-2)
+    assert vllm_log_probs == pytest.approx(hf_log_probs, abs=0.5)
 
     vllm_log_probs = chat_lm.compute_log_probs(text_list, prefix_list=prefix_list)
     hf_log_probs = hf_lm.compute_log_probs(text_list, prefix_list=prefix_list)
-    assert vllm_log_probs == pytest.approx(hf_log_probs, abs=1e-2)
+    assert vllm_log_probs == pytest.approx(hf_log_probs, abs=0.5)
 
 
 @pytest.mark.skipif(not is_vllm_enabled(), reason="vllm library is not installed")