From c467d3f2add8e040c6acef3a19b930a9b218fa69 Mon Sep 17 00:00:00 2001 From: Ousama Ben Younes Date: Sat, 11 Apr 2026 18:33:44 +0000 Subject: [PATCH] fix: fall back to CPU when CUDA is requested but unavailable (#216) PromptCompressor defaulted to device_map="cuda" even when torch was built without CUDA support, producing "AssertionError: Torch not compiled with CUDA enabled" on Windows / CPU-only machines. The fix transparently falls back to "cpu" (with a RuntimeWarning) when "cuda" is requested but torch.cuda.is_available() is False, so the default still works out of the box on CPU-only installs. Generated by Claude Code Vibe coded by ousamabenyounes Co-Authored-By: Claude --- llmlingua/prompt_compressor.py | 10 ++ tests/test_issue_216.py | 173 +++++++++++++++++++++++++++++++++ 2 files changed, 183 insertions(+) create mode 100644 tests/test_issue_216.py diff --git a/llmlingua/prompt_compressor.py b/llmlingua/prompt_compressor.py index 84e390e..e9d7f8b 100644 --- a/llmlingua/prompt_compressor.py +++ b/llmlingua/prompt_compressor.py @@ -6,6 +6,7 @@ import json import re import string +import warnings from collections import defaultdict from typing import List, Union @@ -133,6 +134,15 @@ def load_model( if any("ForTokenClassification" in ar for ar in config.architectures) else AutoModelForCausalLM ) + if "cuda" in device_map and not torch.cuda.is_available(): + warnings.warn( + f"device_map='{device_map}' was requested but CUDA is not available " + f"(torch.cuda.is_available() is False). Falling back to 'cpu'. " + f"To silence this warning, pass device_map='cpu' explicitly.", + RuntimeWarning, + stacklevel=2, + ) + device_map = "cpu" self.device = ( device_map if any(key in device_map for key in ["cuda", "cpu", "mps"]) diff --git a/tests/test_issue_216.py b/tests/test_issue_216.py new file mode 100644 index 0000000..739cd2e --- /dev/null +++ b/tests/test_issue_216.py @@ -0,0 +1,173 @@ +# Copyright (c) 2023 Microsoft +# Licensed under The MIT License [see LICENSE for details] + +"""Regression test for https://github.com/microsoft/LLMLingua/issues/216 + +PromptCompressor defaulted to device_map="cuda" even when torch was built +without CUDA support, producing "AssertionError: Torch not compiled with CUDA +enabled" on Windows / CPU-only machines. The fix transparently falls back to +"cpu" (with a RuntimeWarning) when "cuda" is requested but unavailable. + +These tests exercise the fallback branch directly using monkeypatching so they +run in a fraction of a second and do not depend on a real CUDA runtime. +""" + +import unittest +import warnings +from unittest import mock + +import torch + +import llmlingua.prompt_compressor as prompt_compressor_module +from llmlingua import PromptCompressor + + +class _FakeConfig: + architectures = ["BertForTokenClassification"] + pad_token_id = 0 + max_position_embeddings = 512 + + +class _FakeTokenizer: + pad_token_id = 0 + eos_token_id = 0 + padding_side = "right" + special_tokens_map = {"pad_token": "[PAD]"} + + def add_special_tokens(self, mapping): + return 0 + + +class _FakeModel: + def resize_token_embeddings(self, new_len): + return None + + def to(self, device): + return self + + def eval(self): + return self + + +class Issue216CudaFallbackTest(unittest.TestCase): + """Verify that PromptCompressor falls back to CPU when CUDA is unavailable.""" + + def _patched_load_model( + self, + cuda_available: bool, + requested_device_map: str, + ): + captured = {} + + def fake_from_pretrained(*args, **kwargs): + name = args[0] if args else kwargs.get("model_name", "") + if "Config" in fake_from_pretrained._cls_name: + return _FakeConfig() + if "Tokenizer" in fake_from_pretrained._cls_name: + return _FakeTokenizer() + captured["device_map"] = kwargs.get("device_map") + return _FakeModel() + + fake_from_pretrained._cls_name = "" + + def build_fake(cls_name): + def _factory(*args, **kwargs): + if cls_name == "Config": + return _FakeConfig() + if cls_name == "Tokenizer": + return _FakeTokenizer() + captured["device_map"] = kwargs.get("device_map") + return _FakeModel() + + return mock.Mock(from_pretrained=_factory) + + with mock.patch.object( + torch.cuda, "is_available", return_value=cuda_available + ), mock.patch.object( + prompt_compressor_module, "AutoConfig", build_fake("Config") + ), mock.patch.object( + prompt_compressor_module, "AutoTokenizer", build_fake("Tokenizer") + ), mock.patch.object( + prompt_compressor_module, + "AutoModelForTokenClassification", + build_fake("Model"), + ), mock.patch.object( + prompt_compressor_module, "AutoModelForCausalLM", build_fake("Model") + ): + compressor = PromptCompressor.__new__(PromptCompressor) + compressor.load_model( + "microsoft/llmlingua-2-xlm-roberta-large-meetingbank", + device_map=requested_device_map, + ) + return compressor, captured + + def test_cuda_requested_but_unavailable_falls_back_to_cpu(self): + with warnings.catch_warnings(record=True) as captured_warnings: + warnings.simplefilter("always") + compressor, passed_kwargs = self._patched_load_model( + cuda_available=False, requested_device_map="cuda" + ) + + self.assertEqual(compressor.device, "cpu") + self.assertEqual(passed_kwargs["device_map"], "cpu") + + runtime_warnings = [ + w + for w in captured_warnings + if issubclass(w.category, RuntimeWarning) + and "CUDA is not available" in str(w.message) + ] + self.assertEqual( + len(runtime_warnings), + 1, + f"expected exactly one CUDA fallback RuntimeWarning, got " + f"{len(runtime_warnings)}: {[str(w.message) for w in captured_warnings]}", + ) + + def test_cpu_requested_explicitly_is_untouched(self): + with warnings.catch_warnings(record=True) as captured_warnings: + warnings.simplefilter("always") + compressor, passed_kwargs = self._patched_load_model( + cuda_available=False, requested_device_map="cpu" + ) + + self.assertEqual(compressor.device, "cpu") + self.assertEqual(passed_kwargs["device_map"], "cpu") + + runtime_warnings = [ + w + for w in captured_warnings + if issubclass(w.category, RuntimeWarning) + and "CUDA" in str(w.message) + ] + self.assertEqual( + len(runtime_warnings), + 0, + "explicit device_map='cpu' should not trigger any CUDA warning", + ) + + def test_cuda_requested_and_available_stays_on_cuda(self): + with warnings.catch_warnings(record=True) as captured_warnings: + warnings.simplefilter("always") + compressor, passed_kwargs = self._patched_load_model( + cuda_available=True, requested_device_map="cuda" + ) + + self.assertEqual(compressor.device, "cuda") + self.assertEqual(passed_kwargs["device_map"], "cuda") + + runtime_warnings = [ + w + for w in captured_warnings + if issubclass(w.category, RuntimeWarning) + and "CUDA is not available" in str(w.message) + ] + self.assertEqual( + len(runtime_warnings), + 0, + "no fallback warning should be raised when CUDA is actually available", + ) + + +if __name__ == "__main__": + unittest.main()