nikopueringer · nikopueringer · Mar 27, 2026 · Mar 26, 2026 · Mar 26, 2026 · Mar 26, 2026
diff --git a/CorridorKeyModule/inference_engine.py b/CorridorKeyModule/inference_engine.py
@@ -16,6 +16,11 @@
 from .core import color_utils as cu
 from .core.model_transformer import GreenFormer
 
+# Persist torch.compile autotune cache across runs (default is /tmp which
+# gets wiped on reboot — saves 10-20 min re-autotuning on ROCm, ~30s on CUDA)
+_inductor_cache = os.path.join(os.path.expanduser("~"), ".cache", "corridorkey", "inductor")
+os.environ.setdefault("TORCHINDUCTOR_CACHE_DIR", _inductor_cache)
+
 logger = logging.getLogger(__name__)
 
 
@@ -50,10 +55,16 @@ def __init__(
 
         self.model_precision = model_precision
 
+        self._is_rocm = hasattr(torch.version, "hip") and torch.version.hip
         self.model = self._load_model()
 
-        # We only tested compilation on Windows and Linux. For other platforms compilation is disabled as a precaution.
-        if sys.platform == "linux" or sys.platform == "win32":
+        # torch.compile is tested on CUDA (Windows + Linux) and ROCm (Linux).
+        # ROCm on Windows hangs during Triton kernel compilation — skip it.
+        # CORRIDORKEY_SKIP_COMPILE=1 forces eager mode (useful for testing).
+        skip_compile = (self._is_rocm and sys.platform == "win32") or os.environ.get("CORRIDORKEY_SKIP_COMPILE") == "1"
+        if skip_compile:
+            logger.info("Skipping torch.compile (eager mode)")
+        elif sys.platform == "linux" or sys.platform == "win32":
             self._compile()
 
     def _load_model(self) -> GreenFormer:
@@ -116,20 +127,43 @@ def _load_model(self) -> GreenFormer:
         return model
 
     def _compile(self):
+        if self._is_rocm:
+            # "default" avoids the heavy autotuning that OOM-kills 16GB cards
+            # at 2048x2048. Still compiles Triton kernels, just skips the
+            # exhaustive benchmarking. HIP graphs are also avoided (segfault
+            # on large graphs — pytorch/pytorch#155720).
+            compile_mode = "default"
+        else:
+            compile_mode = "max-autotune"
+
         try:
-            compiled_model = torch.compile(self.model, mode="max-autotune")
-            # Trigger compilation with a dummy input
+            if self._is_rocm:
+                logger.info(
+                    "Compiling model (mode=%s) — this may take 10-20 minutes on first run (ROCm). "
+                    "Compiled kernels are cached for future runs.",
+                    compile_mode,
+                )
+            else:
+                logger.info("Compiling model (mode=%s)...", compile_mode)
+            compiled_model = torch.compile(self.model, mode=compile_mode)
+            # Trigger compilation with a dummy input (the actual compile
+            # happens here, not in the torch.compile() call above)
             dummy_input = torch.zeros(
                 1, 4, self.img_size, self.img_size, dtype=self.model_precision, device=self.device
             )
             with torch.inference_mode():
                 compiled_model(dummy_input)
+            del dummy_input
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
             self.model = compiled_model
+            logger.info("Model compiled successfully (mode=%s)", compile_mode)
 
         except Exception as e:
             logger.info(f"Compilation error: {e}")
             logger.warning("Model compilation failed. Falling back to eager mode.")
-            torch.cuda.empty_cache()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
 
     def _preprocess_input(
         self, image_batch: torch.Tensor, mask_batch_linear: torch.Tensor, input_is_linear: bool

diff --git a/README.md b/README.md
@@ -35,7 +35,8 @@ This project was designed and built on a Linux workstation (Puget Systems PC) eq
 
 The most recent build should work on computers with 6-8 gig of VRAM, and it can run on most M1+ Mac systems with unified memory. Yes, it might even work on your old Macbook pro. Let us know on the Discord!
 
-*   **Windows Users:** To run GPU acceleration natively on Windows, your system MUST have NVIDIA drivers that support **CUDA 12.8 or higher** installed. If your drivers only support older CUDA versions, the installer will likely fallback to the CPU.
+*   **Windows Users (NVIDIA):** To run GPU acceleration natively on Windows, your system MUST have NVIDIA drivers that support **CUDA 12.8 or higher** installed. If your drivers only support older CUDA versions, the installer will likely fallback to the CPU.
+*   **AMD GPU Users (ROCm):** AMD Radeon RX 7000 series (RDNA3) and RX 9000 series (RDNA4) are supported via ROCm on **Linux**. Windows ROCm support is experimental (torch.compile is not yet functional). See the [AMD ROCm Setup](#amd-rocm-setup) section below.
 *   **GVM (Optional):** Requires approximately **80 GB of VRAM** and utilizes massive Stable Video Diffusion models.
 *   **VideoMaMa (Optional):** Natively requires a massive chunk of VRAM as well (originally 80GB+). While the community has tweaked the architecture to run at less than 24GB, those extreme memory optimizations have not yet been fully implemented in this repository.
 *   **BiRefNet (Optional):** Lightweight AlphaHint generator option.
@@ -72,6 +73,7 @@ This project uses **[uv](https://docs.astral.sh/uv/)** to manage Python and all
     uv sync --extra cuda     # CUDA GPU acceleration (Linux/Windows)
     uv sync --extra mlx      # Apple Silicon MLX acceleration
     ```
+    For **AMD ROCm** setup, see the [AMD ROCm Setup](#amd-rocm-setup) section below.
 4.  **Download the Models:**
     *   **CorridorKey v1.0 Model (~300MB):** Downloads automatically on first run. If no `.pth` file is found in `CorridorKeyModule/checkpoints/`, the engine fetches it from [CorridorKey's HuggingFace](https://huggingface.co/nikopueringer/CorridorKey_v1.0) and saves it as `CorridorKey.pth`. No manual download needed.
     *   **GVM Weights (Optional):** [HuggingFace: geyongtao/gvm](https://huggingface.co/geyongtao/gvm)
@@ -220,6 +222,79 @@ uv run python corridorkey_cli.py wizard --win_path "/path/to/clips"
 
 **Use native MLX instead of PyTorch MPS:** MLX avoids PyTorch's MPS layer entirely and typically runs faster on Apple Silicon. See the [Backend Selection](#backend-selection) section below for setup steps.
 
+### AMD ROCm Setup
+
+CorridorKey supports AMD GPUs via PyTorch's ROCm/HIP backend. The `torch.cuda.*` API works transparently on AMD — HIP intercepts all CUDA calls at runtime, so the inference code runs unchanged.
+
+**Supported GPUs (ROCm 7.2+):**
+- RX 7900 XTX (24GB) / XT (20GB) / GRE (16GB) — RDNA3, gfx1100
+- RX 7800 XT (16GB) / 7700 XT (12GB) — RDNA3, gfx1101
+- RX 9070 XT / 9070 (16GB) — RDNA4, gfx1201
+
+**VRAM requirements:** CorridorKey inference at 2048x2048 uses ~10GB on NVIDIA but ~18GB on AMD due to HIP allocator overhead. The RX 7900 XTX (24GB) and RX 7900 XT (20GB) run at full resolution. Cards with 16GB (RX 7800 XT, 9070 XT) work on Windows (which uses system RAM as overflow) but may OOM on Linux — see notes below.
+
+**Linux native (recommended):**
+```bash
+uv sync --extra rocm
+
+# Verify
+uv run python -c "import torch; print(torch.cuda.is_available(), torch.cuda.get_device_name(0))"
+```
+
+**WSL2 (Windows Subsystem for Linux):**
+
+Requires AMD Adrenalin 26.1.1+ driver on Windows. Install ROCm inside WSL2, then use AMD's WSL-specific torch wheels:
+
+```bash
+# 1. Install ROCm for WSL (Ubuntu 24.04)
+sudo apt update
+wget https://repo.radeon.com/amdgpu-install/7.2/ubuntu/noble/amdgpu-install_7.2.70200-1_all.deb
+sudo apt install ./amdgpu-install_7.2.70200-1_all.deb
+amdgpu-install -y --usecase=wsl,rocm --no-dkms
+
+# 2. Verify GPU is visible
+rocminfo  # should show your AMD GPU
+
+# 3. Install AMD's WSL torch wheels (Python 3.12)
+pip3 install \
+  https://repo.radeon.com/rocm/manylinux/rocm-rel-7.2/torch-2.9.1%2Brocm7.2.0.lw.git7e1940d4-cp312-cp312-linux_x86_64.whl \
+  https://repo.radeon.com/rocm/manylinux/rocm-rel-7.2/torchvision-0.24.0%2Brocm7.2.0.gitb919bd0c-cp312-cp312-linux_x86_64.whl \
+  https://repo.radeon.com/rocm/manylinux/rocm-rel-7.2/triton-3.5.1%2Brocm7.2.0.gita272dfa8-cp312-cp312-linux_x86_64.whl
+
+# 4. Fix WSL runtime library conflict (required)
+location=$(pip3 show torch | grep Location | awk -F ": " '{print $2}')
+rm -f ${location}/torch/lib/libhsa-runtime64.so*
+
+# 5. Install CorridorKey deps AFTER torch (so pip doesn't overwrite ROCm torch)
+pip3 install -e .
+```
+
+**Windows native (experimental):**
+
+Windows ROCm requires Python 3.12 and AMD Adrenalin 25.3.1+ driver. `torch.compile` does not work on Windows ROCm — inference runs in eager mode (significantly slower than Linux).
+
+```powershell
+py -3.12 -m pip install https://repo.radeon.com/rocm/windows/rocm-rel-7.2/rocm-7.2.0.dev0-py3-none-win_amd64.whl
+py -3.12 -m pip install --no-cache-dir https://repo.radeon.com/rocm/windows/rocm-rel-7.2/torch-2.9.1+rocmsdk20260116-cp312-cp312-win_amd64.whl https://repo.radeon.com/rocm/windows/rocm-rel-7.2/torchvision-0.24.1+rocmsdk20260116-cp312-cp312-win_amd64.whl
+```
+
+**What CorridorKey does automatically on ROCm:**
+- Sets `TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1` so SDPA dispatches to flash attention kernels on RDNA3 (without this, attention falls back to a slow O(n²) path)
+- Sets `MIOPEN_FIND_MODE=2` for faster convolution kernel selection (reduces warmup from 5-8 minutes to seconds)
+- Uses `torch.compile(mode="default")` on Linux to avoid OOM during kernel autotuning on 16GB cards
+- Skips `torch.compile` entirely on Windows ROCm where Triton compilation hangs
+- Auto-detects ROCm via `/opt/rocm` (Linux), `HIP_PATH` (Windows), or `CORRIDORKEY_ROCM=1` env var (explicit opt-in)
+
+**First-run note:** The first inference run on a new AMD GPU triggers Triton kernel autotuning (10-20 minutes). This is cached in `~/.cache/corridorkey/inductor/` and only happens once per GPU architecture. Subsequent runs start instantly.
+
+**16GB cards on Linux:** CorridorKey at 2048x2048 needs ~18GB. Windows handles this transparently via shared GPU memory (system RAM overflow). On Linux, the GPU has a hard VRAM limit. If you hit OOM on a 16GB card, install `pytorch-rocm-gtt` to enable GTT (system RAM as GPU overflow) — CorridorKey detects and uses it automatically:
+```bash
+pip install pytorch-rocm-gtt
+```
+GTT memory is accessed over PCIe (~10-20x slower than VRAM), so expect slower frame times on 16GB cards vs 20-24GB cards.
+
+**WSL2 limitation:** WSL2 cannot use GTT or shared memory — it has a hard VRAM limit. 16GB cards will OOM in WSL2 at 2048x2048. Use Windows native instead, or a card with 20GB+ VRAM.
+
 ## Backend Selection
 
 CorridorKey supports two inference backends:

diff --git a/corridorkey_cli.py b/corridorkey_cli.py
@@ -25,11 +25,24 @@
 from rich.console import Console
 from rich.logging import RichHandler
 from rich.panel import Panel
-from rich.progress import BarColumn, MofNCompleteColumn, Progress, SpinnerColumn, TaskID, TextColumn, TimeElapsedColumn
+from rich.progress import (
+    BarColumn,
+    MofNCompleteColumn,
+    Progress,
+    SpinnerColumn,
+    TaskID,
+    TextColumn,
+    TimeElapsedColumn,
+)
 from rich.prompt import Confirm, IntPrompt, Prompt
 from rich.table import Table
 
-from clip_manager import (
+# Set ROCm env vars before any module imports torch.
+from device_utils import setup_rocm_env
+
+setup_rocm_env()
+
+from clip_manager import (  # noqa: E402
     LINUX_MOUNT_ROOT,
     ClipEntry,
     InferenceSettings,
@@ -43,8 +56,8 @@
     run_videomama,
     scan_clips,
 )
-from CorridorKeyModule.backend import resolve_backend
-from device_utils import resolve_device
+from CorridorKeyModule.backend import resolve_backend  # noqa: E402
+from device_utils import resolve_device  # noqa: E402
 
 logger = logging.getLogger(__name__)
 console = Console()

diff --git a/device_utils.py b/device_utils.py
@@ -3,16 +3,57 @@
 import logging
 import os
 
-import torch
-
 logger = logging.getLogger(__name__)
 
 DEVICE_ENV_VAR = "CORRIDORKEY_DEVICE"
 VALID_DEVICES = ("auto", "cuda", "mps", "cpu")
 
 
+def is_rocm_system() -> bool:
+    """Detect if the system has AMD ROCm available (without importing torch).
+
+    Checks: /opt/rocm (Linux), HIP_PATH env var (Windows), HIP_VISIBLE_DEVICES
+    (any platform), CORRIDORKEY_ROCM=1 (explicit opt-in for cases where
+    auto-detection fails, e.g. pip-installed ROCm on Windows).
+    """
+    return (
+        os.path.exists("/opt/rocm")
+        or os.environ.get("HIP_PATH") is not None
+        or os.environ.get("HIP_VISIBLE_DEVICES") is not None
+        or os.environ.get("CORRIDORKEY_ROCM") == "1"
+    )
+
+
+def setup_rocm_env() -> None:
+    """Set ROCm environment variables and apply optional patches.
+
+    Must be called before importing torch so that env vars are visible to
+    PyTorch's initialization. This module intentionally avoids importing
+    torch at module level to make that possible. Safe to call on non-ROCm
+    systems (no-op).
+    """
+    if not is_rocm_system():
+        return
+    os.environ.setdefault("TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL", "1")
+    os.environ.setdefault("MIOPEN_FIND_MODE", "2")
+    # Level 4 = suppress info/debug but keep warnings and errors visible
+    os.environ.setdefault("MIOPEN_LOG_LEVEL", "4")
+    # Enable GTT (system RAM as GPU overflow) on Linux for 16GB cards.
+    # pytorch-rocm-gtt must be installed separately: pip install pytorch-rocm-gtt
+    try:
+        import pytorch_rocm_gtt
+
+        pytorch_rocm_gtt.patch()
+    except ImportError:
+        pass  # not installed — expected on most systems
+    except Exception:
+        logger.warning("pytorch-rocm-gtt is installed but patch() failed", exc_info=True)
+
+
 def detect_best_device() -> str:
     """Auto-detect best available device: CUDA > MPS > CPU."""
+    import torch
+
     if torch.cuda.is_available():
         device = "cuda"
     elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
@@ -36,6 +77,8 @@ def resolve_device(requested: str | None = None) -> str:
     Raises:
         RuntimeError: If the requested backend is unavailable.
     """
+    import torch
+
     # CLI arg takes priority, then env var, then auto
     device = requested
     if device is None or device == "auto":
@@ -67,8 +110,10 @@ def resolve_device(requested: str | None = None) -> str:
     return device
 
 
-def clear_device_cache(device: torch.device | str) -> None:
+def clear_device_cache(device) -> None:
     """Clear GPU memory cache if applicable (no-op for CPU)."""
+    import torch
+
     device_type = device.type if isinstance(device, torch.device) else device
     if device_type == "cuda":
         torch.cuda.empty_cache()

diff --git a/pyproject.toml b/pyproject.toml
@@ -50,6 +50,11 @@ cuda = [
 mlx = [
     "corridorkey-mlx ; python_version >= '3.11'",
 ]
+rocm = [
+    "torch==2.8.0",
+    "torchvision==0.23.0",
+    "pytorch-triton-rocm==3.4.0 ; platform_machine == 'x86_64' and sys_platform == 'linux'",
+]
 
 [dependency-groups]
 dev = ["pytest", "pytest-cov", "ruff", "hypothesis"]
@@ -111,6 +116,7 @@ conflicts = [
     [
         { extra = "cuda" },
         { extra = "mlx" },
+        { extra = "rocm" },
     ],
 ]
 
@@ -120,9 +126,22 @@ url = "https://download.pytorch.org/whl/cu128" # CUDA 12.6 doesn't support RTX 5
 explicit = true
 extra = "cuda"
 
+[[tool.uv.index]]
+name = "pytorch-rocm"
+url = "https://download.pytorch.org/whl/rocm6.3"
+explicit = true
+extra = "rocm"
+
 [tool.uv.sources]
 # Use Hiera fix in order to utilize the FlashAttention Kernel
 timm = { git = "https://github.com/Raiden129/pytorch-image-models-fix", branch = "fix/hiera-flash-attention-global-4d" }
-torch = { index = "pytorch", extra = "cuda" }
-torchvision = { index = "pytorch", extra = "cuda" }
+torch = [
+    { index = "pytorch", extra = "cuda" },
+    { index = "pytorch-rocm", extra = "rocm" },
+]
+torchvision = [
+    { index = "pytorch", extra = "cuda" },
+    { index = "pytorch-rocm", extra = "rocm" },
+]
+pytorch-triton-rocm = { index = "pytorch-rocm", extra = "rocm" }
 corridorkey-mlx = { git = "https://github.com/nikopueringer/corridorkey-mlx.git", extra = "mlx" }
diff --git a/tests/test_device_utils.py b/tests/test_device_utils.py
@@ -119,7 +119,7 @@ def test_mps_no_backend_raises(self, monkeypatch):
         _patch_gpu(monkeypatch, cuda=False, mps=False)
         # Replace torch.backends with an object that lacks "mps" entirely
         fake_backends = type("Backends", (), {})()
-        monkeypatch.setattr("device_utils.torch.backends", fake_backends)
+        monkeypatch.setattr(torch, "backends", fake_backends)
         with pytest.raises(RuntimeError, match="no MPS support"):
             resolve_device("mps")
 

diff --git a/tests/test_pyproject_structure.py b/tests/test_pyproject_structure.py
@@ -86,14 +86,26 @@ class TestUvSources:
     def test_torch_source_has_cuda_extra(self, pyproject: dict) -> None:
         sources = pyproject["tool"]["uv"]["sources"]
         torch_src = sources["torch"]
-        assert torch_src.get("extra") == "cuda"
-        assert "marker" not in torch_src, "torch source should not have platform markers"
+        cuda_entry = next(s for s in torch_src if s.get("extra") == "cuda")
+        assert cuda_entry["index"] == "pytorch"
 
     def test_torchvision_source_has_cuda_extra(self, pyproject: dict) -> None:
         sources = pyproject["tool"]["uv"]["sources"]
         tv_src = sources["torchvision"]
-        assert tv_src.get("extra") == "cuda"
-        assert "marker" not in tv_src, "torchvision source should not have platform markers"
+        cuda_entry = next(s for s in tv_src if s.get("extra") == "cuda")
+        assert cuda_entry["index"] == "pytorch"
+
+    def test_torch_source_has_rocm_extra(self, pyproject: dict) -> None:
+        sources = pyproject["tool"]["uv"]["sources"]
+        torch_src = sources["torch"]
+        rocm_entry = next(s for s in torch_src if s.get("extra") == "rocm")
+        assert rocm_entry["index"] == "pytorch-rocm"
+
+    def test_torchvision_source_has_rocm_extra(self, pyproject: dict) -> None:
+        sources = pyproject["tool"]["uv"]["sources"]
+        tv_src = sources["torchvision"]
+        rocm_entry = next(s for s in tv_src if s.get("extra") == "rocm")
+        assert rocm_entry["index"] == "pytorch-rocm"
 
 
 # ---------------------------------------------------------------------------
@@ -110,7 +122,9 @@ def test_cuda_mlx_conflict_declared(self, pyproject: dict) -> None:
         extras_in_groups = [
             {entry["extra"] for entry in group} for group in conflicts if all("extra" in entry for entry in group)
         ]
-        assert {"cuda", "mlx"} in extras_in_groups, "Expected a conflict group containing both 'cuda' and 'mlx' extras"
+        assert {"cuda", "mlx", "rocm"} in extras_in_groups, (
+            "Expected a conflict group containing 'cuda', 'mlx', and 'rocm' extras"
+        )
 
 
 # ---------------------------------------------------------------------------