From 258b20e360baaccaf772d0341329b411cd79dcbf Mon Sep 17 00:00:00 2001
From: Zander <steamwings@outlook.com>
Date: Sun, 15 Feb 2026 15:08:16 -0500
Subject: [PATCH 1/3] feat(rocm): Add AMD ROCm GPU support infrastructure

Add Docker build infrastructure and shell script support for running
OpenTranscribe on AMD GPUs via ROCm/HIP.

New files:
- Dockerfile.rocm: Multi-stage build with PyTorch ROCm 6.4, CTranslate2
  ROCm wheel, and MIOpen JIT compilation headers
- requirements-rocm.txt: Python dependencies with ROCm-specific PyTorch
- docker-compose.rocm-build.yml: Build overlay for ROCm backend image
- docker-compose.gpu-rocm.yml: Runtime overlay with GPU device passthrough,
  render group mapping, and ROCm environment variables

Modified files:
- opentr.sh: Auto-detect ROCm GPUs and inject compose overlays
- .env.example: Document HSA_OVERRIDE_GFX_VERSION and RENDER_GROUP_GID

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .env.example                  |  20 ++++
 backend/Dockerfile.rocm       | 168 ++++++++++++++++++++++++++++++++++
 backend/requirements-rocm.txt |  72 +++++++++++++++
 docker-compose.gpu-rocm.yml   |  50 ++++++++++
 docker-compose.rocm-build.yml |  67 ++++++++++++++
 opentr.sh                     |  52 +++++++++++
 6 files changed, 429 insertions(+)
 create mode 100644 backend/Dockerfile.rocm
 create mode 100644 backend/requirements-rocm.txt
 create mode 100644 docker-compose.gpu-rocm.yml
 create mode 100644 docker-compose.rocm-build.yml

diff --git a/.env.example b/.env.example
index 873a0304..89eaadeb 100644
--- a/.env.example
+++ b/.env.example
@@ -188,6 +188,26 @@ GPU_SCALE_ENABLED=false
 GPU_SCALE_DEVICE_ID=2
 GPU_SCALE_WORKERS=4
 
+# AMD ROCm GPU Configuration (for AMD Radeon GPUs)
+# Only needed if using an AMD GPU with ROCm drivers installed.
+# opentr.sh auto-detects AMD GPUs via rocm-smi and loads ROCm overlays.
+#
+# HSA_OVERRIDE_GFX_VERSION: Required for consumer AMD GPUs to work with ROCm.
+# Set this to match your GPU architecture:
+#   RX 7900 XTX/XT (gfx1100):      11.0.0
+#   RX 7800 XT / 7700 XT (gfx1101): 11.0.1
+#   RX 7600 (gfx1102):              11.0.2
+#   RX 6900/6800 XT (gfx1030):      10.3.0
+#   RX 6700 XT (gfx1031):           10.3.1
+# See: https://rocm.docs.amd.com/en/latest/reference/gpu-arch-specs.html
+# HSA_OVERRIDE_GFX_VERSION=11.0.1
+#
+# RENDER_GROUP_GID: Host's render group GID for /dev/kfd access.
+# Auto-detected by opentr.sh. Only set manually if auto-detection fails.
+# Find your system's value with: stat -c '%g' /dev/kfd
+# Common values: 109 (Ubuntu/Debian), 993 (Fedora/RHEL)
+# RENDER_GROUP_GID=109
+
 #=============================================================================
 # AI MODELS CONFIGURATION
 #=============================================================================
diff --git a/backend/Dockerfile.rocm b/backend/Dockerfile.rocm
new file mode 100644
index 00000000..2542048e
--- /dev/null
+++ b/backend/Dockerfile.rocm
@@ -0,0 +1,168 @@
+# =============================================================================
+# OpenTranscribe Backend - ROCm Dockerfile (AMD GPU)
+# Multi-stage build optimized for security with non-root user
+# Uses PyTorch ROCm 6.4 and CTranslate2 ROCm HIP backend
+# =============================================================================
+
+# -----------------------------------------------------------------------------
+# Stage 1: Build Stage - Install Python dependencies with compilation
+# -----------------------------------------------------------------------------
+FROM python:3.13-slim-trixie AS builder
+
+WORKDIR /build
+
+# Install build dependencies (only in this stage)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    gcc \
+    g++ \
+    curl \
+    unzip \
+    && rm -rf /var/lib/apt/lists/*
+
+# Copy only requirements first for better layer caching
+COPY requirements-rocm.txt .
+
+# Upgrade pip to latest version first (security fix for CVE-2025-8869)
+RUN pip install --user --no-cache-dir --upgrade pip
+
+# Install Python dependencies (ROCm variant)
+# PyTorch 2.8.0+rocm6.4 from pytorch.org
+# CTranslate2 ROCm wheel from GitHub releases (not on PyPI)
+# WhisperX 3.7.0 - latest version with ctranslate2 4.5+ compatibility
+# NumPy 2.x - fully compatible with all packages
+# Use --user to install to /root/.local which we'll copy to final stage
+RUN pip install --user --no-cache-dir --no-warn-script-location -r requirements-rocm.txt
+
+# Install CTranslate2 ROCm wheel from official GitHub releases
+# These wheels are built with HIP backend for AMD GPUs
+# See: https://github.com/OpenNMT/CTranslate2/releases
+ARG CTRANSLATE2_VERSION=4.7.1
+ARG CTRANSLATE2_PYTHON=cp313
+RUN curl -sL -o /tmp/rocm-wheels.zip \
+    "https://github.com/OpenNMT/CTranslate2/releases/download/v${CTRANSLATE2_VERSION}/rocm-python-wheels-Linux.zip" && \
+    unzip -j /tmp/rocm-wheels.zip "temp-linux/ctranslate2-${CTRANSLATE2_VERSION}-${CTRANSLATE2_PYTHON}-${CTRANSLATE2_PYTHON}-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl" -d /tmp/ && \
+    pip install --user --no-cache-dir --force-reinstall /tmp/ctranslate2-*.whl && \
+    rm -f /tmp/rocm-wheels.zip /tmp/ctranslate2-*.whl
+
+# -----------------------------------------------------------------------------
+# Stage 2: Runtime Stage - Minimal production image with non-root user
+# -----------------------------------------------------------------------------
+FROM python:3.13-slim-trixie
+
+# OCI annotations for container metadata and compliance
+LABEL org.opencontainers.image.title="OpenTranscribe Backend (ROCm)" \
+      org.opencontainers.image.description="AI-powered transcription backend with WhisperX and PyAnnote (AMD ROCm GPU)" \
+      org.opencontainers.image.vendor="OpenTranscribe" \
+      org.opencontainers.image.authors="OpenTranscribe Contributors" \
+      org.opencontainers.image.licenses="AGPL-3.0" \
+      org.opencontainers.image.source="https://github.com/davidamacey/OpenTranscribe" \
+      org.opencontainers.image.documentation="https://github.com/davidamacey/OpenTranscribe/blob/master/README.md"
+
+# Install runtime dependencies and ROCm runtime libraries
+# ROCm 7.0 libs needed by CTranslate2's ROCm wheel at runtime.
+# hsa-rocr: HSA runtime with ROCm 7.0 symbols (hsa_amd_memory_get_preferred_copy_engine)
+# rocrand-dev: Headers for MIOpen JIT kernel compilation on consumer GPUs (gfx1100/1101)
+#   PyTorch ROCm wheels lack pre-compiled MIOpen kernels for RDNA 3, so MIOpen compiles
+#   them at first run. This needs rocrand headers (rocrand/rocrand_xorwow.h).
+# hip-dev headers: hip/hip_runtime.h needed by rocrand headers for MIOpen JIT.
+#   hip-dev can't be installed via apt because it pulls rocm-llvm which depends on
+#   libstdc++-11-dev (unavailable on Trixie). Instead we download the .deb and
+#   extract only the include directory — no dependency resolution needed.
+# C++ header stubs: hiprtc/comgr's built-in hiprtc_runtime.h provides type traits
+#   (integral_constant, enable_if, etc.) but rocrand headers do #include <utility>
+#   which is unused. We can't install real C++ headers (GCC's conflict with
+#   hiprtc_runtime.h's definitions; rocm-llvm can't be installed on Trixie).
+#   Instead we create empty stubs in /opt/rocm/include/ so the #include resolves.
+# Debian Trixie ships libhsa-runtime64-1 (ROCm 6.x) which we override via symlink.
+# We use the AMD apt repo targeting Ubuntu Noble (24.04), compatible with Trixie.
+ARG ROCM_VERSION=7.0
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    curl \
+    ffmpeg \
+    libsndfile1 \
+    libimage-exiftool-perl \
+    libgomp1 \
+    && echo "deb [arch=amd64 trusted=yes] https://repo.radeon.com/rocm/apt/${ROCM_VERSION}/ noble main" \
+       > /etc/apt/sources.list.d/rocm.list \
+    && apt-get update && apt-get install -y --no-install-recommends \
+    hsa-rocr \
+    hipblas \
+    hiprand \
+    hipsparse \
+    hipsolver \
+    hipfft \
+    miopen-hip \
+    rocrand-dev \
+    && cd /tmp && apt-get download hip-dev \
+    && dpkg-deb -x hip-dev_*.deb /tmp/hip-extract \
+    && cp -a /tmp/hip-extract/opt/rocm*/include/* /opt/rocm/include/ 2>/dev/null || true \
+    && rm -rf /tmp/hip-extract /tmp/hip-dev_*.deb \
+    && printf '#pragma once\n' | tee /opt/rocm/include/utility /opt/rocm/include/type_traits > /dev/null \
+    && echo "/opt/rocm/lib" > /etc/ld.so.conf.d/rocm.conf \
+    && ldconfig \
+    && ln -sf /opt/rocm/lib/libhsa-runtime64.so.1 /usr/lib/x86_64-linux-gnu/libhsa-runtime64.so.1 \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+
+# Create non-root user for security
+# Add to video and render groups for ROCm GPU access (/dev/kfd, /dev/dri)
+RUN groupadd -r render && \
+    groupadd -r appuser && \
+    useradd -r -g appuser -G video,render -u 1000 -m -s /bin/bash appuser && \
+    mkdir -p /app /app/models /app/temp && \
+    chown -R appuser:appuser /app && \
+    mkdir -p /home/appuser/.cache/huggingface \
+             /home/appuser/.cache/torch \
+             /home/appuser/.cache/nltk_data \
+             /home/appuser/.cache/sentence-transformers \
+             /home/appuser/.cache/yt-dlp && \
+    chown -R appuser:appuser /home/appuser/.cache
+
+# Set working directory
+WORKDIR /app
+
+# Copy Python packages from builder stage
+COPY --from=builder --chown=appuser:appuser /root/.local /home/appuser/.local
+
+# Replace PyTorch's and Triton's bundled ROCm 6.4 HSA runtime with ROCm 7.0
+# Both bundle libhsa-runtime64.so (ROCm 6.4, SONAME .so.1). CTranslate2 needs
+# ROCm 7.0's version (with hsa_amd_memory_get_preferred_copy_engine@ROCR_1).
+# Since shared libs with the same SONAME load only once per process, whichever
+# loads first wins. PyTorch loads before CTranslate2, so without this fix its
+# ROCm 6.4 HSA runtime is used and CTranslate2 fails with undefined symbol.
+RUN ln -sf /opt/rocm/lib/libhsa-runtime64.so.1 \
+        /home/appuser/.local/lib/python3.13/site-packages/torch/lib/libhsa-runtime64.so && \
+    ln -sf /opt/rocm/lib/libhsa-runtime64.so.1 \
+        /home/appuser/.local/lib/python3.13/site-packages/triton/backends/amd/lib/libhsa-runtime64.so
+
+# Ensure scripts in .local are usable by adding to PATH
+# Set LD_LIBRARY_PATH so CTranslate2 and other native libs can find:
+#   - ROCm system libs (hipblas, rocblas) installed via apt from /opt/rocm/lib
+#   - ROCm runtime libs (amdhip64) bundled in PyTorch ROCm wheel
+# Set cache directories to user home
+ENV PATH=/home/appuser/.local/bin:$PATH \
+    PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    LD_LIBRARY_PATH=/opt/rocm/lib:/home/appuser/.local/lib/python3.13/site-packages/torch/lib \
+    HF_HOME=/home/appuser/.cache/huggingface \
+    TRANSFORMERS_CACHE=/home/appuser/.cache/huggingface/transformers \
+    TORCH_HOME=/home/appuser/.cache/torch \
+    NLTK_DATA=/home/appuser/.cache/nltk_data \
+    SENTENCE_TRANSFORMERS_HOME=/home/appuser/.cache/sentence-transformers
+
+# Copy application code
+COPY --chown=appuser:appuser . .
+
+# Switch to non-root user
+USER appuser
+
+# Expose application port
+EXPOSE 8080
+
+# Health check to verify the application is responding
+HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \
+  CMD curl -f http://localhost:8080/health || exit 1
+
+# Command to run the application in production (no reload)
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8080"]
diff --git a/backend/requirements-rocm.txt b/backend/requirements-rocm.txt
new file mode 100644
index 00000000..16919221
--- /dev/null
+++ b/backend/requirements-rocm.txt
@@ -0,0 +1,72 @@
+fastapi>=0.103.1
+uvicorn[standard]>=0.23.2
+websockets>=11.0.3
+sqlalchemy>=2.0.20
+alembic>=1.12.0
+pydantic>=2.3.0
+pydantic-settings>=2.0.0
+python-jose[cryptography]>=3.3.0
+python-multipart>=0.0.6
+passlib[bcrypt]>=1.7.4
+bcrypt==3.2.2
+email-validator
+# Asynchronous Tasks
+celery>=5.3.4
+redis>=5.0.0
+flower>=2.0.0
+psycopg2-binary>=2.9.7
+minio>=7.2.18
+opensearch-py>=3.0.0
+httpx>=0.24.1
+python-dotenv>=1.0.0
+pytest>=7.4.2
+
+# LDAP/Active Directory Authentication
+ldap3>=2.9.1
+
+# Rate Limiting
+slowapi>=0.1.9
+
+# PKI/X.509 Certificate Authentication
+cryptography>=42.0.0
+
+# MFA/TOTP Authentication (FedRAMP IA-2)
+pyotp>=2.9.0
+qrcode[pil]>=7.4.0
+
+# AI/ML Stack - ROCm variant for AMD GPUs
+# Non-GPU packages are identical to requirements.txt
+numpy>=1.25.2
+
+# PyTorch with ROCm 6.4 support for AMD GPUs
+# PyTorch 2.8.0 bundles its own ROCm 6.4 runtime in torch/lib/
+# CTranslate2 ROCm wheel uses ROCm 7.0 system libs (installed via apt in Dockerfile)
+# These coexist because they have different SONAMEs (.so.2 vs .so.3)
+--extra-index-url https://download.pytorch.org/whl/rocm6.4
+torch==2.8.0
+torchaudio==2.8.0
+
+# WhisperX latest version with ctranslate2 4.5+ support
+whisperx==3.7.0
+
+# CTranslate2 with ROCm HIP support (official wheels from GitHub releases)
+# The ROCm wheel is installed separately in the Dockerfile from:
+# https://github.com/OpenNMT/CTranslate2/releases
+# Do NOT install ctranslate2 from PyPI here - the PyPI version is CUDA-only.
+# The Dockerfile handles downloading and installing the ROCm wheel.
+
+# PyAnnote (compatible with NumPy 2.x and PyTorch 2.6+)
+pyannote.audio>=3.3.2
+
+# Supporting libraries
+omegaconf>=2.3.0
+ffmpeg-python>=0.2.0
+sentencepiece>=0.1.99
+psutil>=5.9.5
+pyexiftool>=0.5.0
+yt-dlp>=2025.1.7
+
+# Semantic search and NLP
+sentence-transformers>=2.2.0  # For semantic search embeddings (all-MiniLM-L6-v2)
+# Note: nltk is installed as transitive dependency from transformers/whisperx
+# NLTK data files (punkt_tab) must be pre-downloaded for offline usage
diff --git a/docker-compose.gpu-rocm.yml b/docker-compose.gpu-rocm.yml
new file mode 100644
index 00000000..1aaab059
--- /dev/null
+++ b/docker-compose.gpu-rocm.yml
@@ -0,0 +1,50 @@
+# docker-compose.gpu-rocm.yml
+# Optional overlay for GPU acceleration (AMD ROCm)
+#
+# This overlay enables GPU runtime for celery-worker on systems with AMD GPUs.
+# It is automatically detected and loaded by opentr.sh when ROCm is available.
+#
+# Usage:
+#   ./opentr.sh start dev              # Auto-detects AMD GPU and applies if available
+#   OR manually:
+#   docker compose -f docker-compose.yml -f docker-compose.override.yml \
+#     -f docker-compose.rocm-build.yml -f docker-compose.gpu-rocm.yml up
+#
+# Note: This file is automatically included by opentr.sh when AMD ROCm GPU is detected.
+# On macOS or systems without AMD GPU, this file is not loaded, allowing the base
+# docker-compose.yml to work on CPU-only systems.
+#
+# Configuration (.env file):
+#   GPU_DEVICE_ID - GPU device to use (default: 0)
+#   HSA_OVERRIDE_GFX_VERSION - GFX version override for GPU compatibility
+#     RX 7900 XTX/XT (gfx1100): 11.0.0
+#     RX 7800 XT / 7700 XT (gfx1101): 11.0.1
+#     RX 7600 (gfx1102): 11.0.2
+#     RX 6900/6800 XT (gfx1030): 10.3.0
+#   RENDER_GROUP_GID - Host's render group GID for /dev/kfd access
+#     Find with: stat -c '%g' /dev/kfd
+#     Common values: 109 (Ubuntu), 993 (Fedora/RHEL), varies by distro
+
+services:
+  celery-worker:
+    # Enable AMD ROCm GPU access via device passthrough
+    # /dev/kfd - Kernel Fusion Driver (required for ROCm compute)
+    # /dev/dri - Direct Rendering Infrastructure (GPU device nodes)
+    devices:
+      - /dev/kfd:/dev/kfd
+      - /dev/dri:/dev/dri
+    # Add host's video and render group GIDs for device access.
+    # The render group GID varies by distro (e.g. 109 on Ubuntu, 993 on Fedora).
+    # Using numeric GID ensures it matches the host's /dev/kfd ownership.
+    group_add:
+      - video
+      - "${RENDER_GROUP_GID:-109}"
+    # ROCm requires seccomp=unconfined for HSA memory mapping
+    # See: https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html
+    security_opt:
+      - seccomp:unconfined
+    # Larger shared memory for PyTorch model loading (default 64MB is too small)
+    shm_size: '8g'
+    environment:
+      - HSA_OVERRIDE_GFX_VERSION=${HSA_OVERRIDE_GFX_VERSION:-11.0.0}
+      - HIP_VISIBLE_DEVICES=${GPU_DEVICE_ID:-0}
diff --git a/docker-compose.rocm-build.yml b/docker-compose.rocm-build.yml
new file mode 100644
index 00000000..4136668d
--- /dev/null
+++ b/docker-compose.rocm-build.yml
@@ -0,0 +1,67 @@
+# docker-compose.rocm-build.yml
+# Overlay that switches all backend services to use Dockerfile.rocm
+#
+# This file is automatically loaded by opentr.sh when AMD ROCm GPU is detected.
+# It layers on top of docker-compose.override.yml (dev) or docker-compose.prod.yml (prod)
+# and only overrides the Dockerfile reference — all other settings (volumes, commands,
+# environment) are inherited from the base overlay.
+#
+# Usage:
+#   ./opentr.sh start dev              # Auto-detects AMD GPU and applies if available
+#   OR manually:
+#   docker compose -f docker-compose.yml -f docker-compose.override.yml \
+#     -f docker-compose.rocm-build.yml -f docker-compose.gpu-rocm.yml up
+
+services:
+  backend:
+    image: opentranscribe-backend-rocm:latest
+    build:
+      context: ./backend
+      dockerfile: Dockerfile.rocm
+
+  celery-worker:
+    image: opentranscribe-backend-rocm:latest
+    build:
+      context: ./backend
+      dockerfile: Dockerfile.rocm
+    # ROCm/HIP: Use solo pool to avoid fork.
+    # The HSA runtime initializes during `import torch` (library load) in the parent
+    # process. Forked children inherit stale HSA state, causing "invalid device ordinal".
+    # With --concurrency=1 already set, --pool=solo is functionally equivalent.
+    command: celery -A app.core.celery worker --loglevel=info -Q gpu --concurrency=1 --pool=solo
+
+  celery-download-worker:
+    image: opentranscribe-backend-rocm:latest
+    build:
+      context: ./backend
+      dockerfile: Dockerfile.rocm
+
+  celery-cpu-worker:
+    image: opentranscribe-backend-rocm:latest
+    build:
+      context: ./backend
+      dockerfile: Dockerfile.rocm
+
+  celery-nlp-worker:
+    image: opentranscribe-backend-rocm:latest
+    build:
+      context: ./backend
+      dockerfile: Dockerfile.rocm
+
+  celery-beat:
+    image: opentranscribe-backend-rocm:latest
+    build:
+      context: ./backend
+      dockerfile: Dockerfile.rocm
+
+  flower:
+    image: opentranscribe-backend-rocm:latest
+    build:
+      context: ./backend
+      dockerfile: Dockerfile.rocm
+
+  celery-worker-gpu-scaled:
+    image: opentranscribe-backend-rocm:latest
+    build:
+      context: ./backend
+      dockerfile: Dockerfile.rocm
diff --git a/opentr.sh b/opentr.sh
index 2e8e8dbc..0df19e95 100755
--- a/opentr.sh
+++ b/opentr.sh
@@ -135,6 +135,27 @@ detect_and_configure_hardware() {
       export COMPUTE_TYPE="int8"
       export USE_GPU="false"
     fi
+  elif command -v rocm-smi &> /dev/null && rocm-smi --showproductname &> /dev/null 2>&1; then
+    echo "✅ AMD ROCm GPU detected"
+    export DOCKER_RUNTIME="rocm"
+    export TORCH_DEVICE="cuda"  # ROCm uses torch.cuda API via HIP translation
+    export COMPUTE_TYPE="float16"
+    export USE_GPU="true"
+
+    # Auto-detect render group GID from /dev/kfd for container device access
+    if [ -c "/dev/kfd" ]; then
+      RENDER_GID=$(stat -c '%g' /dev/kfd 2>/dev/null || stat -f '%g' /dev/kfd 2>/dev/null || echo "109")
+      export RENDER_GROUP_GID="$RENDER_GID"
+      echo "✅ ROCm kernel fusion driver available (/dev/kfd, render GID: $RENDER_GID)"
+    else
+      echo "⚠️  AMD GPU detected but /dev/kfd not found"
+      echo "   ROCm GPU access may not work in containers"
+      echo "   Falling back to CPU mode"
+      export DOCKER_RUNTIME=""
+      export TORCH_DEVICE="cpu"
+      export COMPUTE_TYPE="int8"
+      export USE_GPU="false"
+    fi
   elif [[ "$PLATFORM" == "darwin" && "$ARCH" == "arm64" ]]; then
     echo "✅ Apple Silicon detected"
     export TORCH_DEVICE="mps"
@@ -257,6 +278,14 @@ start_app() {
     echo "🎯 Adding GPU overlay (docker-compose.gpu.yml) for NVIDIA acceleration"
   fi
 
+  # Add ROCm GPU overlay if AMD GPU is detected
+  if [ "$DOCKER_RUNTIME" = "rocm" ]; then
+    if [ -f "docker-compose.rocm-build.yml" ] && [ -f "docker-compose.gpu-rocm.yml" ]; then
+      COMPOSE_FILES="$COMPOSE_FILES -f docker-compose.rocm-build.yml -f docker-compose.gpu-rocm.yml"
+      echo "🎯 Adding ROCm overlays for AMD GPU acceleration"
+    fi
+  fi
+
   # Add GPU scaling overlay if requested
   if [ -n "$GPU_SCALE_FLAG" ]; then
     COMPOSE_FILES="$COMPOSE_FILES -f docker-compose.gpu-scale.yml"
@@ -411,6 +440,14 @@ reset_and_init() {
     echo "🎯 Adding GPU overlay (docker-compose.gpu.yml) for NVIDIA acceleration"
   fi
 
+  # Add ROCm GPU overlay if AMD GPU is detected
+  if [ "$DOCKER_RUNTIME" = "rocm" ]; then
+    if [ -f "docker-compose.rocm-build.yml" ] && [ -f "docker-compose.gpu-rocm.yml" ]; then
+      COMPOSE_FILES="$COMPOSE_FILES -f docker-compose.rocm-build.yml -f docker-compose.gpu-rocm.yml"
+      echo "🎯 Adding ROCm overlays for AMD GPU acceleration"
+    fi
+  fi
+
   # Add GPU scaling overlay if requested
   if [ -n "$GPU_SCALE_FLAG" ]; then
     COMPOSE_FILES="$COMPOSE_FILES -f docker-compose.gpu-scale.yml"
@@ -731,6 +768,13 @@ case "$1" in
       COMPOSE_FILES="$COMPOSE_FILES -f docker-compose.gpu.yml"
     fi
 
+    # Add ROCm overlay if AMD GPU is detected
+    if [ "$DOCKER_RUNTIME" = "rocm" ]; then
+      if [ -f "docker-compose.rocm-build.yml" ] && [ -f "docker-compose.gpu-rocm.yml" ]; then
+        COMPOSE_FILES="$COMPOSE_FILES -f docker-compose.rocm-build.yml -f docker-compose.gpu-rocm.yml"
+      fi
+    fi
+
     # shellcheck disable=SC2086
     docker compose $COMPOSE_FILES up -d --build backend celery-worker celery-beat flower
     echo "✅ Backend services rebuilt successfully."
@@ -779,6 +823,14 @@ case "$1" in
       echo "🎯 Including GPU overlay for build"
     fi
 
+    # Add ROCm overlay if AMD GPU is detected
+    if [ "$DOCKER_RUNTIME" = "rocm" ]; then
+      if [ -f "docker-compose.rocm-build.yml" ] && [ -f "docker-compose.gpu-rocm.yml" ]; then
+        COMPOSE_FILES="$COMPOSE_FILES -f docker-compose.rocm-build.yml -f docker-compose.gpu-rocm.yml"
+        echo "🎯 Including ROCm overlays for build"
+      fi
+    fi
+
     # shellcheck disable=SC2086
     docker compose $COMPOSE_FILES build
     echo "✅ Build complete. Use './opentr.sh start' to start the application."

From cd236fe772e67bf0915e4f99cdd2201a0ffb34a9 Mon Sep 17 00:00:00 2001
From: Zander <steamwings@outlook.com>
Date: Sun, 15 Feb 2026 15:08:32 -0500
Subject: [PATCH 2/3] feat(rocm): Add ROCm awareness to Python backend

Update hardware detection and GPU monitoring to handle AMD ROCm/HIP
alongside NVIDIA CUDA:

- hardware_detection.py: Add is_rocm property to detect HIP backend,
  skip NVIDIA-specific env vars (TORCH_CUDA_ARCH_LIST) on ROCm,
  report gpu_backend and hip_version in hardware summary, skip
  NVIDIA driver config in Docker runtime helper
- utility.py: Use rocm-smi for GPU stats on ROCm (temperature, VRAM,
  utilization), with fallback to PyTorch CUDA API for memory stats

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 backend/app/tasks/utility.py            | 99 ++++++++++++++++++-------
 backend/app/utils/hardware_detection.py | 58 +++++++++++----
 2 files changed, 115 insertions(+), 42 deletions(-)

diff --git a/backend/app/tasks/utility.py b/backend/app/tasks/utility.py
index 26546346..83830044 100644
--- a/backend/app/tasks/utility.py
+++ b/backend/app/tasks/utility.py
@@ -74,6 +74,73 @@ def check_tasks_health(self):
     return summary
 
 
+def _get_gpu_memory_bytes(device_id: int = 0) -> tuple[float, float, float]:
+    """
+    Get GPU memory stats using the appropriate system tool.
+
+    Tries nvidia-smi for NVIDIA GPUs, rocm-smi for AMD GPUs,
+    then falls back to PyTorch's own memory reporting.
+
+    Args:
+        device_id: GPU device index
+
+    Returns:
+        Tuple of (memory_used_bytes, memory_total_bytes, memory_free_bytes)
+    """
+    import shutil
+    import subprocess
+
+    # Try nvidia-smi first (NVIDIA GPUs)
+    if shutil.which("nvidia-smi"):
+        # Security: Safe subprocess call with hardcoded system command.
+        # Only dynamic parameter is device_id (integer), preventing command injection.
+        result = subprocess.run(
+            [  # noqa: S603 S607 # nosec B603 B607
+                "nvidia-smi",
+                "--query-gpu=memory.used,memory.total,memory.free",
+                "--format=csv,noheader,nounits",
+                f"--id={device_id}",
+            ],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+        values = result.stdout.strip().split(", ")
+        return (
+            float(values[0]) * 1024 * 1024,
+            float(values[1]) * 1024 * 1024,
+            float(values[2]) * 1024 * 1024,
+        )
+
+    # Try rocm-smi for AMD GPUs
+    if shutil.which("rocm-smi"):
+        result = subprocess.run(
+            [  # noqa: S603 S607 # nosec B603 B607
+                "rocm-smi",
+                "--showmeminfo",
+                "vram",
+                "--json",
+            ],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+        data = json.loads(result.stdout)
+        # rocm-smi JSON: {"card0": {"VRAM Total Used (B)": ..., "VRAM Total Memory (B)": ...}}
+        card_key = list(data.keys())[0]
+        card_data = data[card_key]
+        used = float(card_data.get("VRAM Total Used (B)", 0))
+        total = float(card_data.get("VRAM Total Memory (B)", 0))
+        return (used, total, total - used)
+
+    # Fallback: PyTorch memory API (works on both CUDA and ROCm via HIP)
+    import torch
+
+    total = float(torch.cuda.get_device_properties(device_id).total_memory)
+    allocated = float(torch.cuda.memory_allocated(device_id))
+    return (allocated, total, total - allocated)
+
+
 @celery_app.task(name="update_gpu_stats", bind=True)
 def update_gpu_stats(self):
     """
@@ -82,8 +149,8 @@ def update_gpu_stats(self):
     This task runs on the celery worker (which has GPU access) and stores
     GPU memory stats in Redis so the backend API can retrieve them.
 
-    Uses nvidia-smi to get accurate GPU memory usage including all processes,
-    not just PyTorch allocated memory.
+    Uses nvidia-smi (NVIDIA), rocm-smi (AMD), or PyTorch memory API
+    to get GPU memory usage.
 
     Returns:
         Dictionary with GPU stats or error status
@@ -107,33 +174,11 @@ def update_gpu_stats(self):
             device_id = 0  # Primary GPU
             gpu_properties = torch.cuda.get_device_properties(device_id)
 
-            # Use nvidia-smi for accurate memory usage (includes all processes)
-            # Format: memory.used,memory.total,memory.free (in MiB)
-            # Security: Safe subprocess call with hardcoded system command (nvidia-smi).
-            # Only dynamic parameter is device_id (integer), preventing command injection.
-            result = subprocess.run(
-                [  # noqa: S603 S607 # nosec B603 B607 - hardcoded nvidia-smi, integer device_id
-                    "nvidia-smi",
-                    "--query-gpu=memory.used,memory.total,memory.free",
-                    "--format=csv,noheader,nounits",
-                    f"--id={device_id}",
-                ],
-                capture_output=True,
-                text=True,
-                check=True,
+            # Get memory stats from the appropriate system tool
+            memory_used, memory_total, memory_free = _get_gpu_memory_bytes(
+                device_id
             )
 
-            # Parse the output: "used, total, free" in MiB
-            memory_values = result.stdout.strip().split(", ")
-            memory_used_mib = float(memory_values[0])
-            memory_total_mib = float(memory_values[1])
-            memory_free_mib = float(memory_values[2])
-
-            # Convert MiB to bytes for formatting
-            memory_used = memory_used_mib * 1024 * 1024
-            memory_total = memory_total_mib * 1024 * 1024
-            memory_free = memory_free_mib * 1024 * 1024
-
             # Calculate percentage used
             memory_percent = (memory_used / memory_total * 100) if memory_total > 0 else 0
 
diff --git a/backend/app/utils/hardware_detection.py b/backend/app/utils/hardware_detection.py
index 396b5266..c6da3c92 100644
--- a/backend/app/utils/hardware_detection.py
+++ b/backend/app/utils/hardware_detection.py
@@ -2,10 +2,11 @@
 Hardware Detection and Configuration Module
 
 This module provides automatic detection of available hardware acceleration
-(CUDA, MPS, CPU) and configures optimal settings for each platform.
+(CUDA, ROCm, MPS, CPU) and configures optimal settings for each platform.
 
 Supports:
 - NVIDIA GPUs with CUDA (Linux/Windows)
+- AMD GPUs with ROCm/HIP (Linux) — uses PyTorch's CUDA API via HIP translation
 - Apple Silicon with MPS (macOS)
 - CPU fallback (all platforms)
 """
@@ -48,6 +49,16 @@ def __init__(
             self.torch_available = False
             self.torch_version = None
 
+        # Detect ROCm (AMD GPU via HIP translation layer)
+        self._is_rocm = False
+        if self.torch_available:
+            try:
+                import torch
+
+                self._is_rocm = hasattr(torch.version, "hip") and torch.version.hip is not None
+            except ImportError:
+                pass
+
         # Device and compute type detection
         self.device = force_device or self._detect_optimal_device()
         self.compute_type = force_compute_type or self._detect_optimal_compute_type()
@@ -56,6 +67,11 @@ def __init__(
         # Log configuration
         logger.info(f"Hardware Config: {self.get_summary()}")
 
+    @property
+    def is_rocm(self) -> bool:
+        """Whether PyTorch is using ROCm/HIP backend (AMD GPU)."""
+        return self._is_rocm
+
     def _detect_optimal_device(self) -> str:
         """Detect the best available device for AI processing."""
         if not self.torch_available:
@@ -337,15 +353,16 @@ def get_environment_variables(self) -> dict[str, str]:
             )
 
         elif self.device == "cuda":
-            # CUDA optimizations
-            env_vars.update({"TORCH_CUDA_ARCH_LIST": "6.0 6.1 7.0 7.5 8.0 8.6+PTX"})
+            # CUDA optimizations - only set NVIDIA-specific vars when not on ROCm
+            if not self.is_rocm:
+                env_vars.update({"TORCH_CUDA_ARCH_LIST": "6.0 6.1 7.0 7.5 8.0 8.6+PTX"})
             # Docker maps GPU_DEVICE_ID to container device 0
 
         return env_vars
 
     def get_summary(self) -> dict[str, Any]:
         """Get summary of hardware configuration."""
-        return {
+        summary: dict[str, Any] = {
             "system": self.system,
             "machine": self.machine,
             "device": self.device,
@@ -354,6 +371,15 @@ def get_summary(self) -> dict[str, Any]:
             "torch_available": self.torch_available,
             "torch_version": self.torch_version,
         }
+        if self.torch_available and self.device == "cuda":
+            if self.is_rocm:
+                import torch
+
+                summary["gpu_backend"] = "rocm"
+                summary["hip_version"] = torch.version.hip
+            else:
+                summary["gpu_backend"] = "cuda"
+        return summary
 
     def validate_configuration(self) -> tuple[bool, str]:
         """Validate the current configuration."""
@@ -417,18 +443,20 @@ def get_docker_runtime_config() -> dict[str, Any]:
     }
 
     if config.device == "cuda":
-        # NVIDIA GPU runtime
-        docker_config["deploy"]["resources"] = {
-            "reservations": {
-                "devices": [
-                    {
-                        "driver": "nvidia",
-                        "device_ids": [os.getenv("GPU_DEVICE_ID", "0")],
-                        "capabilities": ["gpu"],
-                    }
-                ]
+        if not config.is_rocm:
+            # NVIDIA GPU runtime
+            docker_config["deploy"]["resources"] = {
+                "reservations": {
+                    "devices": [
+                        {
+                            "driver": "nvidia",
+                            "device_ids": [os.getenv("GPU_DEVICE_ID", "0")],
+                            "capabilities": ["gpu"],
+                        }
+                    ]
+                }
             }
-        }
+        # ROCm uses device passthrough (/dev/kfd, /dev/dri), not Docker deploy resources
 
     return docker_config
 

From 97c05bbd750b1d6583fd4f1e1599c2a3662a3818 Mon Sep 17 00:00:00 2001
From: Zander <steamwings@outlook.com>
Date: Sun, 15 Feb 2026 15:08:56 -0500
Subject: [PATCH 3/3] fix(deps): Pin huggingface-hub<1.0.0 for pyannote.audio
 compatibility

pyannote.audio v3 uses the deprecated use_auth_token parameter that was
removed in huggingface-hub 1.0.0. Pin to <1.0.0 to prevent runtime
errors during speaker diarization model loading.

This affects the CUDA build as well (requirements.txt on master) but
is kept separate here for easy cherry-pick reference.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 backend/requirements-rocm.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/backend/requirements-rocm.txt b/backend/requirements-rocm.txt
index 16919221..033bdae2 100644
--- a/backend/requirements-rocm.txt
+++ b/backend/requirements-rocm.txt
@@ -55,6 +55,10 @@ whisperx==3.7.0
 # Do NOT install ctranslate2 from PyPI here - the PyPI version is CUDA-only.
 # The Dockerfile handles downloading and installing the ROCm wheel.
 
+# Pin huggingface-hub to <1.0.0 because pyannote.audio v3 uses deprecated
+# use_auth_token parameter removed in huggingface-hub 1.0.0
+huggingface-hub<1.0.0
+
 # PyAnnote (compatible with NumPy 2.x and PyTorch 2.6+)
 pyannote.audio>=3.3.2