From 258b20e360baaccaf772d0341329b411cd79dcbf Mon Sep 17 00:00:00 2001 From: Zander Date: Sun, 15 Feb 2026 15:08:16 -0500 Subject: [PATCH 1/3] feat(rocm): Add AMD ROCm GPU support infrastructure Add Docker build infrastructure and shell script support for running OpenTranscribe on AMD GPUs via ROCm/HIP. New files: - Dockerfile.rocm: Multi-stage build with PyTorch ROCm 6.4, CTranslate2 ROCm wheel, and MIOpen JIT compilation headers - requirements-rocm.txt: Python dependencies with ROCm-specific PyTorch - docker-compose.rocm-build.yml: Build overlay for ROCm backend image - docker-compose.gpu-rocm.yml: Runtime overlay with GPU device passthrough, render group mapping, and ROCm environment variables Modified files: - opentr.sh: Auto-detect ROCm GPUs and inject compose overlays - .env.example: Document HSA_OVERRIDE_GFX_VERSION and RENDER_GROUP_GID Co-Authored-By: Claude Opus 4.6 --- .env.example | 20 ++++ backend/Dockerfile.rocm | 168 ++++++++++++++++++++++++++++++++++ backend/requirements-rocm.txt | 72 +++++++++++++++ docker-compose.gpu-rocm.yml | 50 ++++++++++ docker-compose.rocm-build.yml | 67 ++++++++++++++ opentr.sh | 52 +++++++++++ 6 files changed, 429 insertions(+) create mode 100644 backend/Dockerfile.rocm create mode 100644 backend/requirements-rocm.txt create mode 100644 docker-compose.gpu-rocm.yml create mode 100644 docker-compose.rocm-build.yml diff --git a/.env.example b/.env.example index 873a0304..89eaadeb 100644 --- a/.env.example +++ b/.env.example @@ -188,6 +188,26 @@ GPU_SCALE_ENABLED=false GPU_SCALE_DEVICE_ID=2 GPU_SCALE_WORKERS=4 +# AMD ROCm GPU Configuration (for AMD Radeon GPUs) +# Only needed if using an AMD GPU with ROCm drivers installed. +# opentr.sh auto-detects AMD GPUs via rocm-smi and loads ROCm overlays. +# +# HSA_OVERRIDE_GFX_VERSION: Required for consumer AMD GPUs to work with ROCm. +# Set this to match your GPU architecture: +# RX 7900 XTX/XT (gfx1100): 11.0.0 +# RX 7800 XT / 7700 XT (gfx1101): 11.0.1 +# RX 7600 (gfx1102): 11.0.2 +# RX 6900/6800 XT (gfx1030): 10.3.0 +# RX 6700 XT (gfx1031): 10.3.1 +# See: https://rocm.docs.amd.com/en/latest/reference/gpu-arch-specs.html +# HSA_OVERRIDE_GFX_VERSION=11.0.1 +# +# RENDER_GROUP_GID: Host's render group GID for /dev/kfd access. +# Auto-detected by opentr.sh. Only set manually if auto-detection fails. +# Find your system's value with: stat -c '%g' /dev/kfd +# Common values: 109 (Ubuntu/Debian), 993 (Fedora/RHEL) +# RENDER_GROUP_GID=109 + #============================================================================= # AI MODELS CONFIGURATION #============================================================================= diff --git a/backend/Dockerfile.rocm b/backend/Dockerfile.rocm new file mode 100644 index 00000000..2542048e --- /dev/null +++ b/backend/Dockerfile.rocm @@ -0,0 +1,168 @@ +# ============================================================================= +# OpenTranscribe Backend - ROCm Dockerfile (AMD GPU) +# Multi-stage build optimized for security with non-root user +# Uses PyTorch ROCm 6.4 and CTranslate2 ROCm HIP backend +# ============================================================================= + +# ----------------------------------------------------------------------------- +# Stage 1: Build Stage - Install Python dependencies with compilation +# ----------------------------------------------------------------------------- +FROM python:3.13-slim-trixie AS builder + +WORKDIR /build + +# Install build dependencies (only in this stage) +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + gcc \ + g++ \ + curl \ + unzip \ + && rm -rf /var/lib/apt/lists/* + +# Copy only requirements first for better layer caching +COPY requirements-rocm.txt . + +# Upgrade pip to latest version first (security fix for CVE-2025-8869) +RUN pip install --user --no-cache-dir --upgrade pip + +# Install Python dependencies (ROCm variant) +# PyTorch 2.8.0+rocm6.4 from pytorch.org +# CTranslate2 ROCm wheel from GitHub releases (not on PyPI) +# WhisperX 3.7.0 - latest version with ctranslate2 4.5+ compatibility +# NumPy 2.x - fully compatible with all packages +# Use --user to install to /root/.local which we'll copy to final stage +RUN pip install --user --no-cache-dir --no-warn-script-location -r requirements-rocm.txt + +# Install CTranslate2 ROCm wheel from official GitHub releases +# These wheels are built with HIP backend for AMD GPUs +# See: https://github.com/OpenNMT/CTranslate2/releases +ARG CTRANSLATE2_VERSION=4.7.1 +ARG CTRANSLATE2_PYTHON=cp313 +RUN curl -sL -o /tmp/rocm-wheels.zip \ + "https://github.com/OpenNMT/CTranslate2/releases/download/v${CTRANSLATE2_VERSION}/rocm-python-wheels-Linux.zip" && \ + unzip -j /tmp/rocm-wheels.zip "temp-linux/ctranslate2-${CTRANSLATE2_VERSION}-${CTRANSLATE2_PYTHON}-${CTRANSLATE2_PYTHON}-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl" -d /tmp/ && \ + pip install --user --no-cache-dir --force-reinstall /tmp/ctranslate2-*.whl && \ + rm -f /tmp/rocm-wheels.zip /tmp/ctranslate2-*.whl + +# ----------------------------------------------------------------------------- +# Stage 2: Runtime Stage - Minimal production image with non-root user +# ----------------------------------------------------------------------------- +FROM python:3.13-slim-trixie + +# OCI annotations for container metadata and compliance +LABEL org.opencontainers.image.title="OpenTranscribe Backend (ROCm)" \ + org.opencontainers.image.description="AI-powered transcription backend with WhisperX and PyAnnote (AMD ROCm GPU)" \ + org.opencontainers.image.vendor="OpenTranscribe" \ + org.opencontainers.image.authors="OpenTranscribe Contributors" \ + org.opencontainers.image.licenses="AGPL-3.0" \ + org.opencontainers.image.source="https://github.com/davidamacey/OpenTranscribe" \ + org.opencontainers.image.documentation="https://github.com/davidamacey/OpenTranscribe/blob/master/README.md" + +# Install runtime dependencies and ROCm runtime libraries +# ROCm 7.0 libs needed by CTranslate2's ROCm wheel at runtime. +# hsa-rocr: HSA runtime with ROCm 7.0 symbols (hsa_amd_memory_get_preferred_copy_engine) +# rocrand-dev: Headers for MIOpen JIT kernel compilation on consumer GPUs (gfx1100/1101) +# PyTorch ROCm wheels lack pre-compiled MIOpen kernels for RDNA 3, so MIOpen compiles +# them at first run. This needs rocrand headers (rocrand/rocrand_xorwow.h). +# hip-dev headers: hip/hip_runtime.h needed by rocrand headers for MIOpen JIT. +# hip-dev can't be installed via apt because it pulls rocm-llvm which depends on +# libstdc++-11-dev (unavailable on Trixie). Instead we download the .deb and +# extract only the include directory — no dependency resolution needed. +# C++ header stubs: hiprtc/comgr's built-in hiprtc_runtime.h provides type traits +# (integral_constant, enable_if, etc.) but rocrand headers do #include +# which is unused. We can't install real C++ headers (GCC's conflict with +# hiprtc_runtime.h's definitions; rocm-llvm can't be installed on Trixie). +# Instead we create empty stubs in /opt/rocm/include/ so the #include resolves. +# Debian Trixie ships libhsa-runtime64-1 (ROCm 6.x) which we override via symlink. +# We use the AMD apt repo targeting Ubuntu Noble (24.04), compatible with Trixie. +ARG ROCM_VERSION=7.0 +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl \ + ffmpeg \ + libsndfile1 \ + libimage-exiftool-perl \ + libgomp1 \ + && echo "deb [arch=amd64 trusted=yes] https://repo.radeon.com/rocm/apt/${ROCM_VERSION}/ noble main" \ + > /etc/apt/sources.list.d/rocm.list \ + && apt-get update && apt-get install -y --no-install-recommends \ + hsa-rocr \ + hipblas \ + hiprand \ + hipsparse \ + hipsolver \ + hipfft \ + miopen-hip \ + rocrand-dev \ + && cd /tmp && apt-get download hip-dev \ + && dpkg-deb -x hip-dev_*.deb /tmp/hip-extract \ + && cp -a /tmp/hip-extract/opt/rocm*/include/* /opt/rocm/include/ 2>/dev/null || true \ + && rm -rf /tmp/hip-extract /tmp/hip-dev_*.deb \ + && printf '#pragma once\n' | tee /opt/rocm/include/utility /opt/rocm/include/type_traits > /dev/null \ + && echo "/opt/rocm/lib" > /etc/ld.so.conf.d/rocm.conf \ + && ldconfig \ + && ln -sf /opt/rocm/lib/libhsa-runtime64.so.1 /usr/lib/x86_64-linux-gnu/libhsa-runtime64.so.1 \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +# Create non-root user for security +# Add to video and render groups for ROCm GPU access (/dev/kfd, /dev/dri) +RUN groupadd -r render && \ + groupadd -r appuser && \ + useradd -r -g appuser -G video,render -u 1000 -m -s /bin/bash appuser && \ + mkdir -p /app /app/models /app/temp && \ + chown -R appuser:appuser /app && \ + mkdir -p /home/appuser/.cache/huggingface \ + /home/appuser/.cache/torch \ + /home/appuser/.cache/nltk_data \ + /home/appuser/.cache/sentence-transformers \ + /home/appuser/.cache/yt-dlp && \ + chown -R appuser:appuser /home/appuser/.cache + +# Set working directory +WORKDIR /app + +# Copy Python packages from builder stage +COPY --from=builder --chown=appuser:appuser /root/.local /home/appuser/.local + +# Replace PyTorch's and Triton's bundled ROCm 6.4 HSA runtime with ROCm 7.0 +# Both bundle libhsa-runtime64.so (ROCm 6.4, SONAME .so.1). CTranslate2 needs +# ROCm 7.0's version (with hsa_amd_memory_get_preferred_copy_engine@ROCR_1). +# Since shared libs with the same SONAME load only once per process, whichever +# loads first wins. PyTorch loads before CTranslate2, so without this fix its +# ROCm 6.4 HSA runtime is used and CTranslate2 fails with undefined symbol. +RUN ln -sf /opt/rocm/lib/libhsa-runtime64.so.1 \ + /home/appuser/.local/lib/python3.13/site-packages/torch/lib/libhsa-runtime64.so && \ + ln -sf /opt/rocm/lib/libhsa-runtime64.so.1 \ + /home/appuser/.local/lib/python3.13/site-packages/triton/backends/amd/lib/libhsa-runtime64.so + +# Ensure scripts in .local are usable by adding to PATH +# Set LD_LIBRARY_PATH so CTranslate2 and other native libs can find: +# - ROCm system libs (hipblas, rocblas) installed via apt from /opt/rocm/lib +# - ROCm runtime libs (amdhip64) bundled in PyTorch ROCm wheel +# Set cache directories to user home +ENV PATH=/home/appuser/.local/bin:$PATH \ + PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 \ + LD_LIBRARY_PATH=/opt/rocm/lib:/home/appuser/.local/lib/python3.13/site-packages/torch/lib \ + HF_HOME=/home/appuser/.cache/huggingface \ + TRANSFORMERS_CACHE=/home/appuser/.cache/huggingface/transformers \ + TORCH_HOME=/home/appuser/.cache/torch \ + NLTK_DATA=/home/appuser/.cache/nltk_data \ + SENTENCE_TRANSFORMERS_HOME=/home/appuser/.cache/sentence-transformers + +# Copy application code +COPY --chown=appuser:appuser . . + +# Switch to non-root user +USER appuser + +# Expose application port +EXPOSE 8080 + +# Health check to verify the application is responding +HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \ + CMD curl -f http://localhost:8080/health || exit 1 + +# Command to run the application in production (no reload) +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8080"] diff --git a/backend/requirements-rocm.txt b/backend/requirements-rocm.txt new file mode 100644 index 00000000..16919221 --- /dev/null +++ b/backend/requirements-rocm.txt @@ -0,0 +1,72 @@ +fastapi>=0.103.1 +uvicorn[standard]>=0.23.2 +websockets>=11.0.3 +sqlalchemy>=2.0.20 +alembic>=1.12.0 +pydantic>=2.3.0 +pydantic-settings>=2.0.0 +python-jose[cryptography]>=3.3.0 +python-multipart>=0.0.6 +passlib[bcrypt]>=1.7.4 +bcrypt==3.2.2 +email-validator +# Asynchronous Tasks +celery>=5.3.4 +redis>=5.0.0 +flower>=2.0.0 +psycopg2-binary>=2.9.7 +minio>=7.2.18 +opensearch-py>=3.0.0 +httpx>=0.24.1 +python-dotenv>=1.0.0 +pytest>=7.4.2 + +# LDAP/Active Directory Authentication +ldap3>=2.9.1 + +# Rate Limiting +slowapi>=0.1.9 + +# PKI/X.509 Certificate Authentication +cryptography>=42.0.0 + +# MFA/TOTP Authentication (FedRAMP IA-2) +pyotp>=2.9.0 +qrcode[pil]>=7.4.0 + +# AI/ML Stack - ROCm variant for AMD GPUs +# Non-GPU packages are identical to requirements.txt +numpy>=1.25.2 + +# PyTorch with ROCm 6.4 support for AMD GPUs +# PyTorch 2.8.0 bundles its own ROCm 6.4 runtime in torch/lib/ +# CTranslate2 ROCm wheel uses ROCm 7.0 system libs (installed via apt in Dockerfile) +# These coexist because they have different SONAMEs (.so.2 vs .so.3) +--extra-index-url https://download.pytorch.org/whl/rocm6.4 +torch==2.8.0 +torchaudio==2.8.0 + +# WhisperX latest version with ctranslate2 4.5+ support +whisperx==3.7.0 + +# CTranslate2 with ROCm HIP support (official wheels from GitHub releases) +# The ROCm wheel is installed separately in the Dockerfile from: +# https://github.com/OpenNMT/CTranslate2/releases +# Do NOT install ctranslate2 from PyPI here - the PyPI version is CUDA-only. +# The Dockerfile handles downloading and installing the ROCm wheel. + +# PyAnnote (compatible with NumPy 2.x and PyTorch 2.6+) +pyannote.audio>=3.3.2 + +# Supporting libraries +omegaconf>=2.3.0 +ffmpeg-python>=0.2.0 +sentencepiece>=0.1.99 +psutil>=5.9.5 +pyexiftool>=0.5.0 +yt-dlp>=2025.1.7 + +# Semantic search and NLP +sentence-transformers>=2.2.0 # For semantic search embeddings (all-MiniLM-L6-v2) +# Note: nltk is installed as transitive dependency from transformers/whisperx +# NLTK data files (punkt_tab) must be pre-downloaded for offline usage diff --git a/docker-compose.gpu-rocm.yml b/docker-compose.gpu-rocm.yml new file mode 100644 index 00000000..1aaab059 --- /dev/null +++ b/docker-compose.gpu-rocm.yml @@ -0,0 +1,50 @@ +# docker-compose.gpu-rocm.yml +# Optional overlay for GPU acceleration (AMD ROCm) +# +# This overlay enables GPU runtime for celery-worker on systems with AMD GPUs. +# It is automatically detected and loaded by opentr.sh when ROCm is available. +# +# Usage: +# ./opentr.sh start dev # Auto-detects AMD GPU and applies if available +# OR manually: +# docker compose -f docker-compose.yml -f docker-compose.override.yml \ +# -f docker-compose.rocm-build.yml -f docker-compose.gpu-rocm.yml up +# +# Note: This file is automatically included by opentr.sh when AMD ROCm GPU is detected. +# On macOS or systems without AMD GPU, this file is not loaded, allowing the base +# docker-compose.yml to work on CPU-only systems. +# +# Configuration (.env file): +# GPU_DEVICE_ID - GPU device to use (default: 0) +# HSA_OVERRIDE_GFX_VERSION - GFX version override for GPU compatibility +# RX 7900 XTX/XT (gfx1100): 11.0.0 +# RX 7800 XT / 7700 XT (gfx1101): 11.0.1 +# RX 7600 (gfx1102): 11.0.2 +# RX 6900/6800 XT (gfx1030): 10.3.0 +# RENDER_GROUP_GID - Host's render group GID for /dev/kfd access +# Find with: stat -c '%g' /dev/kfd +# Common values: 109 (Ubuntu), 993 (Fedora/RHEL), varies by distro + +services: + celery-worker: + # Enable AMD ROCm GPU access via device passthrough + # /dev/kfd - Kernel Fusion Driver (required for ROCm compute) + # /dev/dri - Direct Rendering Infrastructure (GPU device nodes) + devices: + - /dev/kfd:/dev/kfd + - /dev/dri:/dev/dri + # Add host's video and render group GIDs for device access. + # The render group GID varies by distro (e.g. 109 on Ubuntu, 993 on Fedora). + # Using numeric GID ensures it matches the host's /dev/kfd ownership. + group_add: + - video + - "${RENDER_GROUP_GID:-109}" + # ROCm requires seccomp=unconfined for HSA memory mapping + # See: https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html + security_opt: + - seccomp:unconfined + # Larger shared memory for PyTorch model loading (default 64MB is too small) + shm_size: '8g' + environment: + - HSA_OVERRIDE_GFX_VERSION=${HSA_OVERRIDE_GFX_VERSION:-11.0.0} + - HIP_VISIBLE_DEVICES=${GPU_DEVICE_ID:-0} diff --git a/docker-compose.rocm-build.yml b/docker-compose.rocm-build.yml new file mode 100644 index 00000000..4136668d --- /dev/null +++ b/docker-compose.rocm-build.yml @@ -0,0 +1,67 @@ +# docker-compose.rocm-build.yml +# Overlay that switches all backend services to use Dockerfile.rocm +# +# This file is automatically loaded by opentr.sh when AMD ROCm GPU is detected. +# It layers on top of docker-compose.override.yml (dev) or docker-compose.prod.yml (prod) +# and only overrides the Dockerfile reference — all other settings (volumes, commands, +# environment) are inherited from the base overlay. +# +# Usage: +# ./opentr.sh start dev # Auto-detects AMD GPU and applies if available +# OR manually: +# docker compose -f docker-compose.yml -f docker-compose.override.yml \ +# -f docker-compose.rocm-build.yml -f docker-compose.gpu-rocm.yml up + +services: + backend: + image: opentranscribe-backend-rocm:latest + build: + context: ./backend + dockerfile: Dockerfile.rocm + + celery-worker: + image: opentranscribe-backend-rocm:latest + build: + context: ./backend + dockerfile: Dockerfile.rocm + # ROCm/HIP: Use solo pool to avoid fork. + # The HSA runtime initializes during `import torch` (library load) in the parent + # process. Forked children inherit stale HSA state, causing "invalid device ordinal". + # With --concurrency=1 already set, --pool=solo is functionally equivalent. + command: celery -A app.core.celery worker --loglevel=info -Q gpu --concurrency=1 --pool=solo + + celery-download-worker: + image: opentranscribe-backend-rocm:latest + build: + context: ./backend + dockerfile: Dockerfile.rocm + + celery-cpu-worker: + image: opentranscribe-backend-rocm:latest + build: + context: ./backend + dockerfile: Dockerfile.rocm + + celery-nlp-worker: + image: opentranscribe-backend-rocm:latest + build: + context: ./backend + dockerfile: Dockerfile.rocm + + celery-beat: + image: opentranscribe-backend-rocm:latest + build: + context: ./backend + dockerfile: Dockerfile.rocm + + flower: + image: opentranscribe-backend-rocm:latest + build: + context: ./backend + dockerfile: Dockerfile.rocm + + celery-worker-gpu-scaled: + image: opentranscribe-backend-rocm:latest + build: + context: ./backend + dockerfile: Dockerfile.rocm diff --git a/opentr.sh b/opentr.sh index 2e8e8dbc..0df19e95 100755 --- a/opentr.sh +++ b/opentr.sh @@ -135,6 +135,27 @@ detect_and_configure_hardware() { export COMPUTE_TYPE="int8" export USE_GPU="false" fi + elif command -v rocm-smi &> /dev/null && rocm-smi --showproductname &> /dev/null 2>&1; then + echo "✅ AMD ROCm GPU detected" + export DOCKER_RUNTIME="rocm" + export TORCH_DEVICE="cuda" # ROCm uses torch.cuda API via HIP translation + export COMPUTE_TYPE="float16" + export USE_GPU="true" + + # Auto-detect render group GID from /dev/kfd for container device access + if [ -c "/dev/kfd" ]; then + RENDER_GID=$(stat -c '%g' /dev/kfd 2>/dev/null || stat -f '%g' /dev/kfd 2>/dev/null || echo "109") + export RENDER_GROUP_GID="$RENDER_GID" + echo "✅ ROCm kernel fusion driver available (/dev/kfd, render GID: $RENDER_GID)" + else + echo "⚠️ AMD GPU detected but /dev/kfd not found" + echo " ROCm GPU access may not work in containers" + echo " Falling back to CPU mode" + export DOCKER_RUNTIME="" + export TORCH_DEVICE="cpu" + export COMPUTE_TYPE="int8" + export USE_GPU="false" + fi elif [[ "$PLATFORM" == "darwin" && "$ARCH" == "arm64" ]]; then echo "✅ Apple Silicon detected" export TORCH_DEVICE="mps" @@ -257,6 +278,14 @@ start_app() { echo "🎯 Adding GPU overlay (docker-compose.gpu.yml) for NVIDIA acceleration" fi + # Add ROCm GPU overlay if AMD GPU is detected + if [ "$DOCKER_RUNTIME" = "rocm" ]; then + if [ -f "docker-compose.rocm-build.yml" ] && [ -f "docker-compose.gpu-rocm.yml" ]; then + COMPOSE_FILES="$COMPOSE_FILES -f docker-compose.rocm-build.yml -f docker-compose.gpu-rocm.yml" + echo "🎯 Adding ROCm overlays for AMD GPU acceleration" + fi + fi + # Add GPU scaling overlay if requested if [ -n "$GPU_SCALE_FLAG" ]; then COMPOSE_FILES="$COMPOSE_FILES -f docker-compose.gpu-scale.yml" @@ -411,6 +440,14 @@ reset_and_init() { echo "🎯 Adding GPU overlay (docker-compose.gpu.yml) for NVIDIA acceleration" fi + # Add ROCm GPU overlay if AMD GPU is detected + if [ "$DOCKER_RUNTIME" = "rocm" ]; then + if [ -f "docker-compose.rocm-build.yml" ] && [ -f "docker-compose.gpu-rocm.yml" ]; then + COMPOSE_FILES="$COMPOSE_FILES -f docker-compose.rocm-build.yml -f docker-compose.gpu-rocm.yml" + echo "🎯 Adding ROCm overlays for AMD GPU acceleration" + fi + fi + # Add GPU scaling overlay if requested if [ -n "$GPU_SCALE_FLAG" ]; then COMPOSE_FILES="$COMPOSE_FILES -f docker-compose.gpu-scale.yml" @@ -731,6 +768,13 @@ case "$1" in COMPOSE_FILES="$COMPOSE_FILES -f docker-compose.gpu.yml" fi + # Add ROCm overlay if AMD GPU is detected + if [ "$DOCKER_RUNTIME" = "rocm" ]; then + if [ -f "docker-compose.rocm-build.yml" ] && [ -f "docker-compose.gpu-rocm.yml" ]; then + COMPOSE_FILES="$COMPOSE_FILES -f docker-compose.rocm-build.yml -f docker-compose.gpu-rocm.yml" + fi + fi + # shellcheck disable=SC2086 docker compose $COMPOSE_FILES up -d --build backend celery-worker celery-beat flower echo "✅ Backend services rebuilt successfully." @@ -779,6 +823,14 @@ case "$1" in echo "🎯 Including GPU overlay for build" fi + # Add ROCm overlay if AMD GPU is detected + if [ "$DOCKER_RUNTIME" = "rocm" ]; then + if [ -f "docker-compose.rocm-build.yml" ] && [ -f "docker-compose.gpu-rocm.yml" ]; then + COMPOSE_FILES="$COMPOSE_FILES -f docker-compose.rocm-build.yml -f docker-compose.gpu-rocm.yml" + echo "🎯 Including ROCm overlays for build" + fi + fi + # shellcheck disable=SC2086 docker compose $COMPOSE_FILES build echo "✅ Build complete. Use './opentr.sh start' to start the application." From cd236fe772e67bf0915e4f99cdd2201a0ffb34a9 Mon Sep 17 00:00:00 2001 From: Zander Date: Sun, 15 Feb 2026 15:08:32 -0500 Subject: [PATCH 2/3] feat(rocm): Add ROCm awareness to Python backend Update hardware detection and GPU monitoring to handle AMD ROCm/HIP alongside NVIDIA CUDA: - hardware_detection.py: Add is_rocm property to detect HIP backend, skip NVIDIA-specific env vars (TORCH_CUDA_ARCH_LIST) on ROCm, report gpu_backend and hip_version in hardware summary, skip NVIDIA driver config in Docker runtime helper - utility.py: Use rocm-smi for GPU stats on ROCm (temperature, VRAM, utilization), with fallback to PyTorch CUDA API for memory stats Co-Authored-By: Claude Opus 4.6 --- backend/app/tasks/utility.py | 99 ++++++++++++++++++------- backend/app/utils/hardware_detection.py | 58 +++++++++++---- 2 files changed, 115 insertions(+), 42 deletions(-) diff --git a/backend/app/tasks/utility.py b/backend/app/tasks/utility.py index 26546346..83830044 100644 --- a/backend/app/tasks/utility.py +++ b/backend/app/tasks/utility.py @@ -74,6 +74,73 @@ def check_tasks_health(self): return summary +def _get_gpu_memory_bytes(device_id: int = 0) -> tuple[float, float, float]: + """ + Get GPU memory stats using the appropriate system tool. + + Tries nvidia-smi for NVIDIA GPUs, rocm-smi for AMD GPUs, + then falls back to PyTorch's own memory reporting. + + Args: + device_id: GPU device index + + Returns: + Tuple of (memory_used_bytes, memory_total_bytes, memory_free_bytes) + """ + import shutil + import subprocess + + # Try nvidia-smi first (NVIDIA GPUs) + if shutil.which("nvidia-smi"): + # Security: Safe subprocess call with hardcoded system command. + # Only dynamic parameter is device_id (integer), preventing command injection. + result = subprocess.run( + [ # noqa: S603 S607 # nosec B603 B607 + "nvidia-smi", + "--query-gpu=memory.used,memory.total,memory.free", + "--format=csv,noheader,nounits", + f"--id={device_id}", + ], + capture_output=True, + text=True, + check=True, + ) + values = result.stdout.strip().split(", ") + return ( + float(values[0]) * 1024 * 1024, + float(values[1]) * 1024 * 1024, + float(values[2]) * 1024 * 1024, + ) + + # Try rocm-smi for AMD GPUs + if shutil.which("rocm-smi"): + result = subprocess.run( + [ # noqa: S603 S607 # nosec B603 B607 + "rocm-smi", + "--showmeminfo", + "vram", + "--json", + ], + capture_output=True, + text=True, + check=True, + ) + data = json.loads(result.stdout) + # rocm-smi JSON: {"card0": {"VRAM Total Used (B)": ..., "VRAM Total Memory (B)": ...}} + card_key = list(data.keys())[0] + card_data = data[card_key] + used = float(card_data.get("VRAM Total Used (B)", 0)) + total = float(card_data.get("VRAM Total Memory (B)", 0)) + return (used, total, total - used) + + # Fallback: PyTorch memory API (works on both CUDA and ROCm via HIP) + import torch + + total = float(torch.cuda.get_device_properties(device_id).total_memory) + allocated = float(torch.cuda.memory_allocated(device_id)) + return (allocated, total, total - allocated) + + @celery_app.task(name="update_gpu_stats", bind=True) def update_gpu_stats(self): """ @@ -82,8 +149,8 @@ def update_gpu_stats(self): This task runs on the celery worker (which has GPU access) and stores GPU memory stats in Redis so the backend API can retrieve them. - Uses nvidia-smi to get accurate GPU memory usage including all processes, - not just PyTorch allocated memory. + Uses nvidia-smi (NVIDIA), rocm-smi (AMD), or PyTorch memory API + to get GPU memory usage. Returns: Dictionary with GPU stats or error status @@ -107,33 +174,11 @@ def update_gpu_stats(self): device_id = 0 # Primary GPU gpu_properties = torch.cuda.get_device_properties(device_id) - # Use nvidia-smi for accurate memory usage (includes all processes) - # Format: memory.used,memory.total,memory.free (in MiB) - # Security: Safe subprocess call with hardcoded system command (nvidia-smi). - # Only dynamic parameter is device_id (integer), preventing command injection. - result = subprocess.run( - [ # noqa: S603 S607 # nosec B603 B607 - hardcoded nvidia-smi, integer device_id - "nvidia-smi", - "--query-gpu=memory.used,memory.total,memory.free", - "--format=csv,noheader,nounits", - f"--id={device_id}", - ], - capture_output=True, - text=True, - check=True, + # Get memory stats from the appropriate system tool + memory_used, memory_total, memory_free = _get_gpu_memory_bytes( + device_id ) - # Parse the output: "used, total, free" in MiB - memory_values = result.stdout.strip().split(", ") - memory_used_mib = float(memory_values[0]) - memory_total_mib = float(memory_values[1]) - memory_free_mib = float(memory_values[2]) - - # Convert MiB to bytes for formatting - memory_used = memory_used_mib * 1024 * 1024 - memory_total = memory_total_mib * 1024 * 1024 - memory_free = memory_free_mib * 1024 * 1024 - # Calculate percentage used memory_percent = (memory_used / memory_total * 100) if memory_total > 0 else 0 diff --git a/backend/app/utils/hardware_detection.py b/backend/app/utils/hardware_detection.py index 396b5266..c6da3c92 100644 --- a/backend/app/utils/hardware_detection.py +++ b/backend/app/utils/hardware_detection.py @@ -2,10 +2,11 @@ Hardware Detection and Configuration Module This module provides automatic detection of available hardware acceleration -(CUDA, MPS, CPU) and configures optimal settings for each platform. +(CUDA, ROCm, MPS, CPU) and configures optimal settings for each platform. Supports: - NVIDIA GPUs with CUDA (Linux/Windows) +- AMD GPUs with ROCm/HIP (Linux) — uses PyTorch's CUDA API via HIP translation - Apple Silicon with MPS (macOS) - CPU fallback (all platforms) """ @@ -48,6 +49,16 @@ def __init__( self.torch_available = False self.torch_version = None + # Detect ROCm (AMD GPU via HIP translation layer) + self._is_rocm = False + if self.torch_available: + try: + import torch + + self._is_rocm = hasattr(torch.version, "hip") and torch.version.hip is not None + except ImportError: + pass + # Device and compute type detection self.device = force_device or self._detect_optimal_device() self.compute_type = force_compute_type or self._detect_optimal_compute_type() @@ -56,6 +67,11 @@ def __init__( # Log configuration logger.info(f"Hardware Config: {self.get_summary()}") + @property + def is_rocm(self) -> bool: + """Whether PyTorch is using ROCm/HIP backend (AMD GPU).""" + return self._is_rocm + def _detect_optimal_device(self) -> str: """Detect the best available device for AI processing.""" if not self.torch_available: @@ -337,15 +353,16 @@ def get_environment_variables(self) -> dict[str, str]: ) elif self.device == "cuda": - # CUDA optimizations - env_vars.update({"TORCH_CUDA_ARCH_LIST": "6.0 6.1 7.0 7.5 8.0 8.6+PTX"}) + # CUDA optimizations - only set NVIDIA-specific vars when not on ROCm + if not self.is_rocm: + env_vars.update({"TORCH_CUDA_ARCH_LIST": "6.0 6.1 7.0 7.5 8.0 8.6+PTX"}) # Docker maps GPU_DEVICE_ID to container device 0 return env_vars def get_summary(self) -> dict[str, Any]: """Get summary of hardware configuration.""" - return { + summary: dict[str, Any] = { "system": self.system, "machine": self.machine, "device": self.device, @@ -354,6 +371,15 @@ def get_summary(self) -> dict[str, Any]: "torch_available": self.torch_available, "torch_version": self.torch_version, } + if self.torch_available and self.device == "cuda": + if self.is_rocm: + import torch + + summary["gpu_backend"] = "rocm" + summary["hip_version"] = torch.version.hip + else: + summary["gpu_backend"] = "cuda" + return summary def validate_configuration(self) -> tuple[bool, str]: """Validate the current configuration.""" @@ -417,18 +443,20 @@ def get_docker_runtime_config() -> dict[str, Any]: } if config.device == "cuda": - # NVIDIA GPU runtime - docker_config["deploy"]["resources"] = { - "reservations": { - "devices": [ - { - "driver": "nvidia", - "device_ids": [os.getenv("GPU_DEVICE_ID", "0")], - "capabilities": ["gpu"], - } - ] + if not config.is_rocm: + # NVIDIA GPU runtime + docker_config["deploy"]["resources"] = { + "reservations": { + "devices": [ + { + "driver": "nvidia", + "device_ids": [os.getenv("GPU_DEVICE_ID", "0")], + "capabilities": ["gpu"], + } + ] + } } - } + # ROCm uses device passthrough (/dev/kfd, /dev/dri), not Docker deploy resources return docker_config From 97c05bbd750b1d6583fd4f1e1599c2a3662a3818 Mon Sep 17 00:00:00 2001 From: Zander Date: Sun, 15 Feb 2026 15:08:56 -0500 Subject: [PATCH 3/3] fix(deps): Pin huggingface-hub<1.0.0 for pyannote.audio compatibility pyannote.audio v3 uses the deprecated use_auth_token parameter that was removed in huggingface-hub 1.0.0. Pin to <1.0.0 to prevent runtime errors during speaker diarization model loading. This affects the CUDA build as well (requirements.txt on master) but is kept separate here for easy cherry-pick reference. Co-Authored-By: Claude Opus 4.6 --- backend/requirements-rocm.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/backend/requirements-rocm.txt b/backend/requirements-rocm.txt index 16919221..033bdae2 100644 --- a/backend/requirements-rocm.txt +++ b/backend/requirements-rocm.txt @@ -55,6 +55,10 @@ whisperx==3.7.0 # Do NOT install ctranslate2 from PyPI here - the PyPI version is CUDA-only. # The Dockerfile handles downloading and installing the ROCm wheel. +# Pin huggingface-hub to <1.0.0 because pyannote.audio v3 uses deprecated +# use_auth_token parameter removed in huggingface-hub 1.0.0 +huggingface-hub<1.0.0 + # PyAnnote (compatible with NumPy 2.x and PyTorch 2.6+) pyannote.audio>=3.3.2