Skip to content

Commit e6b50b3

Browse files
committed
feat(gpu): add Blackwell/DGX Spark GPU support (PR #154)
1 parent 02053ae commit e6b50b3

8 files changed

Lines changed: 715 additions & 31 deletions

File tree

backend/Dockerfile.blackwell

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
# =============================================================================
2+
# OpenTranscribe Backend - Blackwell / DGX Spark Dockerfile
3+
#
4+
# Specialized image for NVIDIA Blackwell architecture (SM_121 / GB10) GPUs.
5+
# Based on NVIDIA's PyTorch container which bundles the correct CUDA 12.x
6+
# toolkit and ARM64-native torch builds.
7+
#
8+
# Key differences from Dockerfile.prod:
9+
# - Uses NVIDIA PyTorch base image (nvcr.io/nvidia/pytorch) instead of
10+
# python:3.13-slim to get ARM64 CUDA support
11+
# - Applies SM_121 -> SM_90 compatibility patches via scripts/blackwell_patches.py
12+
# - Pins huggingface_hub to avoid deprecated API calls in NVIDIA stack
13+
# - Sets CUDA_FORCE_PTX_JIT=1 for forward-compatible PTX execution
14+
#
15+
# Usage:
16+
# docker build -t opentranscribe-backend-blackwell -f Dockerfile.blackwell .
17+
# (Then reference in docker-compose.blackwell.yml)
18+
# =============================================================================
19+
20+
FROM nvcr.io/nvidia/pytorch:25.01-py3
21+
22+
LABEL org.opencontainers.image.title="OpenTranscribe Backend (Blackwell)" \
23+
org.opencontainers.image.description="AI-powered transcription backend for NVIDIA Blackwell/DGX Spark" \
24+
org.opencontainers.image.vendor="OpenTranscribe" \
25+
org.opencontainers.image.authors="OpenTranscribe Contributors" \
26+
org.opencontainers.image.licenses="AGPL-3.0" \
27+
org.opencontainers.image.source="https://github.com/davidamacey/OpenTranscribe"
28+
29+
WORKDIR /app
30+
31+
# ---------------------------------------------------------------------------
32+
# Blackwell SM_121 compatibility environment variables
33+
# ---------------------------------------------------------------------------
34+
# CUDA_FORCE_PTX_JIT: Forces PTX JIT compilation for forward compatibility
35+
# when the installed CUDA toolkit doesn't know SM_121 natively.
36+
# TORCH_CUDA_ARCH_LIST: Tells PyTorch extensions to compile for SM_90
37+
# (Hopper), which is binary-compatible with Blackwell via PTX fallback.
38+
ENV CUDA_FORCE_PTX_JIT=1 \
39+
TORCH_CUDA_ARCH_LIST="9.0" \
40+
PYTHONUNBUFFERED=1 \
41+
PYTHONDONTWRITEBYTECODE=1 \
42+
PYTHONWARNINGS=ignore::UserWarning:pyannote.audio.core.io \
43+
DEBIAN_FRONTEND=noninteractive
44+
45+
# ---------------------------------------------------------------------------
46+
# System dependencies (runtime only)
47+
# ---------------------------------------------------------------------------
48+
RUN apt-get update && apt-get install -y --no-install-recommends \
49+
curl \
50+
ffmpeg \
51+
libsndfile1 \
52+
libimage-exiftool-perl \
53+
libgomp1 \
54+
&& rm -rf /var/lib/apt/lists/* \
55+
&& apt-get clean
56+
57+
# ---------------------------------------------------------------------------
58+
# Create directories for runtime and caches
59+
# ---------------------------------------------------------------------------
60+
# Note: The NVIDIA base image uses 'user' (UID 1000), not 'appuser'.
61+
# Cache volume mounts in docker-compose.blackwell.yml use /home/user/ paths.
62+
RUN mkdir -p /app/models /app/temp \
63+
/home/user/.cache/huggingface \
64+
/home/user/.cache/torch \
65+
/home/user/.cache/nltk_data \
66+
/home/user/.cache/sentence-transformers \
67+
/home/user/.cache/yt-dlp
68+
69+
# ---------------------------------------------------------------------------
70+
# Save NVIDIA-stack torch packages before pip installs can overwrite them
71+
# ---------------------------------------------------------------------------
72+
# The NVIDIA base image bundles a custom torch/torchaudio/torchvision built
73+
# against their CUDA toolkit. pip installs below might pull in PyPI versions
74+
# that lack the NVIDIA-specific patches. We save and restore.
75+
SHELL ["/bin/bash", "-o", "pipefail", "-c"]
76+
RUN pip freeze | grep -iE '^(torch|torchaudio|torchvision)==' > /tmp/nvidia_torch_versions.txt
77+
78+
# ---------------------------------------------------------------------------
79+
# Base Python tools
80+
# ---------------------------------------------------------------------------
81+
# hadolint ignore=DL3013
82+
RUN pip install --no-cache-dir --upgrade pip setuptools wheel
83+
84+
# ---------------------------------------------------------------------------
85+
# Install application dependencies
86+
# ---------------------------------------------------------------------------
87+
COPY requirements.txt .
88+
RUN pip install --no-cache-dir --no-warn-script-location -r requirements.txt
89+
90+
# ---------------------------------------------------------------------------
91+
# Restore NVIDIA torch stack (overwrite any PyPI versions pulled by deps)
92+
# ---------------------------------------------------------------------------
93+
# hadolint ignore=DL3013,SC2046
94+
RUN pip install --no-cache-dir --force-reinstall \
95+
"$(cat /tmp/nvidia_torch_versions.txt | tr '\n' ' ')"
96+
97+
# ---------------------------------------------------------------------------
98+
# Pin huggingface_hub for NVIDIA base image compatibility
99+
# ---------------------------------------------------------------------------
100+
# The NVIDIA PyTorch 25.01 base bundles libraries that still use the
101+
# deprecated `use_auth_token` parameter removed in huggingface_hub>=0.24.
102+
# This pin is Blackwell-specific and intentionally NOT in requirements.txt.
103+
RUN pip install --no-cache-dir "huggingface_hub==0.23.5"
104+
105+
# ---------------------------------------------------------------------------
106+
# Install WhisperX (with --no-deps to preserve NVIDIA torch stack)
107+
# ---------------------------------------------------------------------------
108+
RUN pip install --no-cache-dir --no-deps "whisperx==3.8.1"
109+
110+
# ---------------------------------------------------------------------------
111+
# Install PyAnnote fork with GPU optimizations
112+
# ---------------------------------------------------------------------------
113+
# Custom fork with vectorized chunking, adaptive batch size, TF32, CUDA
114+
# streams, and memory-safe batch indexing.
115+
RUN pip install --no-cache-dir --no-deps \
116+
"pyannote.audio @ git+https://github.com/davidamacey/pyannote-audio.git@gpu-optimizations"
117+
118+
# ---------------------------------------------------------------------------
119+
# Apply Blackwell SM_121 -> SM_90 compatibility patches
120+
# ---------------------------------------------------------------------------
121+
# Patches torch, torchaudio, and pyannote for SM_121 compatibility.
122+
# See scripts/blackwell_patches.py for details.
123+
COPY scripts/blackwell_patches.py scripts/blackwell_patches.py
124+
RUN python scripts/blackwell_patches.py
125+
126+
# ---------------------------------------------------------------------------
127+
# Set cache environment variables
128+
# ---------------------------------------------------------------------------
129+
ENV HF_HOME=/home/user/.cache/huggingface \
130+
TORCH_HOME=/home/user/.cache/torch \
131+
NLTK_DATA=/home/user/.cache/nltk_data \
132+
SENTENCE_TRANSFORMERS_HOME=/home/user/.cache/sentence-transformers
133+
134+
# ---------------------------------------------------------------------------
135+
# Copy application code
136+
# ---------------------------------------------------------------------------
137+
COPY . .
138+
139+
EXPOSE 8080
140+
141+
HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \
142+
CMD curl -f http://localhost:8080/health || exit 1
143+
144+
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8080"]

backend/app/tasks/utility.py

Lines changed: 105 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,59 @@
1616
logger = logging.getLogger(__name__)
1717

1818

19+
def _safe_float(value: str | None) -> float | None:
20+
"""Parse a numeric string to float, returning None for non-numeric values.
21+
22+
Handles nvidia-smi reporting '[N/A]' or 'N/A' on systems with unified
23+
memory (e.g. NVIDIA DGX Spark / Blackwell GB10).
24+
25+
Args:
26+
value: String to parse, or None.
27+
28+
Returns:
29+
Parsed float, or None if the value is not a valid number.
30+
"""
31+
if value is None:
32+
return None
33+
value = value.strip()
34+
if not value or value.upper() in ("[N/A]", "N/A", "N/A%"):
35+
return None
36+
try:
37+
return float(value)
38+
except (ValueError, TypeError):
39+
return None
40+
41+
42+
def _query_gpu_memory_torch(device_id: int) -> tuple[float, float] | None:
43+
"""Query GPU memory via torch.cuda.mem_get_info() as a fallback.
44+
45+
This is needed for systems with unified CPU+GPU memory (e.g. DGX Spark)
46+
where nvidia-smi reports memory stats as '[N/A]'.
47+
48+
Args:
49+
device_id: CUDA device index.
50+
51+
Returns:
52+
Tuple of (free_bytes, total_bytes), or None if torch is unavailable.
53+
"""
54+
try:
55+
import torch
56+
57+
if not torch.cuda.is_available():
58+
return None
59+
free, total = torch.cuda.mem_get_info(device_id)
60+
return (float(free), float(total))
61+
except Exception as e:
62+
logger.debug(f"torch.cuda.mem_get_info({device_id}) failed: {e}")
63+
return None
64+
65+
1966
def _query_single_gpu(device_id: int, subprocess_mod, format_bytes) -> dict | None:
20-
"""Query nvidia-smi for one GPU device and return a parsed stats dict.
67+
"""Query GPU stats for one device, with unified-memory fallback.
68+
69+
First tries nvidia-smi for full stats. If memory values come back as
70+
'[N/A]' (unified memory systems like DGX Spark), falls back to
71+
torch.cuda.mem_get_info() for memory data.
2172
2273
Args:
2374
device_id: NVIDIA device index to query.
@@ -31,7 +82,8 @@ def _query_single_gpu(device_id: int, subprocess_mod, format_bytes) -> dict | No
3182
result = subprocess_mod.run( # noqa: S603 # nosec B603 B607
3283
[ # noqa: S607
3384
"nvidia-smi",
34-
"--query-gpu=name,memory.used,memory.total,memory.free,utilization.gpu,temperature.gpu",
85+
"--query-gpu=name,memory.used,memory.total,memory.free,"
86+
"utilization.gpu,temperature.gpu",
3587
"--format=csv,noheader,nounits",
3688
f"--id={device_id}",
3789
],
@@ -41,15 +93,56 @@ def _query_single_gpu(device_id: int, subprocess_mod, format_bytes) -> dict | No
4193
)
4294
parts = result.stdout.strip().split(", ")
4395
gpu_name = parts[0]
44-
memory_used_mib = float(parts[1])
45-
memory_total_mib = float(parts[2])
46-
memory_free_mib = float(parts[3])
47-
utilization_percent = int(parts[4]) if len(parts) > 4 else None
48-
temperature_celsius = int(parts[5]) if len(parts) > 5 else None
49-
50-
memory_used = memory_used_mib * 1024 * 1024
51-
memory_total = memory_total_mib * 1024 * 1024
52-
memory_free = memory_free_mib * 1024 * 1024
96+
memory_used_mib = _safe_float(parts[1])
97+
memory_total_mib = _safe_float(parts[2])
98+
memory_free_mib = _safe_float(parts[3])
99+
utilization_percent = (
100+
int(parts[4]) if len(parts) > 4 and _safe_float(parts[4]) is not None else None
101+
)
102+
temperature_celsius = (
103+
int(parts[5]) if len(parts) > 5 and _safe_float(parts[5]) is not None else None
104+
)
105+
106+
memory_source = "nvidia-smi"
107+
108+
# If nvidia-smi returned N/A for memory (unified memory systems),
109+
# fall back to torch.cuda.mem_get_info()
110+
if memory_total_mib is None or memory_used_mib is None:
111+
torch_mem = _query_gpu_memory_torch(device_id)
112+
if torch_mem is not None:
113+
free_bytes, total_bytes = torch_mem
114+
used_bytes = total_bytes - free_bytes
115+
memory_total = total_bytes
116+
memory_used = used_bytes
117+
memory_free = free_bytes
118+
memory_source = "torch.cuda.mem_get_info"
119+
else:
120+
# Both nvidia-smi and torch failed — report what we can
121+
return {
122+
"available": True,
123+
"device_id": device_id,
124+
"name": gpu_name,
125+
"memory_total": "N/A (unified memory)",
126+
"memory_used": "N/A",
127+
"memory_free": "N/A",
128+
"memory_percent": "N/A",
129+
"utilization_percent": f"{utilization_percent}%"
130+
if utilization_percent is not None
131+
else "N/A",
132+
"temperature_celsius": temperature_celsius,
133+
"memory_source": "unavailable",
134+
"memory_note": "Unified memory system — install PyTorch "
135+
"for memory stats via torch.cuda.mem_get_info()",
136+
}
137+
else:
138+
memory_used = memory_used_mib * 1024 * 1024
139+
memory_total = memory_total_mib * 1024 * 1024
140+
memory_free = (
141+
memory_free_mib * 1024 * 1024
142+
if memory_free_mib is not None
143+
else memory_total - memory_used
144+
)
145+
53146
memory_percent = (memory_used / memory_total * 100) if memory_total > 0 else 0
54147

55148
return {
@@ -64,6 +157,7 @@ def _query_single_gpu(device_id: int, subprocess_mod, format_bytes) -> dict | No
64157
if utilization_percent is not None
65158
else "N/A",
66159
"temperature_celsius": temperature_celsius,
160+
"memory_source": memory_source,
67161
}
68162
except Exception as e:
69163
logger.warning(f"nvidia-smi query for device {device_id} failed: {e}")

0 commit comments

Comments
 (0)