-
Notifications
You must be signed in to change notification settings - Fork 16
Expand file tree
/
Copy pathdocker-compose.gpu-scale.yml
More file actions
74 lines (72 loc) · 3.06 KB
/
docker-compose.gpu-scale.yml
File metadata and controls
74 lines (72 loc) · 3.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# docker-compose.gpu-scale.yml
# Optional overlay for multi-GPU worker scaling
#
# This overlay enables parallel GPU workers on a dedicated GPU device,
# significantly increasing transcription throughput for users with multiple GPUs.
#
# Usage:
# ./opentr.sh start dev --gpu-scale
# OR
# docker compose -f docker-compose.yml -f docker-compose.override.yml -f docker-compose.gpu-scale.yml up
#
# Note: This overlay extends the base celery-worker-gpu-scaled definition from docker-compose.yml
# and adds GPU-specific configuration. Image/build/volumes come from environment overlays
# (override.yml for dev, prod.yml for production, offline.yml for airgapped).
#
# Configuration:
# Set these variables in your .env file:
# - GPU_SCALE_DEVICE_ID=2 # GPU device ID for scaled workers (default: 2)
# - GPU_SCALE_WORKERS=4 # Number of parallel workers (default: 4)
# - GPU_SCALE_DEFAULT_WORKER=0 # Set to 1 to keep default worker (dual-GPU mode)
#
# Single-GPU Mode (default): Replaces default worker with scaled workers on one GPU.
# GPU_SCALE_DEFAULT_WORKER=0 (default)
#
# Dual-GPU Mode: Keeps default worker on GPU_DEVICE_ID alongside scaled workers.
# GPU_SCALE_DEFAULT_WORKER=1
#
# Example Dual-GPU Setup (5 total workers):
# GPU 0: NVIDIA RTX A6000 (49GB) - Scaled workers (4 parallel in single container)
# GPU 1: RTX 3080 Ti (12GB) - Default single worker (GPU_DEVICE_ID=1)
# GPU 2: NVIDIA RTX A6000 (49GB) - Running LLM model (vLLM, external)
services:
# Default worker: disabled in single-GPU mode, kept alive in dual-GPU mode
celery-worker:
scale: ${GPU_SCALE_DEFAULT_WORKER:-0}
# Scaled GPU Worker - GPU-specific settings
# Base definition (env, tmpfs, depends_on) in docker-compose.yml
# Image/build/volumes from override.yml (dev) or prod.yml/offline.yml
celery-worker-gpu-scaled:
scale: 1 # Enable this service (base has scale: 0)
command: >
celery -A app.core.celery worker
-Q gpu
--pool=${GPU_SCALE_POOL:-threads}
--concurrency=${GPU_SCALE_WORKERS:-4}
--max-tasks-per-child=${GPU_SCALE_MAX_TASKS:-500}
-n gpu-scaled@%h
-E
--loglevel=info
environment:
# CUDA_VISIBLE_DEVICES=0 because Docker's device reservation (deploy.resources)
# already maps the host GPU (GPU_SCALE_DEVICE_ID) to device index 0 inside the container.
# Setting this to the host device ID (e.g., 2) would fail because the container
# only sees one GPU at index 0.
- CUDA_VISIBLE_DEVICES=0
# Tell TranscriptionConfig how many concurrent tasks share GPU weights
- GPU_CONCURRENT_REQUESTS=${GPU_SCALE_WORKERS:-4}
- PRELOAD_GPU_MODELS=true
healthcheck:
test: ["CMD-SHELL", "celery -A app.core.celery inspect ping -d gpu-scaled@$$HOSTNAME"]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s
# GPU configuration for multi-GPU scaling
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ["${GPU_SCALE_DEVICE_ID:-2}"]
capabilities: [gpu]