OpenTranscribe/docker-compose.gpu-scale.yml at master · attevon-llc/OpenTranscribe · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# docker-compose.gpu-scale.yml
# Optional overlay for multi-GPU worker scaling
#
# This overlay enables parallel GPU workers on a dedicated GPU device,
# significantly increasing transcription throughput for users with multiple GPUs.
#
# Usage:
#   ./opentr.sh start dev --gpu-scale
#   OR
#   docker compose -f docker-compose.yml -f docker-compose.override.yml -f docker-compose.gpu-scale.yml up
#
# Note: This overlay extends the base celery-worker-gpu-scaled definition from docker-compose.yml
# and adds GPU-specific configuration. Image/build/volumes come from environment overlays
# (override.yml for dev, prod.yml for production, offline.yml for airgapped).
#
# Configuration:
#   Set these variables in your .env file:
#   - GPU_SCALE_DEVICE_ID=2           # GPU device ID for scaled workers (default: 2)
#   - GPU_SCALE_WORKERS=4             # Number of parallel workers (default: 4)
#   - GPU_SCALE_DEFAULT_WORKER=0      # Set to 1 to keep default worker (dual-GPU mode)
#
# Single-GPU Mode (default): Replaces default worker with scaled workers on one GPU.
#   GPU_SCALE_DEFAULT_WORKER=0 (default)
#
# Dual-GPU Mode: Keeps default worker on GPU_DEVICE_ID alongside scaled workers.
#   GPU_SCALE_DEFAULT_WORKER=1
#
# Example Dual-GPU Setup (5 total workers):
#   GPU 0: NVIDIA RTX A6000 (49GB)    - Scaled workers (4 parallel in single container)
#   GPU 1: RTX 3080 Ti (12GB)         - Default single worker (GPU_DEVICE_ID=1)
#   GPU 2: NVIDIA RTX A6000 (49GB)    - Running LLM model (vLLM, external)

services:
  # Default worker: disabled in single-GPU mode, kept alive in dual-GPU mode
  celery-worker:
    scale: ${GPU_SCALE_DEFAULT_WORKER:-0}

  # Scaled GPU Worker - GPU-specific settings
  # Base definition (env, tmpfs, depends_on) in docker-compose.yml
  # Image/build/volumes from override.yml (dev) or prod.yml/offline.yml
  celery-worker-gpu-scaled:
    scale: 1  # Enable this service (base has scale: 0)
    command: >
      celery -A app.core.celery worker
      -Q gpu
      --pool=${GPU_SCALE_POOL:-threads}
      --concurrency=${GPU_SCALE_WORKERS:-4}
      --max-tasks-per-child=${GPU_SCALE_MAX_TASKS:-500}
      -n gpu-scaled@%h
      -E
      --loglevel=info
    environment:
      # CUDA_VISIBLE_DEVICES=0 because Docker's device reservation (deploy.resources)
      # already maps the host GPU (GPU_SCALE_DEVICE_ID) to device index 0 inside the container.
      # Setting this to the host device ID (e.g., 2) would fail because the container
      # only sees one GPU at index 0.
      - CUDA_VISIBLE_DEVICES=0
      # Tell TranscriptionConfig how many concurrent tasks share GPU weights
      - GPU_CONCURRENT_REQUESTS=${GPU_SCALE_WORKERS:-4}
      - PRELOAD_GPU_MODELS=true
    healthcheck:
      test: ["CMD-SHELL", "celery -A app.core.celery inspect ping -d gpu-scaled@$$HOSTNAME"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 40s
    # GPU configuration for multi-GPU scaling
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              device_ids: ["${GPU_SCALE_DEVICE_ID:-2}"]
              capabilities: [gpu]