svara-tts-inference/docker-compose.yml at main · Kenpath/svara-tts-inference · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
services:
  svara-tts-api:
    build:
      context: .
      dockerfile: Dockerfile
    image: svara-tts-api:latest
    container_name: svara-tts-api

    # GPU configuration
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]

    # Port mappings
    ports:
      - "8080:8080"  # FastAPI API server (vLLM embedded)

    # Environment variables (override in .env file)
    environment:
      # vLLM Configuration
      - VLLM_MODEL=${VLLM_MODEL:-kenpath/svara-tts-v1}
      - VLLM_GPU_MEMORY_UTILIZATION=${VLLM_GPU_MEMORY_UTILIZATION:-0.9}
      - VLLM_MAX_MODEL_LEN=${VLLM_MAX_MODEL_LEN:-4096}
      - VLLM_TENSOR_PARALLEL_SIZE=${VLLM_TENSOR_PARALLEL_SIZE:-1}
      - VLLM_QUANTIZATION=${VLLM_QUANTIZATION:-}
      - VLLM_ENFORCE_EAGER=${VLLM_ENFORCE_EAGER:-false}
      - VLLM_ATTENTION_BACKEND=${VLLM_ATTENTION_BACKEND:-}
      - VLLM_KV_CACHE_DTYPE=${VLLM_KV_CACHE_DTYPE:-auto}
      - VLLM_DTYPE=${VLLM_DTYPE:-auto}

      # API Configuration
      - API_PORT=${API_PORT:-8080}
      - API_HOST=${API_HOST:-0.0.0.0}
      - SNAC_DEVICE=${SNAC_DEVICE:-cpu}
      - SNAC_WINDOW_SIZE=${SNAC_WINDOW_SIZE:-28}
      - SNAC_COMPILE=${SNAC_COMPILE:-true}

      # Logging
      - LOG_LEVEL=${LOG_LEVEL:-INFO}

      # Hugging Face Token (optional, for gated models)
      - HF_TOKEN=${HF_TOKEN:-}

    # Volume mounts
    volumes:
      # Cache Hugging Face models to avoid re-downloading
      - huggingface_cache:/root/.cache/huggingface
      # Optional: Mount local code for development
      # - ./tts_engine:/app/tts_engine
      # - ./api:/app/api

    # Restart policy
    restart: unless-stopped

    # Health check
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 120s

    # Logging configuration
    logging:
      driver: "json-file"
      options:
        max-size: "10m"
        max-file: "3"

volumes:
  huggingface_cache:
    driver: local