svara-tts-inference/.env.example at main · Kenpath/svara-tts-inference · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# ============================================================================
# Svara TTS API - Environment Configuration
# ============================================================================
# Copy this file to .env and customize the values for your deployment

# ============================================================================
# vLLM Configuration (embedded engine)
# ============================================================================

# Model repository ID from Hugging Face
# Default: kenpath/svara-tts-v1
VLLM_MODEL=kenpath/svara-tts-v1

# GPU memory utilization (0.0-1.0)
# Lower values leave more memory for other processes
# Recommended: 0.85-0.95 for dedicated GPU
VLLM_GPU_MEMORY_UTILIZATION=0.9

# Maximum model context length
# Adjust based on your GPU memory
VLLM_MAX_MODEL_LEN=4096

# Number of GPUs to use for tensor parallelism
# Set to 1 for single GPU, increase for multi-GPU setups
VLLM_TENSOR_PARALLEL_SIZE=1

# Quantization method (e.g., fp8, awq, gptq, or leave empty for none)
VLLM_QUANTIZATION=

# Use eager mode (disable CUDA graphs). Set to true for debugging.
VLLM_ENFORCE_EAGER=false

# Data type for model weights (auto, float16, bfloat16, float32)
VLLM_DTYPE=auto

# Attention backend (empty = auto, or FLASH_ATTN, FLASHINFER, TRITON_ATTN)
# VLLM_ATTENTION_BACKEND=

# KV cache data type (auto, fp8, fp8_e5m2, fp8_e4m3)
# fp8 reduces KV cache memory ~2x, enabling more concurrent requests
VLLM_KV_CACHE_DTYPE=auto

# ============================================================================
# FastAPI Server Configuration
# ============================================================================

# FastAPI server host and port
API_HOST=0.0.0.0
API_PORT=8080

# ============================================================================
# TTS Engine Configuration
# ============================================================================

# Device for SNAC decoder: cuda, mps, cpu, or leave empty for auto-detect
SNAC_DEVICE=cpu

# SNAC mapper window size (must be multiple of 7)
# 28 = 4 frames (default, fast TTFB)
# 56 = 8 frames (fewer decode calls, better throughput, slower TTFB)
SNAC_WINDOW_SIZE=28

# Compile SNAC model with torch.compile (true = faster decode after warmup)
SNAC_COMPILE=true

# ============================================================================
# Hugging Face Configuration
# ============================================================================

# Hugging Face API token (optional)
# Required for gated models or private repositories
HF_TOKEN=

# ============================================================================
# Logging Configuration
# ============================================================================

# Log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
LOG_LEVEL=INFO