orpheus-tensorrt/setup.sh at main · piyushgit011/orpheus-tensorrt · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#!/bin/bash
# Orpheus TTS Setup Script for RTX 4090
# Direct system installation (no Docker)
# Optimized for FP8 with TensorRT-LLM

set -e

echo "============================================================"
echo "Orpheus TTS Setup for RTX 4090 (FP8 TensorRT-LLM)"
echo "============================================================"
echo ""

# Check CUDA
echo "Checking CUDA installation..."
if ! command -v nvcc &> /dev/null; then
    echo "Error: CUDA not found. Please install CUDA 12.x"
    echo "Visit: https://developer.nvidia.com/cuda-downloads"
    exit 1
fi

CUDA_VERSION=$(nvcc --version | grep "release" | sed 's/.*release //' | cut -d',' -f1)
echo "✓ CUDA Version: $CUDA_VERSION"

# Check GPU
echo ""
echo "Checking GPU..."
if command -v nvidia-smi &> /dev/null; then
    GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader | head -1)
    GPU_MEMORY=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader | head -1)
    COMPUTE_CAP=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -1 2>/dev/null || echo "unknown")

    echo "✓ GPU: $GPU_NAME"
    echo "✓ Memory: $GPU_MEMORY"

    # Check for 4090 / Ada Lovelace for FP8 support
    if [[ "$GPU_NAME" == *"4090"* ]] || [[ "$GPU_NAME" == *"4080"* ]] || [[ "$GPU_NAME" == *"Ada"* ]]; then
        echo "✓ FP8 Support: YES (Ada Lovelace)"
    elif [[ "$GPU_NAME" == *"H100"* ]] || [[ "$GPU_NAME" == *"H200"* ]]; then
        echo "✓ FP8 Support: YES (Hopper)"
    else
        echo "⚠ FP8 Support: Check compute capability >= 8.9"
    fi
else
    echo "Warning: nvidia-smi not found"
fi

# Create virtual environment
echo ""
echo "Creating virtual environment..."
if [ ! -d "venv" ]; then
    python3 -m venv venv
fi
source venv/bin/activate

# Upgrade pip
pip install --upgrade pip wheel setuptools

# Install PyTorch with CUDA
echo ""
echo "Installing PyTorch with CUDA 12.1..."
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

# Verify PyTorch CUDA
python3 -c "import torch; print(f'PyTorch: {torch.__version__}, CUDA: {torch.cuda.is_available()}')"

# Install TensorRT-LLM from NVIDIA
echo ""
echo "Installing TensorRT-LLM..."
echo "This may take a few minutes..."
pip install tensorrt_llm -U --pre --extra-index-url https://pypi.nvidia.com

# Verify TensorRT-LLM
python3 -c "import tensorrt_llm; print(f'TensorRT-LLM: {tensorrt_llm.__version__}')" || echo "TensorRT-LLM verification will complete after full install"

# Install SNAC audio codec
echo ""
echo "Installing SNAC audio codec..."
pip install snac

# Install other requirements
echo ""
echo "Installing other requirements..."
pip install -r requirements.txt

# Create directories
echo ""
echo "Creating directories..."
mkdir -p models checkpoints engines logs

# Download Orpheus TTS model
echo ""
echo "Downloading Orpheus TTS model..."
python3 -c "
from huggingface_hub import snapshot_download
print('Downloading Orpheus TTS model (this may take a while)...')
snapshot_download(
    repo_id='canopylabs/orpheus-tts-0.1-finetune-prod',
    local_dir='./models/orpheus-tts',
    local_dir_use_symlinks=False
)
print('✓ Orpheus TTS model downloaded!')
"

# Download SNAC decoder
echo ""
echo "Downloading SNAC audio decoder..."
python3 -c "
from huggingface_hub import snapshot_download
print('Downloading SNAC 24kHz decoder...')
snapshot_download(
    repo_id='hubertsiuzdak/snac_24khz',
    local_dir='./models/snac_24khz',
    local_dir_use_symlinks=False
)
print('✓ SNAC decoder downloaded!')
"

echo ""
echo "============================================================"
echo "SETUP COMPLETE!"
echo "============================================================"
echo ""
echo "Next steps:"
echo ""
echo "1. Build the TensorRT-LLM engine (FP8 quantized):"
echo "   source venv/bin/activate"
echo "   python build_engine.py --config config/model_config.yaml"
echo ""
echo "2. Start the streaming server:"
echo "   python streaming_server.py --config config/model_config.yaml"
echo ""
echo "3. Test with client:"
echo "   python client.py --text 'Hello, world!' --output test.wav"
echo ""
echo "4. Run benchmarks:"
echo "   python benchmark.py --mode all"
echo ""
echo "Expected performance on RTX 4090:"
echo "  - TTFB: < 150ms"
echo "  - Concurrent streams: 8-12 real-time"
echo "  - RTF: 30-60x faster than real-time"
echo ""