Skip to content
42 changes: 42 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,48 @@ For GPUs with more than 40GB of GPU memory, **e.g., H100, please use the unquant

Interactive inference via the terminal is available at `turbodiffusion/serve/`. This allows multi-turn video generation without reloading the model.

### Memory Optimization: Pre-caching T5 Embeddings

The umT5-XXL text encoder requires ~11GB VRAM, which can cause OOM on 32GB GPUs when combined with the DiT models. To avoid this, you can pre-cache text embeddings in a separate pass:

**Memory Comparison:**
| Approach | Peak VRAM | Notes |
|----------|-----------|-------|
| Standard (T5 + DiT) | ~30GB+ | May OOM on 32GB GPUs |
| Cached embeddings | ~18GB | T5 never loaded during inference |

**Step 1: Cache the embedding (loads T5, encodes prompt, saves to file, unloads T5)**
```bash
python scripts/cache_t5.py \
--prompt "slow head turn, cinematic" \
--output cached_embeddings.pt
```

**Step 2: Run inference with cached embedding (T5 never loaded)**
```bash
python turbodiffusion/inference/wan2.2_i2v_infer.py \
--cached_embedding cached_embeddings.pt \
--skip_t5 \
--model Wan2.2-A14B \
--low_noise_model_path checkpoints/TurboWan2.2-I2V-A14B-low-720P-quant.pth \
--high_noise_model_path checkpoints/TurboWan2.2-I2V-A14B-high-720P-quant.pth \
--image_path your_image.jpg \
--prompt "slow head turn, cinematic" \
--quant_linear --attention_type sagesla --ode
```

You can cache multiple prompts at once:
```bash
# Create a prompts file
echo "slow head turn, cinematic" > prompts.txt
echo "walking forward, dramatic lighting" >> prompts.txt

# Cache all prompts
python scripts/cache_t5.py --prompts_file prompts.txt --output my_prompts.pt
```

The cached file is only ~4MB per prompt, compared to the 11GB T5 model.


## Evaluation

Expand Down
310 changes: 310 additions & 0 deletions install.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,310 @@
#!/bin/bash
# TurboDiffusion Installation Script
# For RTX 5090 (Blackwell) with CUDA 13.0

set -e # Exit on error

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR"

echo "=============================================="
echo "TurboDiffusion Installation Script"
echo "=============================================="
echo ""

# =============================================================================
# Check for Miniconda
# =============================================================================
check_conda() {
if command -v conda &> /dev/null; then
echo "✅ Conda found: $(conda --version)"
return 0
fi

# Check common install locations
for conda_path in ~/miniconda3/bin/conda ~/anaconda3/bin/conda /opt/conda/bin/conda; do
if [ -f "$conda_path" ]; then
echo "✅ Found conda at: $conda_path"
eval "$($conda_path shell.bash hook)"
return 0
fi
done

return 1
}

install_miniconda() {
echo ""
echo "❌ Conda/Miniconda not found!"
echo ""
echo "Please install Miniconda first:"
echo ""
echo " # Download Miniconda (Linux x86_64)"
echo " wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh"
echo ""
echo " # Install (follow prompts)"
echo " bash Miniconda3-latest-Linux-x86_64.sh"
echo ""
echo " # Restart shell or run:"
echo " source ~/.bashrc"
echo ""
echo " # Then re-run this script"
echo ""

read -p "Would you like to download and install Miniconda now? [y/N] " -n 1 -r
echo
if [[ $REPLY =~ ^[Yy]$ ]]; then
echo "Downloading Miniconda..."
wget -q --show-progress https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh

echo "Installing Miniconda to ~/miniconda3..."
bash /tmp/miniconda.sh -b -p ~/miniconda3

echo "Initializing conda..."
~/miniconda3/bin/conda init bash
eval "$(~/miniconda3/bin/conda shell.bash hook)"

rm /tmp/miniconda.sh
echo "✅ Miniconda installed!"
return 0
else
exit 1
fi
}

if ! check_conda; then
install_miniconda
fi

# Source conda for current shell
if [ -f ~/miniconda3/etc/profile.d/conda.sh ]; then
source ~/miniconda3/etc/profile.d/conda.sh
elif [ -f ~/anaconda3/etc/profile.d/conda.sh ]; then
source ~/anaconda3/etc/profile.d/conda.sh
fi

# =============================================================================
# Check for CUDA
# =============================================================================
echo ""
echo "Checking CUDA..."

if ! command -v nvcc &> /dev/null; then
echo "⚠️ nvcc not found in PATH"
# Check common locations
for cuda_path in /usr/local/cuda-13.0 /usr/local/cuda-12.9 /usr/local/cuda; do
if [ -f "$cuda_path/bin/nvcc" ]; then
echo " Found CUDA at: $cuda_path"
export PATH="$cuda_path/bin:$PATH"
export LD_LIBRARY_PATH="$cuda_path/lib64:$LD_LIBRARY_PATH"
break
fi
done
fi

if command -v nvcc &> /dev/null; then
CUDA_VERSION=$(nvcc --version | grep "release" | sed 's/.*release \([0-9]*\.[0-9]*\).*/\1/')
echo "✅ CUDA version: $CUDA_VERSION"
else
echo "❌ CUDA not found. Please install CUDA 13.0 for RTX 5090 support."
exit 1
fi

# Check GPU
if command -v nvidia-smi &> /dev/null; then
GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1)
GPU_MEMORY=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader 2>/dev/null | head -1)
echo "✅ GPU: $GPU_NAME ($GPU_MEMORY)"
fi

# =============================================================================
# Create/Activate Conda Environment
# =============================================================================
ENV_NAME="turbodiffusion"

echo ""
echo "Setting up conda environment: $ENV_NAME"

if conda env list | grep -q "^$ENV_NAME "; then
echo " Environment '$ENV_NAME' already exists"
read -p " Recreate environment? [y/N] " -n 1 -r
echo
if [[ $REPLY =~ ^[Yy]$ ]]; then
echo " Removing existing environment..."
conda env remove -n $ENV_NAME -y
echo " Creating fresh environment..."
conda create -n $ENV_NAME python=3.12 -y
fi
else
echo " Creating new environment with Python 3.12..."
conda create -n $ENV_NAME python=3.12 -y
fi

echo " Activating environment..."
conda activate $ENV_NAME

echo "✅ Python: $(python --version)"

# =============================================================================
# Install PyTorch with CUDA 13.0 (Nightly for Blackwell support)
# =============================================================================
echo ""
echo "Installing PyTorch with CUDA 13.0 support..."
echo " (Nightly build required for RTX 5090/Blackwell)"

pip3 install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu130

# Verify PyTorch installation
python -c "import torch; print(f'✅ PyTorch {torch.__version__}'); print(f' CUDA available: {torch.cuda.is_available()}'); print(f' CUDA version: {torch.version.cuda}')" || {
echo "❌ PyTorch installation failed"
exit 1
}

# =============================================================================
# Install Dependencies
# =============================================================================
echo ""
echo "Installing dependencies..."

pip install psutil

# =============================================================================
# Initialize Git Submodules (CUTLASS)
# =============================================================================
echo ""
echo "Initializing git submodules (CUTLASS)..."

if [ -d ".git" ]; then
git submodule update --init --recursive
echo "✅ Submodules initialized"
else
echo "⚠️ Not a git repository, checking if CUTLASS exists..."
if [ ! -f "turbodiffusion/ops/cutlass/include/cutlass/cutlass.h" ]; then
echo "❌ CUTLASS not found. Please clone with: git clone --recursive <repo>"
exit 1
fi
fi

# Verify CUTLASS headers
if [ ! -f "turbodiffusion/ops/cutlass/include/cutlass/cutlass.h" ]; then
echo "❌ CUTLASS headers not found after submodule init"
exit 1
fi
echo "✅ CUTLASS headers verified"

# =============================================================================
# Build and Install TurboDiffusion
# =============================================================================
echo ""
echo "Building TurboDiffusion..."
echo " Compiling CUDA kernels for: sm_80, sm_89, sm_90, sm_120a (Blackwell)"
echo " This may take several minutes..."
echo ""

# Clean previous builds if requested
if [ "$1" == "--clean" ]; then
echo "Cleaning previous builds..."
rm -rf build/ dist/ *.egg-info/
find . -name "*.so" -path "*/turbodiffusion/*" -delete 2>/dev/null || true
fi

pip install -e . --no-build-isolation 2>&1 | tee build.log

# =============================================================================
# Create Module Symlinks (for inference scripts)
# =============================================================================
echo ""
echo "Creating module symlinks..."

# The inference scripts import from top-level (e.g., 'from imaginaire.utils.io')
# but modules are inside turbodiffusion/. Create symlinks at repo root.
cd "$SCRIPT_DIR"

for module in imaginaire rcm ops SLA; do
if [ -d "turbodiffusion/$module" ]; then
if [ ! -L "$module" ]; then
ln -sf "turbodiffusion/$module" "$module"
echo " Created symlink: $module -> turbodiffusion/$module"
else
echo " Symlink exists: $module"
fi
fi
done

# Verify symlinks work
python -c "
import sys
sys.path.insert(0, '.')
from imaginaire.utils.io import save_image_or_video
from rcm.datasets.utils import VIDEO_RES_SIZE_INFO
from ops import FastLayerNorm, FastRMSNorm, Int8Linear
from SLA import SparseLinearAttention, SageSparseLinearAttention
print('✅ All module imports working')
" || echo "⚠️ Some imports failed - check symlinks"

# =============================================================================
# Install SpargeAttn (Sparse Attention for efficiency)
# =============================================================================
echo ""
echo "Installing SpargeAttn..."

# Get GPU compute capability
GPU_ARCH=$(python -c "import torch; cc = torch.cuda.get_device_capability(); print(f'{cc[0]}.{cc[1]}')" 2>/dev/null || echo "8.0")
echo " Detected GPU compute capability: $GPU_ARCH"

# Clone, patch for Blackwell (sm_120) if needed, and install
SPARGE_TMP="/tmp/SpargeAttn_build_$$"
rm -rf "$SPARGE_TMP"
git clone --depth 1 https://github.com/thu-ml/SpargeAttn.git "$SPARGE_TMP"

# Add sm_120 (Blackwell) support if not already present
if grep -q '"12.0"' "$SPARGE_TMP/setup.py"; then
echo " SpargeAttn already supports sm_120"
else
echo " Patching SpargeAttn for Blackwell (sm_120) support..."
sed -i 's/SUPPORTED_ARCHS = {"8.0", "8.6", "8.7", "8.9", "9.0"}/SUPPORTED_ARCHS = {"8.0", "8.6", "8.7", "8.9", "9.0", "12.0"}/' "$SPARGE_TMP/setup.py"
fi

cd "$SPARGE_TMP"
TORCH_CUDA_ARCH_LIST="$GPU_ARCH" pip install -e . --no-build-isolation
cd "$SCRIPT_DIR"
rm -rf "$SPARGE_TMP"

# =============================================================================
# Verify Installation
# =============================================================================
echo ""
echo "Verifying installation..."

python -c "
import torch
import turbo_diffusion_ops
print('✅ turbo_diffusion_ops loaded')
print(' Available ops:', [x for x in dir(turbo_diffusion_ops) if not x.startswith('_')])

try:
import spas_sage_attn
print('✅ SpargeAttn (spas_sage_attn) loaded')
except ImportError:
print('⚠️ SpargeAttn not available (optional)')

print()
print('GPU Info:')
if torch.cuda.is_available():
print(f' Device: {torch.cuda.get_device_name(0)}')
print(f' VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB')
print(f' Compute Capability: {torch.cuda.get_device_capability(0)}')
"

echo ""
echo "=============================================="
echo "✅ Installation complete!"
echo "=============================================="
echo ""
echo "Usage:"
echo " conda activate $ENV_NAME"
echo " python -c 'import turbodiffusion'"
echo ""
echo "To run the TUI server:"
echo " python -m turbodiffusion.tui_serve"
echo ""
Loading