diff --git a/.bumpversion.cfg b/.bumpversion.cfg
index 055a0cbb..adcfbcc5 100644
--- a/.bumpversion.cfg
+++ b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.3.0
+current_version = 0.3.1
 commit = True
 tag = True
 tag_name = v{new_version}
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index d00df528..1312ca37 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -200,33 +200,37 @@ jobs:
         run: |
           python -c "import torch; print(f'CUDA available in build: {torch.cuda.is_available()}'); print(f'CUDA version: {torch.version.cuda}')"
 
-      - name: Build CUDA server binary
+      - name: Build CUDA server binary (onedir)
         shell: bash
         working-directory: backend
         run: python build_binary.py --cuda
 
-      - name: Split binary for GitHub Releases
+      - name: Package into server core + CUDA libs archives
         shell: bash
         run: |
-          python scripts/split_binary.py \
-            backend/dist/voicebox-server-cuda.exe \
-            --output release-assets/
+          python scripts/package_cuda.py \
+            backend/dist/voicebox-server-cuda/ \
+            --output release-assets/ \
+            --cuda-libs-version cu126-v1 \
+            --torch-compat ">=2.6.0,<2.11.0"
 
-      - name: Upload split parts to GitHub Release
+      - name: Upload archives to GitHub Release
         if: startsWith(github.ref, 'refs/tags/')
-        uses: softprops/action-gh-release@v1
+        uses: softprops/action-gh-release@v2
         with:
           files: |
-            release-assets/voicebox-server-cuda.part*.exe
-            release-assets/voicebox-server-cuda.sha256
-            release-assets/voicebox-server-cuda.manifest
+            release-assets/voicebox-server-cuda.tar.gz
+            release-assets/voicebox-server-cuda.tar.gz.sha256
+            release-assets/cuda-libs-cu126-v1.tar.gz
+            release-assets/cuda-libs-cu126-v1.tar.gz.sha256
+            release-assets/cuda-libs.json
           draft: true
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 
-      - name: Upload binary as workflow artifact
+      - name: Upload onedir as workflow artifact
         uses: actions/upload-artifact@v4
         with:
           name: voicebox-server-cuda-windows
-          path: backend/dist/voicebox-server-cuda.exe
+          path: backend/dist/voicebox-server-cuda/
           retention-days: 7
diff --git a/.gitignore b/.gitignore
index e9755251..130a7aa1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -53,6 +53,12 @@ tauri/src-tauri/gen/Assets.car
 tauri/src-tauri/gen/voicebox.icns
 tauri/src-tauri/gen/partial.plist
 
+# PyInstaller
+*.spec
+
+# Windows artifacts
+nul
+
 # Temporary
 tmp/
 temp/
diff --git a/app/package.json b/app/package.json
index 55a182e5..0ed030fb 100644
--- a/app/package.json
+++ b/app/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@voicebox/app",
-  "version": "0.3.0",
+  "version": "0.3.1",
   "private": true,
   "type": "module",
   "scripts": {
diff --git a/backend/__init__.py b/backend/__init__.py
index 108ff87a..1a7dfe5d 100644
--- a/backend/__init__.py
+++ b/backend/__init__.py
@@ -1,3 +1,3 @@
 # Backend package
 
-__version__ = "0.3.0"
+__version__ = "0.3.1"
diff --git a/backend/build_binary.py b/backend/build_binary.py
index 0574894f..c2b4b510 100644
--- a/backend/build_binary.py
+++ b/backend/build_binary.py
@@ -34,9 +34,15 @@ def build_server(cuda=False):
     binary_name = "voicebox-server-cuda" if cuda else "voicebox-server"
 
     # PyInstaller arguments
+    # CUDA builds use --onedir so we can split the output into two archives:
+    #   1. Server core (~200-400MB) — versioned with the app
+    #   2. CUDA libs (~2GB) — versioned independently (only redownloaded on
+    #      CUDA toolkit / torch major version changes)
+    # CPU builds remain --onefile for simplicity.
+    pack_mode = "--onedir" if cuda else "--onefile"
     args = [
         "server.py",  # Use server.py as entry point instead of main.py
-        "--onefile",
+        pack_mode,
         "--name",
         binary_name,
     ]
diff --git a/backend/services/cuda.py b/backend/services/cuda.py
index 7e4d9602..6e5420f3 100644
--- a/backend/services/cuda.py
+++ b/backend/services/cuda.py
@@ -1,16 +1,22 @@
 """
-CUDA backend binary download, assembly, and verification.
+CUDA backend download, assembly, and verification.
 
-Downloads split parts of the CUDA-enabled voicebox-server binary from
-GitHub Releases, reassembles them, verifies integrity via SHA-256,
-and places the binary in the app's data directory for use on next
-backend restart.
+Downloads two archives from GitHub Releases:
+  1. Server core (voicebox-server-cuda.tar.gz) — the exe + non-NVIDIA deps,
+     versioned with the app.
+  2. CUDA libs (cuda-libs-{version}.tar.gz) — NVIDIA runtime libraries,
+     versioned independently (only redownloaded on CUDA toolkit bump).
+
+Both archives are extracted into {data_dir}/backends/cuda/ which forms the
+complete PyInstaller --onedir directory structure that torch expects.
 """
 
 import hashlib
+import json
 import logging
 import os
 import sys
+import tarfile
 from pathlib import Path
 from typing import Optional
 
@@ -24,6 +30,10 @@
 
 PROGRESS_KEY = "cuda-backend"
 
+# The current expected CUDA libs version.  Bump this when we change the
+# CUDA toolkit version or torch's CUDA dependency changes (e.g. cu126 -> cu128).
+CUDA_LIBS_VERSION = "cu126-v1"
+
 
 def get_backends_dir() -> Path:
     """Directory where downloaded backend binaries are stored."""
@@ -32,21 +42,46 @@ def get_backends_dir() -> Path:
     return d
 
 
-def get_cuda_binary_name() -> str:
-    """Platform-specific CUDA binary filename."""
+def get_cuda_dir() -> Path:
+    """Directory where the CUDA backend (onedir) is extracted."""
+    d = get_backends_dir() / "cuda"
+    d.mkdir(parents=True, exist_ok=True)
+    return d
+
+
+def get_cuda_exe_name() -> str:
+    """Platform-specific CUDA executable filename."""
     if sys.platform == "win32":
         return "voicebox-server-cuda.exe"
     return "voicebox-server-cuda"
 
 
 def get_cuda_binary_path() -> Optional[Path]:
-    """Return path to CUDA binary if it exists."""
-    p = get_backends_dir() / get_cuda_binary_name()
+    """Return path to the CUDA executable if it exists inside the onedir."""
+    p = get_cuda_dir() / get_cuda_exe_name()
     if p.exists():
         return p
     return None
 
 
+def get_cuda_libs_manifest_path() -> Path:
+    """Path to the cuda-libs.json manifest inside the CUDA dir."""
+    return get_cuda_dir() / "cuda-libs.json"
+
+
+def get_installed_cuda_libs_version() -> Optional[str]:
+    """Read the installed CUDA libs version from cuda-libs.json, or None."""
+    manifest_path = get_cuda_libs_manifest_path()
+    if not manifest_path.exists():
+        return None
+    try:
+        data = json.loads(manifest_path.read_text())
+        return data.get("version")
+    except Exception as e:
+        logger.warning(f"Could not read cuda-libs.json: {e}")
+        return None
+
+
 def is_cuda_active() -> bool:
     """Check if the current process is the CUDA binary.
 
@@ -60,25 +95,151 @@ def get_cuda_status() -> dict:
     progress_manager = get_progress_manager()
     cuda_path = get_cuda_binary_path()
     progress = progress_manager.get_progress(PROGRESS_KEY)
+    cuda_libs_version = get_installed_cuda_libs_version()
 
     return {
         "available": cuda_path is not None,
         "active": is_cuda_active(),
         "binary_path": str(cuda_path) if cuda_path else None,
+        "cuda_libs_version": cuda_libs_version,
         "downloading": progress is not None and progress.get("status") == "downloading",
         "download_progress": progress,
     }
 
 
+def _needs_server_download(version: Optional[str] = None) -> bool:
+    """Check if the server core archive needs to be (re)downloaded."""
+    cuda_path = get_cuda_binary_path()
+    if not cuda_path:
+        return True
+    # Check if the binary version matches the expected app version
+    installed = get_cuda_binary_version()
+    expected = version or __version__
+    if expected.startswith("v"):
+        expected = expected[1:]
+    return installed != expected
+
+
+def _needs_cuda_libs_download() -> bool:
+    """Check if the CUDA libs archive needs to be (re)downloaded."""
+    installed = get_installed_cuda_libs_version()
+    if installed is None:
+        return True
+    return installed != CUDA_LIBS_VERSION
+
+
+async def _download_and_extract_archive(
+    client,
+    url: str,
+    sha256_url: Optional[str],
+    dest_dir: Path,
+    label: str,
+    progress_offset: int,
+    total_size: int,
+):
+    """Download a .tar.gz archive and extract it into dest_dir.
+
+    Args:
+        client: httpx.AsyncClient
+        url: URL of the .tar.gz archive
+        sha256_url: URL of the .sha256 checksum file (optional)
+        dest_dir: Directory to extract into
+        label: Human-readable label for progress updates
+        progress_offset: Byte offset for progress reporting (when downloading
+            multiple archives sequentially)
+        total_size: Total bytes across all downloads (for progress bar)
+    """
+    progress = get_progress_manager()
+    temp_path = dest_dir / f".download-{label.replace(' ', '-')}.tmp"
+
+    # Clean up leftover partial download
+    if temp_path.exists():
+        temp_path.unlink()
+
+    # Fetch expected checksum (fail-fast: never extract an unverified archive)
+    expected_sha = None
+    if sha256_url:
+        try:
+            sha_resp = await client.get(sha256_url)
+            sha_resp.raise_for_status()
+            expected_sha = sha_resp.text.strip().split()[0]
+            logger.info(f"{label}: expected SHA-256: {expected_sha[:16]}...")
+        except Exception as e:
+            raise RuntimeError(f"{label}: failed to fetch checksum from {sha256_url}") from e
+
+    # Stream download, verify, and extract — always clean up temp file
+    downloaded = 0
+    try:
+        async with client.stream("GET", url) as response:
+            response.raise_for_status()
+            with open(temp_path, "wb") as f:
+                async for chunk in response.aiter_bytes(chunk_size=1024 * 1024):
+                    f.write(chunk)
+                    downloaded += len(chunk)
+                    progress.update_progress(
+                        PROGRESS_KEY,
+                        current=progress_offset + downloaded,
+                        total=total_size,
+                        filename=f"Downloading {label}",
+                        status="downloading",
+                    )
+
+        # Verify integrity
+        if expected_sha:
+            progress.update_progress(
+                PROGRESS_KEY,
+                current=progress_offset + downloaded,
+                total=total_size,
+                filename=f"Verifying {label}...",
+                status="downloading",
+            )
+            sha256 = hashlib.sha256()
+            with open(temp_path, "rb") as f:
+                while True:
+                    data = f.read(1024 * 1024)
+                    if not data:
+                        break
+                    sha256.update(data)
+            actual = sha256.hexdigest()
+            if actual != expected_sha:
+                raise ValueError(
+                    f"{label} integrity check failed: expected {expected_sha[:16]}..., got {actual[:16]}..."
+                )
+            logger.info(f"{label}: integrity verified")
+
+        # Extract (use data filter for path traversal protection on Python 3.12+)
+        progress.update_progress(
+            PROGRESS_KEY,
+            current=progress_offset + downloaded,
+            total=total_size,
+            filename=f"Extracting {label}...",
+            status="downloading",
+        )
+        with tarfile.open(temp_path, "r:gz") as tar:
+            if sys.version_info >= (3, 12):
+                tar.extractall(path=dest_dir, filter="data")
+            else:
+                tar.extractall(path=dest_dir)
+
+        logger.info(f"{label}: extracted to {dest_dir}")
+    finally:
+        if temp_path.exists():
+            temp_path.unlink()
+    return downloaded
+
+
 async def download_cuda_binary(version: Optional[str] = None):
-    """Download the CUDA backend binary from GitHub Releases.
+    """Download the CUDA backend (server core + CUDA libs if needed).
+
+    Downloads both archives from GitHub Releases, extracts them into
+    {data_dir}/backends/cuda/, and writes the cuda-libs.json manifest.
 
-    Downloads split parts listed in a manifest file, concatenates them,
-    and verifies the SHA-256 checksum for integrity. Atomic write
-    (temp file -> rename).
+    Only downloads what's needed:
+    - Server core: always redownloaded (versioned with app)
+    - CUDA libs: only if missing or version mismatch
 
     Args:
-        version: Version tag (e.g. "v0.2.0"). Defaults to current app version.
+        version: Version tag (e.g. "v0.3.0"). Defaults to current app version.
     """
     import httpx
 
@@ -86,114 +247,91 @@ async def download_cuda_binary(version: Optional[str] = None):
         version = f"v{__version__}"
 
     progress = get_progress_manager()
-    binary_name = get_cuda_binary_name()
-    dest_dir = get_backends_dir()
-    final_path = dest_dir / binary_name
-    temp_path = dest_dir / f"{binary_name}.download"
+    cuda_dir = get_cuda_dir()
 
-    # Clean up any leftover partial download
-    if temp_path.exists():
-        temp_path.unlink()
+    need_server = _needs_server_download(version)
+    need_libs = _needs_cuda_libs_download()
 
-    logger.info(f"Starting CUDA backend download for {version}")
+    if not need_server and not need_libs:
+        logger.info("CUDA backend is up to date, nothing to download")
+        return
+
+    logger.info(
+        f"Starting CUDA backend download for {version} "
+        f"(server={'yes' if need_server else 'cached'}, "
+        f"libs={'yes' if need_libs else 'cached'})"
+    )
     progress.update_progress(
-        PROGRESS_KEY, current=0, total=0,
-        filename="Fetching manifest...", status="downloading",
+        PROGRESS_KEY,
+        current=0,
+        total=0,
+        filename="Preparing download...",
+        status="downloading",
     )
 
     base_url = f"{GITHUB_RELEASES_URL}/{version}"
-    stem = Path(binary_name).stem  # voicebox-server-cuda
+    server_archive = "voicebox-server-cuda.tar.gz"
+    libs_archive = f"cuda-libs-{CUDA_LIBS_VERSION}.tar.gz"
 
     try:
         async with httpx.AsyncClient(follow_redirects=True, timeout=30.0) as client:
-            # Fetch the manifest (list of split part filenames)
-            manifest_url = f"{base_url}/{stem}.manifest"
-            manifest_resp = await client.get(manifest_url)
-            manifest_resp.raise_for_status()
-            parts = [p.strip() for p in manifest_resp.text.strip().splitlines() if p.strip()]
-
-            if not parts:
-                raise ValueError("Empty manifest — no split parts found")
-
-            logger.info(f"Found {len(parts)} split parts to download")
-
-            # Fetch expected checksum (optional — for integrity verification)
-            expected_sha = None
-            try:
-                sha_url = f"{base_url}/{stem}.sha256"
-                sha_resp = await client.get(sha_url)
-                if sha_resp.status_code == 200:
-                    # Format: "sha256hex  filename\n"
-                    expected_sha = sha_resp.text.strip().split()[0]
-                    logger.info(f"Expected SHA-256: {expected_sha[:16]}...")
-            except Exception as e:
-                logger.warning(f"Could not fetch checksum file — skipping verification: {e}")
-
-            # Get total size across all parts by issuing HEAD requests
+            # Estimate total download size
             total_size = 0
-            for part_name in parts:
+            if need_server:
                 try:
-                    head_resp = await client.head(f"{base_url}/{part_name}")
-                    content_length = int(head_resp.headers.get("content-length", 0))
-                    total_size += content_length
+                    head = await client.head(f"{base_url}/{server_archive}")
+                    total_size += int(head.headers.get("content-length", 0))
+                except Exception:
+                    pass
+            if need_libs:
+                try:
+                    head = await client.head(f"{base_url}/{libs_archive}")
+                    total_size += int(head.headers.get("content-length", 0))
                 except Exception:
                     pass
-            logger.info(f"Total download size: {total_size / 1024 / 1024:.1f} MB")
 
-            # Download and concatenate parts
-            total_downloaded = 0
-            with open(temp_path, "wb") as f:
-                for i, part_name in enumerate(parts):
-                    part_url = f"{base_url}/{part_name}"
-                    logger.info(f"Downloading part {i + 1}/{len(parts)}: {part_name}")
-
-                    async with client.stream("GET", part_url) as response:
-                        response.raise_for_status()
-                        async for chunk in response.aiter_bytes(chunk_size=1024 * 1024):
-                            f.write(chunk)
-                            total_downloaded += len(chunk)
-                            progress.update_progress(
-                                PROGRESS_KEY, current=total_downloaded, total=total_size,
-                                filename=f"Downloading CUDA backend ({i + 1}/{len(parts)})",
-                                status="downloading",
-                            )
-
-        # Verify integrity if checksum was available
-        if expected_sha:
-            progress.update_progress(
-                PROGRESS_KEY, current=total_downloaded, total=total_downloaded,
-                filename="Verifying integrity...", status="downloading",
-            )
-            sha256 = hashlib.sha256()
-            with open(temp_path, "rb") as f:
-                while True:
-                    chunk = f.read(1024 * 1024)
-                    if not chunk:
-                        break
-                    sha256.update(chunk)
+            logger.info(f"Total download size: {total_size / 1024 / 1024:.1f} MB")
 
-            actual = sha256.hexdigest()
-            if actual != expected_sha:
-                raise ValueError(
-                    f"Integrity check failed: expected {expected_sha[:16]}..., "
-                    f"got {actual[:16]}..."
+            offset = 0
+
+            # Download server core
+            if need_server:
+                server_downloaded = await _download_and_extract_archive(
+                    client,
+                    url=f"{base_url}/{server_archive}",
+                    sha256_url=f"{base_url}/{server_archive}.sha256",
+                    dest_dir=cuda_dir,
+                    label="CUDA server",
+                    progress_offset=offset,
+                    total_size=total_size,
+                )
+                offset += server_downloaded
+
+                # Make executable on Unix
+                exe_path = cuda_dir / get_cuda_exe_name()
+                if sys.platform != "win32" and exe_path.exists():
+                    exe_path.chmod(0o755)
+
+            # Download CUDA libs
+            if need_libs:
+                await _download_and_extract_archive(
+                    client,
+                    url=f"{base_url}/{libs_archive}",
+                    sha256_url=f"{base_url}/{libs_archive}.sha256",
+                    dest_dir=cuda_dir,
+                    label="CUDA libraries",
+                    progress_offset=offset,
+                    total_size=total_size,
                 )
-            logger.info(f"Integrity verified: {actual[:16]}...")
-
-        # Atomic move into place (replace handles existing target on all platforms)
-        temp_path.replace(final_path)
 
-        # Make executable on Unix
-        if sys.platform != "win32":
-            final_path.chmod(0o755)
+                # Write local cuda-libs.json manifest
+                manifest = {"version": CUDA_LIBS_VERSION}
+                get_cuda_libs_manifest_path().write_text(json.dumps(manifest, indent=2) + "\n")
 
-        logger.info(f"CUDA backend downloaded to {final_path}")
+        logger.info(f"CUDA backend ready at {cuda_dir}")
         progress.mark_complete(PROGRESS_KEY)
 
     except Exception as e:
-        # Clean up on failure
-        if temp_path.exists():
-            temp_path.unlink()
         logger.error(f"CUDA backend download failed: {e}")
         progress.mark_error(PROGRESS_KEY, str(e))
         raise
@@ -202,15 +340,19 @@ async def download_cuda_binary(version: Optional[str] = None):
 def get_cuda_binary_version() -> Optional[str]:
     """Get the version of the installed CUDA binary, or None if not installed."""
     import subprocess
+
     cuda_path = get_cuda_binary_path()
     if not cuda_path:
         return None
     try:
         result = subprocess.run(
             [str(cuda_path), "--version"],
-            capture_output=True, text=True, timeout=30,
+            capture_output=True,
+            text=True,
+            timeout=30,
+            cwd=str(cuda_path.parent),  # Run from the onedir directory
         )
-        # Output format: "voicebox-server 0.2.0"
+        # Output format: "voicebox-server 0.3.0"
         for line in result.stdout.strip().splitlines():
             if "voicebox-server" in line:
                 return line.split()[-1]
@@ -222,26 +364,29 @@ def get_cuda_binary_version() -> Optional[str]:
 async def check_and_update_cuda_binary():
     """Check if the CUDA binary is outdated and auto-download if so.
 
-    Called on server startup. If a CUDA binary exists but its version
-    doesn't match the current app version, triggers a background download
-    of the updated CUDA binary. The download progress is visible to the
-    frontend via the existing SSE progress endpoint.
+    Called on server startup. Checks both server version and CUDA libs
+    version. Downloads only what's needed.
     """
     cuda_path = get_cuda_binary_path()
     if not cuda_path:
         return  # No CUDA binary installed, nothing to update
 
-    cuda_version = get_cuda_binary_version()
-    current_version = __version__
+    need_server = _needs_server_download()
+    need_libs = _needs_cuda_libs_download()
 
-    if cuda_version == current_version:
-        logger.info(f"CUDA binary is up to date (v{current_version})")
+    if not need_server and not need_libs:
+        logger.info(f"CUDA binary is up to date (server=v{__version__}, libs={get_installed_cuda_libs_version()})")
         return
 
-    logger.info(
-        f"CUDA binary version mismatch: binary=v{cuda_version}, app=v{current_version}. "
-        f"Auto-downloading updated CUDA backend..."
-    )
+    reasons = []
+    if need_server:
+        cuda_version = get_cuda_binary_version()
+        reasons.append(f"server v{cuda_version} != v{__version__}")
+    if need_libs:
+        installed_libs = get_installed_cuda_libs_version()
+        reasons.append(f"libs {installed_libs} != {CUDA_LIBS_VERSION}")
+
+    logger.info(f"CUDA backend needs update ({', '.join(reasons)}). Auto-downloading...")
 
     try:
         await download_cuda_binary()
@@ -250,10 +395,12 @@ async def check_and_update_cuda_binary():
 
 
 async def delete_cuda_binary() -> bool:
-    """Delete the downloaded CUDA binary. Returns True if deleted."""
-    path = get_cuda_binary_path()
-    if path and path.exists():
-        path.unlink()
-        logger.info(f"Deleted CUDA binary: {path}")
+    """Delete the downloaded CUDA backend directory. Returns True if deleted."""
+    import shutil
+
+    cuda_dir = get_cuda_dir()
+    if cuda_dir.exists() and any(cuda_dir.iterdir()):
+        shutil.rmtree(cuda_dir)
+        logger.info(f"Deleted CUDA backend directory: {cuda_dir}")
         return True
     return False
diff --git a/docs/content/docs/developer/building.mdx b/docs/content/docs/developer/building.mdx
index 0b3593ad..5ea6c747 100644
--- a/docs/content/docs/developer/building.mdx
+++ b/docs/content/docs/developer/building.mdx
@@ -159,12 +159,14 @@ Tauri looks for `voicebox-server-${PLATFORM}` in `src-tauri/binaries/` and bundl
 
 The `build-cuda-windows` job runs separately:
 
-1. Install PyTorch with CUDA 12.1
-2. Build with `build_binary.py --cuda`
-3. Split binary with `scripts/split_binary.py`
-4. Upload parts as release artifacts
-
-This binary is downloaded on-demand by users who enable CUDA in settings.
+1. Install PyTorch with CUDA 12.6
+2. Build with `build_binary.py --cuda` (produces `--onedir` output)
+3. Package with `scripts/package_cuda.py` into two archives:
+   - `voicebox-server-cuda.tar.gz` — server core (~945 MB)
+   - `cuda-libs-cu126-v1.tar.gz` — NVIDIA runtime libraries (~1.7 GB, cached independently)
+4. Upload archives as release artifacts
+
+This binary is downloaded on-demand by users who enable CUDA in settings. The CUDA libs archive is only re-downloaded when the CUDA toolkit version changes, not on every app update.
 
 ## Troubleshooting
 
diff --git a/docs/content/docs/developer/tts-engines.mdx b/docs/content/docs/developer/tts-engines.mdx
index 90135a37..dc749b02 100644
--- a/docs/content/docs/developer/tts-engines.mdx
+++ b/docs/content/docs/developer/tts-engines.mdx
@@ -80,6 +80,15 @@ grep -r 'token=True\|token=os.getenv' .
 
 # Float64/Float32 assumptions — librosa returns float64, many models assume float32
 grep -r "torch.from_numpy\|\.double()\|float64" .
+
+# @torch.jit.script — calls inspect.getsource(), crashes in frozen builds
+grep -r "@torch.jit.script\|torch.jit.script" .
+
+# torchaudio.load — requires torchcodec in torchaudio 2.10+, use soundfile.read() instead
+grep -r "torchaudio.load\|torchaudio.save" .
+
+# Gated HuggingFace repos — models that hardcode gated repos as tokenizer/config sources
+grep -r "from_pretrained\|tokenizer_name\|AutoTokenizer" . | grep -i "llama\|meta-llama\|gated"
 ```
 
 ### 0.3 Install and Trace in a Throwaway Venv
@@ -270,6 +279,8 @@ In `app/src/lib/hooks/useGenerationForm.ts`:
 - Add engine-to-model-name mapping
 - Update payload construction for engine-specific fields
 
+**Watch out for model naming inconsistencies.** The HuggingFace repo name, the model size label, and the API model name don't always follow predictable patterns. For example, TADA's 3B model is named `tada-3b-ml` (not `tada-3b`), because it's a multilingual variant. Always check the actual repo names and build the frontend model name mapping from those, not from assumptions like `{engine}-{size}`.
+
 ### 3.5 Model Management
 
 In `app/src/components/ServerSettings/ModelManagement.tsx`:
@@ -391,6 +402,7 @@ These are actual production failures from shipping new engines. Every one of the
 | Chatterbox | `FileNotFoundError` for watermark model | `perth` ships pretrained model files (`hparams.yaml`, `.pth.tar`) that PyInstaller doesn't bundle by default | `--collect-all perth` |
 | All engines | `importlib.metadata` failures | Frozen binary doesn't include package metadata for `huggingface-hub`, `transformers`, etc. | `--copy-metadata` for each affected package |
 | All engines | Download progress bars stuck at 0% | `huggingface_hub` silently disables tqdm progress bars based on logger level in frozen builds — our progress tracker never receives byte updates | Force-enable tqdm's internal counter in `HFProgressTracker` |
+| TADA | `inspect.getsource` error in DAC's `Snake1d` | `@torch.jit.script` calls `inspect.getsource()` which fails without `.py` source files | Wrote a lightweight shim (`dac_shim.py`) reimplementing `Snake1d` without `@torch.jit.script`, registered fake `dac.*` modules in `sys.modules` |
 | All engines | `NameError: name 'obj' is not defined` on macOS | Python 3.12.0 has a [CPython bug](https://github.com/pyinstaller/pyinstaller/issues/7992) that corrupts bytecode when PyInstaller rewrites code objects | Upgrade to Python 3.12.13+ |
 | All engines | `resource_tracker` subprocess crash | `multiprocessing` in frozen binaries needs `freeze_support()` called before anything else | Added to `server.py` entry point |
 
@@ -480,6 +492,90 @@ def _get_device(self):
     return "cpu"  # Skip MPS
 ```
 
+### Gated HuggingFace repos as hardcoded config sources
+
+Some models hardcode a gated HuggingFace repo as their tokenizer or config source (e.g., TADA hardcodes `"meta-llama/Llama-3.2-1B"` in both its `AlignerConfig` and `TadaConfig`). This silently fails without HF authentication.
+
+**Fix:** Download from an ungated mirror and patch the config objects directly:
+
+```python
+# Download tokenizer from ungated mirror
+UNGATED_TOKENIZER = "unsloth/Llama-3.2-1B"
+tokenizer_path = snapshot_download(UNGATED_TOKENIZER, token=None)
+
+# Patch the model config to use the local path instead of the gated repo
+config = ModelConfig.from_pretrained(model_path)
+config.tokenizer_name = tokenizer_path
+model = ModelClass.from_pretrained(model_path, config=config)
+```
+
+**Do NOT monkey-patch `AutoTokenizer.from_pretrained`** — it's a classmethod, and replacing it corrupts the descriptor, which breaks other engines that use different tokenizers (e.g., Qwen uses a Qwen tokenizer via `AutoTokenizer`). Always patch at the config level, not the class method level.
+
+### `torchaudio.load()` requires `torchcodec` in 2.10+
+
+As of `torchaudio>=2.10`, `torchaudio.load()` requires the `torchcodec` package for audio I/O. If your engine or backend code uses `torchaudio.load()`, replace it with `soundfile`:
+
+```python
+# Before (breaks without torchcodec):
+import torchaudio
+waveform, sr = torchaudio.load("audio.wav")
+
+# After:
+import soundfile as sf
+import torch
+data, sr = sf.read("audio.wav", dtype="float32")
+waveform = torch.from_numpy(data).unsqueeze(0)
+```
+
+Note: `torchaudio.functional.resample()` and other pure-PyTorch math functions work fine without `torchcodec` — only the I/O functions are affected.
+
+### `@torch.jit.script` breaks in frozen builds
+
+`torch.jit.script` calls `inspect.getsource()` to parse the decorated function's source code. In a PyInstaller binary, `.py` source files aren't available, so this crashes at import time.
+
+**Fix:** Remove or avoid `@torch.jit.script` decorators. If the decorated function comes from an upstream dependency, write a shim that reimplements the function without the decorator (see "Toxic dependency chains" below).
+
+### Toxic dependency chains — the shim pattern
+
+Sometimes a model library depends on a package with a massive, hostile transitive dependency tree, but only uses a tiny piece of it. When the dependency chain is unbuildable or would pull in dozens of unwanted packages, the right move is to write a lightweight shim.
+
+**Example:** TADA depends on `descript-audio-codec` (DAC), which pulls in `descript-audiotools` -> `onnx`, `tensorboard`, `protobuf`, `matplotlib`, `pystoi`, etc. The `onnx` package fails to build from source on macOS. But TADA only uses `Snake1d` from DAC — a 7-line PyTorch module.
+
+**Solution:** Create a shim at `backend/utils/dac_shim.py` that registers fake modules in `sys.modules`:
+
+```python
+import sys
+import types
+import torch
+from torch import nn
+
+def snake(x, alpha):
+    """Snake activation — reimplemented without @torch.jit.script."""
+    return x + (1.0 / (alpha + 1e-9)) * torch.sin(alpha * x).pow(2)
+
+class Snake1d(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.alpha = nn.Parameter(torch.ones(1, channels, 1))
+    def forward(self, x):
+        return snake(x, self.alpha)
+
+# Register fake dac.* modules so "from dac.nn.layers import Snake1d" works
+_nn = types.ModuleType("dac.nn")
+_layers = types.ModuleType("dac.nn.layers")
+_layers.Snake1d = Snake1d
+_nn.layers = _layers
+
+for name, mod in [("dac", types.ModuleType("dac")),
+                   ("dac.nn", _nn), ("dac.nn.layers", _layers)]:
+    sys.modules[name] = mod
+```
+
+**Key rules for shims:**
+- Import the shim **before** importing the model library (so it finds the fake modules first)
+- Do NOT use `@torch.jit.script` in the shim (see above)
+- Only reimplement what the model actually uses — check the import chain carefully
+
 ## Upcoming Engines
 
 Based on the current model landscape, these are candidates for future integration:
@@ -490,7 +586,6 @@ Based on the current model landscape, these are candidates for future integratio
 | **Fish Speech** | 50+ | Medium | Word-level control via inline text | Ready |
 | **Kokoro-82M** | English | 82M | CPU realtime, Apache 2.0 | Ready |
 | **XTTS-v2** | 17+ | Medium | Zero-shot cloning | Ready |
-| **HumeAI TADA** | EN (1B), Multi (3B) | Medium | 700s+ coherent audio, synced transcripts | Shipped |
 | **MOSS-TTS** | Multilingual | Medium | Text-to-voice design, multi-speaker dialogue | Needs vetting |
 | **Pocket TTS** | English | ~100M | CPU-first, >1× realtime | Needs vetting |
 
@@ -508,6 +603,10 @@ Use this as a gate between phases. Do not proceed to the next phase until every
 - [ ] Searched for `torch.load` calls missing `map_location`
 - [ ] Searched for `torch.from_numpy` without `.float()` cast
 - [ ] Searched for `token=True` or `token=os.getenv("HF_TOKEN")` in HuggingFace calls
+- [ ] Searched for `@torch.jit.script` / `torch.jit.script` (crashes in frozen builds)
+- [ ] Searched for `torchaudio.load` / `torchaudio.save` (requires `torchcodec` in 2.10+)
+- [ ] Searched for hardcoded gated HuggingFace repo names (e.g., `meta-llama/*`)
+- [ ] Evaluated whether any dependency is used minimally enough to shim instead of install
 - [ ] Tested model loading and generation on CPU in a throwaway venv
 - [ ] Tested with a clean HuggingFace cache (no pre-downloaded models)
 - [ ] Produced a written dependency audit documenting all findings
diff --git a/docs/plans/CUDA_LIBS_ADDON.md b/docs/plans/CUDA_LIBS_ADDON.md
new file mode 100644
index 00000000..28cbe1b6
--- /dev/null
+++ b/docs/plans/CUDA_LIBS_ADDON.md
@@ -0,0 +1,173 @@
+# CUDA Libs as a Bolt-On Addon
+
+## Problem
+
+Every time we bump `__version__` (even for a UI tweak or bugfix), the exact-match version check in both `main.rs:222` and `cuda.py:237` invalidates the user's ~2.4GB CUDA binary, forcing a full redownload. The CUDA binary is the entire server rebuilt with NVIDIA libs included -- there's no separation between app logic and the CUDA runtime.
+
+## Why This Is Hard With `--onefile`
+
+The core tension is PyInstaller `--onefile` mode (`build_binary.py:39`). In onefile mode, everything -- Python code, all dependencies, torch, the NVIDIA `.dll`/`.so` files -- gets packed into a single self-extracting archive. There's no concept of "swap out one part." The binary IS the server.
+
+## Options
+
+### Option A: Switch to `--onedir` for the CUDA Build (Recommended)
+
+Instead of `--onefile`, build the CUDA variant as a directory (a folder with the exe + all the shared libs alongside it). Then split the distribution into two archives:
+
+1. **`voicebox-server-cuda` executable + non-NVIDIA deps** (~200-400MB) -- versioned with the app, redownloaded on every app update.
+2. **`cuda-libs-cu126.tar.gz`** (~2GB) -- the `nvidia.*` packages (cublas, cudnn, cuda_runtime, etc.), versioned independently (e.g., `cuda-libs-cu126-v1`). Only redownloaded when we bump the CUDA toolkit version or torch's CUDA dependency changes.
+
+#### How it would work at runtime
+
+- Tauri downloads the server binary archive and extracts it to `{data_dir}/backends/cuda/`
+- On first CUDA setup (or when cuda-libs version bumps), downloads and extracts the libs archive into the same directory
+- The CUDA server exe finds the `.dll`/`.so` files next to it (standard PyInstaller onedir behavior)
+- Version check becomes two checks: server version + cuda-libs version
+
+#### Independent versioning
+
+Add a `cuda-libs.json` manifest:
+
+```json
+{"version": "cu126-v1", "torch_compat": ">=2.6.0,<2.8.0"}
+```
+
+The server checks this on startup. The Tauri side checks it before launching. Only bump `cu126-v1` -> `cu126-v2` when we actually change the CUDA toolkit or torch major version.
+
+#### Build pipeline changes
+
+The CI `build-cuda-windows` job would build with `--onedir`, then separate the output into two archives. The CUDA libs archive could be built less frequently (only when torch/CUDA version changes) and stored as a pinned release asset.
+
+#### Download experience
+
+- First-time CUDA setup: ~2.4GB total (same as today)
+- Subsequent app updates: ~200-400MB for the server, CUDA libs stay cached
+- CUDA toolkit bump: ~2GB for just the libs
+
+#### Pros
+
+- PyInstaller `--onedir` natively produces this structure -- NVIDIA DLLs end up as discrete files in the output directory
+- The separation is natural: PyInstaller puts torch's NVIDIA deps in predictable paths (`nvidia/cublas/lib/`, etc.)
+- CUDA libs are highly stable -- only rebundle when changing CUDA toolkit version (e.g., cu126 -> cu128) or major torch version
+- Server updates become ~200-400MB instead of ~2.4GB
+- No library path hacking needed -- torch finds NVIDIA DLLs because they're in the same directory tree
+
+#### Cons
+
+- Onedir means a folder with hundreds of files instead of a single exe -- more complex to manage, extract, and clean up
+- Need to modify download/assembly logic in `cuda.py` to handle two separate archives
+- The Tauri side (`main.rs`) needs to point at an exe inside a directory rather than a standalone binary
+- Users who manually manage the file may find the folder structure confusing
+
+#### TTS engine compatibility
+
+No issues. The TTS engines are pure Python + torch. They don't care whether NVIDIA libs are inside the binary or sitting next to it -- torch's dynamic loader finds them either way.
+
+---
+
+### Option B: Keep `--onefile` but Externalize CUDA Libs via Library Path
+
+Keep the server as a single `--onefile` binary (with NVIDIA packages excluded, same as the CPU build). Ship the CUDA libs as a separate download that gets extracted to `{data_dir}/backends/cuda-libs/`. Before launching, set the library search path to include that directory.
+
+**Important caveat:** The CPU torch wheel (`whl/cpu`) doesn't have CUDA kernels compiled in -- it's a fundamentally different build. So the binary would need to be built with CUDA-compiled torch but with the NVIDIA runtime libraries excluded. The runtime libs (cublas, cudnn, etc.) would be provided externally.
+
+#### How it would work
+
+- Build ONE "CUDA-ready" server binary with CUDA-compiled torch but NVIDIA runtime packages excluded
+- Ship `cuda-libs-cu126-v1.tar.gz` separately (~2GB of `.dll`/`.so` files)
+- When launching, Tauri sets `PATH` (Windows) or `LD_LIBRARY_PATH` (Linux) to include the cuda-libs directory
+
+#### Pros
+
+- Single server binary for both CPU and CUDA users -- simplifies build pipeline enormously
+- True bolt-on CUDA libs with fully independent versioning
+- Server updates are always small (~150MB for the onefile binary)
+
+#### Cons
+
+- **Fragile on Windows.** PyInstaller `--onefile` extracts to a temp directory at runtime and the internal torch may not find externally-placed NVIDIA libs. DLL resolution on Windows is notoriously unreliable in this scenario.
+- `os.add_dll_directory()` only affects `LoadLibraryEx` with `LOAD_LIBRARY_SEARCH_USER_DIRS` flag -- not all DLL loads go through this path
+- PyInstaller's onefile bootloader may configure DLL search paths before Python code runs
+- Could work on Linux but is fragile on Windows
+
+---
+
+### Option C: Hybrid -- `--onefile` Server + Dynamic CUDA Lib Loading at Runtime
+
+Build the server as `--onefile` with CUDA-compiled torch but with NVIDIA packages excluded. At startup, before torch initializes CUDA, explicitly load the NVIDIA shared libraries using `ctypes.CDLL` or `os.add_dll_directory()`.
+
+In `server.py`, before any torch imports:
+
+```python
+cuda_libs_dir = os.environ.get("VOICEBOX_CUDA_LIBS")
+if cuda_libs_dir and os.path.isdir(cuda_libs_dir):
+    if sys.platform == "win32":
+        os.add_dll_directory(cuda_libs_dir)
+        os.environ["PATH"] = cuda_libs_dir + os.pathsep + os.environ.get("PATH", "")
+    else:
+        os.environ["LD_LIBRARY_PATH"] = cuda_libs_dir + ":" + os.environ.get("LD_LIBRARY_PATH", "")
+```
+
+#### Pros
+
+- Single server binary, true bolt-on CUDA libs
+- Clean separation of concerns
+- Independent versioning
+
+#### Cons
+
+- Needs careful testing with each torch version -- CUDA initialization happens deep in C++ extension layer
+- On Windows, `os.add_dll_directory()` may not cover all DLL load paths
+- PyInstaller's onefile bootloader may have already configured DLL search paths before Python code runs
+- Most complex to get right and maintain
+
+## Recommendation
+
+**Option A (`--onedir` with split archives)** is the most reliable path:
+
+1. **It actually works.** `--onedir` puts all files on disk as regular files. Torch finds NVIDIA DLLs because they're in the same directory tree, exactly as they would be in a normal pip install.
+2. **Natural separation.** PyInstaller's `--onedir` output already separates the NVIDIA `.dll`/`.so` files into `nvidia/` subdirectories. We can split the output directory into "core" and "nvidia-libs" archives after building.
+3. **Independent versioning is straightforward.** A `cuda-libs.json` manifest controls when redownloads are needed.
+4. **Build pipeline simplification.** Build CUDA libs archive less frequently, store as a pinned release asset.
+
+The main cost is managing a directory instead of a single file, but we already have sophisticated download/assembly infrastructure in `cuda.py` with manifests and split parts. Extending that to handle two archives is incremental work.
+
+## Tauri Compatibility (Validated)
+
+Tauri handles PyInstaller `--onedir` with no issues. The key insight is that we're **not** using a static sidecar for CUDA -- we're downloading and extracting at runtime (the existing `cuda.py` + `main.rs` flow). For runtime-launched processes, Tauri's `tauri::shell::Command` supports arbitrary directories natively.
+
+### The critical change in `main.rs`
+
+The only Tauri-side change needed is adding `.current_dir()` when spawning the CUDA backend:
+
+```rust
+let cuda_dir = data_dir.join("backends/cuda");
+let exe_path = cuda_dir.join("voicebox-server-cuda.exe");
+
+let mut cmd = app.shell().command(exe_path.to_str().unwrap());
+cmd = cmd.current_dir(&cuda_dir);  // PyInstaller finds all DLLs relative to exe
+cmd = cmd.args(["--data-dir", &data_dir_str, "--port", &port_str, "--parent-pid", &parent_pid_str]);
+```
+
+`.current_dir()` tells the PyInstaller bootloader that everything (DLLs, `nvidia/cublas/lib/`, `_internal/`, torch extensions, etc.) lives relative to the exe. Torch finds the NVIDIA libs exactly as it does in a normal `pip install` or dev environment -- no `LD_LIBRARY_PATH` hacks, no `os.add_dll_directory` gymnastics.
+
+### Community evidence
+
+- Multiple Tauri users run this exact pattern: Nuitka folders (exe + pythonXX.dll + supporting files), multi-file .NET apps, and PyInstaller onedir backends (GitHub issues #5719, discussion #5206).
+- The shell plugin explicitly supports `cwd` in both Rust and JS APIs.
+- No reports of torch/CUDA-specific breakage -- the onedir layout is identical to what PyInstaller produces in normal usage.
+
+### Known gotcha: process termination on Windows
+
+PyInstaller onedir creates a parent bootloader + child Python process on Windows. `child.kill()` only hits the outer process in some cases (Tauri issue #11686). Mitigation: keep a reference to the parent PID or use `taskkill /F /T` for clean shutdown. This is not a blocker -- our existing `--parent-pid` watchdog mechanism in `server.py` already handles orphan cleanup.
+
+## Next Steps
+
+1. Prototype: Build the current CUDA binary with `--onedir` and verify torch CUDA works from the output directory
+2. Measure the size split: how much is NVIDIA libs vs everything else
+3. Design the two-archive download flow and dual version checking
+4. Update `cuda.py` for dual-archive extraction (server core + cuda-libs)
+5. Update `main.rs`: change launch path to `backends/cuda/` dir + add `.current_dir()`
+6. Add `ensure_cuda_structure()` helper in Rust to verify exe + nvidia/ subdirs exist before spawning
+7. Update CI pipeline: `build-cuda-windows` produces two archives instead of split parts
+8. ~~Update `split_binary.py` or replace with archive-based distribution~~ Done: replaced with `package_cuda.py`
diff --git a/justfile b/justfile
index fd8bf962..a1d40133 100644
--- a/justfile
+++ b/justfile
@@ -208,10 +208,11 @@ build-server-cuda: _ensure-venv
     $env:PATH = "{{ venv_bin }};$env:PATH"; \
     & "{{ python }}" backend/build_binary.py --cuda; \
     if ($LASTEXITCODE -ne 0) { throw "build_binary.py --cuda failed with exit code $LASTEXITCODE" }; \
-    $dest = "$env:APPDATA/com.voicebox.app/backends"; \
+    $dest = "$env:APPDATA/sh.voicebox.app/backends/cuda"; \
+    if (Test-Path $dest) { Remove-Item -Recurse -Force $dest }; \
     New-Item -ItemType Directory -Path $dest -Force | Out-Null; \
-    Copy-Item "backend/dist/voicebox-server-cuda.exe" "$dest/voicebox-server-cuda.exe" -Force; \
-    Write-Host "Copied CUDA binary to $dest"
+    Copy-Item "backend/dist/voicebox-server-cuda/*" $dest -Recurse -Force; \
+    Write-Host "Copied CUDA backend to $dest"
 
 # Build everything locally: CPU server + CUDA server + installable Tauri app
 [windows]
diff --git a/landing/package.json b/landing/package.json
index f9aa7e74..316fc7dd 100644
--- a/landing/package.json
+++ b/landing/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@voicebox/landing",
-  "version": "0.3.0",
+  "version": "0.3.1",
   "description": "Landing page for voicebox.sh",
   "scripts": {
     "dev": "bun --bun next dev --turbo",
diff --git a/package.json b/package.json
index d6d94b43..72b229f5 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "voicebox",
-  "version": "0.3.0",
+  "version": "0.3.1",
   "private": true,
   "workspaces": [
     "app",
diff --git a/scripts/package_cuda.py b/scripts/package_cuda.py
new file mode 100644
index 00000000..189f531a
--- /dev/null
+++ b/scripts/package_cuda.py
@@ -0,0 +1,232 @@
+"""
+Package the PyInstaller --onedir CUDA build into two archives.
+
+Takes the PyInstaller --onedir output directory and splits it into:
+  1. voicebox-server-cuda.tar.gz  — server core (exe + non-NVIDIA deps)
+  2. cuda-libs-cu126.tar.gz       — NVIDIA runtime libraries only
+  3. cuda-libs.json                — version manifest for the CUDA libs
+
+Usage:
+    python scripts/package_cuda.py backend/dist/voicebox-server-cuda/
+    python scripts/package_cuda.py backend/dist/voicebox-server-cuda/ --output release-assets/
+    python scripts/package_cuda.py backend/dist/voicebox-server-cuda/ --cuda-libs-version cu126-v1
+"""
+
+import argparse
+import hashlib
+import json
+import sys
+import tarfile
+from pathlib import Path
+
+# DLL name prefixes that identify NVIDIA CUDA runtime libraries.
+# These DLLs may appear in different locations depending on the torch
+# and PyInstaller version:
+#   - nvidia/ subdirectories (older torch with separate nvidia-* packages)
+#   - _internal/torch/lib/ (torch 2.10+ bundles NVIDIA DLLs directly)
+#   - Top-level directory (some PyInstaller versions)
+NVIDIA_DLL_PREFIXES = (
+    "cublas",
+    "cublaslt",
+    "cudart",
+    "cudnn",
+    "cufft",
+    "cufftw",
+    "curand",
+    "cusolver",
+    "cusolvermg",
+    "cusparse",
+    "nvjitlink",
+    "nvrtc",
+    "nccl",
+    "caffe2_nvrtc",
+)
+
+# Files to keep in the server core even if they match NVIDIA prefixes.
+# These are small Python modules or stubs, not the large runtime DLLs.
+NVIDIA_KEEP_IN_CORE = {
+    "torch/cuda/nccl.py",
+    "torch/_inductor/codegen/cuda/cutlass_lib_extensions/cutlass_mock_imports/cuda/cudart.py",
+}
+
+
+def is_nvidia_file(rel_path: str) -> bool:
+    """Check if a relative path belongs to the NVIDIA CUDA libs.
+
+    Identifies large NVIDIA runtime DLLs (.dll/.so) regardless of where
+    PyInstaller placed them. Excludes small Python stubs that happen to
+    share NVIDIA-related names.
+    """
+    rel_lower = rel_path.lower().replace("\\", "/")
+
+    # Never split out Python source files or small stubs
+    if rel_lower in NVIDIA_KEEP_IN_CORE:
+        return False
+
+    # Files under nvidia/ subdirectory tree (older torch layout)
+    if rel_lower.startswith("nvidia/") or "/nvidia/" in rel_lower:
+        # Only DLLs/shared objects — not .py, .dist-info, etc.
+        if rel_lower.endswith((".dll", ".so")):
+            return True
+        # Include entire nvidia/ namespace package tree
+        for part in rel_lower.split("/"):
+            if part == "nvidia":
+                return True
+
+    # NVIDIA DLLs anywhere in the tree (e.g. _internal/torch/lib/cublas64_12.dll)
+    name = rel_lower.rsplit("/", 1)[-1]
+    if name.endswith(".dll") or name.endswith(".so"):
+        name_no_ext = name.rsplit(".", 1)[0]
+        for prefix in NVIDIA_DLL_PREFIXES:
+            if name_no_ext.startswith(prefix):
+                return True
+
+    return False
+
+
+def sha256_file(path: Path) -> str:
+    """Compute SHA-256 hex digest of a file."""
+    h = hashlib.sha256()
+    with open(path, "rb") as f:
+        while True:
+            chunk = f.read(1024 * 1024)
+            if not chunk:
+                break
+            h.update(chunk)
+    return h.hexdigest()
+
+
+def package(
+    onedir_path: Path,
+    output_dir: Path,
+    cuda_libs_version: str,
+    torch_compat: str,
+):
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Collect all files in the onedir output, split into core vs nvidia
+    core_files = []
+    nvidia_files = []
+
+    for item in sorted(onedir_path.rglob("*")):
+        if item.is_dir():
+            continue
+        rel = item.relative_to(onedir_path)
+        rel_str = str(rel)
+        if is_nvidia_file(rel_str):
+            nvidia_files.append((rel_str, item))
+        else:
+            core_files.append((rel_str, item))
+
+    core_size = sum(f.stat().st_size for _, f in core_files)
+    nvidia_size = sum(f.stat().st_size for _, f in nvidia_files)
+
+    print(f"Input directory: {onedir_path}")
+    print(f"Core files:  {len(core_files)} ({core_size / (1024**2):.1f} MB)")
+    print(f"NVIDIA files: {len(nvidia_files)} ({nvidia_size / (1024**2):.1f} MB)")
+
+    if not nvidia_files:
+        print(
+            f"ERROR: No NVIDIA files found in {onedir_path}. "
+            "Refusing to create an empty CUDA libs archive.",
+            file=sys.stderr,
+        )
+        print(
+            "Make sure you built with --cuda and the NVIDIA packages are present.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    # Create server core archive
+    # Files are stored relative to the archive root (no parent directory prefix)
+    # so extracting to backends/cuda/ puts everything at the right level.
+    server_archive = output_dir / "voicebox-server-cuda.tar.gz"
+    print(f"\nCreating server core archive: {server_archive.name}")
+    with tarfile.open(server_archive, "w:gz") as tar:
+        for rel_str, full_path in core_files:
+            tar.add(full_path, arcname=rel_str)
+    server_sha = sha256_file(server_archive)
+    (output_dir / "voicebox-server-cuda.tar.gz.sha256").write_text(
+        f"{server_sha}  voicebox-server-cuda.tar.gz\n"
+    )
+    print(f"  Size: {server_archive.stat().st_size / (1024**2):.1f} MB")
+    print(f"  SHA-256: {server_sha[:16]}...")
+
+    # Create CUDA libs archive
+    cuda_libs_archive = output_dir / f"cuda-libs-{cuda_libs_version}.tar.gz"
+    print(f"\nCreating CUDA libs archive: {cuda_libs_archive.name}")
+    with tarfile.open(cuda_libs_archive, "w:gz") as tar:
+        for rel_str, full_path in nvidia_files:
+            tar.add(full_path, arcname=rel_str)
+    cuda_sha = sha256_file(cuda_libs_archive)
+    (output_dir / f"cuda-libs-{cuda_libs_version}.tar.gz.sha256").write_text(
+        f"{cuda_sha}  cuda-libs-{cuda_libs_version}.tar.gz\n"
+    )
+    print(f"  Size: {cuda_libs_archive.stat().st_size / (1024**2):.1f} MB")
+    print(f"  SHA-256: {cuda_sha[:16]}...")
+
+    # Write cuda-libs.json manifest
+    manifest = {
+        "version": cuda_libs_version,
+        "torch_compat": torch_compat,
+        "archive": cuda_libs_archive.name,
+        "sha256": cuda_sha,
+    }
+    manifest_path = output_dir / "cuda-libs.json"
+    manifest_path.write_text(json.dumps(manifest, indent=2) + "\n")
+    print(f"\nManifest: {manifest_path.name}")
+    print(json.dumps(manifest, indent=2))
+
+    # Summary
+    total_input = core_size + nvidia_size
+    total_output = server_archive.stat().st_size + cuda_libs_archive.stat().st_size
+    print(f"\nTotal input:  {total_input / (1024**3):.2f} GB")
+    print(f"Total output: {total_output / (1024**3):.2f} GB (compressed)")
+    print(
+        f"Server core:  {server_archive.stat().st_size / (1024**2):.1f} MB (redownloaded on app update)"
+    )
+    print(
+        f"CUDA libs:    {cuda_libs_archive.stat().st_size / (1024**2):.1f} MB (cached until CUDA toolkit bump)"
+    )
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Package PyInstaller --onedir CUDA build into server + CUDA libs archives"
+    )
+    parser.add_argument(
+        "input",
+        type=Path,
+        help="Path to PyInstaller --onedir output directory (e.g. backend/dist/voicebox-server-cuda/)",
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=None,
+        help="Output directory for archives (default: same as input parent)",
+    )
+    parser.add_argument(
+        "--cuda-libs-version",
+        type=str,
+        default="cu126-v1",
+        help="Version string for the CUDA libs archive (default: cu126-v1)",
+    )
+    parser.add_argument(
+        "--torch-compat",
+        type=str,
+        default=">=2.6.0,<2.11.0",
+        help="Torch version compatibility range (default: >=2.6.0,<2.11.0)",
+    )
+    args = parser.parse_args()
+
+    if not args.input.is_dir():
+        print(f"Error: {args.input} is not a directory", file=sys.stderr)
+        print("Expected a PyInstaller --onedir output directory.", file=sys.stderr)
+        sys.exit(1)
+
+    output_dir = args.output or args.input.parent
+    package(args.input, output_dir, args.cuda_libs_version, args.torch_compat)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/split_binary.py b/scripts/split_binary.py
deleted file mode 100644
index 0310fbd8..00000000
--- a/scripts/split_binary.py
+++ /dev/null
@@ -1,82 +0,0 @@
-"""
-Split a large binary into chunks for GitHub Releases (<2 GB each).
-
-Usage:
-    python scripts/split_binary.py backend/dist/voicebox-server-cuda.exe
-    python scripts/split_binary.py backend/dist/voicebox-server-cuda.exe --chunk-size 1900000000
-    python scripts/split_binary.py backend/dist/voicebox-server-cuda.exe --output release-assets/
-
-The script produces:
-    - voicebox-server-cuda.part00.exe, .part01.exe, ...  (binary chunks)
-    - voicebox-server-cuda.sha256      (SHA-256 checksum of the complete file)
-    - voicebox-server-cuda.manifest    (ordered list of part filenames)
-"""
-
-import argparse
-import hashlib
-import sys
-from pathlib import Path
-
-
-def split(input_path: Path, chunk_size: int, output_dir: Path):
-    output_dir.mkdir(parents=True, exist_ok=True)
-    data = input_path.read_bytes()
-    total_size = len(data)
-
-    # Write SHA-256 of the complete file
-    sha256 = hashlib.sha256(data).hexdigest()
-    checksum_file = output_dir / f"{input_path.stem}.sha256"
-    checksum_file.write_text(f"{sha256}  {input_path.name}\n")
-
-    # Split into chunks
-    parts = []
-    for i in range(0, total_size, chunk_size):
-        part_index = len(parts)
-        part_name = f"{input_path.stem}.part{part_index:02d}{input_path.suffix}"
-        part_path = output_dir / part_name
-        part_path.write_bytes(data[i:i + chunk_size])
-        parts.append(part_name)
-
-    # Write manifest (ordered list of part filenames)
-    manifest_file = output_dir / f"{input_path.stem}.manifest"
-    manifest_file.write_text("\n".join(parts) + "\n")
-
-    print(f"Input:    {input_path} ({total_size / (1024**3):.2f} GB)")
-    print(f"Output:   {output_dir}/")
-    print(f"Parts:    {len(parts)} (chunk size: {chunk_size / (1024**3):.2f} GB)")
-    print(f"SHA-256:  {sha256}")
-    print(f"Manifest: {manifest_file.name}")
-    for p in parts:
-        size = (output_dir / p).stat().st_size
-        print(f"  {p}  ({size / (1024**3):.2f} GB)")
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Split a large binary into chunks for GitHub Releases"
-    )
-    parser.add_argument("input", type=Path, help="Path to the binary file to split")
-    parser.add_argument(
-        "--chunk-size",
-        type=int,
-        default=1_900_000_000,  # 1.9 GB — safely under 2 GB GitHub limit
-        help="Maximum chunk size in bytes (default: 1.9 GB)",
-    )
-    parser.add_argument(
-        "--output",
-        type=Path,
-        default=None,
-        help="Output directory (default: same directory as input)",
-    )
-    args = parser.parse_args()
-
-    if not args.input.exists():
-        print(f"Error: {args.input} does not exist", file=sys.stderr)
-        sys.exit(1)
-
-    output_dir = args.output or args.input.parent
-    split(args.input, args.chunk_size, output_dir)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tauri/package.json b/tauri/package.json
index 31eb0790..6f569f6a 100644
--- a/tauri/package.json
+++ b/tauri/package.json
@@ -1,7 +1,7 @@
 {
   "name": "@voicebox/tauri",
   "private": true,
-  "version": "0.3.0",
+  "version": "0.3.1",
   "type": "module",
   "scripts": {
     "dev": "vite",
diff --git a/tauri/src-tauri/Cargo.toml b/tauri/src-tauri/Cargo.toml
index 1f707f8e..e587de9f 100644
--- a/tauri/src-tauri/Cargo.toml
+++ b/tauri/src-tauri/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "voicebox"
-version = "0.3.0"
+version = "0.3.1"
 description = "A production-quality desktop app for Qwen3-TTS voice cloning and generation"
 authors = ["you"]
 license = ""
diff --git a/tauri/src-tauri/src/main.rs b/tauri/src-tauri/src/main.rs
index a881f46e..415961f2 100644
--- a/tauri/src-tauri/src/main.rs
+++ b/tauri/src-tauri/src/main.rs
@@ -197,22 +197,24 @@ async fn start_server(
     println!("Data directory: {:?}", data_dir);
     println!("Remote mode: {}", remote.unwrap_or(false));
 
-    // Check for CUDA backend binary in data directory
+    // Check for CUDA backend in data directory (onedir layout: backends/cuda/)
     let cuda_binary = {
-        let backends_dir = data_dir.join("backends");
+        let cuda_dir = data_dir.join("backends").join("cuda");
         let cuda_name = if cfg!(windows) {
             "voicebox-server-cuda.exe"
         } else {
             "voicebox-server-cuda"
         };
-        let path = backends_dir.join(cuda_name);
-        if path.exists() {
-            println!("Found CUDA backend binary at {:?}", path);
+        let exe_path = cuda_dir.join(cuda_name);
+        if exe_path.exists() {
+            println!("Found CUDA backend at {:?}", cuda_dir);
 
-            // Version check: run --version and compare to app version
+            // Version check: run --version from the onedir directory so
+            // PyInstaller can find its support files for the fast --version path
             let app_version = app.config().version.clone().unwrap_or_default();
-            let version_ok = match std::process::Command::new(&path)
+            let version_ok = match std::process::Command::new(&exe_path)
                 .arg("--version")
+                .current_dir(&cuda_dir)
                 .output()
             {
                 Ok(output) => {
@@ -237,7 +239,7 @@ async fn start_server(
             };
 
             if version_ok {
-                Some(path)
+                Some(exe_path)
             } else {
                 None
             }
@@ -300,10 +302,14 @@ async fn start_server(
         println!("Custom models directory: {}", dir);
     }
 
-    // If CUDA binary exists, launch it directly instead of the bundled sidecar
+    // If CUDA binary exists, launch it from the onedir directory.
+    // .current_dir() is critical: PyInstaller onedir expects all DLLs and
+    // support files (nvidia/, _internal/, etc.) relative to the exe.
     let spawn_result = if let Some(ref cuda_path) = cuda_binary {
-        println!("Launching CUDA backend: {:?}", cuda_path);
+        let cuda_dir = cuda_path.parent().unwrap();
+        println!("Launching CUDA backend: {:?} (cwd: {:?})", cuda_path, cuda_dir);
         let mut cmd = app.shell().command(cuda_path.to_str().unwrap());
+        cmd = cmd.current_dir(cuda_dir);
         cmd = cmd.args(["--data-dir", &data_dir_str, "--port", &port_str, "--parent-pid", &parent_pid_str]);
         if is_remote {
             cmd = cmd.args(["--host", "0.0.0.0"]);
diff --git a/tauri/src-tauri/tauri.conf.json b/tauri/src-tauri/tauri.conf.json
index c4c5da85..89fc038f 100644
--- a/tauri/src-tauri/tauri.conf.json
+++ b/tauri/src-tauri/tauri.conf.json
@@ -1,7 +1,7 @@
 {
   "$schema": "https://schema.tauri.app/config/2",
   "productName": "Voicebox",
-  "version": "0.3.0",
+  "version": "0.3.1",
   "identifier": "sh.voicebox.app",
   "build": {
     "beforeDevCommand": "bun run dev",
diff --git a/web/package.json b/web/package.json
index 74247f87..8d59b9cc 100644
--- a/web/package.json
+++ b/web/package.json
@@ -1,7 +1,7 @@
 {
   "name": "@voicebox/web",
   "private": true,
-  "version": "0.3.0",
+  "version": "0.3.1",
   "type": "module",
   "scripts": {
     "dev": "vite",