EndlessReform · EndlessReform · Feb 23, 2025 · Feb 23, 2025 · Feb 23, 2025 · Feb 23, 2025
diff --git a/.github/workflows/python-wheels.yaml b/.github/workflows/python-wheels.yaml
@@ -31,12 +31,12 @@ permissions:
 
 jobs:
     linux-cuda:
-        runs-on: ubuntu-latest
+        runs-on: ubuntu-22.04
         steps:
             - uses: actions/checkout@v4
             - uses: actions/setup-python@v5
               with:
-                  python-version: 3.x
+                  python-version: "3.9"
             - uses: Jimver/cuda-toolkit@v0.2.21
               id: cuda-toolkit
               with:
@@ -49,12 +49,17 @@ jobs:
             - run: nvcc -V
             - uses: PyO3/maturin-action@v1
               env:
+                  # Must be set or bindgen-cuda will look for nvidia-smi which will never work
                   # https://github.com/huggingface/candle/issues/1516
                   CUDA_COMPUTE_CAP: "80"
               with:
                   target: x86_64
-                  args: --release --out dist --find-interpreter --manifest-path fish_speech_python/Cargo.toml --features cuda
-                  manylinux: off
+                  args: --release --out dist --interpreter '3.9 3.10 3.11 3.12 3.13' --manifest-path fish_speech_python/Cargo.toml --features cuda
+                  manylinux: off # Yes this is terrible, but I need the compiler to see nvcc
+            - name: Fix Manylinux Compliance
+              run: |
+                  pip install auditwheel
+                  auditwheel repair dist/*.whl --plat manylinux_2_34_x86_64 -w dist/
 
             - uses: actions/upload-artifact@v4
               with:
@@ -67,11 +72,11 @@ jobs:
             - uses: actions/checkout@v4
             - uses: actions/setup-python@v5
               with:
-                  python-version: 3.x
+                  python-version: "3.9"
             - uses: PyO3/maturin-action@v1
               with:
                   target: aarch64
-                  args: --release --out dist --find-interpreter --manifest-path fish_speech_python/Cargo.toml --features metal
+                  args: --release --out dist --interpreter '3.9 3.10 3.11 3.12 3.13' --manifest-path fish_speech_python/Cargo.toml --features metal
             - uses: actions/upload-artifact@v4
               with:
                   name: wheels-macos-arm64
@@ -83,11 +88,10 @@ jobs:
         runs-on: ubuntu-latest
         steps:
             - uses: actions/download-artifact@v4
-            - name: Create Release
-              uses: softprops/action-gh-release@v1
+            - name: Publish to PyPI
+              uses: PyO3/maturin-action@v1
               with:
-                  files: |
-                      wheels-linux-cuda/*
-                      wheels-macos-arm64/*
+                  command: upload
+                  args: --non-interactive --skip-existing wheels-linux-cuda/* wheels-macos-arm64/*
               env:
-                  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+                  PYPI_API_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,7 @@
 /target
 .venv
 *.safetensors
+*.whl
 checkpoints
 *.npy
 # For debugging purposes; remove before production!

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -6,7 +6,7 @@ members = [
     "server",
 ]
 resolver = "2"
-package.version = "0.2.0"
+package.version = "0.2.1"
 
 [profile.release-with-debug]
 inherits = "release"

diff --git a/fish_speech_python/README.md b/fish_speech_python/README.md
@@ -1,56 +1,92 @@
-## fish_speech_python
+# fish_speech_python
 
 Python bindings for the Fish Speech Candle implementation, using PyO3.
 
-### Installation
+## Installation
+Supports Python 3.9+.
 
-TODO: PyPI installation instructions
+**Supported Platforms**
+- Linux:
+  - x86_64, glibc 2.34+
+  - CUDA 12+ with compute capability >= 8.0 (RTX 30 series+, A100 series+)
+- macOS (M1+, 14.0+ (Monterey))
 
-### Usage
+Windows and AMD hardware will never be supported, so don't ask.
+Feel free to raise an issue if you need ARM or Alpine Linux.
 
-### Local installation
+```bash
+# From PyPI
+pip install fish_speech_python
+```
 
-Requires Python and Rust toolchains.
+## Usage
 
-1. `python -m venv .venv`
-2. `pip install -r requirements.txt`
-3. `maturin develop`
+### Codec
 
-TODO: Library packaging
+This is the low-level API. You feed it PCM audio, it compresses it into codes, and then decompresses it back into PCM.
 
-### Scripts
+```python
+from fish_speech_rs import FireflyCodec
+from huggingface_hub import snapshot_dir
+import numpy as np
 
-Generate speaker conditioning tokens as a `.npy` file:
+# Download weights from Hugging Face
+dir = snapshot_dir("jkeisling/fish-speech-1.5")
 
-```python
-# Saves to input.npy by default
-python encode_audio.py --output_path ../fake.npy ../tests/resources/sky.wav
-```
+# Load the codec model (set device to "cuda" for speed)
+codec = FireflyCodec(
+    dir,
+    version="1.5",  # Supports 1.4 and 1.5
+    device="cuda"    # Or "cpu" if you hate yourself, "metal" on Apple Silicon
+)
 
-You can use these drop-in with the [official Fish Audio inference script](https://github.com/fishaudio/fish-speech):
+# Encode raw PCM into compressed codes
+pcm = np.random.randn(1, 1, 44_100).astype(np.float32)  # (batch, channels, samples)
+codes = codec.encode(pcm)
 
-```bash
-# Follow their steps for inference.
-# If anything goes wrong take it up with them, the whole point of this repo is to not use that inference stack
-python -m tools.vqgan.inference -i ./output.npy --checkpoint-path "checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth"
+# Decode the compressed codes back into PCM
+decoded_pcm = codec.decode(codes)
 ```
 
-### Entry point
+- Input: Raw PCM audio (please handle resampling to 44.1 kHz yourself)
+- Output: Encoded Numpy uint32 “codes” (compressed speech)
+
+## LM
 
-As of 2024-09-08, speaker encoding is implemented (the rest to come):
+The language model (LM) takes text and turns it into speech codes, which you then decode back to audio.
 
 ```python
-import numpy as np
-from fish_speech import FishSpeechModel
+from fish_speech_rs import LM, preprocess_text
+from typing import List
+
+# Load the TTS model
+lm = LM(
+    dir,
+    version="1.5",
+    device="cuda"
+)
+
+# Extract the speaker prompt from reference audio
+speaker_prompt = lm.get_speaker_prompt([{
+    'text': 'foobar',
+    'audio': codes  # From previous encoding step
+}], sysprompt="Speak out the provided text.")
+
+# Preprocess text (splits into chunks)
+chunks: List[str] = preprocess_text("Hello world. This is fast as hell.")
+
+# Generate speech codes (you can stream this too)
+generated_codes = lm.generate(chunks, speaker_prompt=speaker_prompt)
+
+# Decode to PCM audio using codec from earlier
+pcm = codec.decode(generated_codes)
+```
 
-# Downloads VQGan the first time using HF Hub
-model = FishSpeechModel()
 
-# Fake input ndarray of "audio".
-# In reality, you'd run preprocessing with librosa
-mels_shape = (1, 160, 400)
-random_array = np.random.uniform(-1, 1, size=mels_shape)
+### Local installation
 
-indices = model.encode(mels)
-np.save("output.py", indices)
-```
+Requires Python and Rust toolchains.
+
+1. `python -m venv .venv`
+2. `pip install -r requirements.txt`
+3. `maturin develop`
diff --git a/fish_speech_python/pyproject.toml b/fish_speech_python/pyproject.toml
@@ -1,10 +1,19 @@
 [project]
 name = "fish_speech_python"
-version = "0.2.0"
-description = "Add your description here"
+# Handled by maturin
+description = "High-performance speech synthesis"
+version = "0.2.1"
 readme = "README.md"
-requires-python = ">=3.10"
+requires-python = ">=3.9"
+dependencies = [
+    "numpy>=1.21.0"
+]
 
+classifiers = [
+    "Programming Language :: Rust",
+    "Programming Language :: Python :: Implementation :: CPython",
+    "Programming Language :: Python :: Implementation :: PyPy",
+]
 
 [build-system]
 requires = ["maturin>=1.0,<2.0"]

diff --git a/scripts/bump_version.sh b/scripts/bump_version.sh
@@ -51,9 +51,10 @@ rm fish_speech_python/pyproject.toml.bak
 echo "Creating git tag v$NEW_VERSION..."
 git add Cargo.toml Cargo.lock fish_speech_python/pyproject.toml
 git commit -m "chore: bump version from $CURRENT_VERSION to $NEW_VERSION"
-git tag -a "v$NEW_VERSION" -m "Version $NEW_VERSION"
+# Not doing this, create tag manually
+# git tag -a "v$NEW_VERSION" -m "Version $NEW_VERSION"
 
-echo "Done! Changes committed and tag created."
+echo "Done! Changes committed."
 echo ""
 echo "Next steps:"
 echo "1. Review the changes: git show HEAD"