EndlessReform · EndlessReform · Feb 23, 2025 · Feb 23, 2025 · Feb 23, 2025 · Feb 23, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -6,7 +6,7 @@ members = [
     "server",
 ]
 resolver = "2"
-package.version = "0.2.2"
+package.version = "0.2.3"
 
 [profile.release-with-debug]
 inherits = "release"

diff --git a/fish_speech_python/README.md b/fish_speech_python/README.md
@@ -2,23 +2,37 @@
 
 Python bindings for the Fish Speech Candle implementation, using PyO3.
 
-## Installation
-Supports Python 3.9+.
+## Supported Platforms
+
+> [!WARNING]
+> Read this list very carefully. Hardware support is very limited.
+> If you try to use this library on unsupported hardware, it will probably not work.
+>
+> You have been warned.
 
-**Supported Platforms**
+Python: 3.9+.
+
+OS + hardware:
 - Linux:
-  - x86_64, glibc 2.34+
-  - CUDA 12+ with compute capability >= 8.0 (RTX 30 series+, A100 series+)
+  - CPU: x86_64, glibc 2.34+
+    - Example: Ubuntu 22.04 IS supported, Ubuntu 20.04 IS NOT supported
+  - GPU: Nvidia CUDA 12+ with compute capability >= 8.0 (RTX 30 series+, A100 series+)
+    - Example: 2080 Ti is NOT supported (Turing)
+    - Example: RX5700 XT is NOT supported (AMD)
 - macOS (M1+, 14.0+ (Monterey))
 
 Windows and AMD hardware will never be supported, so don't ask.
 Feel free to raise an issue if you need ARM or Alpine Linux.
 
+## Installation
+
 ```bash
 # From PyPI
 pip install fish_speech_rs
 ```
 
+and done.
+
 ## Usage
 
 ### Codec
@@ -27,21 +41,25 @@ This is the low-level API. You feed it PCM audio, it compresses it into codes, a
 
 ```python
 from fish_speech import FireflyCodec
-from huggingface_hub import snapshot_download
 import numpy as np
+# optional but highly recommended
+from huggingface_hub import snapshot_download
 
-# Download weights from Hugging Face
+# This just returns a directory path.
+# Substitute with your own directory path if you don't want to download from Hugging Face.
 dir = snapshot_download("jkeisling/fish-speech-1.5")
 
 # Load the codec model (set device to "cuda" for speed)
 codec = FireflyCodec(
     dir,
-    version="1.5",  # Supports 1.4 and 1.5
-    device="cuda"    # Or "cpu" if you hate yourself, "metal" on Apple Silicon
+    version="1.5",  # Supports 1.2 to 1.5; 1.5 is default
+    device="cuda"    # Or "cpu" (much slower), "metal" on Apple Silicon
 )
 
+# 1s of random audio. Substitute with your own audio.
+# You will need to resample to codec.sample_rate yourself. Soundfile is recommended.
+pcm = np.random.randn(1, 1, codec.sample_rate).astype(np.float32)  # (batch, channels, samples)
 # Encode raw PCM into compressed codes
-pcm = np.random.randn(1, 1, 44_100).astype(np.float32)  # (batch, channels, samples)
 codes = codec.encode(pcm)
 
 # Decode the compressed codes back into PCM
@@ -56,36 +74,45 @@ decoded_pcm = codec.decode(codes)
 The language model (LM) takes text and turns it into speech codes, which you then decode back to audio.
 
 ```python
-from fish_speech import LM, preprocess_text
+from fish_speech import LM
 from typing import List
 
 # Load the TTS model
 lm = LM(
     dir,
     version="1.5",
-    device="cuda"
+    device="cuda",
+    # bf16 only recommended for CUDA, otherwise leave it default (f32)
+    dtype="bf16"
 )
 
 # Extract the speaker prompt from reference audio
 speaker_prompt = lm.get_speaker_prompt([{
     'text': 'foobar',
-    'audio': codes  # From previous encoding step
+    'codes': codes  # From previous encoding step
 }], sysprompt="Speak out the provided text.")
 
-# Preprocess text (splits into chunks)
-chunks: List[str] = preprocess_text("Hello world. This is fast as hell.")
-
-# Generate speech codes (you can stream this too)
-generated_codes = lm.generate(chunks, speaker_prompt=speaker_prompt)
+# Generate speech codes
+# Text chunking and normalization are your responsibility (sorry!);
+# official text preprocessing helper function coming soon
+generated_codes = lm.generate(["This is a test", "This is another test"], speaker_prompt=speaker_prompt)
 
 # Decode to PCM audio using codec from earlier
 pcm = codec.decode(generated_codes)
 ```
 
+If you're in a Jupyter notebook, you can use the following code to play the audio in a widget:
+
+```python
+# assumes you ran the above code
+from IPython.display import Audio
+
+Audio(pcm.flatten(), rate=codec.sample_rate)
+```
 
-### Local installation
+### Developing
 
-Requires Python and Rust toolchains.
+Requires Python and Rust toolchains. Clone this repo, set up a Rust and Python toolchain.
 
 1. `python -m venv .venv`
 2. `pip install -r requirements.txt`

diff --git a/fish_speech_python/pyproject.toml b/fish_speech_python/pyproject.toml
@@ -2,7 +2,7 @@
 name = "fish_speech_rs"
 # Handled by maturin
 description = "High-performance speech synthesis"
-version = "0.2.2"
+version = "0.2.3"
 readme = "README.md"
 requires-python = ">=3.9"
 dependencies = [

diff --git a/fish_speech_python/src/codec.rs b/fish_speech_python/src/codec.rs
@@ -60,6 +60,11 @@ impl FireflyCodec {
         })
     }
 
+    #[getter]
+    pub fn sample_rate(&self) -> u32 {
+        self.model.cfg.spec_transform.sample_rate as u32
+    }
+
     fn encode(&self, pcm_data: numpy::PyReadonlyArray3<f32>) -> PyResult<PyObject> {
         let py = pcm_data.py();
         let pcm_data = pcm_data.as_array();
@@ -73,7 +78,7 @@ impl FireflyCodec {
             .allow_threads(|| {
                 let pcm_data = candle_core::Tensor::from_slice(pcm_data, pcm_shape, &self.device)?
                     .to_dtype(self.dtype)?;
-                let codes = self.model.encode(&pcm_data)?;
+                let codes = self.model.encode(&pcm_data)?.to_dtype(DType::U32)?;
                 codes.to_vec3::<u32>()
             })
             .w()?;

diff --git a/fish_speech_python/src/lm.rs b/fish_speech_python/src/lm.rs
@@ -11,12 +11,6 @@ use tokenizers::Tokenizer;
 
 use super::utils::{get_device, get_version, wrap_err, PyRes};
 
-#[derive(Debug, FromPyObject)]
-struct AudioSample<'py> {
-    pub text: String,
-    pub audio: numpy::PyReadonlyArray3<'py, f32>,
-}
-
 #[pyclass]
 pub struct LM {
     model: DualARTransformer,
@@ -94,7 +88,10 @@ impl LM {
                     .to_slice()
                     .ok_or(PyException::new_err("input data is not contiguous"))?;
 
-                Some(Tensor::from_slice(codes, codes_shape, &self.device).map_err(wrap_err)?)
+                let codes =
+                    Tensor::from_slice(codes, codes_shape, &self.device).map_err(wrap_err)?;
+                let codes = codes.squeeze(0).map_err(wrap_err)?;
+                Some(codes)
             }
             None => None,
         };
@@ -156,22 +153,40 @@ impl LM {
         let py = input[0].py();
         let mut prompts: Vec<Tensor> = Vec::with_capacity(input.len());
         for sample in input {
-            let sample: AudioSample = sample.extract()?;
-            let codes = sample.audio.as_array();
+            // Extract "text" as a String
+            let text: String = sample
+                .get_item("text")?
+                .ok_or(PyException::new_err(format!(
+                    "Missing 'text' field in sample"
+                )))?
+                .extract()?;
+            let audio: numpy::PyReadonlyArray3<u32> = sample
+                .get_item("codes")?
+                .ok_or(PyException::new_err(format!(
+                    "Missing 'codes' field in sample (encoded audio only)"
+                )))?
+                .extract()?;
+            let codes = audio.as_array();
             let codes_shape = codes.shape().to_vec();
             let codes = codes
                 .to_slice()
                 .ok_or(PyException::new_err("input data is not contiguous"))?;
             let codes_tensor =
                 Tensor::from_slice(&codes, codes_shape, &self.device).map_err(wrap_err)?;
+            let codes_tensor = if codes_tensor.rank() == 3 {
+                codes_tensor.squeeze(0).map_err(wrap_err)?
+            } else {
+                codes_tensor
+            };
             prompts.push(
                 prompt_encoder
-                    .encode_conditioning_prompt(&sample.text, &codes_tensor)
+                    .encode_conditioning_prompt(&text, &codes_tensor)
                     .map_err(wrap_err)?,
             );
         }
         let prompts = Tensor::cat(&prompts, D::Minus1).map_err(wrap_err)?;
         // move to npy
+        let prompts = prompts.unsqueeze(0).map_err(wrap_err)?;
         let prompts = prompts.to_vec3::<u32>().map_err(wrap_err)?;
         let prompts = numpy::PyArray3::from_vec3(py, &prompts)?;