Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ members = [
"server",
]
resolver = "2"
package.version = "0.2.2"
package.version = "0.2.3"

[profile.release-with-debug]
inherits = "release"
Expand Down
67 changes: 47 additions & 20 deletions fish_speech_python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,37 @@

Python bindings for the Fish Speech Candle implementation, using PyO3.

## Installation
Supports Python 3.9+.
## Supported Platforms

> [!WARNING]
> Read this list very carefully. Hardware support is very limited.
> If you try to use this library on unsupported hardware, it will probably not work.
>
> You have been warned.

**Supported Platforms**
Python: 3.9+.

OS + hardware:
- Linux:
- x86_64, glibc 2.34+
- CUDA 12+ with compute capability >= 8.0 (RTX 30 series+, A100 series+)
- CPU: x86_64, glibc 2.34+
- Example: Ubuntu 22.04 IS supported, Ubuntu 20.04 IS NOT supported
- GPU: Nvidia CUDA 12+ with compute capability >= 8.0 (RTX 30 series+, A100 series+)
- Example: 2080 Ti is NOT supported (Turing)
- Example: RX5700 XT is NOT supported (AMD)
- macOS (M1+, 14.0+ (Monterey))

Windows and AMD hardware will never be supported, so don't ask.
Feel free to raise an issue if you need ARM or Alpine Linux.

## Installation

```bash
# From PyPI
pip install fish_speech_rs
```

and done.

## Usage

### Codec
Expand All @@ -27,21 +41,25 @@ This is the low-level API. You feed it PCM audio, it compresses it into codes, a

```python
from fish_speech import FireflyCodec
from huggingface_hub import snapshot_download
import numpy as np
# optional but highly recommended
from huggingface_hub import snapshot_download

# Download weights from Hugging Face
# This just returns a directory path.
# Substitute with your own directory path if you don't want to download from Hugging Face.
dir = snapshot_download("jkeisling/fish-speech-1.5")

# Load the codec model (set device to "cuda" for speed)
codec = FireflyCodec(
dir,
version="1.5", # Supports 1.4 and 1.5
device="cuda" # Or "cpu" if you hate yourself, "metal" on Apple Silicon
version="1.5", # Supports 1.2 to 1.5; 1.5 is default
device="cuda" # Or "cpu" (much slower), "metal" on Apple Silicon
)

# 1s of random audio. Substitute with your own audio.
# You will need to resample to codec.sample_rate yourself. Soundfile is recommended.
pcm = np.random.randn(1, 1, codec.sample_rate).astype(np.float32) # (batch, channels, samples)
# Encode raw PCM into compressed codes
pcm = np.random.randn(1, 1, 44_100).astype(np.float32) # (batch, channels, samples)
codes = codec.encode(pcm)

# Decode the compressed codes back into PCM
Expand All @@ -56,36 +74,45 @@ decoded_pcm = codec.decode(codes)
The language model (LM) takes text and turns it into speech codes, which you then decode back to audio.

```python
from fish_speech import LM, preprocess_text
from fish_speech import LM
from typing import List

# Load the TTS model
lm = LM(
dir,
version="1.5",
device="cuda"
device="cuda",
# bf16 only recommended for CUDA, otherwise leave it default (f32)
dtype="bf16"
)

# Extract the speaker prompt from reference audio
speaker_prompt = lm.get_speaker_prompt([{
'text': 'foobar',
'audio': codes # From previous encoding step
'codes': codes # From previous encoding step
}], sysprompt="Speak out the provided text.")

# Preprocess text (splits into chunks)
chunks: List[str] = preprocess_text("Hello world. This is fast as hell.")

# Generate speech codes (you can stream this too)
generated_codes = lm.generate(chunks, speaker_prompt=speaker_prompt)
# Generate speech codes
# Text chunking and normalization are your responsibility (sorry!);
# official text preprocessing helper function coming soon
generated_codes = lm.generate(["This is a test", "This is another test"], speaker_prompt=speaker_prompt)

# Decode to PCM audio using codec from earlier
pcm = codec.decode(generated_codes)
```

If you're in a Jupyter notebook, you can use the following code to play the audio in a widget:

```python
# assumes you ran the above code
from IPython.display import Audio

Audio(pcm.flatten(), rate=codec.sample_rate)
```

### Local installation
### Developing

Requires Python and Rust toolchains.
Requires Python and Rust toolchains. Clone this repo, set up a Rust and Python toolchain.

1. `python -m venv .venv`
2. `pip install -r requirements.txt`
Expand Down
2 changes: 1 addition & 1 deletion fish_speech_python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
name = "fish_speech_rs"
# Handled by maturin
description = "High-performance speech synthesis"
version = "0.2.2"
version = "0.2.3"
readme = "README.md"
requires-python = ">=3.9"
dependencies = [
Expand Down
7 changes: 6 additions & 1 deletion fish_speech_python/src/codec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,11 @@ impl FireflyCodec {
})
}

#[getter]
pub fn sample_rate(&self) -> u32 {
self.model.cfg.spec_transform.sample_rate as u32
}

fn encode(&self, pcm_data: numpy::PyReadonlyArray3<f32>) -> PyResult<PyObject> {
let py = pcm_data.py();
let pcm_data = pcm_data.as_array();
Expand All @@ -73,7 +78,7 @@ impl FireflyCodec {
.allow_threads(|| {
let pcm_data = candle_core::Tensor::from_slice(pcm_data, pcm_shape, &self.device)?
.to_dtype(self.dtype)?;
let codes = self.model.encode(&pcm_data)?;
let codes = self.model.encode(&pcm_data)?.to_dtype(DType::U32)?;
codes.to_vec3::<u32>()
})
.w()?;
Expand Down
35 changes: 25 additions & 10 deletions fish_speech_python/src/lm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,6 @@ use tokenizers::Tokenizer;

use super::utils::{get_device, get_version, wrap_err, PyRes};

#[derive(Debug, FromPyObject)]
struct AudioSample<'py> {
pub text: String,
pub audio: numpy::PyReadonlyArray3<'py, f32>,
}

#[pyclass]
pub struct LM {
model: DualARTransformer,
Expand Down Expand Up @@ -94,7 +88,10 @@ impl LM {
.to_slice()
.ok_or(PyException::new_err("input data is not contiguous"))?;

Some(Tensor::from_slice(codes, codes_shape, &self.device).map_err(wrap_err)?)
let codes =
Tensor::from_slice(codes, codes_shape, &self.device).map_err(wrap_err)?;
let codes = codes.squeeze(0).map_err(wrap_err)?;
Some(codes)
}
None => None,
};
Expand Down Expand Up @@ -156,22 +153,40 @@ impl LM {
let py = input[0].py();
let mut prompts: Vec<Tensor> = Vec::with_capacity(input.len());
for sample in input {
let sample: AudioSample = sample.extract()?;
let codes = sample.audio.as_array();
// Extract "text" as a String
let text: String = sample
.get_item("text")?
.ok_or(PyException::new_err(format!(
"Missing 'text' field in sample"
)))?
.extract()?;
let audio: numpy::PyReadonlyArray3<u32> = sample
.get_item("codes")?
.ok_or(PyException::new_err(format!(
"Missing 'codes' field in sample (encoded audio only)"
)))?
.extract()?;
let codes = audio.as_array();
let codes_shape = codes.shape().to_vec();
let codes = codes
.to_slice()
.ok_or(PyException::new_err("input data is not contiguous"))?;
let codes_tensor =
Tensor::from_slice(&codes, codes_shape, &self.device).map_err(wrap_err)?;
let codes_tensor = if codes_tensor.rank() == 3 {
codes_tensor.squeeze(0).map_err(wrap_err)?
} else {
codes_tensor
};
prompts.push(
prompt_encoder
.encode_conditioning_prompt(&sample.text, &codes_tensor)
.encode_conditioning_prompt(&text, &codes_tensor)
.map_err(wrap_err)?,
);
}
let prompts = Tensor::cat(&prompts, D::Minus1).map_err(wrap_err)?;
// move to npy
let prompts = prompts.unsqueeze(0).map_err(wrap_err)?;
let prompts = prompts.to_vec3::<u32>().map_err(wrap_err)?;
let prompts = numpy::PyArray3::from_vec3(py, &prompts)?;

Expand Down