CLSP (pronounced /klɪsp/) is a contrastive language–speech pretraining model that integrates global and fine-grained supervision to learn unified representations across multiple granularities. It performs reliably across global and fine-grained speech-text retrieval, zero-shot paralinguistic classification, and speech style similarity scoring, with strong alignment to human judgments.
CLSP adopts a dual-encoder architecture, where speech and text are processed by separate encoders and projected into a shared embedding space.
With CLSP, you can extract a latent representation of any given audio and text.
import torch
import torchaudio
from transformers import AutoModel
model = AutoModel.from_pretrained(
"yfyeung/CLSP",
trust_remote_code=True,
)
if torch.cuda.is_available():
model = model.to("cuda")
device = next(model.parameters()).device
audio_path = "asserts/00099.wav"
audio, sr = torchaudio.load(audio_path)
if sr != 16000:
audio = torchaudio.functional.resample(audio, sr, 16000)
sr = 16000
audio = audio.to(device)
audio_lens = torch.tensor([audio.size(1)], device=device)
text = [
"A female speaker with a medium-pitched British accent.",
"A male speaker with a medium-pitched British accent.",
"A female speaker delivers her enunciated words rapidly in a medium-pitched British accent, conveying an authoritative tone.",
"A female speaker delivers her enunciated words slowly in a medium-pitched Chinese accent, conveying an authoritative tone.",
"A mature female with a clear, medium-pitched voice and a British accent speaks in a formal, presentational style, characteristic of a newsreader or broadcaster. She delivers her speech at a fast pace with deliberate enunciation and a measured, authoritative rhythm. Her tone remains neutral and informative, with subtle emphasis on specific phrases, and her volume is consistently loud and steady. The delivery is fluent and controlled."
]
with torch.no_grad():
audio_embedding, text_embedding, _ = model(audio, audio_lens, text)
print(audio_embedding)
print(text_embedding)With CLSP, you can recognize paralinguistic attributes (e.g., emotion, gender, age) under diverse attribute sets, without any task-specific training.
import torch
import torchaudio
from transformers import AutoModel
model = AutoModel.from_pretrained(
"yfyeung/CLSP",
trust_remote_code=True,
)
if torch.cuda.is_available():
model = model.to("cuda")
device = next(model.parameters()).device
audio_path = "asserts/00099.wav"
audio, sr = torchaudio.load(audio_path)
if sr != 16000:
audio = torchaudio.functional.resample(audio, sr, 16000)
sr = 16000
audio = audio.to(device)
audio_lens = torch.tensor([audio.size(1)], device=device)
# emotion
prompts = [
"A speaker in a happy tone.",
"A speaker in a angry tone.",
"A speaker in a sad tone.",
"A speaker in a neutral tone.",
]
with torch.no_grad():
_, text_features, _ = model(text=prompts)
audio_features, _, _ = model(audio=audio, audio_lens=audio_lens)
logits_per_audio = torch.matmul(audio_features, text_features.t())
preds = logits_per_audio.argmax(dim=1)
print(prompts[preds.item()])
# gender
prompts = [
"A male speaker.",
"A female speaker.",
]
with torch.no_grad():
_, text_features, _ = model(text=prompts)
audio_features, _, _ = model(audio=audio, audio_lens=audio_lens)
logits_per_audio = torch.matmul(audio_features, text_features.t())
preds = logits_per_audio.argmax(dim=1)
print(prompts[preds.item()])
# age
prompts = [
"A child or young teenager speaker.",
"An adult speaker.",
"A middle-aged speaker.",
"An older or elder speaker.",
]
with torch.no_grad():
_, text_features, _ = model(text=prompts)
audio_features, _, _ = model(audio=audio, audio_lens=audio_lens)
logits_per_audio = torch.matmul(audio_features, text_features.t())
preds = logits_per_audio.argmax(dim=1)
print(prompts[preds.item()])Please cite our paper if you find this work useful:
@misc{yang2026clsp,
title={Towards Fine-Grained and Multi-Granular Contrastive Language-Speech Pre-training},
author={Yifan Yang and Bing Han and Hui Wang and Wei Wang and Ziyang Ma and Long Zhou and Zengrui Jin and Guanrou Yang and Tianrui Wang and Xu Tan and Xie Chen},
year={2026},
eprint={2601.03065},
archivePrefix={arXiv},
primaryClass={eess.AS},
url={https://arxiv.org/abs/2601.03065},
}

