Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ dependencies = [
"mypy>=0.782",
"pylint>=2.8.0",
"flake8",
"types-tqdm"
"types-tqdm",
"scipy>=1.7",
]
cpu = [
"torch==2.6.0+cpu",
Expand Down Expand Up @@ -135,4 +136,4 @@ explicit = true
[[tool.uv.index]]
name = "pytorch-cpu"
url = "https://download.pytorch.org/whl/cpu"
explicit = true
explicit = true
8 changes: 6 additions & 2 deletions tests/integration_tests/test_sonar_speech_pipeline_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@

from pathlib import Path

import numpy as np
import torch
import torchaudio # type: ignore
from scipy.io import wavfile
from torch.testing import assert_close # type: ignore

from sonar.inference_pipelines.speech import (
Expand All @@ -21,8 +22,11 @@
str(DATA_DIR.joinpath("audio_files/audio_2.wav")),
]

WAV, sr = torchaudio.load(AUDIO_Paths[0])
# Use scipy for WAV file reading (torchaudio I/O removed in 2.9+)
sr, audio_numpy = wavfile.read(AUDIO_Paths[0])
assert sr == 16000, "Sample rate should be 16kHz"
# Convert int16 to float32 and add channel dimension
WAV = torch.from_numpy(audio_numpy.astype(np.float32) / 32767.0).unsqueeze(0)


def test_speech_to_embedding_model_pipeline():
Expand Down
8 changes: 5 additions & 3 deletions tests/unit_tests/test_sonar_speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@

from pathlib import Path

import numpy as np
import pytest
import torch
import torchaudio # type: ignore[import]
from scipy.io import wavfile

from sonar.inference_pipelines.speech import SpeechToEmbeddingModelPipeline

Expand Down Expand Up @@ -39,9 +40,10 @@ def test_speech_embedding_with_waveform_input(encoder):

# Parsing audio within sonar does not support fp16 audio decoding yet
def test_speech_embedding_pipeline_with_audio_files(tmp_path: Path, encoder):
print(torchaudio.list_audio_backends())
fake_audio = torch.rand(1, 175920)
audio_file = tmp_path / "audio.wav"
torchaudio.save(audio_file, fake_audio, 16000)
# Use scipy for WAV file writing (torchaudio I/O removed in 2.9+)
audio_numpy = fake_audio.squeeze().numpy()
wavfile.write(str(audio_file), 16000, (audio_numpy * 32767).astype(np.int16))
embedding = encoder.predict([str(audio_file.resolve())])
assert embedding.shape == torch.Size([1, 1024])
Loading