diff --git a/pyproject.toml b/pyproject.toml index 729974d..ac0981e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,7 +51,8 @@ dependencies = [ "mypy>=0.782", "pylint>=2.8.0", "flake8", - "types-tqdm" + "types-tqdm", + "scipy>=1.7", ] cpu = [ "torch==2.6.0+cpu", @@ -135,4 +136,4 @@ explicit = true [[tool.uv.index]] name = "pytorch-cpu" url = "https://download.pytorch.org/whl/cpu" -explicit = true \ No newline at end of file +explicit = true diff --git a/tests/integration_tests/test_sonar_speech_pipeline_models.py b/tests/integration_tests/test_sonar_speech_pipeline_models.py index 4f0e557..6ed7c9f 100644 --- a/tests/integration_tests/test_sonar_speech_pipeline_models.py +++ b/tests/integration_tests/test_sonar_speech_pipeline_models.py @@ -6,8 +6,9 @@ from pathlib import Path +import numpy as np import torch -import torchaudio # type: ignore +from scipy.io import wavfile from torch.testing import assert_close # type: ignore from sonar.inference_pipelines.speech import ( @@ -21,8 +22,11 @@ str(DATA_DIR.joinpath("audio_files/audio_2.wav")), ] -WAV, sr = torchaudio.load(AUDIO_Paths[0]) +# Use scipy for WAV file reading (torchaudio I/O removed in 2.9+) +sr, audio_numpy = wavfile.read(AUDIO_Paths[0]) assert sr == 16000, "Sample rate should be 16kHz" +# Convert int16 to float32 and add channel dimension +WAV = torch.from_numpy(audio_numpy.astype(np.float32) / 32767.0).unsqueeze(0) def test_speech_to_embedding_model_pipeline(): diff --git a/tests/unit_tests/test_sonar_speech.py b/tests/unit_tests/test_sonar_speech.py index 4ae7e66..9940233 100644 --- a/tests/unit_tests/test_sonar_speech.py +++ b/tests/unit_tests/test_sonar_speech.py @@ -6,9 +6,10 @@ from pathlib import Path +import numpy as np import pytest import torch -import torchaudio # type: ignore[import] +from scipy.io import wavfile from sonar.inference_pipelines.speech import SpeechToEmbeddingModelPipeline @@ -39,9 +40,10 @@ def test_speech_embedding_with_waveform_input(encoder): # Parsing audio within sonar does not support fp16 audio decoding yet def test_speech_embedding_pipeline_with_audio_files(tmp_path: Path, encoder): - print(torchaudio.list_audio_backends()) fake_audio = torch.rand(1, 175920) audio_file = tmp_path / "audio.wav" - torchaudio.save(audio_file, fake_audio, 16000) + # Use scipy for WAV file writing (torchaudio I/O removed in 2.9+) + audio_numpy = fake_audio.squeeze().numpy() + wavfile.write(str(audio_file), 16000, (audio_numpy * 32767).astype(np.int16)) embedding = encoder.predict([str(audio_file.resolve())]) assert embedding.shape == torch.Size([1, 1024])