facebookresearch · YunchaoYang · Feb 2, 2026 · Feb 4, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -51,7 +51,8 @@ dependencies = [
       "mypy>=0.782",
       "pylint>=2.8.0",
       "flake8",
-      "types-tqdm"
+      "types-tqdm",
+      "scipy>=1.7",
   ]
 cpu = [
   "torch==2.6.0+cpu",
@@ -135,4 +136,4 @@ explicit = true
 [[tool.uv.index]]
 name = "pytorch-cpu"
 url = "https://download.pytorch.org/whl/cpu"
-explicit = true
+explicit = true
diff --git a/tests/integration_tests/test_sonar_speech_pipeline_models.py b/tests/integration_tests/test_sonar_speech_pipeline_models.py
@@ -6,8 +6,9 @@
 
 from pathlib import Path
 
+import numpy as np
 import torch
-import torchaudio  # type: ignore
+from scipy.io import wavfile
 from torch.testing import assert_close  # type: ignore
 
 from sonar.inference_pipelines.speech import (
@@ -21,8 +22,11 @@
     str(DATA_DIR.joinpath("audio_files/audio_2.wav")),
 ]
 
-WAV, sr = torchaudio.load(AUDIO_Paths[0])
+# Use scipy for WAV file reading (torchaudio I/O removed in 2.9+)
+sr, audio_numpy = wavfile.read(AUDIO_Paths[0])
 assert sr == 16000, "Sample rate should be 16kHz"
+# Convert int16 to float32 and add channel dimension
+WAV = torch.from_numpy(audio_numpy.astype(np.float32) / 32767.0).unsqueeze(0)
 
 
 def test_speech_to_embedding_model_pipeline():

diff --git a/tests/unit_tests/test_sonar_speech.py b/tests/unit_tests/test_sonar_speech.py
@@ -6,9 +6,10 @@
 
 from pathlib import Path
 
+import numpy as np
 import pytest
 import torch
-import torchaudio  # type: ignore[import]
+from scipy.io import wavfile
 
 from sonar.inference_pipelines.speech import SpeechToEmbeddingModelPipeline
 
@@ -39,9 +40,10 @@ def test_speech_embedding_with_waveform_input(encoder):
 
 # Parsing audio within sonar does not support fp16 audio decoding yet
 def test_speech_embedding_pipeline_with_audio_files(tmp_path: Path, encoder):
-    print(torchaudio.list_audio_backends())
     fake_audio = torch.rand(1, 175920)
     audio_file = tmp_path / "audio.wav"
-    torchaudio.save(audio_file, fake_audio, 16000)
+    # Use scipy for WAV file writing (torchaudio I/O removed in 2.9+)
+    audio_numpy = fake_audio.squeeze().numpy()
+    wavfile.write(str(audio_file), 16000, (audio_numpy * 32767).astype(np.int16))
     embedding = encoder.predict([str(audio_file.resolve())])
     assert embedding.shape == torch.Size([1, 1024])