microsoft · mrbraden56 · Jun 25, 2022
diff --git a/audiolib.py b/audiolib.py
@@ -7,14 +7,15 @@
 import soundfile as sf
 import os
 import numpy as np
+import librosa
 
 # Function to read audio
-def audioread(path, norm = True, start=0, stop=None):
+def audioread(path, norm = True, sr=16000):
     path = os.path.abspath(path)
     if not os.path.exists(path):
         raise ValueError("[{}] does not exist!".format(path))
     try:
-        x, sr = sf.read(path, start=start, stop=stop)
+        x, sr = librosa.load(path, sr=sr)
     except RuntimeError:  # fix for sph pcm-embedded shortened v2
         print('WARNING: Audio type not supported')
 
@@ -47,8 +48,8 @@ def audiowrite(data, fs, destpath, norm=False):
 
     if not os.path.exists(destdir):
         os.makedirs(destdir)
-    
-    sf.write(destpath, data, fs)
+
+    sf.write(destpath, data, int(fs))
     return
 
 # Function to mix clean speech and noise at various SNR levels

diff --git a/noisyspeech_synthesizer.py b/noisyspeech_synthesizer.py
@@ -19,7 +19,7 @@ def main(cfg):
         clean_dir = cfg["speech_dir"]
     if not os.path.exists(clean_dir):
         assert False, ("Clean speech data is required")
-    
+
     noise_dir = os.path.join(os.path.dirname(__file__), 'noise_train')
     if cfg["noise_dir"]!='None':
         noise_dir = cfg["noise_dir"]
@@ -59,7 +59,7 @@ def main(cfg):
 
     while num_samples < total_samples:
         idx_s = np.random.randint(0, np.size(cleanfilenames))
-        clean, fs = audioread(cleanfilenames[idx_s])
+        clean, fs = audioread(cleanfilenames[idx_s], sr=fs)
 
         if len(clean)>audio_length:
             clean = clean
@@ -70,12 +70,12 @@ def main(cfg):
                 idx_s = idx_s + 1
                 if idx_s >= np.size(cleanfilenames)-1:
                     idx_s = np.random.randint(0, np.size(cleanfilenames)) 
-                newclean, fs = audioread(cleanfilenames[idx_s])
+                newclean, fs = audioread(cleanfilenames[idx_s], sr=fs)
                 cleanconcat = np.append(clean, np.zeros(int(fs*silence_length)))
                 clean = np.append(cleanconcat, newclean)
 
         idx_n = np.random.randint(0, np.size(noisefilenames))
-        noise, fs = audioread(noisefilenames[idx_n])
+        noise, fs = audioread(noisefilenames[idx_n], sr=fs)
 
         if len(noise)>=len(clean):
             noise = noise[0:len(clean)]
@@ -86,7 +86,7 @@ def main(cfg):
                 idx_n = idx_n + 1
                 if idx_n >= np.size(noisefilenames)-1:
                     idx_n = np.random.randint(0, np.size(noisefilenames))
-                newnoise, fs = audioread(noisefilenames[idx_n])
+                newnoise, fs = audioread(noisefilenames[idx_n], sr=fs)
                 noiseconcat = np.append(noise, np.zeros(int(fs*silence_length)))
                 noise = np.append(noiseconcat, newnoise)
         noise = noise[0:len(clean)]