Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions cfgs/extract_noise_example.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
audio: /path/to/audio/file/dir/
out: /path/to/output/folder/
frame_length: 4096
hop_length: 2048
num_sec_slice: 3
14 changes: 14 additions & 0 deletions tools/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
Tools for handling unlabeled raw audio.

To investigate and understand your raw audio data better,
and to be able to isolate potentially significant acoustic
events to reduce time labeling.

run_extract_noise.py will generate 3s clips from larger wav files
where the RMS of that segment exceeded the average RMS of the
entire clip. This can highlight loud events in an audio file.

display_rms_and_mel.py will give a visual graph with the mel
spectrogram and RMS chart for a given wav for a sanity check
and to get a better idea of what the spectrogram looks like for
a given RMS peak.
47 changes: 47 additions & 0 deletions tools/display_rms_and_mel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
"""Display RMS and Mel-Spectrogram

For a given audio file, you can visualize the RMS and
the associated Mel-Spectrogram with the same time-step to
see how they relate. Replace the filename variable with the
path to your specific audio file.

Usage:
python3 display_rms_and_mel.py
"""
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np


FILENAME = '<path/to/audio/file.wav>'
FRAME_LENGTH = 2048
HOP_LENGTH = 512
NUM_SECONDS_OF_SLICE = 3

sound, sr = librosa.load(FILENAME, sr=None)

clip_rms = librosa.feature.rms(y=sound,
frame_length=FRAME_LENGTH,
hop_length=HOP_LENGTH)

clip_rms = clip_rms.squeeze()
peak_rms_index = clip_rms.argmax()
print(f"Peak RMS index: {peak_rms_index}")
peak_index = peak_rms_index * HOP_LENGTH + int(FRAME_LENGTH/2)
print(f"Peak index: {peak_index}")

S, phase = librosa.magphase(librosa.stft(sound))
rms = librosa.feature.rms(S=S)
fig, ax = plt.subplots(nrows=2, sharex=True)
times = librosa.times_like(rms)
ax[0].semilogy(times, rms[0], label='RMS Energy')
ax[0].set(xticks=[])
ax[0].legend()
ax[0].label_outer()
librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max),
y_axis='log', x_axis='time', ax=ax[1])
ax[1].set(title='log Power spectrogram')


plt.show()
39 changes: 39 additions & 0 deletions tools/run_extract_noise.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
"""Create segments of noisy audio from wavs.

This script uses the extract noise function to
calculate the average RMS of a given wav file,
and then creates 3 second segments where the
RMS peaked above the average. This main script
parses through a directory and sends each wav
file through the function. The extract_noise.yaml
is an example of the config file needed, copy
it and fill it out prior to running script.

Usage:

python3 run_extract_noise.py
-config /path/to/extract_noise_copy.yaml

"""
import argparse
import os
import yaml
from whoot.extract_noise import clip_loud_segments


if __name__ == "__main__":
PARSER = argparse.ArgumentParser(
description='Path to config file.'
)
PARSER.add_argument('-config', type=str,
help='Path to config.')
ARGS = PARSER.parse_args()
with open(ARGS.config, 'r', encoding='UTF-8') as f:
config = yaml.safe_load(f)
all_files = os.listdir(config['audio'])
for file in all_files:
try:
print(f"running {file}")
clip_loud_segments(os.path.join(config['audio'], file), config)
except Exception as e:
print(f"couldnt load {file} because {e}")
3 changes: 3 additions & 0 deletions whoot/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
__version__ = "0.0.2.dev0"


from .extract_noise import clip_loud_segments
108 changes: 108 additions & 0 deletions whoot/extract_noise.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
"""Extract noisy segments from a wav file.

Takes in a wav file and an outpath to store
the 3 second segments that contain an RMS value above
the average RMS for that wav file.
"""
import os
import librosa
import librosa.display
import numpy as np
import soundfile as sf


def clip_loud_segments(file, config):
"""Extract loud segments from a wav file.

Args:
file (str): The path of the current wav file.
config (str): The path to the directory to store the
loud segments.
"""
index = None
filename = file
frame_length = config['frame_length']
hop_length = config['hop_length']
num_sec_slice = config['num_sec_slice']
sound, sr = librosa.load(filename, sr=None)
print(f"sample rate: {sr}")

above_avg_rms = find_peaks(frame_length, hop_length, sound)

yes_counter = 0
start_index = None
last_right_index = 0

for index, value in enumerate(above_avg_rms):
if value == 1:
if yes_counter == 0:
start_index = index
yes_counter += 1
else:
if yes_counter > 0:
mid_index = int((index - start_index) / 2)
mid_index = mid_index + start_index
real_index = mid_index * hop_length + int(frame_length/2)
half_slice_width = int(num_sec_slice * sr / 2)
left_index = max(0, real_index - half_slice_width)
if left_index > last_right_index:
right_index = real_index + half_slice_width
# left index needs to be greater than the last right
last_right_index = right_index + 1
filename = os.path.basename(file)
filename = filename.strip('.wav')
sound_slice = sound[left_index:right_index]
name = config['out'] + filename + "_" + str(index) + ".wav"
sf.write(name, sound_slice, sr)
yes_counter = 0
print(f"created {name}, setting yes_counter back to 0")
else:
print("skipping clip bc it would overlap with last clip")

if yes_counter > 0:
stop_index = index
mid_index = int((stop_index - start_index) / 2)
real_index = mid_index * hop_length + int(frame_length/2)
half_slice_width = int(num_sec_slice * sr / 2)
left_index = max(0, real_index - half_slice_width)
if left_index > last_right_index:
sound_slice = sound[left_index:stop_index]
filename = os.path.basename(file)
filename = filename.strip('.wav')
name = config['out'] + filename + "_" + str(index) + ".wav"
sf.write(name, sound_slice, sr)
else:
print("skipping clip bc it would overlap with last clip")


def find_peaks(frame_length, hop_length, sound):
"""Find peak RMS moments in a sound file.

Args:
frame_length (int): Window size.
hop_length (int): Overlap between frames.
sound (numpy.ndarray): The audio as a time series array.

Returns:
numpy.ndarray: The array containing each frame as an index
with values corresponding to whether that
frame exceeded the avg RMS or not.
"""
clip_rms = librosa.feature.rms(y=sound,
frame_length=frame_length,
hop_length=hop_length)

clip_rms = clip_rms.squeeze()
average_rms = np.mean(clip_rms) * (3/2)
above_avg_rms = clip_rms

for index, _ in enumerate(clip_rms):
if average_rms > clip_rms[index]:
above_avg_rms[index] = 0
else:
above_avg_rms[index] = 1

num_frames = np.sum(above_avg_rms)
print(f"num frames with above the 1.5x average rms value: {num_frames}")

return above_avg_rms