diff --git a/cfgs/extract_noise_example.yaml b/cfgs/extract_noise_example.yaml new file mode 100644 index 0000000..018e9de --- /dev/null +++ b/cfgs/extract_noise_example.yaml @@ -0,0 +1,5 @@ +audio: /path/to/audio/file/dir/ +out: /path/to/output/folder/ +frame_length: 4096 +hop_length: 2048 +num_sec_slice: 3 diff --git a/tools/README.md b/tools/README.md new file mode 100644 index 0000000..b34c6af --- /dev/null +++ b/tools/README.md @@ -0,0 +1,14 @@ +Tools for handling unlabeled raw audio. + +To investigate and understand your raw audio data better, +and to be able to isolate potentially significant acoustic +events to reduce time labeling. + +run_extract_noise.py will generate 3s clips from larger wav files +where the RMS of that segment exceeded the average RMS of the +entire clip. This can highlight loud events in an audio file. + +display_rms_and_mel.py will give a visual graph with the mel +spectrogram and RMS chart for a given wav for a sanity check +and to get a better idea of what the spectrogram looks like for +a given RMS peak. diff --git a/tools/display_rms_and_mel.py b/tools/display_rms_and_mel.py new file mode 100644 index 0000000..4d3a26b --- /dev/null +++ b/tools/display_rms_and_mel.py @@ -0,0 +1,47 @@ +"""Display RMS and Mel-Spectrogram + +For a given audio file, you can visualize the RMS and +the associated Mel-Spectrogram with the same time-step to +see how they relate. Replace the filename variable with the +path to your specific audio file. + +Usage: + python3 display_rms_and_mel.py +""" +import librosa +import librosa.display +import matplotlib.pyplot as plt +import numpy as np + + +FILENAME = '' +FRAME_LENGTH = 2048 +HOP_LENGTH = 512 +NUM_SECONDS_OF_SLICE = 3 + +sound, sr = librosa.load(FILENAME, sr=None) + +clip_rms = librosa.feature.rms(y=sound, + frame_length=FRAME_LENGTH, + hop_length=HOP_LENGTH) + +clip_rms = clip_rms.squeeze() +peak_rms_index = clip_rms.argmax() +print(f"Peak RMS index: {peak_rms_index}") +peak_index = peak_rms_index * HOP_LENGTH + int(FRAME_LENGTH/2) +print(f"Peak index: {peak_index}") + +S, phase = librosa.magphase(librosa.stft(sound)) +rms = librosa.feature.rms(S=S) +fig, ax = plt.subplots(nrows=2, sharex=True) +times = librosa.times_like(rms) +ax[0].semilogy(times, rms[0], label='RMS Energy') +ax[0].set(xticks=[]) +ax[0].legend() +ax[0].label_outer() +librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max), + y_axis='log', x_axis='time', ax=ax[1]) +ax[1].set(title='log Power spectrogram') + + +plt.show() diff --git a/tools/run_extract_noise.py b/tools/run_extract_noise.py new file mode 100644 index 0000000..3c03240 --- /dev/null +++ b/tools/run_extract_noise.py @@ -0,0 +1,39 @@ +"""Create segments of noisy audio from wavs. + +This script uses the extract noise function to +calculate the average RMS of a given wav file, +and then creates 3 second segments where the +RMS peaked above the average. This main script +parses through a directory and sends each wav +file through the function. The extract_noise.yaml +is an example of the config file needed, copy +it and fill it out prior to running script. + +Usage: + + python3 run_extract_noise.py + -config /path/to/extract_noise_copy.yaml + +""" +import argparse +import os +import yaml +from whoot.extract_noise import clip_loud_segments + + +if __name__ == "__main__": + PARSER = argparse.ArgumentParser( + description='Path to config file.' + ) + PARSER.add_argument('-config', type=str, + help='Path to config.') + ARGS = PARSER.parse_args() + with open(ARGS.config, 'r', encoding='UTF-8') as f: + config = yaml.safe_load(f) + all_files = os.listdir(config['audio']) + for file in all_files: + try: + print(f"running {file}") + clip_loud_segments(os.path.join(config['audio'], file), config) + except Exception as e: + print(f"couldnt load {file} because {e}") diff --git a/whoot/__init__.py b/whoot/__init__.py index c0ae2e0..4fb2222 100644 --- a/whoot/__init__.py +++ b/whoot/__init__.py @@ -1 +1,4 @@ __version__ = "0.0.2.dev0" + + +from .extract_noise import clip_loud_segments diff --git a/whoot/extract_noise.py b/whoot/extract_noise.py new file mode 100644 index 0000000..a7bf47a --- /dev/null +++ b/whoot/extract_noise.py @@ -0,0 +1,108 @@ +"""Extract noisy segments from a wav file. + +Takes in a wav file and an outpath to store +the 3 second segments that contain an RMS value above +the average RMS for that wav file. +""" +import os +import librosa +import librosa.display +import numpy as np +import soundfile as sf + + +def clip_loud_segments(file, config): + """Extract loud segments from a wav file. + + Args: + file (str): The path of the current wav file. + config (str): The path to the directory to store the + loud segments. + """ + index = None + filename = file + frame_length = config['frame_length'] + hop_length = config['hop_length'] + num_sec_slice = config['num_sec_slice'] + sound, sr = librosa.load(filename, sr=None) + print(f"sample rate: {sr}") + + above_avg_rms = find_peaks(frame_length, hop_length, sound) + + yes_counter = 0 + start_index = None + last_right_index = 0 + + for index, value in enumerate(above_avg_rms): + if value == 1: + if yes_counter == 0: + start_index = index + yes_counter += 1 + else: + if yes_counter > 0: + mid_index = int((index - start_index) / 2) + mid_index = mid_index + start_index + real_index = mid_index * hop_length + int(frame_length/2) + half_slice_width = int(num_sec_slice * sr / 2) + left_index = max(0, real_index - half_slice_width) + if left_index > last_right_index: + right_index = real_index + half_slice_width + # left index needs to be greater than the last right + last_right_index = right_index + 1 + filename = os.path.basename(file) + filename = filename.strip('.wav') + sound_slice = sound[left_index:right_index] + name = config['out'] + filename + "_" + str(index) + ".wav" + sf.write(name, sound_slice, sr) + yes_counter = 0 + print(f"created {name}, setting yes_counter back to 0") + else: + print("skipping clip bc it would overlap with last clip") + + if yes_counter > 0: + stop_index = index + mid_index = int((stop_index - start_index) / 2) + real_index = mid_index * hop_length + int(frame_length/2) + half_slice_width = int(num_sec_slice * sr / 2) + left_index = max(0, real_index - half_slice_width) + if left_index > last_right_index: + sound_slice = sound[left_index:stop_index] + filename = os.path.basename(file) + filename = filename.strip('.wav') + name = config['out'] + filename + "_" + str(index) + ".wav" + sf.write(name, sound_slice, sr) + else: + print("skipping clip bc it would overlap with last clip") + + +def find_peaks(frame_length, hop_length, sound): + """Find peak RMS moments in a sound file. + + Args: + frame_length (int): Window size. + hop_length (int): Overlap between frames. + sound (numpy.ndarray): The audio as a time series array. + + Returns: + numpy.ndarray: The array containing each frame as an index + with values corresponding to whether that + frame exceeded the avg RMS or not. + """ + clip_rms = librosa.feature.rms(y=sound, + frame_length=frame_length, + hop_length=hop_length) + + clip_rms = clip_rms.squeeze() + average_rms = np.mean(clip_rms) * (3/2) + above_avg_rms = clip_rms + + for index, _ in enumerate(clip_rms): + if average_rms > clip_rms[index]: + above_avg_rms[index] = 0 + else: + above_avg_rms[index] = 1 + + num_frames = np.sum(above_avg_rms) + print(f"num frames with above the 1.5x average rms value: {num_frames}") + + return above_avg_rms