diff --git a/magnolia/python/analysis/bss_evaluate.py b/magnolia/python/analysis/bss_evaluate.py index 3bb3925..c2cff4f 100644 --- a/magnolia/python/analysis/bss_evaluate.py +++ b/magnolia/python/analysis/bss_evaluate.py @@ -107,10 +107,12 @@ def evaluate(input_path, output_csv_file, target_stype=None, eval_sr=8000, num_s '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/bss/lab41/in_sample_test.csv'], ['/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/lab41/out_of_sample_test', '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/bss/lab41/out_of_sample_test.csv'], + ['/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/large_lab41/in_sample_test', '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/bss/large_lab41/in_sample_test.csv'], ['/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/large_lab41/out_of_sample_test', '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/bss/large_lab41/out_of_sample_test.csv'], + ['/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/chimera/in_sample_test', '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/bss/chimera/mi_in_sample_test.csv', 'mi'], @@ -123,13 +125,27 @@ def evaluate(input_path, output_csv_file, target_stype=None, eval_sr=8000, num_s ['/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/chimera/out_of_sample_test', '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/bss/chimera/dc_out_of_sample_test.csv', 'dc'], + + ['/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/mask_sce/in_sample_test', + '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/bss/mask_sce/mi_in_sample_test.csv', + 'mi'], + ['/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/mask_sce/out_of_sample_test', + '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/bss/mask_sce/mi_out_of_sample_test.csv', + 'mi'], + ['/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/mask_sce/in_sample_test', + '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/bss/mask_sce/dc_in_sample_test.csv', + 'dc'], + ['/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/mask_sce/out_of_sample_test', + '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/bss/mask_sce/dc_out_of_sample_test.csv', + 'dc'], + ['/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/snmf/in_sample_test', '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/bss/snmf/in_sample_test.csv'], ['/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/snmf/out_of_sample_test', '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/bss/snmf/out_of_sample_test.csv'] ] - args = args[8:] + args = args[8:12] # Parallel #processes = [] @@ -142,9 +158,9 @@ def evaluate(input_path, output_csv_file, target_stype=None, eval_sr=8000, num_s # Parallel #pool = mp.Pool(processes=min(len(args), os.cpu_count() - 1)) - pool = mp.Pool(processes=2) - pool.starmap(evaluate, args) + #pool = mp.Pool(processes=2) + #pool.starmap(evaluate, args) # Sequential - #for arg in args: - # evaluate(*arg) + for arg in args: + evaluate(*arg) diff --git a/magnolia/python/analysis/comparison_plot.py b/magnolia/python/analysis/comparison_plot.py index 03ddb36..1d43c32 100644 --- a/magnolia/python/analysis/comparison_plot.py +++ b/magnolia/python/analysis/comparison_plot.py @@ -12,7 +12,7 @@ def format_dae_columns(df): cols[5] = 'Input_SNR' cols[6] = 'Input_SDR' cols[7] = 'Output_SDR' - + df.columns = cols @@ -20,12 +20,12 @@ def load_dataframes(models): for model in models: if 'in_set' in model: model['in_set_df'] = pd.read_csv(model['in_set']) - if model['name'] == 'DAE': - format_dae_columns(model['in_set_df']) + #if model['name'] == 'DAE': + # format_dae_columns(model['in_set_df']) if 'out_of_set' in model: model['out_of_set_df'] = pd.read_csv(model['out_of_set']) - if model['name'] == 'DAE': - format_dae_columns(model['out_of_set_df']) + #if model['name'] == 'DAE': + # format_dae_columns(model['out_of_set_df']) def error_on_the_mean(x): @@ -50,7 +50,7 @@ def make_sdr_delta_versus_noise_source_plot(models, df_base_name): df_name = '{}_df'.format(df_base_name) mean_multiindex_name = ('SDR_Improvement', 'mean') eotm_multiindex_name = ('SDR_Improvement', 'error_on_the_mean') - + all_groups = {} all_colors = {} all_names = [] @@ -68,17 +68,19 @@ def make_sdr_delta_versus_noise_source_plot(models, df_base_name): all_label_df = all_groups[model['name']] else: all_label_df = all_label_df.merge(all_groups[model['name']], how='outer') - + labels = all_label_df['Noise_Type'].unique() n_groups = len(labels) del all_label_df - + # create plot fig, ax = plt.subplots(figsize=(8, 6)) index = np.arange(n_groups) - bar_width = 0.25 + plt.xlim(-0.5, n_groups + 0.5) + bar_width = (n_groups + 1)/(1.15*n_groups*len(models)) + #bar_width = 0.15 opacity = 0.8 - + offset = 0 all_rects = [] for entry_name in all_names: @@ -86,7 +88,7 @@ def make_sdr_delta_versus_noise_source_plot(models, df_base_name): groups = all_groups[entry_name] if entry_name in all_colors: color = all_colors[entry_name] - + #male_means = groups[groups['Speaker_Sex'] == 'M'] #male_means = male_means[male_means['Noise_Type'] == labels].fillna(0) male_means = groups[groups['Noise_Type'] == labels].fillna(0) @@ -98,9 +100,9 @@ def make_sdr_delta_versus_noise_source_plot(models, df_base_name): color=color, label=entry_name, yerr=male_errors) - + offset += 1 - + for i in range(len(labels)): labels[i] = labels[i].replace('_', ' ') plt.xticks(index + (len(all_names)/2 - 0.5)*bar_width, labels) @@ -108,18 +110,20 @@ def make_sdr_delta_versus_noise_source_plot(models, df_base_name): tick.set_rotation(75) tick.set_fontsize(12) plt.xlabel('Noise Type') - plt.ylabel('SDR Improvement') + plt.ylabel('SDR Improvement (dB)') plt.title('SDR Improvement Versus Noise Type', fontsize=20) ax.xaxis.label.set_size(15) ax.yaxis.label.set_size(15) - - ylim = [-0.5, ax.get_ylim()[1]] + + ylim = [-0.5, 1.3*ax.get_ylim()[1]] #ylim[0] = -0.5 ax.set_ylim(ylim) #plt.axis([0, 11, -.5, 16]) - plt.legend(fontsize=12, edgecolor='black') + plt.legend(fontsize=12, edgecolor='black', + loc='upper center', ncol=3, mode='expand') + #ax.legend(bbox_to_anchor=(1.5, 1.5)) plt.tight_layout() - + plt.savefig('{}_sdr_delta_versus_noise_type.pdf'.format(df_base_name), format='pdf') @@ -127,7 +131,7 @@ def make_sdr_delta_versus_input_snr_plot(models, df_base_name, bins): df_name = '{}_df'.format(df_base_name) mean_multiindex_name = ('SDR_Improvement', 'mean') eotm_multiindex_name = ('SDR_Improvement', 'error_on_the_mean') - + all_groups = {} all_colors = {} all_names = [] @@ -145,17 +149,17 @@ def make_sdr_delta_versus_input_snr_plot(models, df_base_name, bins): all_label_df = all_groups[model['name']] else: all_label_df = all_label_df.merge(all_groups[model['name']], how='outer') - + labels = all_label_df['Input_SNR_Bin'].unique() n_groups = len(labels) del all_label_df - + # create plot fig, ax = plt.subplots(figsize=(8, 6)) index = np.arange(n_groups) - bar_width = 0.25 + bar_width = (bins[-1] - bins[0])/(1.15*n_groups*len(models)) opacity = 0.8 - + offset = 0 all_rects = [] for entry_name in all_names: @@ -163,7 +167,7 @@ def make_sdr_delta_versus_input_snr_plot(models, df_base_name, bins): groups = all_groups[entry_name] if entry_name in all_colors: color = all_colors[entry_name] - + #male_means = groups[groups['Speaker_Sex'] == 'M'] #male_means = male_means[male_means['Input_SNR_Bin'] == labels].fillna(0) male_means = groups[groups['Input_SNR_Bin'] == labels].fillna(0) @@ -175,9 +179,9 @@ def make_sdr_delta_versus_input_snr_plot(models, df_base_name, bins): color=color, label=entry_name, yerr=male_errors) - + offset += 1 - + print_labels = [] for i in range(len(labels)): #print_labels.append('[{}, {})'.format(i - 5, i - 4)) @@ -187,19 +191,20 @@ def make_sdr_delta_versus_input_snr_plot(models, df_base_name, bins): tick.set_rotation(0) tick.set_fontsize(12) #plt.xlabel('Input SNR Range') - plt.xlabel('Input SNR') - plt.ylabel('SDR Improvement') + plt.xlabel('Input SNR (dB)') + plt.ylabel('SDR Improvement (dB)') plt.title('SDR Improvement Versus Input SNR', fontsize=20) ax.xaxis.label.set_size(15) ax.yaxis.label.set_size(15) - - ylim = [-0.5, ax.get_ylim()[1]] + + ylim = [-0.5, 1.3*ax.get_ylim()[1]] #ylim[0] = -0.5 ax.set_ylim(ylim) #plt.axis([0, 11, -.5, 16]) - plt.legend(fontsize=12, edgecolor='black') + plt.legend(fontsize=12, edgecolor='black', + loc='upper center', ncol=3, mode='expand') plt.tight_layout() - + plt.savefig('{}_sdr_delta_versus_input_snr.pdf'.format(df_base_name), format='pdf') @@ -211,23 +216,36 @@ def main(): 'out_of_set': '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/bss/snmf/out_of_sample_test_sdr_summary.csv', 'color': '#98C1D9' }, - #{ - # 'name': 'DAE', - # 'out_of_set': '/data/fs4/home/pgamble/Magnolia/Denoising/Autoencoder/Final Results/eval_test_A.csv', - # 'color': '#E0FBFC' - #}, - #{ - # 'name': 'Chimera MI', - # 'in_set': '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/bss/chimera/mi_in_sample_test_sdr_summary.csv', - # 'out_of_set': '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/bss/chimera/mi_out_of_sample_test_sdr_summary.csv', - # 'color': '#3D5A80' - #}, { - 'name': 'DC',#'Chimera DC', + 'name': 'DAE', + 'in_set': '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/bss/dae/ae_in_sample_test.csv', + 'out_of_set': '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/bss/dae/ae_out_of_sample_test.csv', + 'color': '#828C51'#'#E0FBFC' + }, + { + 'name': 'DC + MI (MI)', + 'in_set': '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/bss/chimera/mi_in_sample_test_sdr_summary.csv', + 'out_of_set': '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/bss/chimera/mi_out_of_sample_test_sdr_summary.csv', + 'color': '#3D5A80' + }, + { + 'name': 'DC + MI (C)', 'in_set': '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/bss/chimera/dc_in_sample_test_sdr_summary.csv', 'out_of_set': '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/bss/chimera/dc_out_of_sample_test_sdr_summary.csv', 'color': '#0C0A3E' }, + { + 'name': 'SCE + MI (MI)', + 'in_set': '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/bss/mask_sce/mi_in_sample_test_sdr_summary.csv', + 'out_of_set': '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/bss/mask_sce/mi_out_of_sample_test_sdr_summary.csv', + 'color': '#CA054D' + }, + { + 'name': 'SCE + MI (C)', + 'in_set': '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/bss/mask_sce/dc_in_sample_test_sdr_summary.csv', + 'out_of_set': '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/bss/mask_sce/dc_out_of_sample_test_sdr_summary.csv', + 'color': '#393E41' + }, { 'name': 'SCE', 'in_set': '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/bss/large_lab41/in_sample_test_sdr_summary.csv', @@ -235,15 +253,18 @@ def main(): 'color': '#A4303F' }, ] + # TODO: the input SNR range should be determined automatically bins = np.linspace(-5, 5, 11) bins[-1] = 1.02*bins[-1] - + load_dataframes(models) - + make_sdr_delta_versus_input_snr_plot(models, 'out_of_set', bins) make_sdr_delta_versus_noise_source_plot(models, 'out_of_set') #make_sdr_delta_versus_sex_plot(models, 'out_of_set') - + make_sdr_delta_versus_input_snr_plot(models, 'in_set', bins) + make_sdr_delta_versus_noise_source_plot(models, 'in_set') + if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/magnolia/python/analysis/make_sdr_table.py b/magnolia/python/analysis/make_sdr_table.py index e3ad40f..00d778c 100644 --- a/magnolia/python/analysis/make_sdr_table.py +++ b/magnolia/python/analysis/make_sdr_table.py @@ -53,6 +53,19 @@ def main(): ['/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/bss/large_lab41/out_of_sample_test.csv', '/local_data/magnolia/pipeline_data/date_2017_09_27_time_13_25/aux/out_of_sample_test_mixes.csv', '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/bss/large_lab41/out_of_sample_test_sdr_summary.csv'], + + ['/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/bss/mask_sce/mi_in_sample_test.csv', + '/local_data/magnolia/pipeline_data/date_2017_09_27_time_13_25/aux/in_sample_test_mixes.csv', + '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/bss/mask_sce/mi_in_sample_test_sdr_summary.csv'], + ['/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/bss/mask_sce/mi_out_of_sample_test.csv', + '/local_data/magnolia/pipeline_data/date_2017_09_27_time_13_25/aux/out_of_sample_test_mixes.csv', + '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/bss/mask_sce/mi_out_of_sample_test_sdr_summary.csv'], + ['/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/bss/mask_sce/dc_in_sample_test.csv', + '/local_data/magnolia/pipeline_data/date_2017_09_27_time_13_25/aux/in_sample_test_mixes.csv', + '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/bss/mask_sce/dc_in_sample_test_sdr_summary.csv'], + ['/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/bss/mask_sce/dc_out_of_sample_test.csv', + '/local_data/magnolia/pipeline_data/date_2017_09_27_time_13_25/aux/out_of_sample_test_mixes.csv', + '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/bss/mask_sce/dc_out_of_sample_test_sdr_summary.csv'], ['/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/bss/chimera/mi_in_sample_test.csv', '/local_data/magnolia/pipeline_data/date_2017_09_27_time_13_25/aux/in_sample_test_mixes.csv', '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/bss/chimera/mi_in_sample_test_sdr_summary.csv'], @@ -77,7 +90,7 @@ def main(): # args = args[2:4] #args = args[4:6] #args = args[6:8] - args = args[8:] + #args = args[8:] for arg in args: make_nice_table(*arg) diff --git a/magnolia/python/analysis/out_of_set_sdr_delta_versus_input_snr.pdf b/magnolia/python/analysis/out_of_set_sdr_delta_versus_input_snr.pdf index f84773b..55b43ab 100644 Binary files a/magnolia/python/analysis/out_of_set_sdr_delta_versus_input_snr.pdf and b/magnolia/python/analysis/out_of_set_sdr_delta_versus_input_snr.pdf differ diff --git a/magnolia/python/analysis/out_of_set_sdr_delta_versus_noise_type.pdf b/magnolia/python/analysis/out_of_set_sdr_delta_versus_noise_type.pdf index 4d1747f..ef3f556 100644 Binary files a/magnolia/python/analysis/out_of_set_sdr_delta_versus_noise_type.pdf and b/magnolia/python/analysis/out_of_set_sdr_delta_versus_noise_type.pdf differ diff --git a/magnolia/python/inference/denoising/Chimera/separate_mix.py b/magnolia/python/inference/denoising/Chimera/separate_mix.py index dcdbded..7699ee6 100644 --- a/magnolia/python/inference/denoising/Chimera/separate_mix.py +++ b/magnolia/python/inference/denoising/Chimera/separate_mix.py @@ -1,28 +1,54 @@ # Generic imports +# Generic imports import os +import argparse +import logging.config import json + import numpy as np import pandas as pd import librosa as lr import tqdm # Import the Chimera separation model -from magnolia.dnnseparate.chimera import Chimera +from magnolia.models import make_model # Import utilities for using the model from magnolia.utils.postprocessing import convert_preprocessing_parameters -from magnolia.features.preprocessing import undo_preprocessing -from magnolia.iterate.mix_iterator import MixIterator +from magnolia.preprocessing.preprocessing import undo_preprocessing +from magnolia.training.data_iteration.mix_iterator import MixIterator from magnolia.utils.clustering_utils import chimera_clustering_separate, chimera_mask + def standardize_waveform(y): return (y - y.mean())/y.std() def main(): + # parse command line arguments + parser = argparse.ArgumentParser(description='Denoise mixed samples using the Chimera network.') + # parser.add_argument('--model_settings', '-s', + # default='../../../../data/models_settings/chimera_template.json', + # help='model settings JSON file') + # parser.add_argument('--_settings', '-s', + # default='../../../../data/models_settings/chimera_template.json', + # help='model settings JSON file') + parser.add_argument('--logger_settings', '-l', + default='../../../../data/logging_settings/logging.conf', + help='logging configuration file') + args = parser.parse_args() + + # Load logging configuration + logging.config.fileConfig(args.logger_settings) + logger = logging.getLogger('model') + # from model settings model_params = { + 'layer_size': 500, + 'embedding_size': 10, + 'alpha': 0.1, + 'nonlinearity': 'tf.tanh', } uid_settings = '/local_data/magnolia/pipeline_data/date_2017_09_27_time_13_25/settings/assign_uids_LibriSpeech_UrbanSound8K.json' model_save_base = '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/model_saves/chimera' @@ -38,6 +64,7 @@ def main(): mixer = MixIterator(mixes_settings_filenames=mixes, batch_size=1, + read_waveform=False, from_disk=from_disk) # get frequency dimension @@ -49,9 +76,10 @@ def main(): uid_csv = pd.read_csv(uid_file) number_of_sources = uid_csv['uid'].max() + 1 - model = Chimera(**model_params, - F=frequency_dim, - device=model_location) + model_params['F'] = frequency_dim + config = {'model_params': model_params, + 'device': model_location} + model = make_model('Chimera', config) model.load(model_save_base) diff --git a/magnolia/python/inference/denoising/Chimera/separate_sample_from_mix.py b/magnolia/python/inference/denoising/Chimera/separate_sample_from_mix.py index 3f9d19c..0d36145 100644 --- a/magnolia/python/inference/denoising/Chimera/separate_sample_from_mix.py +++ b/magnolia/python/inference/denoising/Chimera/separate_sample_from_mix.py @@ -1,17 +1,20 @@ # Generic imports import os +import argparse +import logging.config import json + import numpy as np import pandas as pd import librosa as lr # Import the Chimera separation model -from magnolia.dnnseparate.chimera import Chimera +from magnolia.models import make_model # Import utilities for using the model from magnolia.utils.postprocessing import convert_preprocessing_parameters -from magnolia.features.preprocessing import undo_preprocessing -from magnolia.iterate.mix_iterator import MixIterator +from magnolia.preprocessing.preprocessing import undo_preprocessing +from magnolia.training.data_iteration.mix_iterator import MixIterator from magnolia.utils.clustering_utils import chimera_clustering_separate, chimera_mask @@ -20,8 +23,29 @@ def standardize_waveform(y): def main(): + # parse command line arguments + parser = argparse.ArgumentParser(description='Denoise mixed sample using the Chimera network.') + # parser.add_argument('--model_settings', '-s', + # default='../../../../data/models_settings/chimera_template.json', + # help='model settings JSON file') + # parser.add_argument('--_settings', '-s', + # default='../../../../data/models_settings/chimera_template.json', + # help='model settings JSON file') + parser.add_argument('--logger_settings', '-l', + default='../../../../data/logging_settings/logging.conf', + help='logging configuration file') + args = parser.parse_args() + + # Load logging configuration + logging.config.fileConfig(args.logger_settings) + logger = logging.getLogger('model') + # from model settings model_params = { + 'layer_size': 500, + 'embedding_size': 10, + 'alpha': 0.1, + 'nonlinearity': 'tf.tanh', } uid_settings = '/local_data/magnolia/pipeline_data/date_2017_09_27_time_13_25/settings/assign_uids_LibriSpeech_UrbanSound8K.json' model_save_base = '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/model_saves/chimera' @@ -30,7 +54,7 @@ def main(): model_settings = '' mixes = ['/local_data/magnolia/pipeline_data/date_2017_09_27_time_13_25/settings/mixing_LibriSpeech_UrbanSound8K_test_in_sample.json'] from_disk = True - mix_number = 1 + mix_number = 1010 output_path = '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/sample_wav_files/chimera' @@ -38,6 +62,7 @@ def main(): mixer = MixIterator(mixes_settings_filenames=mixes, batch_size=1, + read_waveform=False, from_disk=from_disk) # get frequency dimension @@ -49,9 +74,10 @@ def main(): uid_csv = pd.read_csv(uid_file) number_of_sources = uid_csv['uid'].max() + 1 - model = Chimera(**model_params, - F=frequency_dim, - device=model_location) + model_params['F'] = frequency_dim + config = {'model_params': model_params, + 'device': model_location} + model = make_model('Chimera', config) model.load(model_save_base) diff --git a/magnolia/python/inference/denoising/RatioMaskCluster/__init__.py b/magnolia/python/inference/denoising/RatioMaskCluster/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/magnolia/python/inference/denoising/RatioMaskCluster/separate_mix.py b/magnolia/python/inference/denoising/RatioMaskCluster/separate_mix.py new file mode 100644 index 0000000..7c34d07 --- /dev/null +++ b/magnolia/python/inference/denoising/RatioMaskCluster/separate_mix.py @@ -0,0 +1,196 @@ +# Generic imports +# Generic imports +import os +import argparse +import logging.config +import json + +import numpy as np +import pandas as pd +import librosa as lr +import tqdm + +# Import the RatioMaskCluster separation model +from magnolia.models import make_model + +# Import utilities for using the model +from magnolia.utils.postprocessing import convert_preprocessing_parameters +from magnolia.preprocessing.preprocessing import undo_preprocessing +from magnolia.training.data_iteration.mix_iterator import MixIterator +from magnolia.utils.clustering_utils import chimera_clustering_separate, chimera_mask + + + +def standardize_waveform(y): + return (y - y.mean())/y.std() + + +def main(): + # parse command line arguments + parser = argparse.ArgumentParser(description='Denoise mixed samples using the RatioMaskCluster network.') + # parser.add_argument('--model_settings', '-s', + # default='../../../../data/models_settings/chimera_template.json', + # help='model settings JSON file') + # parser.add_argument('--_settings', '-s', + # default='../../../../data/models_settings/chimera_template.json', + # help='model settings JSON file') + parser.add_argument('--logger_settings', '-l', + default='../../../../data/logging_settings/logging.conf', + help='logging configuration file') + args = parser.parse_args() + + # Load logging configuration + logging.config.fileConfig(args.logger_settings) + logger = logging.getLogger('model') + + # from model settings + model_params = { + 'layer_size': 500, + 'embedding_size': 10, + 'auxiliary_size': 0, + 'alpha': 0.1, # try 0.9 + 'nonlinearity': 'tf.tanh', + 'fuzzifier': 2, + 'num_reco_sources': 2, + 'normalize': False, + 'collapse_sources': False, + } + uid_settings = '/local_data/magnolia/pipeline_data/date_2017_09_27_time_13_25/settings/assign_uids_LibriSpeech_UrbanSound8K.json' + model_save_base = '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/model_saves/mask_cluster' + + model_location = '/cpu:0' + model_settings = '' + mixes = ['/local_data/magnolia/pipeline_data/date_2017_09_27_time_13_25/settings/mixing_LibriSpeech_UrbanSound8K_test_in_sample.json'] + # mixes = ['/local_data/magnolia/pipeline_data/date_2017_09_27_time_13_25/settings/mixing_LibriSpeech_UrbanSound8K_test_out_of_sample.json'] + from_disk = True + output_path = '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/mask_cluster/in_sample_test' + # output_path = '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/mask_cluster/out_of_sample_test' + eval_sr = 8000 + + mixer = MixIterator(mixes_settings_filenames=mixes, + batch_size=1, + read_waveform=False, + from_disk=from_disk) + + # get frequency dimension + frequency_dim = mixer.sample_dimensions()[0] + + # get number of sources + settings = json.load(open(uid_settings)) + uid_file = settings['output_file'] + uid_csv = pd.read_csv(uid_file) + number_of_sources = uid_csv['uid'].max() + 1 + + model_params['F'] = frequency_dim + model_params['num_training_sources'] = number_of_sources + config = {'model_params': model_params, + 'device': model_location} + model = make_model('RatioMaskCluster', config) + + model.load(model_save_base) + + mix_settings = json.load(open(mixes[0])) + + signal = mix_settings['signals'][0] + preprocessing_settings = json.load(open(signal['preprocessing_settings'])) + stft_args = preprocessing_settings['processing_parameters']['stft_args'] + istft_args = convert_preprocessing_parameters(stft_args) + preemphasis_coeff = preprocessing_settings['processing_parameters']['preemphasis_coeff'] + n_fft = 2048 + if 'n_fft' in stft_args: + n_fft = stft_args['n_fft'] + + + os.makedirs(output_path, exist_ok=True) + mix_count = 0 + for _ in tqdm.trange(mixer.epoch_size()): + spec, bin_masks, source_specs, uids, snrs = next(mixer) + model_spec = spec + spec = spec[0] + bin_masks = bin_masks[0] + source_specs = source_specs[0] + uids = uids[0] + snrs = snrs[0] + + # print('SNR of mix {}: {}'.format(mix_count + 1, snrs)) + + y_mix = undo_preprocessing(spec, mixer.sample_length_in_bits(), + preemphasis_coeff=preemphasis_coeff, + istft_args=istft_args) + + + # NOTE: this is only to make comparisons to the reconstructed waveforms later + y_mix[-n_fft:] = 0.0 + y_mix = lr.core.resample(y_mix, mixer.sample_rate(), eval_sr, scale=True) + y_mix = standardize_waveform(y_mix) + + filename = os.path.join(output_path, 'mix_{}_snr_{:.2f}.wav'.format(mix_count + 1, snrs)) + lr.output.write_wav(filename, y_mix, eval_sr, norm=True) + + originals = {} + for i, source_spec in enumerate(source_specs): + y = undo_preprocessing(source_spec, mixer.sample_length_in_bits(), + preemphasis_coeff=preemphasis_coeff, + istft_args=istft_args) + # NOTE: this is only to make comparisons to the reconstructed waveforms later + y[-n_fft:] = 0.0 + y = lr.core.resample(y, mixer.sample_rate(), eval_sr, scale=True) + y = standardize_waveform(y) + + originals[i] = y + + # use dc-head of model + clustering to source-separate the spectrogram + source_specs = chimera_clustering_separate(model_spec, model, mixer.number_of_samples_in_mixes()) + + for i, source_spec in enumerate(source_specs): + y = undo_preprocessing(source_spec, mixer.sample_length_in_bits(), + preemphasis_coeff=preemphasis_coeff, + istft_args=istft_args) + # NOTE: this is only because the masking creates a chirp in the last + # fft frame (likely due to the binary mask) + y[-n_fft:] = 0.0 + y = lr.core.resample(y, mixer.sample_rate(), eval_sr, scale=True) + y = standardize_waveform(y) + + # match this waveform with an original source waveform + min_key = 0 + min_mse = np.inf + for key in originals: + mse = np.mean((y - originals[key])**2) + if mse < min_mse: + min_key = key + min_mse = mse + + # print('Separated sample for source {}'.format(i + 1)) + filename = os.path.join(output_path, 'mix_{}_original_source_{}.wav'.format(mix_count + 1, min_key + 1)) + lr.output.write_wav(filename, originals[min_key], eval_sr, norm=True) + filename = os.path.join(output_path, 'mix_{}_dc_separated_source_{}.wav'.format(mix_count + 1, min_key + 1)) + lr.output.write_wav(filename, y, eval_sr, norm=True) + + y_original = originals.pop(min_key, None) + if y_original is None: + print("something went horribly wrong") + + # use mi-head of model to source-separate the spectrogram + source_specs = chimera_mask(model_spec, model)[0] + + for i in range(source_specs.shape[2]): + source_spec = source_specs[:, :, i] + + y = undo_preprocessing(source_spec, mixer.sample_length_in_bits(), + preemphasis_coeff=preemphasis_coeff, + istft_args=istft_args) + # NOTE: this is only because the masking creates a chirp in the last + # fft frame (likely due to the binary mask) + y[-n_fft:] = 0.0 + y = lr.core.resample(y, mixer.sample_rate(), eval_sr, scale=True) + y = standardize_waveform(y) + + filename = os.path.join(output_path, 'mix_{}_mi_separated_source_{}.wav'.format(mix_count + 1, i + 1)) + lr.output.write_wav(filename, y, eval_sr, norm=True) + + mix_count += 1 + + +if __name__ == '__main__': + main() diff --git a/magnolia/python/inference/denoising/RatioMaskCluster/separate_sample_from_mix.py b/magnolia/python/inference/denoising/RatioMaskCluster/separate_sample_from_mix.py new file mode 100644 index 0000000..6b09563 --- /dev/null +++ b/magnolia/python/inference/denoising/RatioMaskCluster/separate_sample_from_mix.py @@ -0,0 +1,172 @@ +# Generic imports +import os +import argparse +import logging.config +import json + +import numpy as np +import pandas as pd +import librosa as lr + +# Import the RatioMaskCluster separation model +from magnolia.models import make_model + +# Import utilities for using the model +from magnolia.utils.postprocessing import convert_preprocessing_parameters +from magnolia.preprocessing.preprocessing import undo_preprocessing +from magnolia.training.data_iteration.mix_iterator import MixIterator +from magnolia.utils.clustering_utils import mask_cluster_clustering_separate, mask_cluster_mask + + +def standardize_waveform(y): + return (y - y.mean())/y.std() + + +def main(): + # parse command line arguments + parser = argparse.ArgumentParser(description='Denoise mixed sample using the RatioMaskCluster network.') + # parser.add_argument('--model_settings', '-s', + # default='../../../../data/models_settings/chimera_template.json', + # help='model settings JSON file') + # parser.add_argument('--_settings', '-s', + # default='../../../../data/models_settings/chimera_template.json', + # help='model settings JSON file') + parser.add_argument('--logger_settings', '-l', + default='../../../../data/logging_settings/logging.conf', + help='logging configuration file') + args = parser.parse_args() + + # Load logging configuration + logging.config.fileConfig(args.logger_settings) + logger = logging.getLogger('model') + + # from model settings + model_params = { + 'layer_size': 500, + 'embedding_size': 10, + 'auxiliary_size': 0, + 'alpha': 0.1, # try 0.9 + 'nonlinearity': 'tf.tanh', + 'fuzzifier': 2, + 'num_reco_sources': 2, + 'normalize': False, + 'collapse_sources': False, + } + uid_settings = '/local_data/magnolia/pipeline_data/date_2017_09_27_time_13_25/settings/assign_uids_LibriSpeech_UrbanSound8K.json' + model_save_base = '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/model_saves/mask_cluster' + + model_location = '/cpu:0' + model_settings = '' + mixes = ['/local_data/magnolia/pipeline_data/date_2017_09_27_time_13_25/settings/mixing_LibriSpeech_UrbanSound8K_test_in_sample.json'] + from_disk = True + mix_number = 1 + output_path = '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/sample_wav_files/mask_cluster' + + + os.makedirs(output_path, exist_ok=True) + + mixer = MixIterator(mixes_settings_filenames=mixes, + batch_size=1, + read_waveform=False, + from_disk=from_disk) + + # get frequency dimension + frequency_dim = mixer.sample_dimensions()[0] + + # get number of sources + settings = json.load(open(uid_settings)) + uid_file = settings['output_file'] + uid_csv = pd.read_csv(uid_file) + number_of_sources = uid_csv['uid'].max() + 1 + + model_params['F'] = frequency_dim + model_params['num_training_sources'] = number_of_sources + config = {'model_params': model_params, + 'device': model_location} + model = make_model('RatioMaskCluster', config) + + model.load(model_save_base) + + assert(mix_number <= mixer.epoch_size()) + + mix_settings = json.load(open(mixes[0])) + + signal = mix_settings['signals'][0] + preprocessing_settings = json.load(open(signal['preprocessing_settings'])) + stft_args = preprocessing_settings['processing_parameters']['stft_args'] + istft_args = convert_preprocessing_parameters(stft_args) + preemphasis_coeff = preprocessing_settings['processing_parameters']['preemphasis_coeff'] + n_fft = 2048 + if 'n_fft' in stft_args: + n_fft = stft_args['n_fft'] + + + for i in range(mix_number): + spec, bin_masks, source_specs, uids, snrs = next(mixer) + + model_spec = spec + spec = spec[0] + bin_masks = bin_masks[0] + source_specs = source_specs[0] + uids = uids[0] + snrs = snrs[0] + + print('SNR of this mix: {}'.format(snrs)) + + y_mix = undo_preprocessing(spec, mixer.sample_length_in_bits(), + preemphasis_coeff=preemphasis_coeff, + istft_args=istft_args) + + # NOTE: this is only to make comparisons to the reconstructed waveforms later + y_mix[-n_fft:] = 0.0 + y_mix = standardize_waveform(y_mix) + + # print('Mixed sample') + lr.output.write_wav(os.path.join(output_path, 'mix_{}.wav'.format(mix_number)), y_mix, mixer.sample_rate(), norm=True) + + for i, source_spec in enumerate(source_specs): + y = undo_preprocessing(source_spec, mixer.sample_length_in_bits(), + preemphasis_coeff=preemphasis_coeff, + istft_args=istft_args) + + # NOTE: this is only to make comparisons to the reconstructed waveforms later + y[-n_fft:] = 0.0 + y = standardize_waveform(y) + + # print('Sample for source {}'.format(i + 1)) + lr.output.write_wav(os.path.join(output_path, 'mix_{}_original_source_{}.wav'.format(mix_number, i + 1)), y, mixer.sample_rate(), norm=True) + + source_specs = mask_cluster_clustering_separate(model_spec, model, mixer.number_of_samples_in_mixes()) + + for i, source_spec in enumerate(source_specs): + y = undo_preprocessing(source_spec, mixer.sample_length_in_bits(), + preemphasis_coeff=preemphasis_coeff, + istft_args=istft_args) + + # NOTE: this is only because the masking creates a chirp in the last + # fft frame (likely due to the mask) + y[-n_fft:] = 0.0 + y = standardize_waveform(y) + + # print('Separated sample for source {}'.format(i + 1)) + lr.output.write_wav(os.path.join(output_path, 'mix_{}_dc_separated_{}.wav'.format(mix_number, i + 1)), y, mixer.sample_rate(), norm=True) + + source_specs = mask_cluster_mask(model_spec, model, mixer.number_of_samples_in_mixes())[0] + + for i in range(source_specs.shape[2]): + source_spec = source_specs[:, :, i] + y = undo_preprocessing(source_spec, mixer.sample_length_in_bits(), + preemphasis_coeff=preemphasis_coeff, + istft_args=istft_args) + + # NOTE: this is only because the masking creates a chirp in the last + # fft frame (likely due to the mask) + y[-n_fft:] = 0.0 + y = standardize_waveform(y) + + # print('Separated sample for source {}'.format(i + 1)) + lr.output.write_wav(os.path.join(output_path, 'mix_{}_mi_separated_{}.wav'.format(mix_number, i + 1)), y, mixer.sample_rate(), norm=True) + + +if __name__ == '__main__': + main() diff --git a/magnolia/python/inference/denoising/RatioMaskSCE/__init__.py b/magnolia/python/inference/denoising/RatioMaskSCE/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/magnolia/python/inference/denoising/RatioMaskSCE/separate_mix.py b/magnolia/python/inference/denoising/RatioMaskSCE/separate_mix.py new file mode 100644 index 0000000..266836d --- /dev/null +++ b/magnolia/python/inference/denoising/RatioMaskSCE/separate_mix.py @@ -0,0 +1,194 @@ +# Generic imports +# Generic imports +import os +import argparse +import logging.config +import json + +import numpy as np +import pandas as pd +import librosa as lr +import tqdm + +# Import the RatioMaskSCE separation model +from magnolia.models import make_model + +# Import utilities for using the model +from magnolia.utils.postprocessing import convert_preprocessing_parameters +from magnolia.preprocessing.preprocessing import undo_preprocessing +from magnolia.training.data_iteration.mix_iterator import MixIterator +from magnolia.utils.clustering_utils import chimera_clustering_separate, chimera_mask + + + +def standardize_waveform(y): + return (y - y.mean())/y.std() + + +def main(): + # parse command line arguments + parser = argparse.ArgumentParser(description='Denoise mixed samples using the RatioMaskSCE network.') + # parser.add_argument('--model_settings', '-s', + # default='../../../../data/models_settings/chimera_template.json', + # help='model settings JSON file') + # parser.add_argument('--_settings', '-s', + # default='../../../../data/models_settings/chimera_template.json', + # help='model settings JSON file') + parser.add_argument('--logger_settings', '-l', + default='../../../../data/logging_settings/logging.conf', + help='logging configuration file') + args = parser.parse_args() + + # Load logging configuration + logging.config.fileConfig(args.logger_settings) + logger = logging.getLogger('model') + + # from model settings + model_params = { + 'layer_size': 500, + 'embedding_size': 10, + 'alpha': 0.9, # try 0.9 + 'nonlinearity': 'tf.tanh', + 'num_reco_sources': 2, + 'normalize': False, + 'collapse_sources': False, + } + uid_settings = '/local_data/magnolia/pipeline_data/date_2017_09_27_time_13_25/settings/assign_uids_LibriSpeech_UrbanSound8K.json' + model_save_base = '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/model_saves/mask_sce' + + model_location = '/cpu:0' + model_settings = '' + mixes = ['/local_data/magnolia/pipeline_data/date_2017_09_27_time_13_25/settings/mixing_LibriSpeech_UrbanSound8K_test_in_sample.json'] + # mixes = ['/local_data/magnolia/pipeline_data/date_2017_09_27_time_13_25/settings/mixing_LibriSpeech_UrbanSound8K_test_out_of_sample.json'] + from_disk = True + output_path = '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/mask_sce/in_sample_test' + # output_path = '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/mask_sce/out_of_sample_test' + eval_sr = 8000 + + mixer = MixIterator(mixes_settings_filenames=mixes, + batch_size=1, + read_waveform=False, + from_disk=from_disk) + + # get frequency dimension + frequency_dim = mixer.sample_dimensions()[0] + + # get number of sources + settings = json.load(open(uid_settings)) + uid_file = settings['output_file'] + uid_csv = pd.read_csv(uid_file) + number_of_sources = uid_csv['uid'].max() + 1 + + model_params['F'] = frequency_dim + model_params['num_training_sources'] = number_of_sources + config = {'model_params': model_params, + 'device': model_location} + model = make_model('RatioMaskSCE', config) + + model.load(model_save_base) + + mix_settings = json.load(open(mixes[0])) + + signal = mix_settings['signals'][0] + preprocessing_settings = json.load(open(signal['preprocessing_settings'])) + stft_args = preprocessing_settings['processing_parameters']['stft_args'] + istft_args = convert_preprocessing_parameters(stft_args) + preemphasis_coeff = preprocessing_settings['processing_parameters']['preemphasis_coeff'] + n_fft = 2048 + if 'n_fft' in stft_args: + n_fft = stft_args['n_fft'] + + + os.makedirs(output_path, exist_ok=True) + mix_count = 0 + for _ in tqdm.trange(mixer.epoch_size()): + spec, bin_masks, source_specs, uids, snrs = next(mixer) + model_spec = spec + spec = spec[0] + bin_masks = bin_masks[0] + source_specs = source_specs[0] + uids = uids[0] + snrs = snrs[0] + + # print('SNR of mix {}: {}'.format(mix_count + 1, snrs)) + + y_mix = undo_preprocessing(spec, mixer.sample_length_in_bits(), + preemphasis_coeff=preemphasis_coeff, + istft_args=istft_args) + + + # NOTE: this is only to make comparisons to the reconstructed waveforms later + y_mix[-n_fft:] = 0.0 + y_mix = lr.core.resample(y_mix, mixer.sample_rate(), eval_sr, scale=True) + y_mix = standardize_waveform(y_mix) + + filename = os.path.join(output_path, 'mix_{}_snr_{:.2f}.wav'.format(mix_count + 1, snrs)) + lr.output.write_wav(filename, y_mix, eval_sr, norm=True) + + originals = {} + for i, source_spec in enumerate(source_specs): + y = undo_preprocessing(source_spec, mixer.sample_length_in_bits(), + preemphasis_coeff=preemphasis_coeff, + istft_args=istft_args) + # NOTE: this is only to make comparisons to the reconstructed waveforms later + y[-n_fft:] = 0.0 + y = lr.core.resample(y, mixer.sample_rate(), eval_sr, scale=True) + y = standardize_waveform(y) + + originals[i] = y + + # use dc-head of model + clustering to source-separate the spectrogram + source_specs = chimera_clustering_separate(model_spec, model, mixer.number_of_samples_in_mixes()) + + for i, source_spec in enumerate(source_specs): + y = undo_preprocessing(source_spec, mixer.sample_length_in_bits(), + preemphasis_coeff=preemphasis_coeff, + istft_args=istft_args) + # NOTE: this is only because the masking creates a chirp in the last + # fft frame (likely due to the binary mask) + y[-n_fft:] = 0.0 + y = lr.core.resample(y, mixer.sample_rate(), eval_sr, scale=True) + y = standardize_waveform(y) + + # match this waveform with an original source waveform + min_key = 0 + min_mse = np.inf + for key in originals: + mse = np.mean((y - originals[key])**2) + if mse < min_mse: + min_key = key + min_mse = mse + + # print('Separated sample for source {}'.format(i + 1)) + filename = os.path.join(output_path, 'mix_{}_original_source_{}.wav'.format(mix_count + 1, min_key + 1)) + lr.output.write_wav(filename, originals[min_key], eval_sr, norm=True) + filename = os.path.join(output_path, 'mix_{}_dc_separated_source_{}.wav'.format(mix_count + 1, min_key + 1)) + lr.output.write_wav(filename, y, eval_sr, norm=True) + + y_original = originals.pop(min_key, None) + if y_original is None: + print("something went horribly wrong") + + # use mi-head of model to source-separate the spectrogram + source_specs = chimera_mask(model_spec, model)[0] + + for i in range(source_specs.shape[2]): + source_spec = source_specs[:, :, i] + + y = undo_preprocessing(source_spec, mixer.sample_length_in_bits(), + preemphasis_coeff=preemphasis_coeff, + istft_args=istft_args) + # NOTE: this is only because the masking creates a chirp in the last + # fft frame (likely due to the binary mask) + y[-n_fft:] = 0.0 + y = lr.core.resample(y, mixer.sample_rate(), eval_sr, scale=True) + y = standardize_waveform(y) + + filename = os.path.join(output_path, 'mix_{}_mi_separated_source_{}.wav'.format(mix_count + 1, i + 1)) + lr.output.write_wav(filename, y, eval_sr, norm=True) + + mix_count += 1 + + +if __name__ == '__main__': + main() diff --git a/magnolia/python/inference/denoising/RatioMaskSCE/separate_sample_from_mix.py b/magnolia/python/inference/denoising/RatioMaskSCE/separate_sample_from_mix.py new file mode 100644 index 0000000..bdec6f1 --- /dev/null +++ b/magnolia/python/inference/denoising/RatioMaskSCE/separate_sample_from_mix.py @@ -0,0 +1,170 @@ +# Generic imports +import os +import argparse +import logging.config +import json + +import numpy as np +import pandas as pd +import librosa as lr + +# Import the RatioMaskSCE separation model +from magnolia.models import make_model + +# Import utilities for using the model +from magnolia.utils.postprocessing import convert_preprocessing_parameters +from magnolia.preprocessing.preprocessing import undo_preprocessing +from magnolia.training.data_iteration.mix_iterator import MixIterator +from magnolia.utils.clustering_utils import chimera_clustering_separate, chimera_mask + + +def standardize_waveform(y): + return (y - y.mean())/y.std() + + +def main(): + # parse command line arguments + parser = argparse.ArgumentParser(description='Denoise mixed sample using the RatioMaskSCE network.') + # parser.add_argument('--model_settings', '-s', + # default='../../../../data/models_settings/chimera_template.json', + # help='model settings JSON file') + # parser.add_argument('--_settings', '-s', + # default='../../../../data/models_settings/chimera_template.json', + # help='model settings JSON file') + parser.add_argument('--logger_settings', '-l', + default='../../../../data/logging_settings/logging.conf', + help='logging configuration file') + args = parser.parse_args() + + # Load logging configuration + logging.config.fileConfig(args.logger_settings) + logger = logging.getLogger('model') + + # from model settings + model_params = { + 'layer_size': 500, + 'embedding_size': 10, + 'alpha': 0.9, # try 0.9 + 'nonlinearity': 'tf.tanh', + 'num_reco_sources': 2, + 'normalize': False, + 'collapse_sources': False, + } + uid_settings = '/local_data/magnolia/pipeline_data/date_2017_09_27_time_13_25/settings/assign_uids_LibriSpeech_UrbanSound8K.json' + model_save_base = '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/model_saves/mask_sce' + + model_location = '/cpu:0' + model_settings = '' + mixes = ['/local_data/magnolia/pipeline_data/date_2017_09_27_time_13_25/settings/mixing_LibriSpeech_UrbanSound8K_test_in_sample.json'] + from_disk = True + mix_number = 1010 + output_path = '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/sample_wav_files/mask_sce' + + + os.makedirs(output_path, exist_ok=True) + + mixer = MixIterator(mixes_settings_filenames=mixes, + batch_size=1, + read_waveform=False, + from_disk=from_disk) + + # get frequency dimension + frequency_dim = mixer.sample_dimensions()[0] + + # get number of sources + settings = json.load(open(uid_settings)) + uid_file = settings['output_file'] + uid_csv = pd.read_csv(uid_file) + number_of_sources = uid_csv['uid'].max() + 1 + + model_params['F'] = frequency_dim + model_params['num_training_sources'] = number_of_sources + config = {'model_params': model_params, + 'device': model_location} + model = make_model('RatioMaskSCE', config) + + model.load(model_save_base) + + assert(mix_number <= mixer.epoch_size()) + + mix_settings = json.load(open(mixes[0])) + + signal = mix_settings['signals'][0] + preprocessing_settings = json.load(open(signal['preprocessing_settings'])) + stft_args = preprocessing_settings['processing_parameters']['stft_args'] + istft_args = convert_preprocessing_parameters(stft_args) + preemphasis_coeff = preprocessing_settings['processing_parameters']['preemphasis_coeff'] + n_fft = 2048 + if 'n_fft' in stft_args: + n_fft = stft_args['n_fft'] + + + for i in range(mix_number): + spec, bin_masks, source_specs, uids, snrs = next(mixer) + + model_spec = spec + spec = spec[0] + bin_masks = bin_masks[0] + source_specs = source_specs[0] + uids = uids[0] + snrs = snrs[0] + + print('SNR of this mix: {}'.format(snrs)) + + y_mix = undo_preprocessing(spec, mixer.sample_length_in_bits(), + preemphasis_coeff=preemphasis_coeff, + istft_args=istft_args) + + # NOTE: this is only to make comparisons to the reconstructed waveforms later + y_mix[-n_fft:] = 0.0 + y_mix = standardize_waveform(y_mix) + + # print('Mixed sample') + lr.output.write_wav(os.path.join(output_path, 'mix_{}.wav'.format(mix_number)), y_mix, mixer.sample_rate(), norm=True) + + for i, source_spec in enumerate(source_specs): + y = undo_preprocessing(source_spec, mixer.sample_length_in_bits(), + preemphasis_coeff=preemphasis_coeff, + istft_args=istft_args) + + # NOTE: this is only to make comparisons to the reconstructed waveforms later + y[-n_fft:] = 0.0 + y = standardize_waveform(y) + + # print('Sample for source {}'.format(i + 1)) + lr.output.write_wav(os.path.join(output_path, 'mix_{}_original_source_{}.wav'.format(mix_number, i + 1)), y, mixer.sample_rate(), norm=True) + + source_specs = chimera_clustering_separate(model_spec, model, mixer.number_of_samples_in_mixes()) + + for i, source_spec in enumerate(source_specs): + y = undo_preprocessing(source_spec, mixer.sample_length_in_bits(), + preemphasis_coeff=preemphasis_coeff, + istft_args=istft_args) + + # NOTE: this is only because the masking creates a chirp in the last + # fft frame (likely due to the mask) + y[-n_fft:] = 0.0 + y = standardize_waveform(y) + + # print('Separated sample for source {}'.format(i + 1)) + lr.output.write_wav(os.path.join(output_path, 'mix_{}_dc_separated_{}.wav'.format(mix_number, i + 1)), y, mixer.sample_rate(), norm=True) + + source_specs = chimera_mask(model_spec, model)[0] + + for i in range(source_specs.shape[2]): + source_spec = source_specs[:, :, i] + y = undo_preprocessing(source_spec, mixer.sample_length_in_bits(), + preemphasis_coeff=preemphasis_coeff, + istft_args=istft_args) + + # NOTE: this is only because the masking creates a chirp in the last + # fft frame (likely due to the mask) + y[-n_fft:] = 0.0 + y = standardize_waveform(y) + + # print('Separated sample for source {}'.format(i + 1)) + lr.output.write_wav(os.path.join(output_path, 'mix_{}_mi_separated_{}.wav'.format(mix_number, i + 1)), y, mixer.sample_rate(), norm=True) + + +if __name__ == '__main__': + main() diff --git a/magnolia/python/models/__init__.py b/magnolia/python/models/__init__.py index 9f10f20..0a65933 100644 --- a/magnolia/python/models/__init__.py +++ b/magnolia/python/models/__init__.py @@ -1,9 +1,15 @@ +from .dnnseparate.jflec import JFLEC from .dnndenoise.chimera import Chimera from .dnndenoise.L41_regression_model import L41RegressionModel +from .dnndenoise.sce_mask import RatioMaskSCE +from .dnndenoise.cluster_mask import RatioMaskCluster __all__ = [ + "JFLEC", "Chimera", "L41RegressionModel", + "RatioMaskSCE", + "RatioMaskCluster", ] def make_model(model_name, config): diff --git a/magnolia/python/models/dnndenoise/L41_regression_model.py b/magnolia/python/models/dnndenoise/L41_regression_model.py index 7a7ee89..5a647f1 100644 --- a/magnolia/python/models/dnndenoise/L41_regression_model.py +++ b/magnolia/python/models/dnndenoise/L41_regression_model.py @@ -244,6 +244,7 @@ def get_cost(self, X_in, y_in, sig_in, I_in): @staticmethod def scale_signal(S, amin=1e-5, log_base=10, ref_value=1.0): + # NOTE: should possibly min/max scale log_base = np.log(log_base) log_spec = 20.0 * tf.log(tf.maximum(amin, S)) / log_base log_spec -= 20.0 * tf.log(tf.maximum(amin, ref_value)) / log_base diff --git a/magnolia/python/models/dnndenoise/chimera.py b/magnolia/python/models/dnndenoise/chimera.py index 3345097..2fa6b4c 100644 --- a/magnolia/python/models/dnndenoise/chimera.py +++ b/magnolia/python/models/dnndenoise/chimera.py @@ -128,22 +128,17 @@ def learn_from_epoch(self, epoch_id, # Store the validation cost self.v_costs.append(ave_c_v) - # Store the current batch number - self.nbatches.append(batch_count + 1 + start) - - # Compute scale quantities for plotting - length = len(self.nbatches) - cutoff = int(0.5*length) - lowline = [min(self.v_costs)]*length - - logger.info("Training cost on batch {} is {}.".format(self.nbatches[-1], self.t_costs[-1])) - logger.info("Validation cost on batch {} is {}.".format(self.nbatches[-1], self.v_costs[-1])) - logger.info("Last saved {} batches ago.".format(self.nbatches[-1] - self.last_saved)) - - # Stop training if the number of iterations since the last save point exceeds the threshold - if self.nbatches[-1] - self.last_saved > stop_threshold: - logger.info("Early stopping criteria met!") - break + # Store the current batch number + self.nbatches.append(batch_count + start) + + logger.info("Training cost on batch {} is {}.".format(self.nbatches[-1], self.t_costs[-1])) + logger.info("Validation cost on batch {} is {}.".format(self.nbatches[-1], self.v_costs[-1])) + logger.info("Last saved {} batches ago.".format(self.nbatches[-1] - self.last_saved)) + + # Stop training if the number of iterations since the last save point exceeds the threshold + if self.nbatches[-1] - self.last_saved > stop_threshold: + logger.info("Early stopping criteria met!") + break batch_count += 1 diff --git a/magnolia/python/models/dnndenoise/cluster_mask.py b/magnolia/python/models/dnndenoise/cluster_mask.py new file mode 100644 index 0000000..d1dab8c --- /dev/null +++ b/magnolia/python/models/dnndenoise/cluster_mask.py @@ -0,0 +1,524 @@ +import logging.config +import numpy as np +import tensorflow as tf + +from magnolia.models.model_base import ModelBase +from magnolia.utils import tf_utils + + +logger = logging.getLogger('model') + + +class RatioMaskCluster(ModelBase): + """ + Chimera network from [1] but uses the soft C-means loss. + Defaults correspond to the parameters used by the best + performing model in the paper. + + [1] Luo, Yi., et al. "Deep Clustering and Conventional Networks for Music + Separation: Stronger Together" Published in Acoustics, Speech, and + Signal Processing (ICASSP) 2017; doi:10.1109/ICASSP.2017.7952118 + + Hyperparameters: + F: Number of frequency bins in the input data + num_reco_sources: Number sources to reconstruct + num_training_sources: Number sources in the training set + layer_size: Size of BLSTM layers + embedding_size: Dimension of embedding vector + alpha: Relative mixture of cost terms + nonlinearity: Nonlinearity to use in BLSTM layers + device: Which device to run the model on + """ + + def initialize(self): + self.F = self.config['model_params']['F'] + # should always be 2 + self.num_reco_sources = self.config['model_params']['num_reco_sources'] + self.num_training_sources = self.config['model_params']['num_training_sources'] + self.layer_size = self.config['model_params']['layer_size'] + self.fuzzifier = self.config['model_params']['fuzzifier'] + self.embedding_size = self.config['model_params']['embedding_size'] + self.auxiliary_size = self.config['model_params']['auxiliary_size'] + self.normalize = self.config['model_params']['normalize'] + self.alpha = self.config['model_params']['alpha'] + self.nonlinearity = eval(self.config['model_params']['nonlinearity']) + self.collapse_sources = self.config['model_params']['collapse_sources'] + + self.batch_count = 0 + self.costs = [] + self.t_costs = [] + self.v_costs = [] + self.last_saved = 0 + + def build_graph(self, graph): + with graph.as_default(): + with tf.device(self.config['device']): + # Placeholder tensor for the input data + self.X = tf.placeholder("float", [None, None, self.F]) + # Placeholder tensor for the unscaled input data + self.X_clean = tf.placeholder("float", [None, None, self.F]) + + # Placeholder tensor for the labels/targets + self.y = tf.placeholder("float", [None, None, self.F, None]) + # Placeholder tensor for the unscaled labels/targets + self.y_clean = tf.placeholder( + "float", [None, None, self.F, None]) + + # Placeholder for the speaker indicies + self.I = tf.placeholder(tf.int32, [None, None]) + + # Define the speaker vectors to use during training + self.speaker_vectors = tf_utils.weight_variable( + [self.num_training_sources, self.embedding_size], + tf.sqrt(2 / self.embedding_size)) + if self.auxiliary_size > 0: + # Define the auxiliary vectors to use during training + self.auxiliary_vectors = tf_utils.weight_variable( + [self.auxiliary_size, self.auxiliary_size], + tf.sqrt(2 / self.auxiliary_size)) + else: + self.auxiliary_vectors = None + + # Model methods + self.network + # TODO: COMMENT BACK IN!!! + self.clustering_cost + self.mi_cost + self.cost + self.optimizer + + return graph + + def learn_from_epoch(self, epoch_id, + validate_every, + stop_threshold, + training_mixer, + validation_mixer, + batch_formatter, + model_save_base): + + batch_count = self.batch_count + # Training epoch loop + for batch in iter(training_mixer): + unscaled_spectral_sum_batch, scaled_spectral_sum_batch, spectral_masks_batch, spectral_sources_batch = batch_formatter( + batch[0], batch[1], batch[2]) + # should be dimensions of (batch size, source) + uids_batch = batch[3] + + # override ids for simply signal/noise + if self.collapse_sources: + uids_batch[:, 0] = 0 + uids_batch[:, 1] = 1 + + # Train the model on one batch and get the cost + c = self.train_on_batch(scaled_spectral_sum_batch, unscaled_spectral_sum_batch, + spectral_masks_batch, spectral_sources_batch, + uids_batch) + + # Store the training cost + self.costs.append(c) + + # Evaluate the model on the validation data + if (batch_count + 1) % validate_every == 0: + # Store the training cost + self.t_costs.append(np.mean(self.costs)) + # Reset the cost over the last 10 batches + self.costs = [] + + # Compute average validation score + all_c_v = [] + for vbatch in iter(validation_mixer): + unscaled_spectral_sum_batch, scaled_spectral_sum_batch, spectral_masks_batch, spectral_sources_batch = batch_formatter( + vbatch[0], vbatch[1], vbatch[2]) + # dimensions of (batch size, source) + uids_batch = vbatch[3] + + # override ids for simply signal/noise + if self.collapse_sources: + uids_batch[:, 0] = 0 + uids_batch[:, 1] = 1 + + # Get the cost on the validation batch + c_v = self.get_cost(scaled_spectral_sum_batch, unscaled_spectral_sum_batch, + spectral_masks_batch, spectral_sources_batch, + uids_batch) + all_c_v.append(c_v) + + ave_c_v = np.mean(all_c_v) + + # Check if the validation cost is below the minimum validation cost, and if so, save it. + # and len(self.nbatches) > 0: + if len(self.v_costs) > 0 and ave_c_v < min(self.v_costs): + logger.info("Saving the model because validation score is {} below the old minimum.".format( + min(self.v_costs) - ave_c_v)) + + # Save the model to the specified path + self.save(model_save_base) + + # Record the batch that the model was last saved on + self.last_saved = batch_count # self.nbatches[-1] + + # Store the validation cost + self.v_costs.append(ave_c_v) + + # Store the current batch number + # self.nbatches.append(batch_count) + + logger.info("Training cost on batch {} is {}.".format( + batch_count, self.t_costs[-1])) + logger.info("Validation cost on batch {} is {}.".format( + batch_count, self.v_costs[-1])) + logger.info("Last saved {} batches ago.".format( + batch_count - self.last_saved)) + + # Stop training if the number of iterations since the last save point exceeds the threshold + if batch_count - self.last_saved > stop_threshold: + logger.info("Early stopping criteria met!") + break + + batch_count += 1 + + self.batch_count = batch_count + + def infer(self, **kw_args): + pass + + @tf_utils.scope_decorator + def network(self): + """ + Construct the op for the network used in [1]. This consists of four + BLSTM layers followed by a dense layer giving a set of T-F vectors of + dimension embedding_size + """ + + reduced_contrast = False + m = self.fuzzifier + + # Get the shape of the input + shape = tf.shape(self.X) + shapeI = tf.shape(self.I) + + # BLSTM layer one + BLSTM_1 = tf_utils.BLSTM_(self.X, self.layer_size, 'one', + activation=self.nonlinearity) + + # BLSTM layer two + BLSTM_2 = tf_utils.BLSTM_(BLSTM_1, self.layer_size, 'two', + activation=self.nonlinearity) + + # BLSTM layer three + BLSTM_3 = tf_utils.BLSTM_(BLSTM_2, self.layer_size, 'three', + activation=self.nonlinearity) + + # BLSTM layer four + BLSTM_4 = tf_utils.BLSTM_(BLSTM_3, self.layer_size, 'four', + activation=self.nonlinearity) + + # Feedforward layer + feedforward = tf_utils.conv1d_layer(BLSTM_4, + [1, self.layer_size, (self.embedding_size + self.auxiliary_size) * self.F]) + + # Reshape the feedforward output to have shape (T,F,D) + z = tf.reshape(feedforward, + [shape[0], shape[1], self.F, self.embedding_size + self.auxiliary_size]) + + # indices helpers for fuzzy c-means + #known_sources_init = np.zeros(self.num_training_sources) + # known_sources = tf.get_variable('known_sources', + # dtype=tf.bool, trainable=False, + # initializer=tf.constant(known_sources_init, dtype=tf.bool)) + #current_sources_indices, _ = tf.unique(tf.reshape(self.I, shape=[shapeI[0]*shapeI[1]])) + # known_sources = tf.scatter_update(known_sources, current_sources_indices, + # tf.fill(tf.shape(current_sources_indices), True)) + + # current_sources = tf.cast(tf.scatter_nd(tf.expand_dims(current_sources_indices, -1), + # tf.ones_like(current_sources_indices, dtype=tf.int32), + # [self.num_training_sources]), + # dtype=tf.bool) + + # batch_sources = tf.reshape(tf.gather(self.speaker_vectors, tf.reshape(self.I, shape=[shapeI[0]*shapeI[1]])), + # shape=[shapeI[0], shapeI[1], self.embedding_size]) + + flattened_I = tf.reshape(self.I, shape=[shapeI[0] * shapeI[1]]) + batch_range = tf.range(shape[0] * shapeI[1]) + + current_sources_indices, current_sources_indices_batch = tf.unique( + flattened_I) + known_sources_indices = tf.get_variable('known_sources_indices', + initializer=tf.constant( + [], dtype=tf.int32), + dtype=tf.int32, + validate_shape=False, trainable=False) + known_sources_indices = tf.sets.set_union(tf.expand_dims(current_sources_indices, 0), + tf.expand_dims(known_sources_indices, 0)).values + + # clustering head + embedding = self.nonlinearity(z) + # Normalize the T-F vectors to get the network output + embedding = tf.nn.l2_normalize(embedding, 3) + + # batch, features, embedding + embeddings = tf.reshape(embedding, + [shape[0], shape[1] * self.F, self.embedding_size + self.auxiliary_size]) + + # compute fuzzy assignments + # batch, nfeatures, nsources + if self.auxiliary_vectors is None: + ## batch, nfeatures + # batch, nsource in mix, nfeatures + #squared_diffs_batch = tf.reduce_sum(tf.square(embeddings - tf.expand_dims(tf.gather(self.speaker_vectors, flattened_I), 1)), -1) + squared_diffs_batch = tf.reduce_sum(tf.square(tf.expand_dims(embeddings, 1) - + tf.reshape(tf.expand_dims(tf.gather(self.speaker_vectors, flattened_I), 1), + [shape[0], shapeI[1], 1, self.embedding_size])), + -1) + # squared_diffs_batch = tf.reshape( + # squared_diffs_batch, [shape[0] * shapeI[1], shape[1] * self.F]) + diffs_pow_matrix_batch = tf.pow(squared_diffs_batch, 1. / (m - 1.)) + + # W_denom = tf.reduce_sum(tf.reciprocal( + # tf.pow( + # tf.reduce_sum(tf.square(tf.expand_dims(embeddings, 2) - tf.expand_dims( + # tf.expand_dims(tf.gather(self.speaker_vectors, known_sources_indices), 0), 0)), -1), + # 1. / (m - 1.) + # ) + # ), -1) + # W_denom = tf.expand_dims(W_denom, 1) + W_denom = tf.reduce_sum(tf.reciprocal( + diffs_pow_matrix_batch + # tf.pow( + # tf.reduce_sum(tf.square(tf.expand_dims(embeddings, 2) - tf.expand_dims( + # tf.expand_dims(tf.gather(self.speaker_vectors, known_sources_indices), 0), 0)), -1), + # 1. / (m - 1.) + # ) + ), 1) + W_denom = tf.expand_dims(W_denom, 1) + + #squared_diffs_current = tf.reduce_sum(tf.square(tf.expand_dims(embeddings, 2) - tf.expand_dims(tf.expand_dims(tf.gather(self.speaker_vectors, current_sources_indices), 0), 0)), -1) + #diffs_pow_matrix_current = tf.pow(squared_diffs_current, 1./(m - 1.)) + # + # if reduced_contrast: + # W_denom = tf.expand_dims(tf.reduce_sum(tf.reciprocal(diffs_pow_matrix_current), -1), -1) + # else: + # W_denom = tf.expand_dims(tf.reduce_sum(tf.reciprocal( + # tf.pow( + # tf.reduce_sum(tf.square(tf.expand_dims(embeddings, 2) - tf.expand_dims(tf.expand_dims(tf.gather(self.speaker_vectors, known_sources_indices), 0), 0)), -1), + # 1./(m - 1.) + # ) + # ), -1), -1) + + #squared_diffs_batch = tf.reduce_sum(tf.square(tf.expand_dims(embeddings, 2) - tf.expand_dims(batch_sources, 1)), -1) + #squared_diffs_known = tf.reduce_sum(tf.square(tf.expand_dims(embeddings, 2) - tf.expand_dims(tf.expand_dims(tf.boolean_mask(self.speaker_vectors, known_sources), 0), 0)), -1) + #squared_diffs_current = tf.reduce_sum(tf.square(tf.expand_dims(embeddings, 2) - tf.expand_dims(tf.expand_dims(tf.boolean_mask(self.speaker_vectors, current_sources), 0), 0)), -1) + #diffs_pow_matrix_batch = tf.pow(squared_diffs_batch, 1./(m - 1.)) + #diffs_pow_matrix_known = tf.pow(squared_diffs_known, 1./(m - 1.)) + #diffs_pow_matrix_current = tf.pow(squared_diffs_current, 1./(m - 1.)) + else: + # NOTE: true/aux refers to both the coordinates and cluster centers + true_embeddings = embeddings[:, :, :-self.auxiliary_size] + aux_embeddings = embeddings[:, :, -self.auxiliary_size:] + true_embeddings_l2 = tf.reduce_sum( + tf.square(true_embeddings), axis=-1) + aux_embeddings_l2 = tf.reduce_sum( + tf.square(aux_embeddings), axis=-1) + #true_squared_diffs_batch = tf.reduce_sum(tf.square(tf.expand_dims(true_embeddings, 2) - tf.expand_dims(batch_sources, 1)), -1) + #true_squared_diffs_known = tf.reduce_sum(tf.square(tf.expand_dims(true_embeddings, 2) - tf.expand_dims(tf.expand_dims(tf.boolean_mask(self.speaker_vectors, known_sources), 0), 0)), -1) + #true_squared_diffs_current = tf.reduce_sum(tf.square(tf.expand_dims(true_embeddings, 2) - tf.expand_dims(tf.expand_dims(tf.gather(self.speaker_vectors, current_sources_indices), 0), 0)), -1) + #aux_squared_diffs = tf.reduce_sum(tf.square(tf.expand_dims(aux_embeddings, 2) - tf.expand_dims(tf.expand_dims(self.auxiliary_vectors, 0), 0)), -1) + ##diffs_pow_matrix_batch = tf.pow(true_squared_diffs_batch + tf.expand_dims(aux_embeddings_l2, -1), 1./(m - 1.)) + # diffs_pow_matrix_known = tf.concat([tf.pow(true_squared_diffs_known + tf.expand_dims(aux_embeddings_l2, -1), 1./(m - 1.)), + # tf.pow(aux_squared_diffs + tf.expand_dims(true_embeddings_l2, -1), 1./(m - 1.))], axis=2) + # diffs_pow_matrix_current = tf.concat([tf.pow(true_squared_diffs_current + tf.expand_dims(aux_embeddings_l2, -1), 1./(m - 1.)), + # tf.pow(aux_squared_diffs + tf.expand_dims(true_embeddings_l2, -1), 1./(m - 1.))], axis=2) + + # if reduced_contrast: + # W_denom = tf.expand_dims(tf.reduce_sum(tf.reciprocal(diffs_pow_matrix_current), -1), -1) + # else: + # W_denom = tf.expand_dims(tf.reduce_sum(tf.reciprocal( + # tf.concat([ + # tf.pow( + # tf.reduce_sum(tf.square(tf.expand_dims(true_embeddings, 2) - tf.expand_dims(tf.expand_dims(tf.gather(self.speaker_vectors, known_sources_indices), 0), 0)), + # -1) + tf.expand_dims(aux_embeddings_l2, -1), + # 1./(m - 1.) + # ), + # tf.pow(aux_squared_diffs + tf.expand_dims(true_embeddings_l2, -1), 1./(m - 1.)) + # ], axis=2) + # ), -1), -1) + + # batch, nsource in mix, nfeatures + true_squared_diffs_batch = tf.reduce_sum(tf.square(tf.expand_dims(true_embeddings, 1) - + tf.reshape(tf.expand_dims(tf.gather(self.speaker_vectors, flattened_I), 1), + [shape[0], shapeI[1], 1, self.embedding_size])), + -1) + # true_squared_diffs_batch = tf.reduce_sum(tf.square( + # true_embeddings - tf.expand_dims(tf.gather(self.speaker_vectors, flattened_I), 1)), -1) + # batch, nfeatures, nsources (aux) + aux_squared_diffs = tf.reduce_sum(tf.square(tf.expand_dims( + aux_embeddings, 2) - tf.expand_dims(tf.expand_dims(self.auxiliary_vectors, 0), 0)), -1) + aux_diffs_pow_matrix = tf.pow(aux_squared_diffs + tf.expand_dims(true_embeddings_l2, -1), 1. / (m - 1.)) + # batch, nsource in mix, nfeatures + squared_diffs_batch = true_squared_diffs_batch + tf.expand_dims(aux_embeddings_l2, 1) + diffs_pow_matrix_batch = tf.pow(squared_diffs_batch, 1. / (m - 1.)) + + W_denom = tf.reduce_sum(tf.reciprocal( + tf.concat([ + tf.transpose(diffs_pow_matrix_batch, perm=[0, 2, 1]), + aux_diffs_pow_matrix + ], axis=2) + ), -1) + # batch, 1, nfeatures + W_denom = tf.expand_dims(W_denom, 1) + + #W = tf.reciprocal(diffs_pow_matrix_current*tf.expand_dims(tf.reduce_sum(tf.reciprocal(diffs_pow_matrix_known), -1), -1), name='W') + #clustering_factors = tf.reciprocal(diffs_pow_matrix_batch*tf.expand_dims(tf.reduce_sum(tf.reciprocal(diffs_pow_matrix_known), -1), -1)) + #W = tf.reciprocal(diffs_pow_matrix_current*W_denom, name='W') + #clustering_factors = tf.gather_nd(tf.transpose(W, perm=[0, 2, 1]), tf.stack((batch_range, current_sources_indices_batch), axis=1)) + # batch, nsource in mix, nfeatures + W = tf.reciprocal(diffs_pow_matrix_batch * W_denom, name='W') + # clustering_factors = W + # batch, nfeatures, nsource in mix + clustering_factors = tf.transpose(W, perm=[0, 2, 1]) + + # MI head + mi_head = tf.reshape(clustering_factors, + [shape[0], shape[1], self.F, shapeI[1]]) + # MI head + # Feedforward layer + # feedforward_fc = tf_utils.conv2d_layer(z, + # [1, 1, self.embedding_size, self.num_reco_sources]) + # perform a softmax along the source dimension + #mi_head = tf.nn.softmax(feedforward_fc, dim=3) + + return embedding, mi_head, W, squared_diffs_batch + #if self.auxiliary_vectors is None: + # return embedding, mi_head, W, squared_diffs_batch + #else: + # return embedding, mi_head, W, true_squared_diffs_batch + aux_embeddings_l2 + + @tf_utils.scope_decorator + def clustering_cost(self): + """ + Constuct the cost function op for the cost function used for clustering + """ + + cluster_output, mi_output, W, squared_diffs = self.network + + clustering_loss = tf.reduce_mean( + tf.pow(W, self.fuzzifier) * squared_diffs) + + return clustering_loss + + @tf_utils.scope_decorator + def mi_cost(self): + """ + Constuct the cost function op for the cost function used for mask inference head + """ + + cluster_output, mi_output, W, squared_diffs = self.network + + # broadcast product along source dimension + mi_cost = tf.square(self.y_clean - mi_output * + tf.expand_dims(self.X_clean, -1)) + + return mi_cost + + @tf_utils.scope_decorator + def cost(self): + """ + Constuct the cost function op for the cost function used for clustering + and the mask inference head + """ + + # # TODO: REMOVE!!! + # cluster_output, mi_output, W, squared_diffs = self.network + + # clustering_loss = tf.reduce_mean( + # tf.pow(W, self.fuzzifier) * squared_diffs) + + # # broadcast product along source dimension + # mi_loss = tf.square(self.y_clean - mi_output * + # tf.expand_dims(self.X_clean, -1)) + + # TODO: COMMENT BACK IN!!! + clustering_loss = self.clustering_cost + mi_loss = self.mi_cost + + return self.alpha * clustering_loss + (1.0 - self.alpha) * tf.reduce_mean(mi_loss) + + @tf_utils.scope_decorator + def optimizer(self): + """ + Constructs the optimizer op used to train the network + """ + opt = tf.train.AdamOptimizer() + return opt.minimize(self.cost) + + # def save(self, path): + # """ + # Saves the model to the specified path. + # """ + # self.saver.save(self.sess, path) + + # def load(self, path): + # """ + # Load the model from the specified path. + # """ + # self.saver.restore(self.sess, path) + + def train_on_batch(self, X_train, X_train_clean, y_train, y_train_clean, I_train): + """ + Train the model on a batch with input X and target y. Returns the cost + computed on this batch. + """ + + cost, _ = self.sess.run([self.cost, self.optimizer], + {self.X: X_train, self.y: y_train, + self.X_clean: X_train_clean, + self.y_clean: y_train_clean, + self.I: I_train}) + + return cost + + #def get_masks(self, X_in, nsources=2, nclustering_iterations_max=500, iterations_stop=10): + def get_masks(self, X_in, cluster_centers, nclustering_iterations_max=500, iterations_stop=10): + """ + Compute the masks for the input spectrograms + """ + + nspectrograms = len(X_in) + + num_sources = len(cluster_centers) + cluster_centers_init = np.zeros((self.num_training_sources, self.embedding_size)) + cluster_centers_init[:num_sources] = cluster_centers + + #I = np.arange(nspectrograms * num_sources, dtype=np.int32).reshape(nspectrograms, num_sources) + I = np.tile(np.arange(num_sources, dtype=np.int32), reps=(nspectrograms, 1)) + + with self.graph.as_default(): + assign_cc = self.speaker_vectors.assign(cluster_centers_init) + + self.sess.run(assign_cc, {self.X: X_in, self.I: I}) + masks = self.sess.run(self.network, {self.X: X_in, self.I: I})[1] + + return masks + + def get_vectors(self, X_in, nsources=2): + """ + Compute the embedding vectors for the input spectrograms + """ + + nspectrograms = len(X_in) + #I = np.arange(nspectrograms * nsources, dtype=np.int32).reshape(nspectrograms, nsources) + I = np.tile(np.arange(nsources, dtype=np.int32), reps=(nspectrograms, 1)) + + vectors = self.sess.run(self.network, {self.X: X_in, self.I: I})[0] + return vectors + + def get_cost(self, X_in, X_clean_in, y_in, y_clean_in, I_in): + """ + Computes the cost of a batch, but does not update any model parameters. + """ + cost = self.sess.run(self.cost, {self.X: X_in, self.y: y_in, + self.X_clean: X_clean_in, + self.y_clean: y_clean_in, + self.I: I_in}) + return cost diff --git a/magnolia/python/models/dnndenoise/sce_mask.py b/magnolia/python/models/dnndenoise/sce_mask.py new file mode 100644 index 0000000..b67fddc --- /dev/null +++ b/magnolia/python/models/dnndenoise/sce_mask.py @@ -0,0 +1,326 @@ +import logging.config +import numpy as np +import tensorflow as tf + +from magnolia.models.model_base import ModelBase +from magnolia.utils import tf_utils + + +logger = logging.getLogger('model') + + +class RatioMaskSCE(ModelBase): + """ + Chimera network from [1] but uses the SCE loss from Lab41. + Defaults correspond to the parameters used by the best + performing model in the paper. + + [1] Luo, Yi., et al. "Deep Clustering and Conventional Networks for Music + Separation: Stronger Together" Published in Acoustics, Speech, and + Signal Processing (ICASSP) 2017; doi:10.1109/ICASSP.2017.7952118 + + Hyperparameters: + F: Number of frequency bins in the input data + num_reco_sources: Number sources to reconstruct + num_training_sources: Number sources in the training set + layer_size: Size of BLSTM layers + embedding_size: Dimension of embedding vector + alpha: Relative mixture of cost terms + nonlinearity: Nonlinearity to use in BLSTM layers + device: Which device to run the model on + """ + + def initialize(self): + self.F = self.config['model_params']['F'] + self.num_reco_sources = self.config['model_params']['num_reco_sources'] # should always be 2 + self.num_training_sources = self.config['model_params']['num_training_sources'] + self.layer_size = self.config['model_params']['layer_size'] + self.embedding_size = self.config['model_params']['embedding_size'] + self.normalize = self.config['model_params']['normalize'] + self.alpha = self.config['model_params']['alpha'] + self.nonlinearity = eval(self.config['model_params']['nonlinearity']) + self.collapse_sources = self.config['model_params']['collapse_sources'] + + self.batch_count = 0 + self.costs = [] + self.t_costs = [] + self.v_costs = [] + self.last_saved = 0 + + + def build_graph(self, graph): + with graph.as_default(): + with tf.device(self.config['device']): + # Placeholder tensor for the input data + self.X = tf.placeholder("float", [None, None, self.F]) + # Placeholder tensor for the unscaled input data + self.X_clean = tf.placeholder("float", [None, None, self.F]) + + # Placeholder tensor for the labels/targets + self.y = tf.placeholder("float", [None, None, self.F, None]) + # Placeholder tensor for the unscaled labels/targets + self.y_clean = tf.placeholder("float", [None, None, self.F, None]) + + # Placeholder for the speaker indicies + self.I = tf.placeholder(tf.int32, [None, None]) + + # Define the speaker vectors to use during training + self.speaker_vectors = tf_utils.weight_variable( + [self.num_training_sources, self.embedding_size], + tf.sqrt(2/self.embedding_size)) + + # Model methods + self.network + self.cost + self.optimizer + + return graph + + + def learn_from_epoch(self, epoch_id, + validate_every, + stop_threshold, + training_mixer, + validation_mixer, + batch_formatter, + model_save_base): + + batch_count = self.batch_count + # Training epoch loop + for batch in iter(training_mixer): + unscaled_spectral_sum_batch, scaled_spectral_sum_batch, spectral_masks_batch, spectral_sources_batch = batch_formatter(batch[0], batch[1], batch[2]) + # should be dimensions of (batch size, source) + uids_batch = batch[3] + + # override ids for simply signal/noise + if self.collapse_sources: + uids_batch[:, 0] = 0 + uids_batch[:, 1] = 1 + + # Train the model on one batch and get the cost + c = self.train_on_batch(scaled_spectral_sum_batch, unscaled_spectral_sum_batch, + spectral_masks_batch, spectral_sources_batch, + uids_batch) + + # Store the training cost + self.costs.append(c) + + # Evaluate the model on the validation data + if (batch_count + 1) % validate_every == 0: + # Store the training cost + self.t_costs.append(np.mean(self.costs)) + # Reset the cost over the last 10 batches + self.costs = [] + + # Compute average validation score + all_c_v = [] + for vbatch in iter(validation_mixer): + unscaled_spectral_sum_batch, scaled_spectral_sum_batch, spectral_masks_batch, spectral_sources_batch = batch_formatter(vbatch[0], vbatch[1], vbatch[2]) + # dimensions of (batch size, source) + uids_batch = vbatch[3] + + # override ids for simply signal/noise + if self.collapse_sources: + uids_batch[:, 0] = 0 + uids_batch[:, 1] = 1 + + # Get the cost on the validation batch + c_v = self.get_cost(scaled_spectral_sum_batch, unscaled_spectral_sum_batch, + spectral_masks_batch, spectral_sources_batch, + uids_batch) + all_c_v.append(c_v) + + ave_c_v = np.mean(all_c_v) + + # Check if the validation cost is below the minimum validation cost, and if so, save it. + if len(self.v_costs) > 0 and ave_c_v < min(self.v_costs):# and len(self.nbatches) > 0: + logger.info("Saving the model because validation score is {} below the old minimum.".format(min(self.v_costs) - ave_c_v)) + + # Save the model to the specified path + self.save(model_save_base) + + # Record the batch that the model was last saved on + self.last_saved = batch_count#self.nbatches[-1] + + # Store the validation cost + self.v_costs.append(ave_c_v) + + # Store the current batch number + #self.nbatches.append(batch_count) + + logger.info("Training cost on batch {} is {}.".format(batch_count, self.t_costs[-1])) + logger.info("Validation cost on batch {} is {}.".format(batch_count, self.v_costs[-1])) + logger.info("Last saved {} batches ago.".format(batch_count - self.last_saved)) + + # Stop training if the number of iterations since the last save point exceeds the threshold + if batch_count - self.last_saved > stop_threshold: + logger.info("Early stopping criteria met!") + break + + batch_count += 1 + + self.batch_count = batch_count + + + def infer(self, **kw_args): + pass + + + @tf_utils.scope_decorator + def network(self): + """ + Construct the op for the network used in [1]. This consists of four + BLSTM layers followed by a dense layer giving a set of T-F vectors of + dimension embedding_size + """ + + # Get the shape of the input + shape = tf.shape(self.X) + + # BLSTM layer one + BLSTM_1 = tf_utils.BLSTM_(self.X, self.layer_size, 'one', + activation=self.nonlinearity) + + # BLSTM layer two + BLSTM_2 = tf_utils.BLSTM_(BLSTM_1, self.layer_size, 'two', + activation=self.nonlinearity) + + # BLSTM layer three + BLSTM_3 = tf_utils.BLSTM_(BLSTM_2, self.layer_size, 'three', + activation=self.nonlinearity) + + # BLSTM layer four + BLSTM_4 = tf_utils.BLSTM_(BLSTM_3, self.layer_size, 'four', + activation=self.nonlinearity) + + # Feedforward layer + feedforward = tf_utils.conv1d_layer(BLSTM_4, + [1, self.layer_size, self.embedding_size*self.F]) + + # Reshape the feedforward output to have shape (T,F,D) + z = tf.reshape(feedforward, + [shape[0], shape[1], self.F, self.embedding_size]) + + # SCE head + embedding = self.nonlinearity(z) + # Normalize the T-F vectors to get the network output + embedding = tf.nn.l2_normalize(embedding, 3) + + # MI head + # Feedforward layer + feedforward_fc = tf_utils.conv2d_layer(z, + [1, 1, self.embedding_size, self.num_reco_sources]) + # perform a softmax along the source dimension + mi_head = tf.nn.softmax(feedforward_fc, dim=3) + + return embedding, mi_head + + @tf_utils.scope_decorator + def cost(self): + """ + Constuct the cost function op for the cost function used in sce + and the mask inference head + """ + + # Get the shape of the input + shape = tf.shape(self.y) + + sce_output, mi_output = self.network + + # Reshape I so that it is of the correct dimension + I = tf.expand_dims( self.I, axis=2 ) + + # Normalize the speaker vectors and collect the speaker vectors + # correspinding to the speakers in batch + if self.normalize: + speaker_vectors = tf.nn.l2_normalize(self.speaker_vectors, 1) + else: + speaker_vectors = self.speaker_vectors + Vspeakers = tf.gather_nd(speaker_vectors, I) + + # Expand the dimensions in preparation for broadcasting + Vspeakers_broad = tf.expand_dims(Vspeakers, 1) + Vspeakers_broad = tf.expand_dims(Vspeakers_broad, 1) + embedding_broad = tf.expand_dims(sce_output, 3) + + # Compute the dot product between the emebedding vectors and speaker + # vectors + dot = tf.reduce_sum(Vspeakers_broad * embedding_broad, 4) + + # Compute the cost for every element + sce_cost = -tf.log(tf.nn.sigmoid(self.y * dot)) + + # Average the cost over all speakers in the input + sce_cost = tf.reduce_mean(sce_cost, 3) + + # Average the cost over all batches + sce_cost = tf.reduce_mean(sce_cost, 0) + + # Average the cost over all T-F elements. Here is where weighting to + # account for gradient confidence can occur + sce_cost = tf.reduce_mean(sce_cost) + + # broadcast product along source dimension + mi_cost = tf.square(self.y_clean - mi_output*tf.expand_dims(self.X_clean, -1)) + + return self.alpha*sce_cost + (1.0 - self.alpha)*tf.reduce_mean(mi_cost) + + @tf_utils.scope_decorator + def optimizer(self): + """ + Constructs the optimizer op used to train the network + """ + opt = tf.train.AdamOptimizer() + return opt.minimize(self.cost) + + # def save(self, path): + # """ + # Saves the model to the specified path. + # """ + # self.saver.save(self.sess, path) + + # def load(self, path): + # """ + # Load the model from the specified path. + # """ + # self.saver.restore(self.sess, path) + + def train_on_batch(self, X_train, X_train_clean, y_train, y_train_clean, I_train): + """ + Train the model on a batch with input X and target y. Returns the cost + computed on this batch. + """ + + cost, _ = self.sess.run([self.cost, self.optimizer], + {self.X: X_train, self.y: y_train, + self.X_clean: X_train_clean, + self.y_clean: y_train_clean, + self.I: I_train}) + + return cost + + def get_masks(self, X_in): + """ + Compute the masks for the input spectrograms + """ + + masks = self.sess.run(self.network, {self.X: X_in})[1] + return masks + + def get_vectors(self, X_in): + """ + Compute the embedding vectors for the input spectrograms + """ + + vectors = self.sess.run(self.network, {self.X: X_in})[0] + return vectors + + def get_cost(self, X_in, X_clean_in, y_in, y_clean_in, I_in): + """ + Computes the cost of a batch, but does not update any model parameters. + """ + cost = self.sess.run(self.cost, {self.X: X_in, self.y: y_in, + self.X_clean: X_clean_in, + self.y_clean: y_clean_in, + self.I: I_in}) + return cost diff --git a/magnolia/python/models/dnnseparate/jflec.py b/magnolia/python/models/dnnseparate/jflec.py new file mode 100644 index 0000000..62d4e7c --- /dev/null +++ b/magnolia/python/models/dnnseparate/jflec.py @@ -0,0 +1,351 @@ +import logging.config +import numpy as np +import tensorflow as tf + +from magnolia.models.model_base import ModelBase +from magnolia.utils import tf_utils + + +logger = logging.getLogger('model') + + +class JFLEC(ModelBase): + """ + """ + + def initialize(self): + self.num_sources = self.config['model_params']['num_sources'] + self.num_samples = self.config['model_params']['num_samples'] + self.num_encoding_layers = self.config['model_params']['num_encoding_layers'] + self.embedding_size = self.config['model_params']['embedding_size'] + self.num_decoding_layers = self.config['model_params']['num_decoding_layers'] + self.alpha_init = self.config['model_params']['alpha'] + nl = self.config['model_params']['nonlinearity'] + self.nonlinearity = eval('{}'.format(nl)) + + self.batch_count = 0 + self.nbatches = [] + self.costs = [] + self.t_costs = [] + self.v_costs = [] + self.last_saved = 0 + + + def build_graph(self, graph): + with graph.as_default(): + with tf.device(self.config['device']): + # Placeholder tensor for the input data + self.X = tf.placeholder(tf.float32, [None, self.num_samples]) + + # Placeholder tensor for UIDs for sources + self.X_uids = tf.placeholder(tf.int32, [None, 2]) + + # Placeholder scalar for relative cost factor + self.Alpha = tf.placeholder(tf.float32) + + # Placeholder tensor for the labels/targets + self.Y = tf.placeholder(tf.float32, [None, self.num_samples]) + + # Model methods + self.network + self.cost + self.optimizer + + return graph + + + def learn_from_epoch(self, epoch_id, + validate_every, + stop_threshold, + training_mixer, + validation_mixer, + batch_formatter, + model_save_base): + # FIXME: + # Find the number of batches already elapsed (Useful for resuming training) + start = 0 + if len(self.nbatches) != 0: + start = self.nbatches[-1] + + batch_count = self.batch_count + # Training epoch loop + for batch in iter(training_mixer): + unscaled_spectral_sum_batch, scaled_spectral_sum_batch, spectral_masks_batch, spectral_sources_batch = batch_formatter(batch[0], batch[1], batch[2]) + # should be dimensions of (batch size, source) + uids_batch = batch[3] + + # Train the model on one batch and get the cost + c = self.train_on_batch(scaled_spectral_sum_batch, unscaled_spectral_sum_batch, + spectral_masks_batch, spectral_sources_batch) + + # Store the training cost + self.costs.append(c) + + # Store the current batch_count number + + # Evaluate the model on the validation data + if (batch_count + 1) % validate_every == 0: + # Store the training cost + self.t_costs.append(np.mean(self.costs)) + # Reset the cost over the last 10 batches + self.costs = [] + + # Compute average validation score + all_c_v = [] + for vbatch in iter(validation_mixer): + unscaled_spectral_sum_batch, scaled_spectral_sum_batch, spectral_masks_batch, spectral_sources_batch = batch_formatter(vbatch[0], vbatch[1], vbatch[2]) + # dimensions of (batch size, source) + uids_batch = vbatch[3] + + # Get the cost on the validation batch + c_v = self.get_cost(scaled_spectral_sum_batch, unscaled_spectral_sum_batch, + spectral_masks_batch, spectral_sources_batch) + all_c_v.append(c_v) + + ave_c_v = np.mean(all_c_v) + + # Check if the validation cost is below the minimum validation cost, and if so, save it. + if len(self.v_costs) > 0 and ave_c_v < min(self.v_costs) and len(self.nbatches) > 0: + logger.info("Saving the model because validation score is {} below the old minimum.".format(min(self.v_costs) - ave_c_v)) + + # Save the model to the specified path + self.save(model_save_base) + + # Record the batch that the model was last saved on + self.last_saved = self.nbatches[-1] + + # Store the validation cost + self.v_costs.append(ave_c_v) + + # Store the current batch number + self.nbatches.append(batch_count + 1 + start) + + # Compute scale quantities for plotting + length = len(self.nbatches) + cutoff = int(0.5*length) + lowline = [min(self.v_costs)]*length + + logger.info("Training cost on batch {} is {}.".format(self.nbatches[-1], self.t_costs[-1])) + logger.info("Validation cost on batch {} is {}.".format(self.nbatches[-1], self.v_costs[-1])) + logger.info("Last saved {} batches ago.".format(self.nbatches[-1] - self.last_saved)) + + # Stop training if the number of iterations since the last save point exceeds the threshold + if self.nbatches[-1] - self.last_saved > stop_threshold: + logger.info("Early stopping criteria met!") + break + + batch_count += 1 + + self.batch_count = batch_count + + + def infer(self, **kw_args): + pass + + + @tf_utils.scope_decorator + def network(self): + """ + """ + + # Get the shape of the input + input_shape = tf.shape(self.X) + + # encoder + l = tf.expand_dims(self.X) + for i in range(self.num_encoding_layers): + if i == 0: + nfilters = 2**3 + filter_size = 2**3 + else: + nfilters *= 2 + layer_num = i + 1 + l = self.encoding_layer(l, filter_size, nfilters, layer_num) + + # feature + embeddings + embeddings = self.compute_embeddings(l) + + # clustering + cc = self.make_cluster_centers() + # compute fuzzy assignments + # batch, feature 1, feature 2, nsources + squared_diffs = tf.reduce_sum(tf.square(tf.expand_dims(embeddings, 3) - tf.expand_dims(tf.expand_dims(tf.expand_dims(cc, 0), 0), 0)), -1) + squared_diffs_pow = tf.pow(squared_diffs, 1./(m - 1.)) + W = tf.reciprocal(squared_diffs_pow*tf.expand_dims(tf.reduce_sum(tf.reciprocal(squared_diffs_pow), -1), -1)) + + WT = tf.transpose(W, perm=[0, 3, 1, 2]) + clustering_factors = tf.gather_nd(WT, self.X_uids) + + # NOTE: I'm making an explicit choice to multiply the embedding vectors + # by the fuzzy c-means coefficients + # batch, feature 1, feature 2, nsources, embedding_size + scaled_embeddings = tf.expand_dims(clustering_factors, -1)*embeddings + + # decoder + # collapse embedding dimension with convolution (should revisit later) + with tf.variable_scope('embedding_decoder', reuse=tf.AUTO_REUSE): + filters = tf.truncated_normal([1, 1, 1, self.embedding_size, 1], mean=0.0, stddev=0.1) + bias = tf.truncated_normal([1], mean=0.0, stddev=0.1) + l = tf.nn.convolution(input=scaled_embeddings, + filter=tf.get_variable(name='weights', + initializer=filters), + padding='VALID') + \ + tf.get_variable(name='bias', initializer=bias) + l = self.nonlinearity(tf.squeeze(l)) + + #for i in range(self.num_encoding_layers): + + return embeddings, W + + @tf_utils.scope_decorator + def cost(self): + """ + Constuct the cost function op for the cost function used in the deep + clusetering model and the mask inference head + """ + + # Get the shape of the input + shape = tf.shape(self.y) + + dc_output, mi_output = self.network + + # Reshape the targets to be of shape (batch, T*F, c) and the vectors to + # have shape (batch, T*F, K) + Y = tf.reshape(self.y, [shape[0], shape[1]*shape[2], shape[3]]) + V = tf.reshape(dc_output, + [shape[0], shape[1]*shape[2], self.embedding_size]) + + # Compute the partition size vectors + ones = tf.ones([shape[0], shape[1]*shape[2], 1]) + mul_ones = tf.matmul(tf.transpose(Y, perm=[0,2,1]), ones) + diagonal = tf.matmul(Y, mul_ones) + # D = 1/tf.sqrt(diagonal) + # D = tf.sqrt(1/diagonal) + D = tf.sqrt(tf.where(tf.is_inf(1/diagonal), tf.ones_like(diagonal) * 0, 1/diagonal)) + D = tf.reshape(D, [shape[0], shape[1]*shape[2]]) + + # Compute the matrix products needed for the cost function. Reshapes + # are to allow the diagonal to be multiplied across the correct + # dimensions without explicitly constructing the full diagonal matrix. + DV = D * tf.transpose(V, perm=[2,0,1]) + DV = tf.transpose(DV, perm=[1,2,0]) + VTV = tf.matmul(tf.transpose(V, perm=[0,2,1]), DV) + + DY = D * tf.transpose(Y, perm=[2,0,1]) + DY = tf.transpose(DY, perm=[1,2,0]) + VTY = tf.matmul(tf.transpose(V, perm=[0,2,1]), DY) + + YTY = tf.matmul(tf.transpose(Y, perm=[0,2,1]), DY) + + # Compute the cost by taking the Frobenius norm for each matrix + dc_cost = tf.norm(VTV, axis=[-2,-1]) -2*tf.norm(VTY, axis=[-2,-1]) + \ + tf.norm(YTY, axis=[-2,-1]) + + # broadcast product along source dimension + mi_cost = tf.square(self.y_clean - mi_output*tf.expand_dims(self.X_clean, -1)) + + return self.alpha*tf.reduce_mean(dc_cost) + (1.0 - self.alpha)*tf.reduce_mean(mi_cost) + + @tf_utils.scope_decorator + def optimizer(self): + """ + Constructs the optimizer op used to train the network + """ + opt = tf.train.AdamOptimizer() + return opt.minimize(self.cost) + + # def save(self, path): + # """ + # Saves the model to the specified path. + # """ + # self.saver.save(self.sess, path) + + # def load(self, path): + # """ + # Load the model from the specified path. + # """ + # self.saver.restore(self.sess, path) + + def train_on_batch(self, X_train, X_train_clean, y_train, y_train_clean): + """ + Train the model on a batch with input X and target y. Returns the cost + computed on this batch. + """ + + cost, _ = self.sess.run([self.cost, self.optimizer], + {self.X: X_train, self.y: y_train, + self.X_clean: X_train_clean, + self.y_clean: y_train_clean}) + + return cost + + def get_vectors(self, X_in): + """ + Compute the embedding vectors for the input spectrograms + """ + + vectors = self.sess.run(self.network, {self.X: X_in})[0] + return vectors + + def get_cost(self, X_in, X_clean_in, y_in, y_clean_in): + """ + Computes the cost of a batch, but does not update any model parameters. + """ + cost = self.sess.run(self.cost, {self.X: X_in, self.y: y_in, + self.X_clean: X_clean_in, + self.y_clean: y_clean_in}) + return cost + + def encoding_layer(self, prev_layer, filter_size, nfilters, layer_num): + shape = tf.shape(prev_layer) + prev_nfilters = shape[-1] + + with tf.variable_scope('encoding_layer_{}'.format(layer_num), + reuse=tf.AUTO_REUSE): + filters = tf.truncated_normal([filter_size, prev_nfilters, nfilters], + mean=0.0, stddev=0.1) + bias = tf.truncated_normal([nfilters], + mean=0.0, stddev=0.1) + l = tf.nn.convolution(input=prev_layer, + filter=tf.get_variable(name='weights', + initializer=filters), + padding='VALID', + strides=[1, 1, 2]) + \ + tf.get_variable(name='bias', + initializer=bias) + + l = self.nonlinearity(l) + + return l + + def compute_embeddings(self, prev_layer): + feature_shape = tf.shape(prev_layer) + l = tf.expand_dims(prev_layer) + + with tf.variable_scope('embeddings', reuse=tf.AUTO_REUSE): + # NOTE: This convolution will scan over all "time bins" with + # different weights for each embedding dimension. + # The filter window is divised such that the temporal + # correlations are preserved + filters = tf.truncated_normal([2*feature_shape[1] - 1, 1, + 1, self.embedding_size], mean=0.0, stddev=0.1) + bias = tf.truncated_normal([self.embedding_size], mean=0.0, stddev=0.1) + embeddings = tf.nn.convolution(input=l, + filter=tf.get_variable(name='weights', + initializer=filters), + padding='SAME') + \ + tf.get_variable(name='bias', + initializer=bias) + + embeddings = self.nonlinearity(embeddings) + + return embeddings + + def make_cluster_centers(self): + with tf.variable_scope('cluster_centers', reuse=tf.AUTO_REUSE): + init = tf.truncated_normal([self.num_sources, self.embedding_size], + mean=0.0, stddev=0.1) + + w = tf.get_variable(name='weights', initializer=init) + + return w diff --git a/magnolia/python/models/model_base.py b/magnolia/python/models/model_base.py index 29e3601..4fe7b62 100644 --- a/magnolia/python/models/model_base.py +++ b/magnolia/python/models/model_base.py @@ -92,7 +92,7 @@ def __init__(self, config): # Add all the other common code for the initialization here gpu_options = tf.GPUOptions(allow_growth=True) - sessConfig = tf.ConfigProto(gpu_options=gpu_options) + sessConfig = tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options) self.sess = tf.Session(config=sessConfig, graph=self.graph) self.make_summaries = 'summary_config' in self.config if self.make_summaries: @@ -140,11 +140,19 @@ def save(self, save_path): self.saver.save(self.sess, save_path) - def load(self, checkpoint_dir): - checkpoint = tf.train.get_checkpoint_state(checkpoint_dir) - if self.debug_flag: - logger.debug('Loading the model from folder: {}'.format(checkpoint_dir)) - self.saver.restore(self.sess, checkpoint.model_checkpoint_path) + def load(self, path): + self.saver.restore(self.sess, path) + # TODO: fix this + #model_name = checkpoint_dir.split(os.sep)[-1] + #dir_name = os.path.join(checkpoint_dir.split(os.sep)[:-1]) + #checkpoint = tf.train.get_checkpoint_state(dir_name, latest_filename=model_name) + #if checkpoint is None: + # raise RuntimeError("Couldn't find checkpoint files at {}".format(checkpoint_dir)) + #path = checkpoint.model_checkpoint_path + #step = int(path.split(os.sep)[-1].split('-')[-1]) + #if self.debug_flag: + # logger.debug('Loading the model (step {}) from folder: {}'.format(step, checkpoint_dir)) + #self.saver.restore(self.sess, path) ########################################################### diff --git a/magnolia/python/training/denoising/JFLEC/__init__.py b/magnolia/python/training/denoising/JFLEC/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/magnolia/python/training/denoising/JFLEC/training.py b/magnolia/python/training/denoising/JFLEC/training.py new file mode 100644 index 0000000..e69e070 --- /dev/null +++ b/magnolia/python/training/denoising/JFLEC/training.py @@ -0,0 +1,94 @@ +# Generic imports +import argparse +import logging.config +import json + +import numpy as np +import pandas as pd +import tensorflow as tf + +# Import the Chimera separation model +from magnolia.models import make_model + +# Import utilities for using the model +from magnolia.training.data_iteration.mix_iterator import MixIterator +# from magnolia.utils.training import preprocess_chimera_batch + + +def main(): + # parse command line arguments + parser = argparse.ArgumentParser(description='Train the JFLEC network.') + # parser.add_argument('--model_settings', '-s', + # default='../../../../data/models_settings/chimera_template.json', + # help='model settings JSON file') + # parser.add_argument('--_settings', '-s', + # default='../../../../data/models_settings/chimera_template.json', + # help='model settings JSON file') + parser.add_argument('--logger_settings', '-l', + default='../../../../data/logging_settings/logging.conf', + help='logging configuration file') + args = parser.parse_args() + + # Load logging configuration + logging.config.fileConfig(args.logger_settings) + logger = logging.getLogger('model') + + # Number of epochs + num_epochs = 2 + # Threshold for stopping if the model hasn't improved for this many consecutive batches + stop_threshold = 10000 + # validate every number of these batches + validate_every = 100 + train_batchsize = 256 + train_mixes = ['/local_data/magnolia/pipeline_data/date_2017_09_27_time_13_25/settings/mixing_LibriSpeech_UrbanSound8K_train.json'] + train_from_disk = False + validate_batchsize = 200 + validate_mixes = ['/local_data/magnolia/pipeline_data/date_2017_09_27_time_13_25/settings/mixing_LibriSpeech_UrbanSound8K_validate.json'] + validate_from_disk = False + model_params = { + 'layer_size': 500, + 'embedding_size': 10, + 'alpha': 0.1, + 'nonlinearity': 'tf.tanh', + } + model_location = '/gpu:0' + uid_settings = '/local_data/magnolia/pipeline_data/date_2017_09_27_time_13_25/settings/assign_uids_LibriSpeech_UrbanSound8K.json' + model_save_base = '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/model_saves/jflec' + + + training_mixer = MixIterator(mixes_settings_filenames=train_mixes, + batch_size=train_batchsize, + read_spectrogram=False, + from_disk=train_from_disk) + + validation_mixer = MixIterator(mixes_settings_filenames=validate_mixes, + batch_size=validate_batchsize, + read_spectrogram=False, + from_disk=validate_from_disk) + + # get frequency dimension + frequency_dim = training_mixer.sample_dimensions()[0] + # TODO: throw an exception + assert(frequency_dim == validation_mixer.sample_dimensions()[0]) + + # get number of sources + settings = json.load(open(uid_settings)) + uid_file = settings['output_file'] + uid_csv = pd.read_csv(uid_file) + number_of_sources = uid_csv['uid'].max() + 1 + + model_params['F'] = frequency_dim + config = {'model_params': model_params, + 'device': model_location} + model = make_model('JFLEC', config) + + model.train(validate_every=validate_every, + stop_threshold=stop_threshold, + training_mixer=training_mixer, + validation_mixer=validation_mixer, + batch_formatter=preprocess_chimera_batch, + model_save_base=model_save_base) + + +if __name__ == '__main__': + main() diff --git a/magnolia/python/training/denoising/RatioMaskCluster/__init__.py b/magnolia/python/training/denoising/RatioMaskCluster/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/magnolia/python/training/denoising/RatioMaskCluster/training.py b/magnolia/python/training/denoising/RatioMaskCluster/training.py new file mode 100644 index 0000000..af2d0f9 --- /dev/null +++ b/magnolia/python/training/denoising/RatioMaskCluster/training.py @@ -0,0 +1,103 @@ +# Generic imports +import argparse +import logging.config +import json + +import numpy as np +import pandas as pd +import tensorflow as tf + +# Import the RatioMaskCluster separation model +from magnolia.models import make_model + +# Import utilities for using the model +from magnolia.training.data_iteration.mix_iterator import MixIterator +from magnolia.utils.training import preprocess_chimera_batch +#from magnolia.utils.tf_utils import double_learnable_relu + + +def main(): + # parse command line arguments + parser = argparse.ArgumentParser( + description='Train the k-means clustering + ratio mask network.') + # parser.add_argument('--model_settings', '-s', + # default='../../../../data/models_settings/chimera_template.json', + # help='model settings JSON file') + # parser.add_argument('--_settings', '-s', + # default='../../../../data/models_settings/chimera_template.json', + # help='model settings JSON file') + parser.add_argument('--logger_settings', '-l', + default='../../../../data/logging_settings/logging.conf', + help='logging configuration file') + args = parser.parse_args() + + # Load logging configuration + logging.config.fileConfig(args.logger_settings) + logger = logging.getLogger('model') + + # Number of epochs + num_epochs = 20 # try 20 + # Threshold for stopping if the model hasn't improved for this many consecutive batches + stop_threshold = 10000 + # validate every number of these batches + validate_every = 100 + train_batchsize = 512 + train_mixes = [ + '/local_data/magnolia/pipeline_data/date_2017_09_27_time_13_25/settings/mixing_LibriSpeech_UrbanSound8K_train.json'] + train_from_disk = False + validate_batchsize = 500 + validate_mixes = [ + '/local_data/magnolia/pipeline_data/date_2017_09_27_time_13_25/settings/mixing_LibriSpeech_UrbanSound8K_validate.json'] + validate_from_disk = False + model_params = { + 'layer_size': 500, + 'embedding_size': 10, + 'auxiliary_size': 0, + 'alpha': 0.9, # try 0.9 + 'nonlinearity': 'tf.tanh', + 'fuzzifier': 1.2, + 'num_reco_sources': 2, + 'normalize': False, + 'collapse_sources': False, + } + model_location = '/gpu:0' + uid_settings = '/local_data/magnolia/pipeline_data/date_2017_09_27_time_13_25/settings/assign_uids_LibriSpeech_UrbanSound8K.json' + model_save_base = '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/model_saves/mask_cluster_tight_alpha0.99' + + training_mixer = MixIterator(mixes_settings_filenames=train_mixes, + batch_size=train_batchsize, + read_waveform=False, + from_disk=train_from_disk) + + validation_mixer = MixIterator(mixes_settings_filenames=validate_mixes, + batch_size=validate_batchsize, + read_waveform=False, + from_disk=validate_from_disk) + + # get frequency dimension + frequency_dim = training_mixer.sample_dimensions()[0] + # TODO: throw an exception + assert(frequency_dim == validation_mixer.sample_dimensions()[0]) + + # get number of sources + settings = json.load(open(uid_settings)) + uid_file = settings['output_file'] + uid_csv = pd.read_csv(uid_file) + number_of_sources = uid_csv['uid'].max() + 1 + + model_params['F'] = frequency_dim + model_params['num_training_sources'] = number_of_sources + config = {'model_params': model_params, + 'device': model_location} + model = make_model('RatioMaskCluster', config) + + model.train(validate_every=validate_every, + stop_threshold=stop_threshold, + training_mixer=training_mixer, + validation_mixer=validation_mixer, + batch_formatter=preprocess_chimera_batch, + model_save_base=model_save_base) + + +if __name__ == '__main__': + main() diff --git a/magnolia/python/training/denoising/RatioMaskSCE/__init__.py b/magnolia/python/training/denoising/RatioMaskSCE/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/magnolia/python/training/denoising/RatioMaskSCE/training.py b/magnolia/python/training/denoising/RatioMaskSCE/training.py new file mode 100644 index 0000000..0a3c79c --- /dev/null +++ b/magnolia/python/training/denoising/RatioMaskSCE/training.py @@ -0,0 +1,98 @@ +# Generic imports +import argparse +import logging.config +import json + +import numpy as np +import pandas as pd +import tensorflow as tf + +# Import the RatioMaskSCE separation model +from magnolia.models import make_model + +# Import utilities for using the model +from magnolia.training.data_iteration.mix_iterator import MixIterator +from magnolia.utils.training import preprocess_chimera_batch + + +def main(): + # parse command line arguments + parser = argparse.ArgumentParser(description='Train the SCE + ratio mask network.') + # parser.add_argument('--model_settings', '-s', + # default='../../../../data/models_settings/chimera_template.json', + # help='model settings JSON file') + # parser.add_argument('--_settings', '-s', + # default='../../../../data/models_settings/chimera_template.json', + # help='model settings JSON file') + parser.add_argument('--logger_settings', '-l', + default='../../../../data/logging_settings/logging.conf', + help='logging configuration file') + args = parser.parse_args() + + # Load logging configuration + logging.config.fileConfig(args.logger_settings) + logger = logging.getLogger('model') + + # Number of epochs + num_epochs = 20 # try 20 + # Threshold for stopping if the model hasn't improved for this many consecutive batches + stop_threshold = 10000 + # validate every number of these batches + validate_every = 100 + train_batchsize = 256 + train_mixes = ['/local_data/magnolia/pipeline_data/date_2017_09_27_time_13_25/settings/mixing_LibriSpeech_UrbanSound8K_train.json'] + train_from_disk = False + validate_batchsize = 200 + validate_mixes = ['/local_data/magnolia/pipeline_data/date_2017_09_27_time_13_25/settings/mixing_LibriSpeech_UrbanSound8K_validate.json'] + validate_from_disk = False + model_params = { + 'layer_size': 500, + 'embedding_size': 10, + 'alpha': 0.9, # try 0.9 + 'nonlinearity': 'tf.tanh', + 'num_reco_sources': 2, + 'normalize': False, + 'collapse_sources': False, + } + model_location = '/gpu:0' + uid_settings = '/local_data/magnolia/pipeline_data/date_2017_09_27_time_13_25/settings/assign_uids_LibriSpeech_UrbanSound8K.json' + model_save_base = '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/model_saves/mask_sce' + + + training_mixer = MixIterator(mixes_settings_filenames=train_mixes, + batch_size=train_batchsize, + read_waveform=False, + from_disk=train_from_disk) + + validation_mixer = MixIterator(mixes_settings_filenames=validate_mixes, + batch_size=validate_batchsize, + read_waveform=False, + from_disk=validate_from_disk) + + # get frequency dimension + frequency_dim = training_mixer.sample_dimensions()[0] + # TODO: throw an exception + assert(frequency_dim == validation_mixer.sample_dimensions()[0]) + + # get number of sources + settings = json.load(open(uid_settings)) + uid_file = settings['output_file'] + uid_csv = pd.read_csv(uid_file) + number_of_sources = uid_csv['uid'].max() + 1 + + model_params['F'] = frequency_dim + model_params['num_training_sources'] = number_of_sources + config = {'model_params': model_params, + 'device': model_location} + model = make_model('RatioMaskSCE', config) + + model.train(validate_every=validate_every, + stop_threshold=stop_threshold, + training_mixer=training_mixer, + validation_mixer=validation_mixer, + batch_formatter=preprocess_chimera_batch, + model_save_base=model_save_base) + + +if __name__ == '__main__': + main() diff --git a/magnolia/python/utils/clustering_utils.py b/magnolia/python/utils/clustering_utils.py index 8a2512e..dd8fe3a 100644 --- a/magnolia/python/utils/clustering_utils.py +++ b/magnolia/python/utils/clustering_utils.py @@ -9,8 +9,8 @@ from sklearn.mixture import BayesianGaussianMixture, GaussianMixture from sklearn.decomposition import PCA -from ..features.spectral_features import istft -from ..features.preprocessing import make_stft_features, \ +from magnolia.preprocessing.spectral_features import istft +from magnolia.preprocessing.preprocessing import make_stft_features, \ undo_preemphasis from magnolia.utils.training import preprocess_l41_batch, preprocess_chimera_batch, preprocess_l41_regression_batch @@ -127,7 +127,7 @@ def get_cluster_masks(vectors, num_sources, binary_mask=True, algo=None): else: # Do clustering - algo.fit(vectors[0].reshape((shape[1]*shape[2],shape[3]))) + algo.fit(vectors[0].reshape((shape[1]*shape[2], shape[3]))) if binary_mask: # Use cluster IDs to construct masks @@ -361,3 +361,73 @@ def l41_regression_signal(spec, model): signal = model.get_signal(model_spec) return signal.transpose(0, 2, 1, 3) + + +def mask_cluster_clustering_separate(spec, model, num_sources, + binary_mask=True): + """ + Takes in a spectrogram and a model which has a get_vectors method and returns + the specified number of output sources. + + Inputs: + spec: Spectrogram (in the format from a MixIterator) to separate. + model: Instance of model to use to separate the signal + num_sources: Integer number of sources to separate into + binary_mask: If true, computes the binary mask. Otherwise + computes a soft mask + + Returns: + sources: Numpy ndarray of shape (num_sources, Spectrogram.shape) + """ + + + model_spec = preprocess_chimera_batch(spec)[1] + # Get the T-F embedding vectors for this signal from the model + vectors = model.get_vectors(model_spec) + + # Clustering algo + clusterer = KMeans(n_clusters=num_sources, random_state=0) + + # Run clustering algorithm on the vectors with k=num_sources to recover the + # signal masks + masks = get_cluster_masks(vectors, num_sources, binary_mask=binary_mask, algo=clusterer) + + # Apply the masks from the clustering to the input signal + masked_specs = apply_masks(spec[0].T, masks) + + sources = np.stack(masked_specs) + + return sources.transpose(0, 2, 1) + + +def mask_cluster_mask(spec, model, num_sources, **kwd_args): + model_spec = preprocess_chimera_batch(spec)[1] + + # # TODO: COMMENT BACK IN!!! + # soft_masks = model.get_masks(model_spec, num_sources, **kwd_args) + + # masked_specs = soft_masks*np.expand_dims(spec.transpose(0, 2, 1), axis=-1) + + # return masked_specs.transpose(0, 2, 1, 3) + + # TODO: REMOVE + # Get the T-F embedding vectors for this signal from the model + vectors = model.get_vectors(model_spec) + + # Clustering algo + clusterer = KMeans(n_clusters=num_sources, random_state=0) + + # Get the shape of the input + shape = np.shape(vectors) + + vectorsr = vectors[0].reshape((shape[1]*shape[2], shape[3])) + + # Do clustering + clusterer.fit(vectorsr) + + cluster_centers = clusterer.cluster_centers_ + + soft_masks = model.get_masks(model_spec, cluster_centers, **kwd_args) + masked_specs = soft_masks*np.expand_dims(spec.transpose(0, 2, 1), axis=-1) + + return masked_specs.transpose(0, 2, 1, 3) diff --git a/magnolia/python/utils/mix2file.py b/magnolia/python/utils/mix2file.py index 69a3f09..62bce64 100644 --- a/magnolia/python/utils/mix2file.py +++ b/magnolia/python/utils/mix2file.py @@ -56,10 +56,12 @@ def main(): total_length = int(sample_length*sample_rate) for i in range(args.sample): - spec, bin_masks, source_specs, uids, snrs = next(mixer_iter) + spec, bin_masks, source_specs, wf, wf_sources, uids, snrs = next(mixer_iter) spec = spec[0] bin_masks = bin_masks[0] + wf = wf[0] + wf_sources = wf_sources[0] uids = uids[0] snrs = snrs[0] @@ -71,6 +73,9 @@ def main(): istft_args=istft_args) lr.output.write_wav(mix_file_name, y, sample_rate, norm=True) + mix_file_name = '{}_waveform_mix.wav'.format(os.path.splitext(args.output_file)[0]) + lr.output.write_wav(mix_file_name, wf, sample_rate, norm=True) + for i in range(bin_masks.shape[0]): source_file_name = '{}_{}.wav'.format(os.path.splitext(args.output_file)[0], uids[i]) source_spec = apply_binary_mask(bin_masks[i], spec) @@ -79,6 +84,9 @@ def main(): istft_args=istft_args) lr.output.write_wav(source_file_name, source_y, sample_rate, norm=True) + source_file_name = '{}_{}_waveform.wav'.format(os.path.splitext(args.output_file)[0], uids[i]) + lr.output.write_wav(source_file_name, wf_sources[i], sample_rate, norm=True) + if __name__ == '__main__': main() diff --git a/magnolia/python/utils/postprocessing.py b/magnolia/python/utils/postprocessing.py index 3b9367e..99e68a9 100644 --- a/magnolia/python/utils/postprocessing.py +++ b/magnolia/python/utils/postprocessing.py @@ -1,6 +1,6 @@ import numpy as np -from ..features.preprocessing import undo_preemphasis -from ..features.spectral_features import istft +from magnolia.preprocessing.preprocessing import undo_preemphasis +from magnolia.preprocessing.spectral_features import istft def convert_preprocessing_parameters(params): diff --git a/magnolia/python/utils/tf_utils.py b/magnolia/python/utils/tf_utils.py index 1228c0c..6bcea7b 100644 --- a/magnolia/python/utils/tf_utils.py +++ b/magnolia/python/utils/tf_utils.py @@ -44,6 +44,18 @@ def bias_variable(shape, value=0.1): initial = tf.constant(value, shape=shape) return tf.Variable(initial) +def double_learnable_relu(preactivations, initial_alphas=[1.0, 0.1], name=None): + """Relu activation with two learnable slopes""" + if name is not None: + name1 = '{}/alpha1'.format(name) + name2 = '{}/alpha2'.format(name) + else: + name1 = 'alpha1' + name2 = 'alpha2' + alpha1 = tf.get_variable(name1, initializer=tf.constant(initial_alphas[0])) + alpha2 = tf.get_variable(name2, initializer=tf.constant(initial_alphas[1])) + return tf.where(preactivations >= 0.0, x=alpha1*preactivations, y=alpha2*preactivations) + def leaky_relu(x, alpha=0.1): """ Leaky rectified linear unit. Returns max(x, alpha*x)