From 6bd441bee2a0c1e2f9ee7426db06c069b7531123 Mon Sep 17 00:00:00 2001 From: Hang Lyu Date: Tue, 29 Nov 2016 02:23:14 -0500 Subject: [PATCH 01/23] add a function--ApplyAddAdditiveNoise --- .../nnet3-xvector-signal-perturb-egs.cc | 108 ++++++++++++++++-- 1 file changed, 101 insertions(+), 7 deletions(-) diff --git a/src/xvectorbin/nnet3-xvector-signal-perturb-egs.cc b/src/xvectorbin/nnet3-xvector-signal-perturb-egs.cc index f78c9c6a03a..9b154490d8f 100644 --- a/src/xvectorbin/nnet3-xvector-signal-perturb-egs.cc +++ b/src/xvectorbin/nnet3-xvector-signal-perturb-egs.cc @@ -51,8 +51,8 @@ void ApplyPerturbation(XvectorPerturbOptions opts, // randomly generate an zero-phase FIR filter with no zeros. // In future, we can select trucated part of room impluse response // and convolve it with input_egs. - perturb_xvector.ComputeAndApplyRandDistortion(shifted_egs, - &rand_distort_shifted_egs); + ////perturb_xvector.ComputeAndApplyRandDistortion(shifted_egs, + //// &rand_distort_shifted_egs); } if (noise_egs) { @@ -73,16 +73,71 @@ void ApplyPerturbation(XvectorPerturbOptions opts, } // Perturb speed of signal egs Matrix warped_distorted_shifted_egs(rand_distort_shifted_egs); - if (opts.max_time_stretch != 0.0) - perturb_xvector.TimeStretch(rand_distort_shifted_egs, - &warped_distorted_shifted_egs); + ////if (opts.max_time_stretch != 0.0) + //// perturb_xvector.TimeStretch(rand_distort_shifted_egs, + //// &warped_distorted_shifted_egs); // If nagation is true, the sample values are randomly negated // with some probability. - if (opts.negation) { + ////if (opts.negation) { + ////} +} + +// add +// This function add the noise to the orginial signal. We should not normalize +// the signal level of the orginial signal. According to SNR, we rescale the noise +// and add it. So that the perturbed signal is created. +void ApplyAddAdditiveNoise(const int32 &SNR, + const Matrix &input_eg, + const Matrix &noise_eg, + Matrix *perturb_eg) { + // In the version, we ask the noise_cols >= input_cols. If mfcc, the cols are equal. + // If raw data, we ask the noise_cols > input_cols. + int32 input_rows = input_eg.NumRows(), input_cols = input_eg.NumCols(); + KALDI_ASSERT(noise_eg.NumCols() >= input_cols); + + // According to the rows of noise_eg, form the noise_mat + // repeat the noise_eg blocks to have a new block which is longer than input_eg + Matrix noise_mat; + if (noise_eg.NumRows() < input_rows) { + int32 repeat_times = (input_rows / noise_eg.NumRows()) + 1; + noise_mat.Resize(noise_eg.NumRows() * repeat_times, noise_eg.NumCols()); + for (int32 i = 0; i < repeat_times; ++i) { + noise_mat.Range(i*noise_eg.NumRows(), noise_eg.NumRows(), + 0, noise_eg.NumCols()).CopyFromMat(noise_eg); + } + } else { + noise_mat.Resize(noise_eg.NumRows(), noise_eg.NumCols()); + noise_mat.CopyFromMat(noise_eg); } + + // select the noise range + int32 noise_rows = noise_mat.NumRows(), noise_cols = noise_mat.NumCols(); + int32 start_row_ind = RandInt(0, noise_rows - input_rows), + start_col_ind = RandInt(0, noise_cols - input_cols); + Matrix selected_noise_mat(input_rows, input_cols); + selected_noise_mat.AddMat(1.0, noise_mat.Range(start_row_ind, input_rows, + start_col_ind, input_cols)); + // compute the energy of noise and input + Matrix input_energy_mat(input_rows, input_cols); + input_energy_mat.AddMatMatElements(1.0, input_eg, input_eg, 1.0); + double input_energy = input_energy_mat.Sum(); + Matrix noise_energy_mat(input_rows, input_cols); + noise_energy_mat.AddMatMatElements(1.0, selected_noise_mat, selected_noise_mat, 1.0); + double noise_energy = noise_energy_mat.Sum(); + + // In Energy domain, SNR=20log10(S/N). + // 10^(SNR/20) = input_energy / (scale^2 * noise_energy) + double scale = input_energy / noise_energy / (pow(10,SNR/20)); + scale = sqrt(scale); + + // Add noise mat to input_eg mat + perturb_eg->Resize(input_rows, input_cols); + perturb_eg->CopyFromMat(input_eg); + perturb_eg->AddMat(scale, selected_noise_mat); } +// add-end } // end of namespace nnet3 } // end of namespace kaldi @@ -108,6 +163,14 @@ int main(int argc, char *argv[]) { XvectorPerturbOptions perturb_opts; perturb_opts.Register(&po); + // add + std::string add_noise_rspecifier; + po.Register("add-noise", &add_noise_rspecifier, "specify a file contains some noise egs"); + int32 snr; + po.Register("SNR",&snr,"specify a Signal to Noise Ration.We will scale the noise according \ + to the original signal and SNR. Normally, it's a non-zero number between -30 and 30"); + // add-end + po.Read(argc, argv); if (po.NumArgs() != 2) { po.PrintUsage(); @@ -121,6 +184,20 @@ int main(int argc, char *argv[]) { NnetExampleWriter example_writer(examples_wspecifier); + // add + // count the number of noise examples and record the key + std::vector list_noise_egs; + SequentialNnetExampleReader noise_seq_reader(add_noise_rspecifier); + for (; !noise_seq_reader.Done(); noise_seq_reader.Next()) { + std::string key = noise_seq_reader.Key(); + list_noise_egs.push_back(key); + } + noise_seq_reader.Close(); + int32 num_noise_egs = list_noise_egs.size(); + // initial a RandomAccessTableReader for noise egs + RandomAccessNnetExampleReader noise_random_reader(add_noise_rspecifier); + // add-end + int64 num_read = 0, num_written = 0; Matrix *noise_mat = NULL; @@ -141,7 +218,24 @@ int main(int argc, char *argv[]) { Matrix perturb_eg_mat, input_eg_mat; input_eg_io.features.CopyToMat(&input_eg_mat); - ApplyPerturbation(perturb_opts, input_eg_mat, noise_mat, &perturb_eg_mat); + + // add + if (!add_noise_rspecifier.empty()) { + // random choose a noise example + int32 index_noise_egs = RandInt(0, num_noise_egs - 1); + std::string key_noise_egs = list_noise_egs[index_noise_egs]; + const NnetExample &noise_eg = noise_random_reader.Value(key_noise_egs); + const NnetIo &noise_eg_io = noise_eg.io[0]; + + Matrix noise_eg_mat; + noise_eg_io.features.CopyToMat(&noise_eg_mat); + + // deal with add noise + ApplyAddAdditiveNoise(snr, input_eg_mat, noise_eg_mat, &perturb_eg_mat); + } else { + ApplyPerturbation(perturb_opts, input_eg_mat, noise_mat, &perturb_eg_mat); + } + // add-end perturb_eg->io.resize(1.0); perturb_eg->io[0].features.SwapFullMatrix(&perturb_eg_mat); example_writer.Write(key, *perturb_eg); From 0e73c48793276ab25ed80317541ffb48b57ada08 Mon Sep 17 00:00:00 2001 From: Hang Lyu Date: Tue, 29 Nov 2016 17:10:04 -0500 Subject: [PATCH 02/23] fix the noise_eg.NumCols() == input_eg.NumCols() --- src/xvectorbin/nnet3-xvector-signal-perturb-egs.cc | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/xvectorbin/nnet3-xvector-signal-perturb-egs.cc b/src/xvectorbin/nnet3-xvector-signal-perturb-egs.cc index 9b154490d8f..875051a1cc8 100644 --- a/src/xvectorbin/nnet3-xvector-signal-perturb-egs.cc +++ b/src/xvectorbin/nnet3-xvector-signal-perturb-egs.cc @@ -92,10 +92,9 @@ void ApplyAddAdditiveNoise(const int32 &SNR, const Matrix &input_eg, const Matrix &noise_eg, Matrix *perturb_eg) { - // In the version, we ask the noise_cols >= input_cols. If mfcc, the cols are equal. - // If raw data, we ask the noise_cols > input_cols. + // In the version, we ask the noise_cols == input_cols. int32 input_rows = input_eg.NumRows(), input_cols = input_eg.NumCols(); - KALDI_ASSERT(noise_eg.NumCols() >= input_cols); + KALDI_ASSERT(noise_eg.NumCols() == input_cols); // According to the rows of noise_eg, form the noise_mat // repeat the noise_eg blocks to have a new block which is longer than input_eg @@ -113,12 +112,11 @@ void ApplyAddAdditiveNoise(const int32 &SNR, } // select the noise range - int32 noise_rows = noise_mat.NumRows(), noise_cols = noise_mat.NumCols(); - int32 start_row_ind = RandInt(0, noise_rows - input_rows), - start_col_ind = RandInt(0, noise_cols - input_cols); + int32 noise_rows = noise_mat.NumRows(); + int32 start_row_ind = RandInt(0, noise_rows - input_rows); Matrix selected_noise_mat(input_rows, input_cols); selected_noise_mat.AddMat(1.0, noise_mat.Range(start_row_ind, input_rows, - start_col_ind, input_cols)); + 0, input_cols)); // compute the energy of noise and input Matrix input_energy_mat(input_rows, input_cols); input_energy_mat.AddMatMatElements(1.0, input_eg, input_eg, 1.0); From f2e3119f4fee9c7f56ec6e57e05182599f0c02ab Mon Sep 17 00:00:00 2001 From: Hang Lyu Date: Sat, 3 Dec 2016 23:49:22 -0500 Subject: [PATCH 03/23] modify the design style --- src/feat/signal-distort.cc | 88 +++++++++++++++++ src/feat/signal-distort.h | 17 +++- .../nnet3-xvector-signal-perturb-egs.cc | 96 +++---------------- 3 files changed, 115 insertions(+), 86 deletions(-) diff --git a/src/feat/signal-distort.cc b/src/feat/signal-distort.cc index 788860ba769..25744f788a6 100644 --- a/src/feat/signal-distort.cc +++ b/src/feat/signal-distort.cc @@ -72,5 +72,93 @@ void TimeStretch(const MatrixBase &input_egs, perturb_egs->CopyFromMat(out_mat); } +// This function add the noise to the orginial signal. We should not normalize +// the signal level of the orginial signal. According to SNR, we rescale the noise +// and add it. So that the perturbed signal is created. +void PerturbXvectorSignal::ApplyAdditiveNoise(const MatrixBase &input_eg, + const Matrix &noise_eg, + const int32 &SNR, + Matrix *perturb_eg) { + // In the version, we ask the noise_cols == input_cols. + int32 input_rows = input_eg.NumRows(), input_cols = input_eg.NumCols(); + KALDI_ASSERT(noise_eg.NumCols() == input_cols); + // According to the rows of noise_eg, form the noise_mat + // repeat the noise_eg blocks to have a new block which is longer than input_eg + + // As the noise_eg is very huge and the input_eg is small normally, + // so we'd better not reload the "noise_eg" matrix + // select the noise range + + Matrix selected_noise_mat; + selected_noise_mat.Resize(input_rows, input_cols); + + int32 noise_rows = noise_eg.NumRows(); + int32 start_row_ind = RandInt(0, noise_rows - input_rows); + + if (noise_eg.NumRows() < input_rows) { + int32 indices[input_rows]; + for (int32 i=0; i < input_rows; ++i) { + indices[i] = (start_row_ind + i) % noise_eg.NumRows(); + } + selected_noise_mat.CopyRows(noise_eg, indices); + } else { + selected_noise_mat.AddMat(1.0, noise_eg.Range(start_row_ind, input_rows, + 0, input_cols)); + } + + // compute the energy of noise and input + Matrix input_energy_mat(input_rows, input_cols); + input_energy_mat.AddMatMatElements(1.0, input_eg, input_eg, 0.0); + double input_energy = input_energy_mat.Sum(); + Matrix noise_energy_mat(input_rows, input_cols); + noise_energy_mat.AddMatMatElements(1.0, selected_noise_mat, selected_noise_mat, 0.0); + double noise_energy = noise_energy_mat.Sum(); + + // In Energy domain, SNR=20log10(S/N). + // 10^(SNR/20) = input_energy / (scale^2 * noise_energy) + double scale = input_energy / noise_energy / (pow(10,SNR/20)); + scale = sqrt(scale); + + // Add noise mat to input_eg mat + perturb_eg->Resize(input_rows, input_cols); + perturb_eg->CopyFromMat(input_eg); + perturb_eg->AddMat(scale, selected_noise_mat); +} + +void PerturbXvectorSignal::ApplyDistortion(const MatrixBase &input_egs, + Matrix *perturb_egs) { + if (!opts_.add_noise_rspecifier.empty()) { // deal with the add_noise ark situdation + // count the number of noise examples and record the key + std::vector list_noise_egs; + list_noise_egs.clear(); + kaldi::nnet3::SequentialNnetExampleReader noise_seq_reader(opts_.add_noise_rspecifier); + for (; !noise_seq_reader.Done(); noise_seq_reader.Next()) { + std::string key = noise_seq_reader.Key(); + list_noise_egs.push_back(key); + } + noise_seq_reader.Close(); + + // random choose a noise_eg and use it. + int32 num_noise_egs = list_noise_egs.size(); + int32 index_noise_egs = RandInt(0, num_noise_egs - 1); + std::string key_noise_egs = list_noise_egs[index_noise_egs]; + + kaldi::nnet3::RandomAccessNnetExampleReader noise_random_reader(opts_.add_noise_rspecifier); + const kaldi::nnet3::NnetExample &noise_eg = noise_random_reader.Value(key_noise_egs); + const kaldi::nnet3::NnetIo &noise_eg_io = noise_eg.io[0]; + Matrix noise_eg_mat; + noise_eg_io.features.CopyToMat(&noise_eg_mat); + int32 SNR = opts_.snr; + + // conduct ApplyAdditiveNoise + ApplyAdditiveNoise(input_egs, noise_eg_mat, SNR, perturb_egs); + + // conduct others + // TODO + } else { // deal with the opts_.noise_egs situation + // TODO + } +} +// add-end } // end of namespace kaldi diff --git a/src/feat/signal-distort.h b/src/feat/signal-distort.h index b3faad96554..d6c0e3e84ff 100644 --- a/src/feat/signal-distort.h +++ b/src/feat/signal-distort.h @@ -32,6 +32,7 @@ #include "feat/resample.h" #include "matrix/matrix-functions.h" #include "cudamatrix/cu-matrix.h" +#include "nnet3/nnet-example.h" namespace kaldi { @@ -43,11 +44,15 @@ struct XvectorPerturbOptions { int32 negation_prop; bool rand_distort; std::string noise_egs; + std::string add_noise_rspecifier; + int32 snr; + XvectorPerturbOptions(): max_shift(0.2), max_time_stretch(0.2), frame_dim(80), negation_prop(0.0), - rand_distort(false) { } + rand_distort(false), + snr(10) { } void Register(OptionsItf *opts) { opts->Register("max-shift", &max_shift, "Maximum random shift relative" "to frame length applied to egs."); @@ -59,6 +64,10 @@ struct XvectorPerturbOptions { opts->Register("noise-egs", &noise_egs, "If supplied, the additive noise is added to input signal."); opts->Register("rand_distort", &rand_distort, "If true, the signal is slightly changes" "using some designed FIR filter with no zeros."); + opts->Register("add-noise", &add_noise_rspecifier, "specify a file contains some noise egs"); + opts->Register("SNR",&snr,"specify a Signal to Noise Ration. We will scale the noise according" + "to the original signal and SNR. Normally, it's a non-zero number between -30 and 30" + "default=10"); } }; @@ -70,8 +79,14 @@ class PerturbXvectorSignal { Matrix *perturb_egs); private: XvectorPerturbOptions opts_; + + void ApplyAdditiveNoise(const MatrixBase &input_eg, + const Matrix &noise_eg, + const int32 &SNR, + Matrix *perturb_eg); }; + // randomly disturb the input signal using a band-pass filter with no zeros. void ComputeAndApplyRandDistortion(const MatrixBase &input_egs, Matrix *perturb_egs); diff --git a/src/xvectorbin/nnet3-xvector-signal-perturb-egs.cc b/src/xvectorbin/nnet3-xvector-signal-perturb-egs.cc index 875051a1cc8..71c297ea18a 100644 --- a/src/xvectorbin/nnet3-xvector-signal-perturb-egs.cc +++ b/src/xvectorbin/nnet3-xvector-signal-perturb-egs.cc @@ -85,55 +85,14 @@ void ApplyPerturbation(XvectorPerturbOptions opts, } // add -// This function add the noise to the orginial signal. We should not normalize -// the signal level of the orginial signal. According to SNR, we rescale the noise -// and add it. So that the perturbed signal is created. -void ApplyAddAdditiveNoise(const int32 &SNR, - const Matrix &input_eg, - const Matrix &noise_eg, - Matrix *perturb_eg) { - // In the version, we ask the noise_cols == input_cols. - int32 input_rows = input_eg.NumRows(), input_cols = input_eg.NumCols(); - KALDI_ASSERT(noise_eg.NumCols() == input_cols); - - // According to the rows of noise_eg, form the noise_mat - // repeat the noise_eg blocks to have a new block which is longer than input_eg - Matrix noise_mat; - if (noise_eg.NumRows() < input_rows) { - int32 repeat_times = (input_rows / noise_eg.NumRows()) + 1; - noise_mat.Resize(noise_eg.NumRows() * repeat_times, noise_eg.NumCols()); - for (int32 i = 0; i < repeat_times; ++i) { - noise_mat.Range(i*noise_eg.NumRows(), noise_eg.NumRows(), - 0, noise_eg.NumCols()).CopyFromMat(noise_eg); - } - } else { - noise_mat.Resize(noise_eg.NumRows(), noise_eg.NumCols()); - noise_mat.CopyFromMat(noise_eg); - } - - // select the noise range - int32 noise_rows = noise_mat.NumRows(); - int32 start_row_ind = RandInt(0, noise_rows - input_rows); - Matrix selected_noise_mat(input_rows, input_cols); - selected_noise_mat.AddMat(1.0, noise_mat.Range(start_row_ind, input_rows, - 0, input_cols)); - // compute the energy of noise and input - Matrix input_energy_mat(input_rows, input_cols); - input_energy_mat.AddMatMatElements(1.0, input_eg, input_eg, 1.0); - double input_energy = input_energy_mat.Sum(); - Matrix noise_energy_mat(input_rows, input_cols); - noise_energy_mat.AddMatMatElements(1.0, selected_noise_mat, selected_noise_mat, 1.0); - double noise_energy = noise_energy_mat.Sum(); - - // In Energy domain, SNR=20log10(S/N). - // 10^(SNR/20) = input_energy / (scale^2 * noise_energy) - double scale = input_energy / noise_energy / (pow(10,SNR/20)); - scale = sqrt(scale); - - // Add noise mat to input_eg mat - perturb_eg->Resize(input_rows, input_cols); - perturb_eg->CopyFromMat(input_eg); - perturb_eg->AddMat(scale, selected_noise_mat); +// This function is a entrance. It calls ApplyDistortion to apply different +// type of distortions on input. +void PerturbExample(XvectorPerturbOptions opts, + const Matrix &input_egs, + Matrix *perturb_egs) { + //new a PerturbXvectorSignal object and call ApplyDistortion + PerturbXvectorSignal perturb_xvector(opts); + perturb_xvector.ApplyDistortion(input_egs, perturb_egs); } // add-end @@ -161,14 +120,6 @@ int main(int argc, char *argv[]) { XvectorPerturbOptions perturb_opts; perturb_opts.Register(&po); - // add - std::string add_noise_rspecifier; - po.Register("add-noise", &add_noise_rspecifier, "specify a file contains some noise egs"); - int32 snr; - po.Register("SNR",&snr,"specify a Signal to Noise Ration.We will scale the noise according \ - to the original signal and SNR. Normally, it's a non-zero number between -30 and 30"); - // add-end - po.Read(argc, argv); if (po.NumArgs() != 2) { po.PrintUsage(); @@ -182,20 +133,6 @@ int main(int argc, char *argv[]) { NnetExampleWriter example_writer(examples_wspecifier); - // add - // count the number of noise examples and record the key - std::vector list_noise_egs; - SequentialNnetExampleReader noise_seq_reader(add_noise_rspecifier); - for (; !noise_seq_reader.Done(); noise_seq_reader.Next()) { - std::string key = noise_seq_reader.Key(); - list_noise_egs.push_back(key); - } - noise_seq_reader.Close(); - int32 num_noise_egs = list_noise_egs.size(); - // initial a RandomAccessTableReader for noise egs - RandomAccessNnetExampleReader noise_random_reader(add_noise_rspecifier); - // add-end - int64 num_read = 0, num_written = 0; Matrix *noise_mat = NULL; @@ -215,21 +152,10 @@ int main(int argc, char *argv[]) { NnetExample *perturb_eg = new NnetExample(); Matrix perturb_eg_mat, input_eg_mat; - input_eg_io.features.CopyToMat(&input_eg_mat); - + input_eg_io.features.CopyToMat(&input_eg_mat); // add - if (!add_noise_rspecifier.empty()) { - // random choose a noise example - int32 index_noise_egs = RandInt(0, num_noise_egs - 1); - std::string key_noise_egs = list_noise_egs[index_noise_egs]; - const NnetExample &noise_eg = noise_random_reader.Value(key_noise_egs); - const NnetIo &noise_eg_io = noise_eg.io[0]; - - Matrix noise_eg_mat; - noise_eg_io.features.CopyToMat(&noise_eg_mat); - - // deal with add noise - ApplyAddAdditiveNoise(snr, input_eg_mat, noise_eg_mat, &perturb_eg_mat); + if (!perturb_opts.add_noise_rspecifier.empty()) { + PerturbExample(perturb_opts, input_eg_mat, &perturb_eg_mat); } else { ApplyPerturbation(perturb_opts, input_eg_mat, noise_mat, &perturb_eg_mat); } From e82966f9e41e462ec3b4e9b96687f2c3b8367aec Mon Sep 17 00:00:00 2001 From: Hang Lyu Date: Tue, 6 Dec 2016 00:18:09 -0500 Subject: [PATCH 04/23] modify the design style of ApplyAdditiveNoise --- src/feat/signal-distort.cc | 52 +++++++------------ src/feat/signal-distort.h | 22 +++++--- .../nnet3-xvector-signal-perturb-egs.cc | 40 +++++++++----- 3 files changed, 63 insertions(+), 51 deletions(-) diff --git a/src/feat/signal-distort.cc b/src/feat/signal-distort.cc index 25744f788a6..c00acb8c5f7 100644 --- a/src/feat/signal-distort.cc +++ b/src/feat/signal-distort.cc @@ -77,8 +77,7 @@ void TimeStretch(const MatrixBase &input_egs, // and add it. So that the perturbed signal is created. void PerturbXvectorSignal::ApplyAdditiveNoise(const MatrixBase &input_eg, const Matrix &noise_eg, - const int32 &SNR, - Matrix *perturb_eg) { + Matrix *perturbed_eg) { // In the version, we ask the noise_cols == input_cols. int32 input_rows = input_eg.NumRows(), input_cols = input_eg.NumCols(); KALDI_ASSERT(noise_eg.NumCols() == input_cols); @@ -117,48 +116,37 @@ void PerturbXvectorSignal::ApplyAdditiveNoise(const MatrixBase &input // In Energy domain, SNR=20log10(S/N). // 10^(SNR/20) = input_energy / (scale^2 * noise_energy) - double scale = input_energy / noise_energy / (pow(10,SNR/20)); + double scale = input_energy / noise_energy / (pow(10,opts_.snr/20)); scale = sqrt(scale); // Add noise mat to input_eg mat - perturb_eg->Resize(input_rows, input_cols); - perturb_eg->CopyFromMat(input_eg); - perturb_eg->AddMat(scale, selected_noise_mat); + perturbed_eg->Resize(input_rows, input_cols); + perturbed_eg->CopyFromMat(input_eg); + perturbed_eg->AddMat(scale, selected_noise_mat); } void PerturbXvectorSignal::ApplyDistortion(const MatrixBase &input_egs, Matrix *perturb_egs) { - if (!opts_.add_noise_rspecifier.empty()) { // deal with the add_noise ark situdation - // count the number of noise examples and record the key - std::vector list_noise_egs; - list_noise_egs.clear(); - kaldi::nnet3::SequentialNnetExampleReader noise_seq_reader(opts_.add_noise_rspecifier); - for (; !noise_seq_reader.Done(); noise_seq_reader.Next()) { - std::string key = noise_seq_reader.Key(); - list_noise_egs.push_back(key); - } - noise_seq_reader.Close(); - - // random choose a noise_eg and use it. - int32 num_noise_egs = list_noise_egs.size(); - int32 index_noise_egs = RandInt(0, num_noise_egs - 1); - std::string key_noise_egs = list_noise_egs[index_noise_egs]; - - kaldi::nnet3::RandomAccessNnetExampleReader noise_random_reader(opts_.add_noise_rspecifier); - const kaldi::nnet3::NnetExample &noise_eg = noise_random_reader.Value(key_noise_egs); - const kaldi::nnet3::NnetIo &noise_eg_io = noise_eg.io[0]; - Matrix noise_eg_mat; - noise_eg_io.features.CopyToMat(&noise_eg_mat); - int32 SNR = opts_.snr; - // conduct ApplyAdditiveNoise - ApplyAdditiveNoise(input_egs, noise_eg_mat, SNR, perturb_egs); - + if (!opts_.add_noise_rspecifier.empty()) { + ApplyAdditiveNoise(input_egs, *noise_egs_, perturb_egs); // conduct others // TODO } else { // deal with the opts_.noise_egs situation // TODO } } -// add-end + +// This function is a entrance. It calls ApplyDistortion to apply different +// type of distortions on input. +void PerturbExample(XvectorPerturbOptions opts, + const Matrix &input_egs, + const Matrix &noise_egs, + Matrix *perturbed_egs) { + //new a PerturbXvectorSignal object and call ApplyDistortion + PerturbXvectorSignal perturb_egs(opts); + perturb_egs.SetNoiseEgs(noise_egs); + perturb_egs.ApplyDistortion(input_egs, perturbed_egs); +} + } // end of namespace kaldi diff --git a/src/feat/signal-distort.h b/src/feat/signal-distort.h index d6c0e3e84ff..af06c235ca7 100644 --- a/src/feat/signal-distort.h +++ b/src/feat/signal-distort.h @@ -32,7 +32,6 @@ #include "feat/resample.h" #include "matrix/matrix-functions.h" #include "cudamatrix/cu-matrix.h" -#include "nnet3/nnet-example.h" namespace kaldi { @@ -45,7 +44,7 @@ struct XvectorPerturbOptions { bool rand_distort; std::string noise_egs; std::string add_noise_rspecifier; - int32 snr; + BaseFloat snr; XvectorPerturbOptions(): max_shift(0.2), max_time_stretch(0.2), @@ -74,16 +73,22 @@ struct XvectorPerturbOptions { class PerturbXvectorSignal { public: PerturbXvectorSignal(XvectorPerturbOptions opts): opts_(opts) { }; - + inline void SetNoiseEgs(const Matrix &noise_egs) { + noise_egs_ = &noise_egs; + } void ApplyDistortion(const MatrixBase &input_egs, Matrix *perturb_egs); private: XvectorPerturbOptions opts_; - + // if we want use many examples in once ApplyDistortion, we can expand the point + // to a point vector. + const Matrix *noise_egs_; + // I know we can use noise_egs_ instead of noise_eg parameter in this function, + // But I keep it. Because we may expand the point to a point vector and choose + // one kind noise to call ApplyAdditiveNoise. void ApplyAdditiveNoise(const MatrixBase &input_eg, const Matrix &noise_eg, - const int32 &SNR, - Matrix *perturb_eg); + Matrix *perturbed_eg); }; @@ -104,5 +109,10 @@ void TimeStretch(const MatrixBase &input_egs, BaseFloat max_time_stretch, Matrix *perturb_egs); +void PerturbExample(XvectorPerturbOptions opts, + const Matrix &input_egs, + const Matrix &noise_egs, + Matrix *perturbed_egs); + } // end of namespace kaldi #endif // KALDI_SIGNAL_DISTORT_H_ diff --git a/src/xvectorbin/nnet3-xvector-signal-perturb-egs.cc b/src/xvectorbin/nnet3-xvector-signal-perturb-egs.cc index 71c297ea18a..6ebfd8f9e4f 100644 --- a/src/xvectorbin/nnet3-xvector-signal-perturb-egs.cc +++ b/src/xvectorbin/nnet3-xvector-signal-perturb-egs.cc @@ -84,18 +84,6 @@ void ApplyPerturbation(XvectorPerturbOptions opts, ////} } -// add -// This function is a entrance. It calls ApplyDistortion to apply different -// type of distortions on input. -void PerturbExample(XvectorPerturbOptions opts, - const Matrix &input_egs, - Matrix *perturb_egs) { - //new a PerturbXvectorSignal object and call ApplyDistortion - PerturbXvectorSignal perturb_xvector(opts); - perturb_xvector.ApplyDistortion(input_egs, perturb_egs); -} -// add-end - } // end of namespace nnet3 } // end of namespace kaldi @@ -145,6 +133,19 @@ int main(int argc, char *argv[]) { } + // if we have the add_noise option, we need to record the keys of noise_egs. + // It will easy for us to choose a different noise example for each input_eg. + std::vector list_noise_egs; + if (!perturb_opts.add_noise_rspecifier.empty()) { + list_noise_egs.clear(); + SequentialNnetExampleReader noise_seq_reader(perturb_opts.add_noise_rspecifier); + for (; !noise_seq_reader.Done(); noise_seq_reader.Next()) { + std::string key = noise_seq_reader.Key(); + list_noise_egs.push_back(key); + } + noise_seq_reader.Close(); + } + for (; !example_reader.Done(); example_reader.Next(), num_read++) { std::string key = example_reader.Key(); const NnetExample &input_eg = example_reader.Value(); @@ -153,9 +154,22 @@ int main(int argc, char *argv[]) { Matrix perturb_eg_mat, input_eg_mat; input_eg_io.features.CopyToMat(&input_eg_mat); + // add if (!perturb_opts.add_noise_rspecifier.empty()) { - PerturbExample(perturb_opts, input_eg_mat, &perturb_eg_mat); + // For the input example, we firstly random choose an noise example. + int32 num_noise_egs = list_noise_egs.size(); + int32 index_noise_eg = RandInt(0, num_noise_egs - 1); + std::string key_noise_eg = list_noise_egs[index_noise_eg]; + + RandomAccessNnetExampleReader noise_random_reader(perturb_opts.add_noise_rspecifier); + const NnetExample &noise_eg = noise_random_reader.Value(key_noise_eg); + const NnetIo &noise_eg_io = noise_eg.io[0]; + Matrix noise_eg_mat; + noise_eg_io.features.CopyToMat(&noise_eg_mat); + + // We call the PerturbExample to implement adding distortion. + PerturbExample(perturb_opts, input_eg_mat, noise_eg_mat, &perturb_eg_mat); } else { ApplyPerturbation(perturb_opts, input_eg_mat, noise_mat, &perturb_eg_mat); } From 486314d332b97e9e35d131fd016ba4ee7e3999c0 Mon Sep 17 00:00:00 2001 From: Hang Lyu Date: Tue, 6 Dec 2016 17:04:56 -0500 Subject: [PATCH 05/23] fix the add_noise from nnet3-examples to matrix --- src/feat/signal-distort.cc | 22 +++++++++++-- src/feat/signal-distort.h | 5 ++- .../nnet3-xvector-signal-perturb-egs.cc | 32 +++---------------- 3 files changed, 25 insertions(+), 34 deletions(-) diff --git a/src/feat/signal-distort.cc b/src/feat/signal-distort.cc index c00acb8c5f7..9cc1a19cb6b 100644 --- a/src/feat/signal-distort.cc +++ b/src/feat/signal-distort.cc @@ -128,7 +128,25 @@ void PerturbXvectorSignal::ApplyAdditiveNoise(const MatrixBase &input void PerturbXvectorSignal::ApplyDistortion(const MatrixBase &input_egs, Matrix *perturb_egs) { // conduct ApplyAdditiveNoise - if (!opts_.add_noise_rspecifier.empty()) { + if (!opts_.add_noise.empty()) { + // choose a noise from the noise.scp/ark + // 1) we need to record the keys of noise_egs + std::vector list_noise_egs; + SequentialBaseFloatMatrixReader noise_seq_reader(opts_.add_noise); + for (; !noise_seq_reader.Done(); noise_seq_reader.Next()) { + std::string key = noise_seq_reader.Key(); + list_noise_egs.push_back(key); + } + noise_seq_reader.Close(); + + // 2) we random choose an noise example + int32 num_noise_egs = list_noise_egs.size(); + int32 index_noise_eg = RandInt(0, num_noise_egs - 1); + std::string key_noise_eg = list_noise_egs[index_noise_eg]; + RandomAccessBaseFloatMatrixReader noise_random_reader(opts_.add_noise); + Matrix noise_eg_mat = noise_random_reader.Value(key_noise_eg); + SetNoiseEgs(noise_eg_mat); + ApplyAdditiveNoise(input_egs, *noise_egs_, perturb_egs); // conduct others // TODO @@ -141,11 +159,9 @@ void PerturbXvectorSignal::ApplyDistortion(const MatrixBase &input_eg // type of distortions on input. void PerturbExample(XvectorPerturbOptions opts, const Matrix &input_egs, - const Matrix &noise_egs, Matrix *perturbed_egs) { //new a PerturbXvectorSignal object and call ApplyDistortion PerturbXvectorSignal perturb_egs(opts); - perturb_egs.SetNoiseEgs(noise_egs); perturb_egs.ApplyDistortion(input_egs, perturbed_egs); } diff --git a/src/feat/signal-distort.h b/src/feat/signal-distort.h index af06c235ca7..fb7729da9f4 100644 --- a/src/feat/signal-distort.h +++ b/src/feat/signal-distort.h @@ -43,7 +43,7 @@ struct XvectorPerturbOptions { int32 negation_prop; bool rand_distort; std::string noise_egs; - std::string add_noise_rspecifier; + std::string add_noise; BaseFloat snr; XvectorPerturbOptions(): max_shift(0.2), @@ -63,7 +63,7 @@ struct XvectorPerturbOptions { opts->Register("noise-egs", &noise_egs, "If supplied, the additive noise is added to input signal."); opts->Register("rand_distort", &rand_distort, "If true, the signal is slightly changes" "using some designed FIR filter with no zeros."); - opts->Register("add-noise", &add_noise_rspecifier, "specify a file contains some noise egs"); + opts->Register("add-noise", &add_noise, "specify a file contains some noise egs"); opts->Register("SNR",&snr,"specify a Signal to Noise Ration. We will scale the noise according" "to the original signal and SNR. Normally, it's a non-zero number between -30 and 30" "default=10"); @@ -111,7 +111,6 @@ void TimeStretch(const MatrixBase &input_egs, void PerturbExample(XvectorPerturbOptions opts, const Matrix &input_egs, - const Matrix &noise_egs, Matrix *perturbed_egs); } // end of namespace kaldi diff --git a/src/xvectorbin/nnet3-xvector-signal-perturb-egs.cc b/src/xvectorbin/nnet3-xvector-signal-perturb-egs.cc index 6ebfd8f9e4f..59c08c23002 100644 --- a/src/xvectorbin/nnet3-xvector-signal-perturb-egs.cc +++ b/src/xvectorbin/nnet3-xvector-signal-perturb-egs.cc @@ -102,6 +102,7 @@ int main(int argc, char *argv[]) { "e.g.\n" "nnet3-xvector-signal-perturb-egs --noise-egs=noise.egs\n" "--max-shift=0.2 --max-speed-perturb=0.1 --negation=true\n" + "--add-noise=noise.scp --snr=10\n" "ark:input.egs akr:distorted.egs\n"; ParseOptions po(usage); @@ -129,21 +130,7 @@ int main(int argc, char *argv[]) { SequentialNnetExampleReader noise_reader(perturb_opts.noise_egs); const NnetExample &noise_egs = noise_reader.Value(); const NnetIo &noise_io = noise_egs.io[0]; - noise_io.features.CopyToMat(noise_mat); - - } - - // if we have the add_noise option, we need to record the keys of noise_egs. - // It will easy for us to choose a different noise example for each input_eg. - std::vector list_noise_egs; - if (!perturb_opts.add_noise_rspecifier.empty()) { - list_noise_egs.clear(); - SequentialNnetExampleReader noise_seq_reader(perturb_opts.add_noise_rspecifier); - for (; !noise_seq_reader.Done(); noise_seq_reader.Next()) { - std::string key = noise_seq_reader.Key(); - list_noise_egs.push_back(key); - } - noise_seq_reader.Close(); + noise_io.features.CopyToMat(noise_mat); } for (; !example_reader.Done(); example_reader.Next(), num_read++) { @@ -156,20 +143,9 @@ int main(int argc, char *argv[]) { input_eg_io.features.CopyToMat(&input_eg_mat); // add - if (!perturb_opts.add_noise_rspecifier.empty()) { - // For the input example, we firstly random choose an noise example. - int32 num_noise_egs = list_noise_egs.size(); - int32 index_noise_eg = RandInt(0, num_noise_egs - 1); - std::string key_noise_eg = list_noise_egs[index_noise_eg]; - - RandomAccessNnetExampleReader noise_random_reader(perturb_opts.add_noise_rspecifier); - const NnetExample &noise_eg = noise_random_reader.Value(key_noise_eg); - const NnetIo &noise_eg_io = noise_eg.io[0]; - Matrix noise_eg_mat; - noise_eg_io.features.CopyToMat(&noise_eg_mat); - + if (!perturb_opts.add_noise.empty()) { // We call the PerturbExample to implement adding distortion. - PerturbExample(perturb_opts, input_eg_mat, noise_eg_mat, &perturb_eg_mat); + PerturbExample(perturb_opts, input_eg_mat, &perturb_eg_mat); } else { ApplyPerturbation(perturb_opts, input_eg_mat, noise_mat, &perturb_eg_mat); } From 12ade04d2fb80d7670757b215cc13727575ec2bc Mon Sep 17 00:00:00 2001 From: Hang Lyu Date: Wed, 7 Dec 2016 17:55:58 -0500 Subject: [PATCH 06/23] remove private variable noise_egs_ from class and the corresponding change --- src/feat/signal-distort.cc | 47 ++++++++++++++++---------------------- src/feat/signal-distort.h | 22 ++++++------------ 2 files changed, 27 insertions(+), 42 deletions(-) diff --git a/src/feat/signal-distort.cc b/src/feat/signal-distort.cc index 9cc1a19cb6b..3803b904303 100644 --- a/src/feat/signal-distort.cc +++ b/src/feat/signal-distort.cc @@ -76,33 +76,30 @@ void TimeStretch(const MatrixBase &input_egs, // the signal level of the orginial signal. According to SNR, we rescale the noise // and add it. So that the perturbed signal is created. void PerturbXvectorSignal::ApplyAdditiveNoise(const MatrixBase &input_eg, - const Matrix &noise_eg, + const Matrix &noise_mat, Matrix *perturbed_eg) { // In the version, we ask the noise_cols == input_cols. int32 input_rows = input_eg.NumRows(), input_cols = input_eg.NumCols(); - KALDI_ASSERT(noise_eg.NumCols() == input_cols); + KALDI_ASSERT(noise_mat.NumCols() == input_cols); - // According to the rows of noise_eg, form the noise_mat - // repeat the noise_eg blocks to have a new block which is longer than input_eg - - // As the noise_eg is very huge and the input_eg is small normally, - // so we'd better not reload the "noise_eg" matrix + // As the noise_mat is very huge and the input_eg is small normally, + // so we'd better not reload the "noise_mat" matrix // select the noise range Matrix selected_noise_mat; selected_noise_mat.Resize(input_rows, input_cols); - int32 noise_rows = noise_eg.NumRows(); + int32 noise_rows = noise_mat.NumRows(); int32 start_row_ind = RandInt(0, noise_rows - input_rows); - if (noise_eg.NumRows() < input_rows) { + if (noise_mat.NumRows() < input_rows) { int32 indices[input_rows]; for (int32 i=0; i < input_rows; ++i) { - indices[i] = (start_row_ind + i) % noise_eg.NumRows(); + indices[i] = (start_row_ind + i) % noise_mat.NumRows(); } - selected_noise_mat.CopyRows(noise_eg, indices); + selected_noise_mat.CopyRows(noise_mat, indices); } else { - selected_noise_mat.AddMat(1.0, noise_eg.Range(start_row_ind, input_rows, + selected_noise_mat.AddMat(1.0, noise_mat.Range(start_row_ind, input_rows, 0, input_cols)); } @@ -127,40 +124,36 @@ void PerturbXvectorSignal::ApplyAdditiveNoise(const MatrixBase &input void PerturbXvectorSignal::ApplyDistortion(const MatrixBase &input_egs, Matrix *perturb_egs) { - // conduct ApplyAdditiveNoise if (!opts_.add_noise.empty()) { // choose a noise from the noise.scp/ark // 1) we need to record the keys of noise_egs - std::vector list_noise_egs; + std::vector noise_list; SequentialBaseFloatMatrixReader noise_seq_reader(opts_.add_noise); for (; !noise_seq_reader.Done(); noise_seq_reader.Next()) { std::string key = noise_seq_reader.Key(); - list_noise_egs.push_back(key); + noise_list.push_back(key); } noise_seq_reader.Close(); // 2) we random choose an noise example - int32 num_noise_egs = list_noise_egs.size(); - int32 index_noise_eg = RandInt(0, num_noise_egs - 1); - std::string key_noise_eg = list_noise_egs[index_noise_eg]; + int32 num_noises = noise_list.size(); + int32 noise_index = RandInt(0, num_noises - 1); + std::string noise_name = noise_list[noise_index]; RandomAccessBaseFloatMatrixReader noise_random_reader(opts_.add_noise); - Matrix noise_eg_mat = noise_random_reader.Value(key_noise_eg); - SetNoiseEgs(noise_eg_mat); + Matrix noise_mat = noise_random_reader.Value(noise_name); - ApplyAdditiveNoise(input_egs, *noise_egs_, perturb_egs); + // 3) conduct ApplyAdditiveNoise + ApplyAdditiveNoise(input_egs, noise_mat, perturb_egs); // conduct others // TODO - } else { // deal with the opts_.noise_egs situation - // TODO - } + } } -// This function is a entrance. It calls ApplyDistortion to apply different -// type of distortions on input. +// This function calls ApplyDistortion to apply different type of perturbations. void PerturbExample(XvectorPerturbOptions opts, const Matrix &input_egs, Matrix *perturbed_egs) { - //new a PerturbXvectorSignal object and call ApplyDistortion + // new a PerturbXvectorSignal object and call ApplyDistortion PerturbXvectorSignal perturb_egs(opts); perturb_egs.ApplyDistortion(input_egs, perturbed_egs); } diff --git a/src/feat/signal-distort.h b/src/feat/signal-distort.h index fb7729da9f4..8e45c066cd3 100644 --- a/src/feat/signal-distort.h +++ b/src/feat/signal-distort.h @@ -51,7 +51,7 @@ struct XvectorPerturbOptions { frame_dim(80), negation_prop(0.0), rand_distort(false), - snr(10) { } + snr(10.0) { } void Register(OptionsItf *opts) { opts->Register("max-shift", &max_shift, "Maximum random shift relative" "to frame length applied to egs."); @@ -63,31 +63,23 @@ struct XvectorPerturbOptions { opts->Register("noise-egs", &noise_egs, "If supplied, the additive noise is added to input signal."); opts->Register("rand_distort", &rand_distort, "If true, the signal is slightly changes" "using some designed FIR filter with no zeros."); - opts->Register("add-noise", &add_noise, "specify a file contains some noise egs"); - opts->Register("SNR",&snr,"specify a Signal to Noise Ration. We will scale the noise according" - "to the original signal and SNR. Normally, it's a non-zero number between -30 and 30" - "default=10"); + opts->Register("add-noise", &add_noise, "Noise rspecifier for additive noises, if " + "nonempty, the additive noise randomly selected and added to input egs."); + opts->Register("SNR",&snr,"Specify a Signal to Noise Ration. We will scale the noise according " + "to the original signal and SNR. Normally, it's a non-zero number between -30 and 30" + "default=10"); } }; class PerturbXvectorSignal { public: PerturbXvectorSignal(XvectorPerturbOptions opts): opts_(opts) { }; - inline void SetNoiseEgs(const Matrix &noise_egs) { - noise_egs_ = &noise_egs; - } void ApplyDistortion(const MatrixBase &input_egs, Matrix *perturb_egs); private: XvectorPerturbOptions opts_; - // if we want use many examples in once ApplyDistortion, we can expand the point - // to a point vector. - const Matrix *noise_egs_; - // I know we can use noise_egs_ instead of noise_eg parameter in this function, - // But I keep it. Because we may expand the point to a point vector and choose - // one kind noise to call ApplyAdditiveNoise. void ApplyAdditiveNoise(const MatrixBase &input_eg, - const Matrix &noise_eg, + const Matrix &noise_mat, Matrix *perturbed_eg); }; From 9bec26d6f932cac52e79cb4cd417c046e53c16b4 Mon Sep 17 00:00:00 2001 From: Hang Lyu Date: Wed, 7 Dec 2016 23:04:20 -0500 Subject: [PATCH 07/23] modify the main program --- src/feat/signal-distort.h | 6 +- .../nnet3-xvector-signal-perturb-egs.cc | 90 ++----------------- 2 files changed, 7 insertions(+), 89 deletions(-) diff --git a/src/feat/signal-distort.h b/src/feat/signal-distort.h index 8e45c066cd3..ccbaa3241cc 100644 --- a/src/feat/signal-distort.h +++ b/src/feat/signal-distort.h @@ -42,7 +42,6 @@ struct XvectorPerturbOptions { int32 frame_dim; int32 negation_prop; bool rand_distort; - std::string noise_egs; std::string add_noise; BaseFloat snr; @@ -53,14 +52,13 @@ struct XvectorPerturbOptions { rand_distort(false), snr(10.0) { } void Register(OptionsItf *opts) { - opts->Register("max-shift", &max_shift, "Maximum random shift relative" - "to frame length applied to egs."); + opts->Register("max-shift", &max_shift, "Maximum random shift relative " + "to frame length applied to egs."); opts->Register("max-speed-perturb", &max_time_stretch, "Max speed perturbation applied on egs."); opts->Register("frame-dim", &frame_dim, "The numebr of samples in input frame as product of frame_length by samp_freq."); opts->Register("negation-prop", &negation_prop, "This proportion of the input value is randomly negated."); - opts->Register("noise-egs", &noise_egs, "If supplied, the additive noise is added to input signal."); opts->Register("rand_distort", &rand_distort, "If true, the signal is slightly changes" "using some designed FIR filter with no zeros."); opts->Register("add-noise", &add_noise, "Noise rspecifier for additive noises, if " diff --git a/src/xvectorbin/nnet3-xvector-signal-perturb-egs.cc b/src/xvectorbin/nnet3-xvector-signal-perturb-egs.cc index 59c08c23002..351277482a8 100644 --- a/src/xvectorbin/nnet3-xvector-signal-perturb-egs.cc +++ b/src/xvectorbin/nnet3-xvector-signal-perturb-egs.cc @@ -22,70 +22,6 @@ #include "feat/signal-distort.h" #include "nnet3/nnet-example.h" #include "nnet3/nnet-example-utils.h" -namespace kaldi { -namespace nnet3 { - -// This function applies different type of perturbation to input_egs. -// random distortion of inputs, random shifts, adding additive noise, -// random time stretch and random negations are different type of -// distortions used in this function. -void ApplyPerturbation(XvectorPerturbOptions opts, - const Matrix &input_egs, - Matrix *noise_egs, - Matrix *perturb_egs) { - - PerturbXvectorSignal perturb_xvector(opts); - - Matrix shifted_egs(input_egs); - // Generate random shift samples to shift egs. - if (opts.max_shift != 0.0) { - int32 max_shift_int = static_cast(opts.max_shift * opts.frame_dim); - // shift input_egs using random shift. - int32 eg_dim = input_egs.NumCols() - opts.frame_dim, - shift = RandInt(0, max_shift_int); - shifted_egs.CopyFromMat(input_egs.Range(0, input_egs.NumRows(), shift, eg_dim)); - } - - Matrix rand_distort_shifted_egs(shifted_egs); - if (opts.rand_distort) { - // randomly generate an zero-phase FIR filter with no zeros. - // In future, we can select trucated part of room impluse response - // and convolve it with input_egs. - ////perturb_xvector.ComputeAndApplyRandDistortion(shifted_egs, - //// &rand_distort_shifted_egs); - } - - if (noise_egs) { - // select random block of noise egs and add to input_egs - // number of additive noises should be larger than number of input-egs. - KALDI_ASSERT(noise_egs->NumRows() >= input_egs.NumRows()); - if (noise_egs->NumRows() < input_egs.NumRows()) { - // repeat the noise_egs_mat blocks to have same length block - // and randomly perturb the rows. - } else { - // Select random submatrix out of noise_egs and add it to perturb_egs. - // we should shuffle noise_egs before passing them to this binary. - int32 start_row_ind = RandInt(0, noise_egs->NumRows() - input_egs.NumRows()), - start_col_ind = RandInt(0, noise_egs->NumCols() - input_egs.NumCols()); - rand_distort_shifted_egs.AddMat(1.0, noise_egs->Range(start_row_ind, input_egs.NumRows(), - start_col_ind, input_egs.NumCols())); - } - } - // Perturb speed of signal egs - Matrix warped_distorted_shifted_egs(rand_distort_shifted_egs); - ////if (opts.max_time_stretch != 0.0) - //// perturb_xvector.TimeStretch(rand_distort_shifted_egs, - //// &warped_distorted_shifted_egs); - - // If nagation is true, the sample values are randomly negated - // with some probability. - ////if (opts.negation) { - - ////} -} - -} // end of namespace nnet3 -} // end of namespace kaldi int main(int argc, char *argv[]) { try { @@ -100,12 +36,11 @@ int main(int argc, char *argv[]) { "such as additive noise, negation, random time shifts or random distortion.\n" "Usage: nnet3-xvector-signal-perturb-egs [options...] \n" "e.g.\n" - "nnet3-xvector-signal-perturb-egs --noise-egs=noise.egs\n" - "--max-shift=0.2 --max-speed-perturb=0.1 --negation=true\n" - "--add-noise=noise.scp --snr=10\n" + "nnet3-xvector-signal-perturb-egs --max-shift=0.2" + " --max-speed-perturb=0.1 --negation=true --add-noise=noise.scp --snr=10\n" "ark:input.egs akr:distorted.egs\n"; - ParseOptions po(usage); + ParseOptions po(usage); XvectorPerturbOptions perturb_opts; perturb_opts.Register(&po); @@ -124,15 +59,6 @@ int main(int argc, char *argv[]) { int64 num_read = 0, num_written = 0; - Matrix *noise_mat = NULL; - // read additive noise egs if it is specified. - if (!perturb_opts.noise_egs.empty()) { - SequentialNnetExampleReader noise_reader(perturb_opts.noise_egs); - const NnetExample &noise_egs = noise_reader.Value(); - const NnetIo &noise_io = noise_egs.io[0]; - noise_io.features.CopyToMat(noise_mat); - } - for (; !example_reader.Done(); example_reader.Next(), num_read++) { std::string key = example_reader.Key(); const NnetExample &input_eg = example_reader.Value(); @@ -142,14 +68,8 @@ int main(int argc, char *argv[]) { input_eg_mat; input_eg_io.features.CopyToMat(&input_eg_mat); - // add - if (!perturb_opts.add_noise.empty()) { - // We call the PerturbExample to implement adding distortion. - PerturbExample(perturb_opts, input_eg_mat, &perturb_eg_mat); - } else { - ApplyPerturbation(perturb_opts, input_eg_mat, noise_mat, &perturb_eg_mat); - } - // add-end + PerturbExample(perturb_opts, input_eg_mat, &perturb_eg_mat); + perturb_eg->io.resize(1.0); perturb_eg->io[0].features.SwapFullMatrix(&perturb_eg_mat); example_writer.Write(key, *perturb_eg); From fb78d055adef7e7bb1553a1f117907caabf93ba8 Mon Sep 17 00:00:00 2001 From: Hang Lyu Date: Thu, 8 Dec 2016 18:08:52 -0500 Subject: [PATCH 08/23] fix the signal distortion --- src/feat/signal-distort.cc | 59 +++++++++---------- src/feat/signal-distort.h | 17 +++--- .../nnet3-xvector-signal-perturb-egs.cc | 3 +- 3 files changed, 39 insertions(+), 40 deletions(-) diff --git a/src/feat/signal-distort.cc b/src/feat/signal-distort.cc index 3803b904303..c71f8e967db 100644 --- a/src/feat/signal-distort.cc +++ b/src/feat/signal-distort.cc @@ -72,6 +72,17 @@ void TimeStretch(const MatrixBase &input_egs, perturb_egs->CopyFromMat(out_mat); } +PerturbXvectorSignal::PerturbXvectorSignal(XvectorPerturbOptions opts): opts_(opts) { + if (!opts_.add_noise.empty()) { + // initialize the noise_list_ + SequentialBaseFloatMatrixReader noise_seq_reader(opts_.add_noise); + for (; !noise_seq_reader.Done(); noise_seq_reader.Next()) { + std::string key = noise_seq_reader.Key(); + noise_list_.push_back(key); + } + noise_seq_reader.Close(); + } +} // This function add the noise to the orginial signal. We should not normalize // the signal level of the orginial signal. According to SNR, we rescale the noise // and add it. So that the perturbed signal is created. @@ -122,40 +133,26 @@ void PerturbXvectorSignal::ApplyAdditiveNoise(const MatrixBase &input perturbed_eg->AddMat(scale, selected_noise_mat); } -void PerturbXvectorSignal::ApplyDistortion(const MatrixBase &input_egs, - Matrix *perturb_egs) { - if (!opts_.add_noise.empty()) { - // choose a noise from the noise.scp/ark - // 1) we need to record the keys of noise_egs - std::vector noise_list; - SequentialBaseFloatMatrixReader noise_seq_reader(opts_.add_noise); - for (; !noise_seq_reader.Done(); noise_seq_reader.Next()) { - std::string key = noise_seq_reader.Key(); - noise_list.push_back(key); - } - noise_seq_reader.Close(); - - // 2) we random choose an noise example - int32 num_noises = noise_list.size(); - int32 noise_index = RandInt(0, num_noises - 1); - std::string noise_name = noise_list[noise_index]; - RandomAccessBaseFloatMatrixReader noise_random_reader(opts_.add_noise); - Matrix noise_mat = noise_random_reader.Value(noise_name); - - // 3) conduct ApplyAdditiveNoise - ApplyAdditiveNoise(input_egs, noise_mat, perturb_egs); - // conduct others - // TODO - } +void PerturbXvectorSignal::ApplyDistortion(const MatrixBase &input_eg, + Matrix *perturbed_eg) { + // we random choose an noise example + int32 num_noises = noise_list_.size(); + int32 noise_index = RandInt(0, num_noises - 1); + std::string noise_name = noise_list_[noise_index]; + RandomAccessBaseFloatMatrixReader noise_random_reader(opts_.add_noise); + Matrix noise_mat = noise_random_reader.Value(noise_name); + + // conduct ApplyAdditiveNoise + ApplyAdditiveNoise(input_eg, noise_mat, perturbed_eg); + // conduct others + // TODO } // This function calls ApplyDistortion to apply different type of perturbations. -void PerturbExample(XvectorPerturbOptions opts, - const Matrix &input_egs, - Matrix *perturbed_egs) { - // new a PerturbXvectorSignal object and call ApplyDistortion - PerturbXvectorSignal perturb_egs(opts); - perturb_egs.ApplyDistortion(input_egs, perturbed_egs); +void PerturbExample(PerturbXvectorSignal &eg_perturber, + const Matrix &input_eg, + Matrix *perturbed_eg) { + eg_perturber.ApplyDistortion(input_eg, perturbed_eg); } } // end of namespace kaldi diff --git a/src/feat/signal-distort.h b/src/feat/signal-distort.h index ccbaa3241cc..b8fc1542b4d 100644 --- a/src/feat/signal-distort.h +++ b/src/feat/signal-distort.h @@ -71,14 +71,15 @@ struct XvectorPerturbOptions { class PerturbXvectorSignal { public: - PerturbXvectorSignal(XvectorPerturbOptions opts): opts_(opts) { }; - void ApplyDistortion(const MatrixBase &input_egs, - Matrix *perturb_egs); - private: - XvectorPerturbOptions opts_; + PerturbXvectorSignal(XvectorPerturbOptions opts); + void ApplyDistortion(const MatrixBase &input_eg, + Matrix *perturbed_eg); void ApplyAdditiveNoise(const MatrixBase &input_eg, const Matrix &noise_mat, Matrix *perturbed_eg); + private: + XvectorPerturbOptions opts_; + std::vector noise_list_; }; @@ -99,9 +100,9 @@ void TimeStretch(const MatrixBase &input_egs, BaseFloat max_time_stretch, Matrix *perturb_egs); -void PerturbExample(XvectorPerturbOptions opts, - const Matrix &input_egs, - Matrix *perturbed_egs); +void PerturbExample(PerturbXvectorSignal &eg_perturber, + const Matrix &input_eg, + Matrix *perturbed_eg); } // end of namespace kaldi #endif // KALDI_SIGNAL_DISTORT_H_ diff --git a/src/xvectorbin/nnet3-xvector-signal-perturb-egs.cc b/src/xvectorbin/nnet3-xvector-signal-perturb-egs.cc index 351277482a8..d32c9a66d60 100644 --- a/src/xvectorbin/nnet3-xvector-signal-perturb-egs.cc +++ b/src/xvectorbin/nnet3-xvector-signal-perturb-egs.cc @@ -59,6 +59,7 @@ int main(int argc, char *argv[]) { int64 num_read = 0, num_written = 0; + PerturbXvectorSignal eg_perturber(perturb_opts); for (; !example_reader.Done(); example_reader.Next(), num_read++) { std::string key = example_reader.Key(); const NnetExample &input_eg = example_reader.Value(); @@ -68,7 +69,7 @@ int main(int argc, char *argv[]) { input_eg_mat; input_eg_io.features.CopyToMat(&input_eg_mat); - PerturbExample(perturb_opts, input_eg_mat, &perturb_eg_mat); + PerturbExample(eg_perturber, input_eg_mat, &perturb_eg_mat); perturb_eg->io.resize(1.0); perturb_eg->io[0].features.SwapFullMatrix(&perturb_eg_mat); From 48d51aae47d41e57f3bad8bb146fe018d5a00e9c Mon Sep 17 00:00:00 2001 From: Hang Lyu Date: Tue, 13 Dec 2016 20:24:10 -0500 Subject: [PATCH 09/23] write nnet3-fvector-get-egs.cc --- src/fvectorbin/Makefile | 25 ++++ src/fvectorbin/nnet3-fvector-get-egs.cc | 155 ++++++++++++++++++++++++ 2 files changed, 180 insertions(+) create mode 100644 src/fvectorbin/Makefile create mode 100644 src/fvectorbin/nnet3-fvector-get-egs.cc diff --git a/src/fvectorbin/Makefile b/src/fvectorbin/Makefile new file mode 100644 index 00000000000..7d826881cf6 --- /dev/null +++ b/src/fvectorbin/Makefile @@ -0,0 +1,25 @@ + +all: +EXTRA_CXXFLAGS = -Wno-sign-compare +include ../kaldi.mk + +LDFLAGS += $(CUDA_LDFLAGS) +LDLIBS += $(CUDA_LDLIBS) + +BINFILES = nnet3-fvector-get-egs + +OBJFILES = + +# Add this dependency to force cuda-compiled.o to be rebuilt when we reconfigure. +cuda-compiled.o: ../kaldi.mk + +TESTFILES = + +ADDLIBS = ../xvector/kaldi-xvector.a ../nnet3/kaldi-nnet3.a ../gmm/kaldi-gmm.a \ + ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a \ + ../transform/kaldi-transform.a ../tree/kaldi-tree.a \ + ../thread/kaldi-thread.a ../feat/kaldi-feat.a ../cudamatrix/kaldi-cudamatrix.a \ + ../matrix/kaldi-matrix.a ../fstext/kaldi-fstext.a \ + ../util/kaldi-util.a ../base/kaldi-base.a + +include ../makefiles/default_rules.mk diff --git a/src/fvectorbin/nnet3-fvector-get-egs.cc b/src/fvectorbin/nnet3-fvector-get-egs.cc new file mode 100644 index 00000000000..2f7fdbfa748 --- /dev/null +++ b/src/fvectorbin/nnet3-fvector-get-egs.cc @@ -0,0 +1,155 @@ +// fvectorbin/nnet3-fvector-get-egs.cc + +// Copyright 2016 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "hmm/transition-model.h" +#include "hmm/posterior.h" +#include "nnet3/nnet-example.h" + +namespace kaldi { +namespace nnet3 { + + +static void ProcessFile(const MatrixBase &feats, + const std::string &utt_id, + bool compress, + int32 left_context, + int32 right_context, + int32 frames_per_eg, + int64 *num_frames_written, + int64 *num_egs_written, + NnetExampleWriter *example_writer) { + for (int32 t = 0; t < feats.NumRows(); t += frames_per_eg) { + + // actual_frames_per_eg is the number of frames in center. + // At the end of the file we pad with zero posteriors + // so that all examples have the same structure (prevents the need + // for recompilations). + int32 actual_frames_per_eg = std::min(frames_per_eg, + feats.NumRows() - t); + + int32 tot_frames = left_context + frames_per_eg + right_context; + + Matrix input_frames(tot_frames, feats.NumCols(), kUndefined); + + // Set up "input_frames". + for (int32 j = -left_context; j < frames_per_eg + right_context; j++) { + int32 t2 = j + t; + if (t2 < 0) t2 = 0; + if (t2 >= feats.NumRows()) t2 = feats.NumRows() - 1; + SubVector src(feats, t2), + dest(input_frames, j + left_context); + dest.CopyFromVec(src); + } + + NnetExample eg; + + // call the regular input "input". + eg.io.push_back(NnetIo("input", -left_context, input_frames)); + + if (compress) { eg.Compress();} + + std::ostringstream os; + os << utt_id << "-" << t; + + std::string key = os.str(); // key is - + + *num_frames_written += actual_frames_per_eg; + *num_egs_written += 1; + + example_writer->Write(key, eg); + } +} + + +} // namespace nnet3 +} // namespace kaldi + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + + const char *usage = + "Get frame-by-frame examples of data for nnet3 neural network training.\n" + "Essentially this is a format change from features into a special frame-by-frame format.\n" + "This program handles the common case where you have some input features\n" + "and convert them to fvector examples format\n" + "Note: In fvector version, there is no need for iVectors, posterior and labels.\n" + "\n" + "Usage: nnet3-fvector-get-egs [options] \n" + "\n" + "An example [where $feats expands to the actual features]:\n" + "nnet3-fvector-get-egs --left-context=12 --right-context=9 --compress=true \"$feats\" \\\n" + "\"ark:train.egs\"\n"; + + + bool compress = true; + int32 left_context = 0, right_context = 0, num_frames = 1; + + ParseOptions po(usage); + po.Register("compress", &compress, "If true, write egs in " + "compressed format."); + po.Register("left-context", &left_context, "Number of frames of left " + "context the neural net requires."); + po.Register("right-context", &right_context, "Number of frames of right " + "context the neural net requires."); + po.Register("num-frames", &num_frames, "Number of frames is central"); + + po.Read(argc, argv); + + if (po.NumArgs() != 2) { + po.PrintUsage(); + exit(1); + } + + std::string feature_rspecifier = po.GetArg(1), + examples_wspecifier = po.GetArg(2); + + // Read in all the training files. + SequentialBaseFloatMatrixReader feat_reader(feature_rspecifier); + NnetExampleWriter example_writer(examples_wspecifier); + + int32 num_done = 0; + int64 num_frames_written = 0, num_egs_written = 0; + + for (; !feat_reader.Done(); feat_reader.Next()) { + std::string key = feat_reader.Key(); + const Matrix &feats = feat_reader.Value(); + ProcessFile(feats, key, compress, left_context, right_context, + num_frames, &num_frames_written, &num_egs_written, + &example_writer); + num_done++; + } + + KALDI_LOG << "Finished generating examples, " + << "successfully processed " << num_done + << " feature files, wrote " << num_egs_written << " examples, " + << " with " << num_frames_written << " egs in total."; + return (num_egs_written == 0 || num_done == 0 ? 1 : 0); + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} From 25f3e1b443a2f70800c788e0869ca4ff4287c461 Mon Sep 17 00:00:00 2001 From: Hang Lyu Date: Sun, 18 Dec 2016 23:26:36 -0500 Subject: [PATCH 10/23] about generate noise range --- .../nnet3/fvector/generate_noise_range.py | 127 ++++++++++++++++++ .../s5/steps/nnet3/fvector/lh_add_noise.sh | 41 ++++++ 2 files changed, 168 insertions(+) create mode 100644 egs/wsj/s5/steps/nnet3/fvector/generate_noise_range.py create mode 100644 egs/wsj/s5/steps/nnet3/fvector/lh_add_noise.sh diff --git a/egs/wsj/s5/steps/nnet3/fvector/generate_noise_range.py b/egs/wsj/s5/steps/nnet3/fvector/generate_noise_range.py new file mode 100644 index 00000000000..db4d35a4325 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/fvector/generate_noise_range.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python + +from __future__ import print_function +import re, os, argparse, sys, math, warnings, random + +parser = argparse.ArgumentParser(description="Generate n kinds of noise range for each original wav" + epilog="Called by steps/nnet3/fvector/lh_add_noise.sh") +parser.add_argument("--num-kind-range", type=int, default=4, + help="the number of kinds of noise ranges") +parser.add_argument("--min-additive-noise-len", type=float, default=2.0, + help="the minimum duration of each noise file") +parser.add_argument("--min-snr", type=int, default=0, + help="the minimum Signal-to-Noise Rate, the default=0") +parser.add_argument("--max-snr", type=int, default=-10, + help="the maximum Signal-to-Noise Rate, the default=-10") +parser.add_argument("--seed", type=int, default=-1, + help="Seed for random number generator") + +# now the positional arguments +parser.add_argument("wav_utt2dur", + help="utt2dur file of the original wav to be used as input (format is: " + " ") +parser.add_argument("noise_utt2dur", + help="utt2dur file of the noise wav to be used as input (format is: " + " ") +parser.add_argument("rangs_dir", + help="Name of ranges directory, exp/fxvector/ranges") + +print(' '.join(sys.argv)) + +args = parser.parse_args() + +## Check arguments +if args.min_snr < args.max_snr: + sys.exit("For SNR, the less numerical value is, the larger noise is. So --min-snr bigger " + "than --max-snr in numerical value.") + +random.seed(args.seed) + +# deal with the original wav utt2dur +f = open(args.wav_utt2dur, "r") +if f is None: + sys.exit("Error opening wav_utt2dur file " + str(args.wav_utt2dur)) +wav_utt_ids = [] +wav_lengths = [] +for line in f: + a = line.split() + if len(a) != 2: + sys.exit("bad line in wav_utt2dur file " + line) + wav_utt_ids.append(a[0]) + wav_lengths.append(a[1]) +f.close() + +wav_num_utts = len(wav_utt_ids) + +# deal with the noise wav utt2dur +f = open(args.noise_utt2dur, "r") +if f is None: + sys.exit("Error opening wav_utt2dur file " + str(args.wav_utt2dur)) +noise_utt_ids = [] +noise_lengths = [] +for line in f: + a = line.split() + if len(a) != 2: + sys.exit("bad line in noise_utt2dur file " + line); + if a[1] < args.min_additive_noise_len: + sys.exit("bad line in noise_utt2dur file " + line); + noise_utt_ids.append(a[0]) + noise_lengths.append(a[1]) +f.close() + +noise_num_utts = len(noise_utt_ids) + +# generate the range file for each original wav file +for i in range(0, wav_num_utts): + + # decide the number of noises which will be add to + current_wav_len = wav_lengths[i] + max_num_additive_noise = int(current_wav_len / args.min_additive_noise_len) + upperbound_num_additive_noise = min(max_num_additive_noise, noise_num_utts) + + # select a number from [1 ... upperbound_num_additive_noise] + num_additive_noise = random.randrange(1, upperbound_num_additive_noise + 1) + + # decide the length of each noise, minus 0.01 to prevent overstep + len_additive_noise = float('{:.2f}'.format(current_wav_len / num_additive)) - 0.01 + + # We generate $num_kind_range ranges + for j in range(0, args.num_kind_range): + + # create a file to record the ranges + f = open(args.rangs_dir + "/" + str(wav_utt_ids[i]) + ".range." + str(j), "w") + if f is None: + sys.exit("Error open file " + args.rangs_dir + "/" + str(wav_utt_ids[i]) + ".ranges." + str(j)) + + # generate range file + # format: wav_t_start, wav_t_end, noise_name, noise_t_start, noise_t_end, snr + for k in range(0, num_additive_noise): + wav_t_start = flat('{:.2f}'.format(k * len_additive_noise)) + + noise_index = random.randrange(0, noise_num_utts) + current_noise_name = noise_utt_ids[noise_index] + current_noise_len = noise_lengths[noise_index] + + upperbound_add_len = min(len_additive_noise, current_noise_len) + current_add_len = float('{:.2f}'.format(random.randrange(0, upperbound_add_len, 0.01))) + + noise_start_bound = float('{:.2f}'.format(current_noise_len - current_add_len)) + noise_t_start = float('{:.2f}'.format(random.randrange(0, noise_start_bound))) + noise_t_end = noise_t_start + current_add_len + + wav_t_end = wav_t_start + current_add_len + + current_snr = random.randrange(args.max_snr, args.min_snr) + + print("{0} {1} {2} {3} {4} {5} {6}".format(wav_t_start, + wav_t_end, + current_noise_name, + noise_t_start, + noise_t_end, + current_snr, + file=f) + f.close() + +print("generate_noise_range.py: finished generate the range files for all wav") + + diff --git a/egs/wsj/s5/steps/nnet3/fvector/lh_add_noise.sh b/egs/wsj/s5/steps/nnet3/fvector/lh_add_noise.sh new file mode 100644 index 00000000000..6b81af6c6ec --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/fvector/lh_add_noise.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +# Begin Configuration section +stage=0 +min_additive_noise_len=2 # the minimum duration of each noise file +num_kind_range=4 # the number of kinds of noise ranges +min_snr=0 # the minimum snr value +max_snr=0 # the maximum snr value +seed=-1 # set the random seed + +# End Configuration section + +data=$1 # contain wav.scp +noise=$2 # contain noise.scp +dir=$3 # eg: ranges/ + + +if [ ! -f $data/utt2dur ]; then + # get original clean wav's duration + utils/data/get_utt2dur.sh $data +fi + +if [ ! -f $noise/utt2dur ]; then + # get the duration of each noise file + utils/data/get_utt2dur.sh $noise +fi + +mkdir -p $dir/log +if [ $stage -le 0 ]; then + echo "$0: generate $num_kind_rage kinds of noise range for each original wav" + $cmd $dir/log/generate_noise_range.log \ + steps/nnet3/fvector/generate_noise_range.py \ + --num-kind-range=$num_kind_range \ + --min-additive-noise-len=$min_additive_noise_len \ + --min-snr=$min_snr \ + --max-snr=$max_snr \ + --seed=$seed \ + $data/utt2dur $noise/utt2dur $dir +fi + +exit 0 From 969d31fddf154630aa9b7a7ec5df50b9a37a6baa Mon Sep 17 00:00:00 2001 From: Hang Lyu Date: Thu, 22 Dec 2016 13:01:41 -0500 Subject: [PATCH 11/23] two version perturb signal --- .../fvector/generate_fixed_length_range.py | 185 +++++++++++++++++ .../fvector/generate_variable_length_range.py | 188 ++++++++++++++++++ .../nnet3-fvector-perturb-signal.cc | 179 +++++++++++++++++ 3 files changed, 552 insertions(+) create mode 100644 egs/wsj/s5/steps/nnet3/fvector/generate_fixed_length_range.py create mode 100644 egs/wsj/s5/steps/nnet3/fvector/generate_variable_length_range.py create mode 100644 src/fvectorbin/nnet3-fvector-perturb-signal.cc diff --git a/egs/wsj/s5/steps/nnet3/fvector/generate_fixed_length_range.py b/egs/wsj/s5/steps/nnet3/fvector/generate_fixed_length_range.py new file mode 100644 index 00000000000..90072c98a6a --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/fvector/generate_fixed_length_range.py @@ -0,0 +1,185 @@ +#!/usr/bin/env python + +# The function use to generate range file for fvector +# This is the fixed-length version +# The format is + +# For +# We except the last fragement, the length will be a fixed value T. + +# For +# It is randomly selected from noise list, which is longer than --min-additive-noise-len + +# For +# If the noise file is longer than fixed value. We randomly select the start point and +# the length will be fixed value T. +# If the noise file is shorter than T. We select the whole noise. + +# The control the rate of signal and noise. In the other word, scale the amplitude of noise. + +from __future__ import print_function +import re, os, argparse, sys, math, warnings, random + +parser = argparse.ArgumentParser(description="Generate N noise range files for each original wav. The file" + "which created by this python code will be supplied to fixed-length add " + "additive noise program.", + epilog="Called by steps/nnet3/fvector/add_noise.sh") +parser.add_argument("--num-kind-range", type=int, default=4, + help="the number of noise range files") +parser.add_argument("--min-additive-noise-len", type=float, default=2.0, + help="the minimum duration/length of each noise file") +parser.add_argument("--min-snr", type=int, default=0, + help="the minimum Signal-to-Noise Rate, the default=0") +parser.add_argument("--max-snr", type=int, default=-10, + help="the maximum Signal-to-Noise Rate, the default=-10") +parser.add_argument("--seed", type=int, default=-1, + help="Seed for random number generator") + +# now the positional arguments +parser.add_argument("wav_utt2dur", + help="utt2dur file of the original wav to be used as input (format is: " + " ") +parser.add_argument("noise_utt2dur", + help="utt2dur file of the noise wav to be used as input (format is: " + " ") +parser.add_argument("rangs_dir", + help="Name of ranges directory, exp/fxvector/ranges") + +print(' '.join(sys.argv)) + +args = parser.parse_args() + +## Check arguments +if args.min_snr < args.max_snr: + sys.exit("For SNR, the less numerical value is, the larger noise is. So --min-snr bigger " + "than --max-snr in numerical value.") + +random.seed(args.seed) + +# deal with the original wav utt2dur +# the information was stored in wav_utt_ids[], wav_lengths[] and wav_num_utts +f = open(args.wav_utt2dur, "r") +if f is None: + sys.exit("Error opening wav_utt2dur file " + str(args.wav_utt2dur)) +wav_utt_ids = [] +wav_lengths = [] +for line in f: + a = line.split() + if len(a) != 2: + sys.exit("Bad line in wav_utt2dur file " + line) + if float(a[1]) < args.min_additive_noise_len: + sys.exit("ERROR: The wav length is shorter than --min-additive-noise-len") + wav_utt_ids.append(a[0]) + wav_lengths.append(float(a[1])) +f.close() + +wav_num_utts = len(wav_utt_ids) + +# deal with the noise wav utt2dur +# remove the noise whose length < --min-additive-noise-len +num_error = 0 +num_done = 0 +f = open(args.noise_utt2dur, "r") +if f is None: + sys.exit("Error opening wav_utt2dur file " + str(args.wav_utt2dur)) +noise_utt_ids = [] +noise_lengths = [] +for line in f: + a = line.split() + if len(a) != 2: + sys.exit("bad line in noise_utt2dur file " + line); + if float(a[1]) < args.min_additive_noise_len: + num_error += 1 + continue + noise_utt_ids.append(a[0]) + noise_lengths.append(float(a[1])) + num_done += 1 +f.close() +noise_num_utts = len(noise_utt_ids) +noise_str = "Warning: There are " + str(num_error) + " noise files length smaller than " + \ + "--min-additive-noise-len, we remove it from the noise list. Now, there are " + \ + str(num_done) + " noise file." +sys.stdout.write( noise_str + '\n') + +num_error = 0 +num_done = 0 +# generate the range file for each original wav file +for i in range(0, wav_num_utts): + # decide the number of noises which will be add to + current_wav_len = wav_lengths[i] + max_num_additive_noise = int(current_wav_len / args.min_additive_noise_len) + + if max_num_additive_noise > noise_num_utts: + print( "Warning: The number of noise files or the --min-additive-noise-len is too small" ) + num_error += 1 + continue + + # We generate $num_kind_range ranges + for j in range(0, args.num_kind_range): + # select a number from [1 ... max_num_additive_noise] + num_additive_noise = random.randint(1, max_num_additive_noise) + + # decide the length of each noise, minus 0.01 to prevent overstep + len_additive_noise = float('{:.2f}'.format(current_wav_len / num_additive_noise)) - 0.01 + + # create a file to record the ranges + f = open(args.rangs_dir + "/" + str(wav_utt_ids[i]) + ".range." + str(j), "w") + if f is None: + sys.exit("Error open file " + args.rangs_dir + "/" + str(wav_utt_ids[i]) + ".ranges." + str(j)) + # generate range file + # format: wav_t_start, wav_t_end, noise_name, noise_t_start, noise_t_end, snr + for k in range(0, num_additive_noise - 1): + wav_t_start = float('{:.2f}'.format(k * len_additive_noise)) + wav_t_end = wav_t_start + len_additive_noise + + noise_index = random.randrange(0, noise_num_utts) + current_noise_name = noise_utt_ids[noise_index] + current_noise_len = noise_lengths[noise_index] + if current_noise_len <= len_additive_noise: + noise_t_start = 0.0 + noise_t_end = current_noise_len + else : + noise_start_bound = float('{:.2f}'.format(current_noise_len - len_additive_noise)) + noise_t_start = float('{:.2f}'.format(random.uniform(0, noise_start_bound))) + noise_t_end = noise_t_start + len_additive_noise + + current_snr = random.randrange(args.max_snr, args.min_snr) + + print("{0} {1} {2} {3} {4} {5}".format(wav_t_start, + wav_t_end, + current_noise_name, + noise_t_start, + noise_t_end, + current_snr), + file=f) + # deal with the last noise, which cover the rest + k = num_additive_noise - 1 + wav_t_start = float('{:.2f}'.format(k * len_additive_noise)) + wav_t_end = current_wav_len + + noise_index = random.randrange(0, noise_num_utts) + current_noise_name = noise_utt_ids[noise_index] + current_noise_len = noise_lengths[noise_index] + + if current_noise_len <= (wav_t_end - wav_t_start): + noise_t_start = 0.0 + noise_t_end = current_noise_len + else : + noise_start_bound = float('{:.2f}'.format(current_noise_len - wav_t_end + wav_t_start)) + noise_t_start = float('{:.2f}'.format(random.uniform(0, noise_start_bound))) + noise_t_end = noise_t_start + wav_t_end - wav_t_start + + current_snr = random.randrange(args.max_snr, args.min_snr) + + print("{0} {1} {2} {3} {4} {5}".format(wav_t_start, + wav_t_end, + current_noise_name, + noise_t_start, + noise_t_end, + current_snr), + file=f) + + f.close() + num_done += 1 + +print('''generate_fixed_length_range.py: finished generate the range files for all wav. Compare with our expect, it lacks %d files. Now we totally have %d noise range files.''' %(num_error, num_done) ) diff --git a/egs/wsj/s5/steps/nnet3/fvector/generate_variable_length_range.py b/egs/wsj/s5/steps/nnet3/fvector/generate_variable_length_range.py new file mode 100644 index 00000000000..67d0fd0d5ad --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/fvector/generate_variable_length_range.py @@ -0,0 +1,188 @@ +#!/usr/bin/env python + +# The function use to generate range file for fvector +# This is the variable-length version +# The format is + +# For +# We except the last fragement, the length will be random. + +# For +# It is randomly selected from noise list, which is longer than --min-additive-noise-len + +# For +# If the noise file is longer than wav length. We randomly select the start point and +# the length will be the same as wav length. +# If the noise file is shorter than T. We select the whole noise. + +# For , it was used to control the amplitude of noise + +from __future__ import print_function +import re, os, argparse, sys, math, warnings, random + +parser = argparse.ArgumentParser(description="Generate N noise range files for each original wav. The file" + "which created by this python code will be supplied to variable-length " + "and additive noise program.", + epilog="Called by steps/nnet3/fvector/add_noise.sh") +parser.add_argument("--num-kind-range", type=int, default=4, + help="the number of noise range files") +parser.add_argument("--min-additive-noise-len", type=float, default=2.0, + help="the minimum duration/length of each noise file") +parser.add_argument("--min-snr", type=int, default=0, + help="the minimum Signal-to-Noise Rate, the default=0") +parser.add_argument("--max-snr", type=int, default=-10, + help="the maximum Signal-to-Noise Rate, the default=-10") +parser.add_argument("--seed", type=int, default=-1, + help="Seed for random number generator") + +# now the positional arguments +parser.add_argument("wav_utt2dur", + help="utt2dur file of the original wav to be used as input (format is: " + " ") +parser.add_argument("noise_utt2dur", + help="utt2dur file of the noise wav to be used as input (format is: " + " ") +parser.add_argument("rangs_dir", + help="Name of ranges directory, exp/fxvector/ranges") + +print(' '.join(sys.argv)) + +args = parser.parse_args() + +## Check arguments +if args.min_snr < args.max_snr: + sys.exit("For SNR, the less numerical value is, the larger noise is. So --min-snr bigger " + "than --max-snr in numerical value.") + +random.seed(args.seed) + +# deal with the original wav utt2dur +# the information was stored in wav_utt_ids[], wav_lengths[] and wav_num_utts +f = open(args.wav_utt2dur, "r") +if f is None: + sys.exit("Error opening wav_utt2dur file " + str(args.wav_utt2dur)) +wav_utt_ids = [] +wav_lengths = [] +for line in f: + a = line.split() + if len(a) != 2: + sys.exit("bad line in wav_utt2dur file " + line) + if float(a[1]) < args.min_additive_noise_len: + sys.exit("ERROR: The wav length is shorter than --min-additive-noise-len") + wav_utt_ids.append(a[0]) + wav_lengths.append(float(a[1])) +f.close() + +wav_num_utts = len(wav_utt_ids) + +# deal with the noise wav utt2dur +# remove the noise whose length < --min-additive-noise-len +num_error = 0 +num_done = 0 +f = open(args.noise_utt2dur, "r") +if f is None: + sys.exit("Error opening wav_utt2dur file " + str(args.wav_utt2dur)) +noise_utt_ids = [] +noise_lengths = [] +for line in f: + a = line.split() + if len(a) != 2: + sys.exit("bad line in noise_utt2dur file " + line); + if float(a[1]) < args.min_additive_noise_len: + num_error += 1 + continue + noise_utt_ids.append(a[0]) + noise_lengths.append(float(a[1])) + num_done += 1 +f.close() +noise_num_utts = len(noise_utt_ids) +noise_str = "Warning: There are " + str(num_error) + " noise files length smaller than " + \ + "--min-additive-noise-len, we remove it from the noise list. Now, there are " + \ + str(num_done) + " noise file." +sys.stdout.write( noise_str + '\n') + +num_error = 0 +num_done = 0 +# generate the range file for each original wav file +for i in range(0, wav_num_utts): + + # check the noise list has enough sample or not + current_wav_len = wav_lengths[i] + max_num_additive_noise = int(current_wav_len / args.min_additive_noise_len) + + if max_num_additive_noise > noise_num_utts: + print( "Warning: The number of noise files or the --min-additive-noise-len is too small" ) + num_error += 1 + continue + + # We generate $num_kind_range ranges + for j in range(0, args.num_kind_range): + + # create a file to record the ranges + f = open(args.rangs_dir + "/" + str(wav_utt_ids[i]) + ".range." + str(j), "w") + if f is None: + sys.exit("Error open file " + args.rangs_dir + "/" + str(wav_utt_ids[i]) + ".ranges." + str(j)) + # generate range file + # format: wav_t_start, wav_t_end, noise_name, noise_t_start, noise_t_end, snr + the_rest = current_wav_len + wav_t_start = 0.0 + wav_t_end = 0.0 + while (the_rest > float(args.min_additive_noise_len)): + # firstly, we randomly choose a kind of noise and snr + noise_index = random.randrange(0, noise_num_utts) + current_noise_name = noise_utt_ids[noise_index] + current_noise_len = noise_lengths[noise_index] + current_snr = random.randrange(args.max_snr, args.min_snr) + + # Secondly, we randomly select a fragement of the noise file. + noise_start_bound = float('{:.2f}'.format(current_noise_len - float(args.min_additive_noise_len))) + noise_t_start = float('{:.2f}'.format(random.uniform(0, noise_start_bound))) + noise_end_upperbound = float('{:.2f}'.format(noise_t_start + float(args.min_additive_noise_len))) + noise_end_lowerbound = float('{:.2f}'.format(min((noise_t_start + the_rest), current_noise_len))) + noise_t_end = float('{:.2f}'.format(random.uniform(noise_end_upperbound, noise_end_lowerbound))) + current_noise_length = noise_t_end - noise_t_start + + # Thirdly, we generate the start and end point of wav + wav_t_start = wav_t_end #the new start is the end of the last. + wav_t_end = wav_t_start + current_noise_length + + # Forthly, update the_rest + the_rest = the_rest - current_noise_length + + # Fifthly, print + print("{0} {1} {2} {3} {4} {5}".format(wav_t_start, + wav_t_end, + current_noise_name, + noise_t_start, + noise_t_end, + current_snr), + file=f) + # deal with the bit of wav + # firstly, we randomly choose a kind of noise and snr + noise_index = random.randrange(0, noise_num_utts) + current_noise_name = noise_utt_ids[noise_index] + current_noise_len = noise_lengths[noise_index] + current_snr = random.randrange(args.max_snr, args.min_snr) + + # Secondly, we randomly select a fragement of the noise file. + noise_start_bound = float('{:.2f}'.format(current_noise_len - the_rest)) + noise_t_start = float('{:.2f}'.format(random.uniform(0, noise_start_bound))) + noise_t_end = noise_t_start + the_rest + current_noise_length = noise_t_end - noise_t_start + + # Thirdly, we generate the start and end point of wav + wav_t_start = wav_t_end #the new start is the end of the last. + wav_t_end = wav_t_start + current_noise_length + + # Forthly, print + print("{0} {1} {2} {3} {4} {5}".format(wav_t_start, + wav_t_end, + current_noise_name, + noise_t_start, + noise_t_end, + current_snr), + file=f) + f.close() + num_done += 1 + +print('''generate_fixed_length_range.py: finished generate the range files for all wav. Compare with our expect, it lacks %d files. Now we totally have %d noise range files.''' %(num_error, num_done) ) diff --git a/src/fvectorbin/nnet3-fvector-perturb-signal.cc b/src/fvectorbin/nnet3-fvector-perturb-signal.cc new file mode 100644 index 00000000000..02e13656b74 --- /dev/null +++ b/src/fvectorbin/nnet3-fvector-perturb-signal.cc @@ -0,0 +1,179 @@ +// fvector/nnet3-fvector-perturb-signal.cc + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "feat/wave-reader.h" +#include "feat/signal.h" + +namespace kaldi { + +struct NoiseController{ + BaseFloat wav_t_start_; + BaseFloat wav_t_end_; + std::string noise_uttid_; + BaseFloat noise_t_start_; + BaseFloat noise_t_end_; + BaseFloat snr_; + + NoiseController(BaseFloat wav_t_start, BaseFloat wav_t_end, std::string noise_uttid, + BaseFloat noise_t_start, BaseFloat noise_t_end, BaseFloat snr): + wav_t_start_(wav_t_start), wav_t_end_(wav_t_end), noise_uttid_(noise_uttid), + noise_t_start_(noise_t_start), noise_t_end_(noise_t_end), snr_(snr) { } +}; + + +} + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + + const char *usage = + "Perturb the wave files supplied via the specified noise-range file\n" + "Usage: nnet3-fvector-perturb-signal [options...] " + "\n" + "e.g.\n" + "nnet3-fvector-perturb-signal --noise-range-file=uttid.range.n --add-noise-list=" + "scp:noise.scp --input-channel=0 input.wav output.wav\n"; + + ParseOptions po(usage); + + std::string noise_range_file; + std::string noise_list_rspecifier; + int32 input_channel = 0; + + po.Register("noise-range-file",&noise_range_file, + "Provide a range file. We use the content in this file to control " + "the process of adding noise. The format of each line in this file " + ": " + " "); + po.Register("add-noise-list",&noise_list_rspecifier, + "There is a list of optional noise. It need to match the " + "--noise-range-file."); + po.Register("input-channel",&input_channel, + "Specifies the channel to be used in input file"); + + po.Read(argc, argv); + if (po.NumArgs() != 2) { + po.PrintUsage(); + exit(1); + } + + std::string input_wave_file = po.GetArg(1); + std::string output_wave_file = po.GetArg(2); + + // Genterate the Noise Controller list + std::vector controller; + if (noise_range_file != "") { + std::ifstream fi(noise_range_file.c_str()); + if (!fi) { + KALDI_ERR << "failed to open file " << noise_range_file; + } + BaseFloat wav_t_start; + BaseFloat wav_t_end; + std::string noise_uttid; + BaseFloat noise_t_start; + BaseFloat noise_t_end; + BaseFloat snr; + while (fi >> wav_t_start >> wav_t_end >> noise_uttid >> noise_t_start >> noise_t_end >> snr) { + controller.push_back(NoiseController(wav_t_start, wav_t_end, noise_uttid, + noise_t_start, noise_t_end, snr)); + } + } + + WaveData input_wave; + { + WaveHolder waveholder; + Input ki(input_wave_file); + waveholder.Read(ki.Stream()); + input_wave = waveholder.Value(); + } + + // about input wav + const Matrix &input_matrix = input_wave.Data(); + BaseFloat samp_freq_input = input_wave.SampFreq(); + int32 num_samp_input = input_matrix.NumCols(), // #samples in the input + num_input_channel = input_matrix.NumRows(); // #channels in the input + KALDI_VLOG(1) << "Sampling frequency of input: " << samp_freq_input + << "the number of samples: " << num_samp_input + << "the number of channels: " << num_input_channel; + KALDI_ASSERT(input_channel < num_input_channel); + Vector input(num_samp_input); + input.CopyRowFromMat(input_matrix, input_channel); + + // new output vector + Vector output(input); + + // about noise list + RandomAccessTableReader noise_reader(noise_list_rspecifier); + + // add noise + for (int i=0; i < controller.size(); ++i) { + const WaveData &noise_wav = noise_reader.Value(controller[i].noise_uttid_); + BaseFloat samp_freq_noise = noise_wav.SampFreq(); + KALDI_ASSERT(samp_freq_input == samp_freq_noise); + + const Matrix &noise_matrix = noise_wav.Data(); + int32 num_samp_noise = noise_matrix.NumCols(); + Vector noise(num_samp_noise); + noise.CopyRowFromMat(noise_matrix, 0); + + int32 input_start_point = samp_freq_input * controller[i].wav_t_start_; + int32 input_end_point = samp_freq_input * controller[i].wav_t_end_ - 1; + int32 noise_start_point = samp_freq_noise * controller[i].noise_t_start_; + int32 noise_end_point = samp_freq_noise * controller[i].noise_t_end_ - 1; + BaseFloat snr = controller[i].snr_; + + SubVector input_part(input, input_start_point, + input_end_point - input_start_point + 1); + SubVector noise_part(noise, noise_start_point, + noise_end_point - noise_start_point + 1); + Vector selected_noise(input_part.Dim()); + if (noise_part.Dim() < input_part.Dim()) { + int32 the_rest = selected_noise.Dim(); + while (the_rest > noise_part.Dim()) { + selected_noise.Range(selected_noise.Dim()-the_rest, + noise_part.Dim()).CopyFromVec(noise_part); + the_rest = the_rest - noise_part.Dim(); + } + selected_noise.Range(selected_noise.Dim()-the_rest, the_rest).CopyFromVec( + noise_part.Range(0, the_rest)); + } else { + selected_noise.CopyFromVec(noise_part); + } + + BaseFloat input_energy = VecVec(input_part, input_part); + BaseFloat noise_energy = VecVec(selected_noise, selected_noise); + BaseFloat scale_factor = sqrt(input_energy/ noise_energy/ (pow(10, snr/20)) ); + output.Range(input_start_point, input_part.Dim()).AddVec(scale_factor, selected_noise); + } + + Matrix out_matrix(1, num_samp_input); + out_matrix.CopyRowsFromVec(output); + + WaveData out_wave(samp_freq_input, out_matrix); + Output ko(output_wave_file, false); + out_wave.Write(ko.Stream()); + + return 0; + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} + From 51856f78bbddebf11ad0c3573d35adceaf7a7482 Mon Sep 17 00:00:00 2001 From: Hang Lyu Date: Wed, 28 Dec 2016 22:35:09 -0500 Subject: [PATCH 12/23] modify the python files to generate noise_range; fix the binary of perturb-signal; make a simple bash script --- egs/wsj/s5/steps/nnet3/fvector/add_noise.sh | 64 ++++ .../fvector/generate_fixed_length_range.py | 116 +++---- .../nnet3/fvector/generate_noise_range.py | 304 ++++++++++++++---- .../fvector/generate_variable_length_range.py | 112 ++++--- .../nnet3-fvector-perturb-signal.cc | 167 +++++----- 5 files changed, 520 insertions(+), 243 deletions(-) create mode 100644 egs/wsj/s5/steps/nnet3/fvector/add_noise.sh diff --git a/egs/wsj/s5/steps/nnet3/fvector/add_noise.sh b/egs/wsj/s5/steps/nnet3/fvector/add_noise.sh new file mode 100644 index 00000000000..4a7169a3b42 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/fvector/add_noise.sh @@ -0,0 +1,64 @@ +#!/bin/bash +# Copyright 2016 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 + +# Begin Configuration section. +stage=0 +cmd=run.pl +nj=4 +# Begain Configuration. +min_additive_noise_len=2.0 # the minimum duration of each noise file +num_kind_range=4 # the number of kinds of noise ranges +min_snr=-5 # the minimum snr value +max_snr=-15 # the maximum snr value +seed=-1 # set the random seed +variable_len_additive_noise=true #If true, generate the variable-length range files + #If false, generate the fixed-length range files +# End Configuration options. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh # source the path. +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "usage: steps/nnet3/fvector/add_noise.sh " + echo "e.g.: steps/nnet3/fvector/add_noise.sh data/train data/noise ranges" + echo "main options (for others, see top of script file)" + echo " --min-additive-noise-len # limit the minimum length of noise" + echo " --num-kind-range # number of noise range kinds" + echo " --variable-len-additive-noise (true|false) # decide fixed/variable version" + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs" +fi + +data=$1 # contain wav.scp +noise=$2 # contain noise.scp +dir=$3 # eg: ranges/ + + +if [ ! -f $data/utt2dur ]; then + # get original clean wav's duration + utils/data/get_utt2dur.sh $data +fi + +if [ ! -f $noise/utt2dur ]; then + # get the duration of each noise file + utils/data/get_utt2dur.sh $noise +fi + +mkdir -p $dir/log +if [ $stage -le 0 ]; then + echo "$0: generate $num_kind_rage kinds of noise range for each original wav" + $cmd $dir/log/generate_noise_range.log \ + steps/nnet3/fvector/generate_noise_range.py \ + --num-kind-range=$num_kind_range \ + --min-additive-noise-len=$min_additive_noise_len \ + --min-snr=$min_snr \ + --max-snr=$max_snr \ + --variable-len-additive-noise $variable_len_additive_noise \ + --seed=$seed \ + $data/utt2dur $noise/utt2dur $dir +fi + +exit 0 diff --git a/egs/wsj/s5/steps/nnet3/fvector/generate_fixed_length_range.py b/egs/wsj/s5/steps/nnet3/fvector/generate_fixed_length_range.py index 90072c98a6a..84a41541163 100644 --- a/egs/wsj/s5/steps/nnet3/fvector/generate_fixed_length_range.py +++ b/egs/wsj/s5/steps/nnet3/fvector/generate_fixed_length_range.py @@ -2,10 +2,12 @@ # The function use to generate range file for fvector # This is the fixed-length version -# The format is +# Each line of the range-file corrsponds to a kind of perturbed wav. In each line, +# we use comma to seperate different addnoise range. The format of each addnoise +# range is ::::: # For -# We except the last fragement, the length will be a fixed value T. +# Except the last fragement, the length will be a fixed value T. # For # It is randomly selected from noise list, which is longer than --min-additive-noise-len @@ -16,16 +18,18 @@ # If the noise file is shorter than T. We select the whole noise. # The control the rate of signal and noise. In the other word, scale the amplitude of noise. +# The snr will be randomly selected form the range (max-snr, min-snr). from __future__ import print_function import re, os, argparse, sys, math, warnings, random -parser = argparse.ArgumentParser(description="Generate N noise range files for each original wav. The file" - "which created by this python code will be supplied to fixed-length add " +parser = argparse.ArgumentParser(description="Generate a noise range files which contains " + "N lines corresponding to the number of kinds for each original wav. " + "The file created by this python code will be supplied to fixed-length add " "additive noise program.", epilog="Called by steps/nnet3/fvector/add_noise.sh") parser.add_argument("--num-kind-range", type=int, default=4, - help="the number of noise range files") + help="the number of expected addnoise kinds") parser.add_argument("--min-additive-noise-len", type=float, default=2.0, help="the minimum duration/length of each noise file") parser.add_argument("--min-snr", type=int, default=0, @@ -36,13 +40,13 @@ help="Seed for random number generator") # now the positional arguments -parser.add_argument("wav_utt2dur", - help="utt2dur file of the original wav to be used as input (format is: " +parser.add_argument("wav2dur", + help="wav2dur file of the original wav to be used as input (format is: " " ") -parser.add_argument("noise_utt2dur", - help="utt2dur file of the noise wav to be used as input (format is: " +parser.add_argument("noise2dur", + help="noise2dur file of the noise wav to be used as input (format is: " " ") -parser.add_argument("rangs_dir", +parser.add_argument("range_dir", help="Name of ranges directory, exp/fxvector/ranges") print(' '.join(sys.argv)) @@ -57,108 +61,112 @@ random.seed(args.seed) # deal with the original wav utt2dur -# the information was stored in wav_utt_ids[], wav_lengths[] and wav_num_utts -f = open(args.wav_utt2dur, "r") +# the information was stored in wav_ids[], wav_lengths[] and wav_num_utts +f = open(args.wav2dur, "r") if f is None: - sys.exit("Error opening wav_utt2dur file " + str(args.wav_utt2dur)) -wav_utt_ids = [] + sys.exit("Error opening wav2dur file " + str(args.wav2dur)) +wav_ids = [] wav_lengths = [] for line in f: a = line.split() if len(a) != 2: - sys.exit("Bad line in wav_utt2dur file " + line) + sys.exit("Bad line in wav2dur file " + line) if float(a[1]) < args.min_additive_noise_len: sys.exit("ERROR: The wav length is shorter than --min-additive-noise-len") - wav_utt_ids.append(a[0]) + wav_ids.append(a[0]) wav_lengths.append(float(a[1])) f.close() -wav_num_utts = len(wav_utt_ids) +wav_num_utts = len(wav_ids) # deal with the noise wav utt2dur # remove the noise whose length < --min-additive-noise-len num_error = 0 num_done = 0 -f = open(args.noise_utt2dur, "r") +f = open(args.noise2dur, "r") if f is None: - sys.exit("Error opening wav_utt2dur file " + str(args.wav_utt2dur)) -noise_utt_ids = [] + sys.exit("Error opening noise2dur file " + str(args.noise2dur)) +noise_ids = [] noise_lengths = [] for line in f: a = line.split() if len(a) != 2: - sys.exit("bad line in noise_utt2dur file " + line); + sys.exit("Bad line in noise2dur file " + line); if float(a[1]) < args.min_additive_noise_len: num_error += 1 continue - noise_utt_ids.append(a[0]) + noise_ids.append(a[0]) noise_lengths.append(float(a[1])) num_done += 1 f.close() -noise_num_utts = len(noise_utt_ids) +noise_num_utts = len(noise_ids) noise_str = "Warning: There are " + str(num_error) + " noise files length smaller than " + \ "--min-additive-noise-len, we remove it from the noise list. Now, there are " + \ str(num_done) + " noise file." sys.stdout.write( noise_str + '\n') -num_error = 0 -num_done = 0 -# generate the range file for each original wav file +num_fixed_error = 0 +num_fixed_done = 0 + for i in range(0, wav_num_utts): # decide the number of noises which will be add to current_wav_len = wav_lengths[i] max_num_additive_noise = int(current_wav_len / args.min_additive_noise_len) - + if max_num_additive_noise > noise_num_utts: print( "Warning: The number of noise files or the --min-additive-noise-len is too small" ) - num_error += 1 + num_fixed_error += 1 continue + # create a file to record the ranges + f = open(args.range_dir + "/" + str(wav_ids[i]) + ".noiserange", "w") + if f is None: + sys.exit("Error open file " + args.range_dir + "/" + str(wav_ids[i]) + ".noiserange") + # We generate $num_kind_range ranges for j in range(0, args.num_kind_range): + # print the perturbed wav id in the beginning of line + print("{0}-{1}".format(wav_ids[i], "perturbed-"+str(j+1)), end=" ", file=f) + # select a number from [1 ... max_num_additive_noise] num_additive_noise = random.randint(1, max_num_additive_noise) # decide the length of each noise, minus 0.01 to prevent overstep - len_additive_noise = float('{:.2f}'.format(current_wav_len / num_additive_noise)) - 0.01 - - # create a file to record the ranges - f = open(args.rangs_dir + "/" + str(wav_utt_ids[i]) + ".range." + str(j), "w") - if f is None: - sys.exit("Error open file " + args.rangs_dir + "/" + str(wav_utt_ids[i]) + ".ranges." + str(j)) - # generate range file - # format: wav_t_start, wav_t_end, noise_name, noise_t_start, noise_t_end, snr + additive_noise_len = float('{:.2f}'.format(current_wav_len / num_additive_noise)) - 0.01 + + # generate one line of file + # format: wav_t_start:wav_t_end:noise_name:noise_t_start:noise_t_end:snr, for k in range(0, num_additive_noise - 1): - wav_t_start = float('{:.2f}'.format(k * len_additive_noise)) - wav_t_end = wav_t_start + len_additive_noise + wav_t_start = float('{:.2f}'.format(k * additive_noise_len)) + wav_t_end = wav_t_start + additive_noise_len noise_index = random.randrange(0, noise_num_utts) - current_noise_name = noise_utt_ids[noise_index] + current_noise_name = noise_ids[noise_index] current_noise_len = noise_lengths[noise_index] - if current_noise_len <= len_additive_noise: - noise_t_start = 0.0 - noise_t_end = current_noise_len + if current_noise_len <= additive_noise_len: + noise_t_start = 0.0 + noise_t_end = current_noise_len else : - noise_start_bound = float('{:.2f}'.format(current_noise_len - len_additive_noise)) + noise_start_bound = float('{:.2f}'.format(current_noise_len - additive_noise_len)) noise_t_start = float('{:.2f}'.format(random.uniform(0, noise_start_bound))) - noise_t_end = noise_t_start + len_additive_noise + noise_t_end = noise_t_start + additive_noise_len current_snr = random.randrange(args.max_snr, args.min_snr) - print("{0} {1} {2} {3} {4} {5}".format(wav_t_start, + print("{0}:{1}:{2}:{3}:{4}:{5}".format(wav_t_start, wav_t_end, current_noise_name, noise_t_start, noise_t_end, current_snr), - file=f) - # deal with the last noise, which cover the rest + end=",",file=f) + # deal with the last noise, which cover the rest k = num_additive_noise - 1 - wav_t_start = float('{:.2f}'.format(k * len_additive_noise)) + wav_t_start = float('{:.2f}'.format(k * additive_noise_len)) wav_t_end = current_wav_len noise_index = random.randrange(0, noise_num_utts) - current_noise_name = noise_utt_ids[noise_index] + current_noise_name = noise_ids[noise_index] current_noise_len = noise_lengths[noise_index] if current_noise_len <= (wav_t_end - wav_t_start): @@ -171,15 +179,13 @@ current_snr = random.randrange(args.max_snr, args.min_snr) - print("{0} {1} {2} {3} {4} {5}".format(wav_t_start, + print("{0}:{1}:{2}:{3}:{4}:{5}".format(wav_t_start, wav_t_end, current_noise_name, noise_t_start, noise_t_end, current_snr), file=f) - - f.close() - num_done += 1 - -print('''generate_fixed_length_range.py: finished generate the range files for all wav. Compare with our expect, it lacks %d files. Now we totally have %d noise range files.''' %(num_error, num_done) ) + num_fixed_done += 1 + f.close() +print('''generate_fixed_length_range.py: finished generate the range files for all wav. Compare with our expect, it lacks %d files. Now we totally have %d noise range files.''' %(num_fixed_error, num_fixed_done) ) diff --git a/egs/wsj/s5/steps/nnet3/fvector/generate_noise_range.py b/egs/wsj/s5/steps/nnet3/fvector/generate_noise_range.py index db4d35a4325..fa42e030e85 100644 --- a/egs/wsj/s5/steps/nnet3/fvector/generate_noise_range.py +++ b/egs/wsj/s5/steps/nnet3/fvector/generate_noise_range.py @@ -1,29 +1,67 @@ #!/usr/bin/env python +# The function use to generate range-file for fvector +# Each line of the range-file corrsponds to a kind of perturbed wav. In each line, +# we use comma to seperate different addnoise range. The format of each addnoise +# range is ::::: +# The line which starts with the asterisk(*) is the differences between two versions. + +# For the fixed-length version: +# *For +# *Except the last fragement, the length will be a fixed value T. +# For +# It is randomly selected from noise list, which is longer than --min-additive-noise-len +# *For +# *If the noise file is longer than fixed value. We randomly select the start point and +# *the length will be fixed value T. +# *If the noise file is shorter than T. We select the whole noise. +# The control the rate of signal and noise. In the other word, scale the amplitude of noise. +# The snr will be randomly selected form the range (max-snr, min-snr). + +# For the variable-length version: +# *For +# *Except the last fragement, the length will be random. +# For +# It is randomly selected from noise list, which is longer than --min-additive-noise-len +# *For +# *If the noise file is longer than wav length. We randomly select the start point and +# *the length will be the same as wav length. +# *If the noise file is shorter than T. We select the whole noise. +# For , it was used to control the amplitude of noise +# It will be randomly selected from the range (max-snr, min-snr) + + from __future__ import print_function import re, os, argparse, sys, math, warnings, random -parser = argparse.ArgumentParser(description="Generate n kinds of noise range for each original wav" - epilog="Called by steps/nnet3/fvector/lh_add_noise.sh") +parser = argparse.ArgumentParser(description="Generate a noise range-file which contains " + "N lines corresponding to the number of kinds for each original wav. " + "The file which created by this python code will be supplied to " + "add additive noise program.", + epilog="Called by steps/nnet3/fvector/add_noise.sh") parser.add_argument("--num-kind-range", type=int, default=4, - help="the number of kinds of noise ranges") + help="the number of expected addnoise kinds") parser.add_argument("--min-additive-noise-len", type=float, default=2.0, - help="the minimum duration of each noise file") -parser.add_argument("--min-snr", type=int, default=0, + help="the minimum duration/length of each noise file") +parser.add_argument("--min-snr", type=int, default=-5, help="the minimum Signal-to-Noise Rate, the default=0") -parser.add_argument("--max-snr", type=int, default=-10, +parser.add_argument("--max-snr", type=int, default=-15, help="the maximum Signal-to-Noise Rate, the default=-10") parser.add_argument("--seed", type=int, default=-1, help="Seed for random number generator") +parser.add_argument("--variable-len-additive-noise", type=str, + help="If true, generate the variable-length range files for each original wavform file." + "If false, generate the fixed-length range files for each original wavform file.", + default="false", choices = ["false", "true"]) # now the positional arguments -parser.add_argument("wav_utt2dur", - help="utt2dur file of the original wav to be used as input (format is: " +parser.add_argument("wav2dur", + help="wav2dur file of the original wav to be used as input (format is: " " ") -parser.add_argument("noise_utt2dur", - help="utt2dur file of the noise wav to be used as input (format is: " +parser.add_argument("noise2dur", + help="noise2dur file of the noise wav to be used as input (format is: " " ") -parser.add_argument("rangs_dir", +parser.add_argument("range_dir", help="Name of ranges directory, exp/fxvector/ranges") print(' '.join(sys.argv)) @@ -38,90 +76,228 @@ random.seed(args.seed) # deal with the original wav utt2dur -f = open(args.wav_utt2dur, "r") +# the information was stored in wav_ids[], wav_lengths[] and wav_num_utts +f = open(args.wav2dur, "r") if f is None: - sys.exit("Error opening wav_utt2dur file " + str(args.wav_utt2dur)) -wav_utt_ids = [] + sys.exit("Error opening wav2dur file " + str(args.wav2dur)) +wav_ids = [] wav_lengths = [] for line in f: a = line.split() if len(a) != 2: - sys.exit("bad line in wav_utt2dur file " + line) - wav_utt_ids.append(a[0]) - wav_lengths.append(a[1]) + sys.exit("Bad line in wav2dur file " + line) + if float(a[1]) < args.min_additive_noise_len: + sys.exit("ERROR: The wav length is shorter than --min-additive-noise-len") + wav_ids.append(a[0]) + wav_lengths.append(float(a[1])) f.close() -wav_num_utts = len(wav_utt_ids) +wav_num_utts = len(wav_ids) # deal with the noise wav utt2dur -f = open(args.noise_utt2dur, "r") +# remove the noise whose length < --min-additive-noise-len +num_error = 0 +num_done = 0 +f = open(args.noise2dur, "r") if f is None: - sys.exit("Error opening wav_utt2dur file " + str(args.wav_utt2dur)) -noise_utt_ids = [] + sys.exit("Error opening noise2dur file " + str(args.noise2dur)) +noise_ids = [] noise_lengths = [] for line in f: a = line.split() if len(a) != 2: - sys.exit("bad line in noise_utt2dur file " + line); - if a[1] < args.min_additive_noise_len: - sys.exit("bad line in noise_utt2dur file " + line); - noise_utt_ids.append(a[0]) - noise_lengths.append(a[1]) + sys.exit("Bad line in noise2dur file " + line); + if float(a[1]) < args.min_additive_noise_len: + num_error += 1 + continue + noise_ids.append(a[0]) + noise_lengths.append(float(a[1])) + num_done += 1 f.close() +noise_num_utts = len(noise_ids) +noise_str = "Warning: There are " + str(num_error) + " noise files length smaller than " + \ + "--min-additive-noise-len, we remove it from the noise list. Now, there are " + \ + str(num_done) + " noise file." +sys.stdout.write( noise_str + '\n') -noise_num_utts = len(noise_utt_ids) +# This function generates the fixed-length range files +def GenerateFixedLengthRangeFile(): + num_fixed_error = 0 + num_fixed_done = 0 -# generate the range file for each original wav file -for i in range(0, wav_num_utts): - - # decide the number of noises which will be add to - current_wav_len = wav_lengths[i] - max_num_additive_noise = int(current_wav_len / args.min_additive_noise_len) - upperbound_num_additive_noise = min(max_num_additive_noise, noise_num_utts) - - # select a number from [1 ... upperbound_num_additive_noise] - num_additive_noise = random.randrange(1, upperbound_num_additive_noise + 1) + for i in range(0, wav_num_utts): + # decide the number of noises which will be add to + current_wav_len = wav_lengths[i] + max_num_additive_noise = int(current_wav_len / args.min_additive_noise_len) - # decide the length of each noise, minus 0.01 to prevent overstep - len_additive_noise = float('{:.2f}'.format(current_wav_len / num_additive)) - 0.01 + if max_num_additive_noise > noise_num_utts: + print( "Warning: The number of noise files or the --min-additive-noise-len is too small" ) + num_fixed_error += 1 + continue - # We generate $num_kind_range ranges - for j in range(0, args.num_kind_range): - # create a file to record the ranges - f = open(args.rangs_dir + "/" + str(wav_utt_ids[i]) + ".range." + str(j), "w") + f = open(args.range_dir + "/" + str(wav_ids[i]) + ".noiserange", "w") if f is None: - sys.exit("Error open file " + args.rangs_dir + "/" + str(wav_utt_ids[i]) + ".ranges." + str(j)) - - # generate range file - # format: wav_t_start, wav_t_end, noise_name, noise_t_start, noise_t_end, snr - for k in range(0, num_additive_noise): - wav_t_start = flat('{:.2f}'.format(k * len_additive_noise)) - - noise_index = random.randrange(0, noise_num_utts) - current_noise_name = noise_utt_ids[noise_index] - current_noise_len = noise_lengths[noise_index] - - upperbound_add_len = min(len_additive_noise, current_noise_len) - current_add_len = float('{:.2f}'.format(random.randrange(0, upperbound_add_len, 0.01))) + sys.exit("Error open file " + args.range_dir + "/" + str(wav_ids[i]) + ".noiserange") + + # We generate $num_kind_range ranges + for j in range(0, args.num_kind_range): + # print the perturbed wav id in the beginning of line + print("{0}-{1}".format(wav_ids[i], "perturbed-"+str(j+1)), end=" ", file=f) - noise_start_bound = float('{:.2f}'.format(current_noise_len - current_add_len)) - noise_t_start = float('{:.2f}'.format(random.randrange(0, noise_start_bound))) - noise_t_end = noise_t_start + current_add_len + # select a number from [1 ... max_num_additive_noise] + num_additive_noise = random.randint(1, max_num_additive_noise) + + # decide the length of each noise, minus 0.01 to prevent overstep + additive_noise_len = float('{:.2f}'.format(current_wav_len / num_additive_noise)) - 0.01 - wav_t_end = wav_t_start + current_add_len + # generate one line of file + # format: wav_t_start:wav_t_end:noise_name:noise_t_start:noise_t_end:snr, + for k in range(0, num_additive_noise - 1): + wav_t_start = float('{:.2f}'.format(k * additive_noise_len)) + wav_t_end = wav_t_start + additive_noise_len + + noise_index = random.randrange(0, noise_num_utts) + current_noise_name = noise_ids[noise_index] + current_noise_len = noise_lengths[noise_index] + if current_noise_len <= additive_noise_len: + noise_t_start = 0.0 + noise_t_end = current_noise_len + else : + noise_start_bound = float('{:.2f}'.format(current_noise_len - additive_noise_len)) + noise_t_start = float('{:.2f}'.format(random.uniform(0, noise_start_bound))) + noise_t_end = noise_t_start + additive_noise_len - current_snr = random.randrange(args.max_snr, args.min_snr) + current_snr = random.randrange(args.max_snr, args.min_snr) - print("{0} {1} {2} {3} {4} {5} {6}".format(wav_t_start, + print("{0}:{1}:{2}:{3}:{4}:{5}".format(wav_t_start, wav_t_end, current_noise_name, noise_t_start, noise_t_end, - current_snr, + current_snr), + end=",",file=f) + # deal with the last noise, which cover the rest + k = num_additive_noise - 1 + wav_t_start = float('{:.2f}'.format(k * additive_noise_len)) + wav_t_end = current_wav_len + + noise_index = random.randrange(0, noise_num_utts) + current_noise_name = noise_ids[noise_index] + current_noise_len = noise_lengths[noise_index] + + if current_noise_len <= (wav_t_end - wav_t_start): + noise_t_start = 0.0 + noise_t_end = current_noise_len + else : + noise_start_bound = float('{:.2f}'.format(current_noise_len - wav_t_end + wav_t_start)) + noise_t_start = float('{:.2f}'.format(random.uniform(0, noise_start_bound))) + noise_t_end = noise_t_start + wav_t_end - wav_t_start + + current_snr = random.randrange(args.max_snr, args.min_snr) + + print("{0}:{1}:{2}:{3}:{4}:{5}".format(wav_t_start, + wav_t_end, + current_noise_name, + noise_t_start, + noise_t_end, + current_snr), file=f) + num_fixed_done += 1 f.close() + print('''generate_fixed_length_range.py: finished generate the range files for all wav. Compare with our expect, it lacks %d files. Now we totally have %d noise range files.''' %(num_fixed_error, num_fixed_done) ) + +# This function generates the variable-length range files +def GenerateVariableLengthRangeFile(): + num_variable_error = 0 + num_variable_done = 0 + + for i in range(0, wav_num_utts): + + # check the noise list has enough sample or not + current_wav_len = wav_lengths[i] + max_num_additive_noise = int(current_wav_len / args.min_additive_noise_len) + + if max_num_additive_noise > noise_num_utts: + print( "Warning: The number of noise files or the --min-additive-noise-len is too small" ) + num_variable_error += 1 + continue + + # create a file to record the ranges + f = open(args.range_dir + "/" + str(wav_ids[i]) + ".noiserange", "w") + if f is None: + sys.exit("Error open file " + args.range_dir + "/" + str(wav_ids[i]) + ".noiserange") -print("generate_noise_range.py: finished generate the range files for all wav") + # We generate $num_kind_range ranges + for j in range(0, args.num_kind_range): + # print the perturbed wav id in the beginning of line + print("{0}-{1}".format(wav_ids[i], "perturbed-"+str(j+1)), end=" ", file=f) + # generate range file + # format: wav_t_start:wav_t_end:noise_name:noise_t_start:noise_t_end:snr, + the_rest = current_wav_len + wav_t_start = 0.0 + wav_t_end = 0.0 + while (the_rest > float(args.min_additive_noise_len)): + # firstly, we randomly choose a kind of noise and snr + noise_index = random.randrange(0, noise_num_utts) + current_noise_name = noise_ids[noise_index] + current_noise_len = noise_lengths[noise_index] + current_snr = random.randrange(args.max_snr, args.min_snr) + + # Secondly, we randomly select a fragement of the noise file. + noise_start_bound = float('{:.2f}'.format(current_noise_len - float(args.min_additive_noise_len))) + noise_t_start = float('{:.2f}'.format(random.uniform(0, noise_start_bound))) + noise_end_upperbound = float('{:.2f}'.format(noise_t_start + float(args.min_additive_noise_len))) + noise_end_lowerbound = float('{:.2f}'.format(min((noise_t_start + the_rest), current_noise_len))) + noise_t_end = float('{:.2f}'.format(random.uniform(noise_end_upperbound, noise_end_lowerbound))) + current_noise_length = noise_t_end - noise_t_start + + # Thirdly, we generate the start and end point of wav + wav_t_start = wav_t_end #the new start is the end of the last. + wav_t_end = wav_t_start + current_noise_length + + # Forthly, update the_rest + the_rest = the_rest - current_noise_length + + # Fifthly, print + print("{0}:{1}:{2}:{3}:{4}:{5}".format(wav_t_start, + wav_t_end, + current_noise_name, + noise_t_start, + noise_t_end, + current_snr), + end=",",file=f) + # deal with the bit of wav + # firstly, we randomly choose a kind of noise and snr + noise_index = random.randrange(0, noise_num_utts) + current_noise_name = noise_ids[noise_index] + current_noise_len = noise_lengths[noise_index] + current_snr = random.randrange(args.max_snr, args.min_snr) + + # Secondly, we randomly select a fragement of the noise file. + noise_start_bound = float('{:.2f}'.format(current_noise_len - the_rest)) + noise_t_start = float('{:.2f}'.format(random.uniform(0, noise_start_bound))) + noise_t_end = noise_t_start + the_rest + current_noise_length = noise_t_end - noise_t_start + + # Thirdly, we generate the start and end point of wav + wav_t_start = wav_t_end #the new start is the end of the last. + wav_t_end = wav_t_start + current_noise_length + + # Forthly, print + print("{0}:{1}:{2}:{3}:{4}:{5}".format(wav_t_start, + wav_t_end, + current_noise_name, + noise_t_start, + noise_t_end, + current_snr), + file=f) + num_variable_done += 1 + f.close() + print('''generate_fixed_length_range.py: finished generate the range files for all wav. Compare with our expect, it lacks %d files. Now we totally have %d noise range files.''' %(num_variable_error, num_variable_done) ) +if args.variable_len_additive_noise == "true": + GenerateVariableLengthRangeFile() +else: + GenerateFixedLengthRangeFile() diff --git a/egs/wsj/s5/steps/nnet3/fvector/generate_variable_length_range.py b/egs/wsj/s5/steps/nnet3/fvector/generate_variable_length_range.py index 67d0fd0d5ad..ef0fe9e02a2 100644 --- a/egs/wsj/s5/steps/nnet3/fvector/generate_variable_length_range.py +++ b/egs/wsj/s5/steps/nnet3/fvector/generate_variable_length_range.py @@ -2,10 +2,12 @@ # The function use to generate range file for fvector # This is the variable-length version -# The format is +# Each line of the range-file corrsponds to a kind of perturbed wav. In each line, +# we use comma to seperate different addnoise range. The format of each addnoise +# range is ::::: # For -# We except the last fragement, the length will be random. +# Except the last fragement, the length will be random. # For # It is randomly selected from noise list, which is longer than --min-additive-noise-len @@ -16,12 +18,14 @@ # If the noise file is shorter than T. We select the whole noise. # For , it was used to control the amplitude of noise +# It will be randomly selected from the range (max-snr, min-snr) from __future__ import print_function import re, os, argparse, sys, math, warnings, random -parser = argparse.ArgumentParser(description="Generate N noise range files for each original wav. The file" - "which created by this python code will be supplied to variable-length " +parser = argparse.ArgumentParser(description="Generate a noise range-file which cotains " + "N lines corresponding to the number of kinds for each original wav. " + "The file created by this python code will be supplied to variable-length " "and additive noise program.", epilog="Called by steps/nnet3/fvector/add_noise.sh") parser.add_argument("--num-kind-range", type=int, default=4, @@ -36,13 +40,13 @@ help="Seed for random number generator") # now the positional arguments -parser.add_argument("wav_utt2dur", - help="utt2dur file of the original wav to be used as input (format is: " +parser.add_argument("wav2dur", + help="wav2dur file of the original wav to be used as input (format is: " " ") -parser.add_argument("noise_utt2dur", - help="utt2dur file of the noise wav to be used as input (format is: " +parser.add_argument("noise2dur", + help="noise2dur file of the noise wav to be used as input (format is: " " ") -parser.add_argument("rangs_dir", +parser.add_argument("range_dir", help="Name of ranges directory, exp/fxvector/ranges") print(' '.join(sys.argv)) @@ -57,45 +61,45 @@ random.seed(args.seed) # deal with the original wav utt2dur -# the information was stored in wav_utt_ids[], wav_lengths[] and wav_num_utts -f = open(args.wav_utt2dur, "r") +# the information was stored in wav_ids[], wav_lengths[] and wav_num_utts +f = open(args.wav2dur, "r") if f is None: - sys.exit("Error opening wav_utt2dur file " + str(args.wav_utt2dur)) -wav_utt_ids = [] + sys.exit("Error opening wav2dur file " + str(args.wav2dur)) +wav_ids = [] wav_lengths = [] for line in f: a = line.split() if len(a) != 2: - sys.exit("bad line in wav_utt2dur file " + line) + sys.exit("Bad line in wav2dur file " + line) if float(a[1]) < args.min_additive_noise_len: sys.exit("ERROR: The wav length is shorter than --min-additive-noise-len") - wav_utt_ids.append(a[0]) + wav_ids.append(a[0]) wav_lengths.append(float(a[1])) f.close() -wav_num_utts = len(wav_utt_ids) +wav_num_utts = len(wav_ids) # deal with the noise wav utt2dur # remove the noise whose length < --min-additive-noise-len num_error = 0 num_done = 0 -f = open(args.noise_utt2dur, "r") +f = open(args.noise2dur, "r") if f is None: - sys.exit("Error opening wav_utt2dur file " + str(args.wav_utt2dur)) -noise_utt_ids = [] + sys.exit("Error opening wav2dur file " + str(args.noise2dur)) +noise_ids = [] noise_lengths = [] for line in f: a = line.split() if len(a) != 2: - sys.exit("bad line in noise_utt2dur file " + line); + sys.exit("bad line in noise2dur file " + line); if float(a[1]) < args.min_additive_noise_len: num_error += 1 continue - noise_utt_ids.append(a[0]) + noise_ids.append(a[0]) noise_lengths.append(float(a[1])) num_done += 1 f.close() -noise_num_utts = len(noise_utt_ids) +noise_num_utts = len(noise_ids) noise_str = "Warning: There are " + str(num_error) + " noise files length smaller than " + \ "--min-additive-noise-len, we remove it from the noise list. Now, there are " + \ str(num_done) + " noise file." @@ -104,6 +108,9 @@ num_error = 0 num_done = 0 # generate the range file for each original wav file +num_variable_error = 0 +num_variable_done = 0 + for i in range(0, wav_num_utts): # check the noise list has enough sample or not @@ -112,55 +119,58 @@ if max_num_additive_noise > noise_num_utts: print( "Warning: The number of noise files or the --min-additive-noise-len is too small" ) - num_error += 1 + num_variable_error += 1 continue + # create a file to record the ranges + f = open(args.range_dir + "/" + str(wav_ids[i]) + ".noiserange", "w") + if f is None: + sys.exit("Error open file " + args.range_dir + "/" + str(wav_ids[i]) + ".noiserange") + # We generate $num_kind_range ranges for j in range(0, args.num_kind_range): + # print the perturbed wav id in the beginning of line + print("{0}-{1}".format(wav_ids[i], "perturbed-"+str(j+1)), end=" ", file=f) - # create a file to record the ranges - f = open(args.rangs_dir + "/" + str(wav_utt_ids[i]) + ".range." + str(j), "w") - if f is None: - sys.exit("Error open file " + args.rangs_dir + "/" + str(wav_utt_ids[i]) + ".ranges." + str(j)) # generate range file - # format: wav_t_start, wav_t_end, noise_name, noise_t_start, noise_t_end, snr + # format: wav_t_start:wav_t_end:noise_name:noise_t_start:noise_t_end:snr, the_rest = current_wav_len wav_t_start = 0.0 wav_t_end = 0.0 while (the_rest > float(args.min_additive_noise_len)): - # firstly, we randomly choose a kind of noise and snr - noise_index = random.randrange(0, noise_num_utts) - current_noise_name = noise_utt_ids[noise_index] + # firstly, we randomly choose a kind of noise and snr + noise_index = random.randrange(0, noise_num_utts) + current_noise_name = noise_ids[noise_index] current_noise_len = noise_lengths[noise_index] current_snr = random.randrange(args.max_snr, args.min_snr) # Secondly, we randomly select a fragement of the noise file. noise_start_bound = float('{:.2f}'.format(current_noise_len - float(args.min_additive_noise_len))) noise_t_start = float('{:.2f}'.format(random.uniform(0, noise_start_bound))) - noise_end_upperbound = float('{:.2f}'.format(noise_t_start + float(args.min_additive_noise_len))) - noise_end_lowerbound = float('{:.2f}'.format(min((noise_t_start + the_rest), current_noise_len))) + noise_end_upperbound = float('{:.2f}'.format(noise_t_start + float(args.min_additive_noise_len))) + noise_end_lowerbound = float('{:.2f}'.format(min((noise_t_start + the_rest), current_noise_len))) noise_t_end = float('{:.2f}'.format(random.uniform(noise_end_upperbound, noise_end_lowerbound))) - current_noise_length = noise_t_end - noise_t_start + current_noise_length = noise_t_end - noise_t_start - # Thirdly, we generate the start and end point of wav + # Thirdly, we generate the start and end point of wav wav_t_start = wav_t_end #the new start is the end of the last. - wav_t_end = wav_t_start + current_noise_length + wav_t_end = wav_t_start + current_noise_length - # Forthly, update the_rest - the_rest = the_rest - current_noise_length + # Forthly, update the_rest + the_rest = the_rest - current_noise_length # Fifthly, print - print("{0} {1} {2} {3} {4} {5}".format(wav_t_start, + print("{0}:{1}:{2}:{3}:{4}:{5}".format(wav_t_start, wav_t_end, current_noise_name, noise_t_start, noise_t_end, current_snr), - file=f) - # deal with the bit of wav - # firstly, we randomly choose a kind of noise and snr + end=",",file=f) + # deal with the bit of wav + # firstly, we randomly choose a kind of noise and snr noise_index = random.randrange(0, noise_num_utts) - current_noise_name = noise_utt_ids[noise_index] + current_noise_name = noise_ids[noise_index] current_noise_len = noise_lengths[noise_index] current_snr = random.randrange(args.max_snr, args.min_snr) @@ -169,20 +179,20 @@ noise_t_start = float('{:.2f}'.format(random.uniform(0, noise_start_bound))) noise_t_end = noise_t_start + the_rest current_noise_length = noise_t_end - noise_t_start - + # Thirdly, we generate the start and end point of wav wav_t_start = wav_t_end #the new start is the end of the last. wav_t_end = wav_t_start + current_noise_length - + # Forthly, print - print("{0} {1} {2} {3} {4} {5}".format(wav_t_start, + print("{0}:{1}:{2}:{3}:{4}:{5}".format(wav_t_start, wav_t_end, current_noise_name, noise_t_start, noise_t_end, current_snr), - file=f) - f.close() - num_done += 1 - -print('''generate_fixed_length_range.py: finished generate the range files for all wav. Compare with our expect, it lacks %d files. Now we totally have %d noise range files.''' %(num_error, num_done) ) + file=f) + num_variable_done += 1 + f.close() +print('''generate_fixed_length_range.py: finished generate the range files for all wav. Compare with our expect, it lacks %d files. Now we totally have %d noise range files.''' %(num_variable_error, num_variable_done) ) + diff --git a/src/fvectorbin/nnet3-fvector-perturb-signal.cc b/src/fvectorbin/nnet3-fvector-perturb-signal.cc index 02e13656b74..09962e96a7b 100644 --- a/src/fvectorbin/nnet3-fvector-perturb-signal.cc +++ b/src/fvectorbin/nnet3-fvector-perturb-signal.cc @@ -36,6 +36,78 @@ struct NoiseController{ noise_t_start_(noise_t_start), noise_t_end_(noise_t_end), snr_(snr) { } }; +void GenerateController(std::vector &segments, + std::vector *controller) { + BaseFloat wav_t_start; + BaseFloat wav_t_end; + std::string noise_uttid; + BaseFloat noise_t_start; + BaseFloat noise_t_end; + BaseFloat snr; + for(int i=0; i < segments.size(); ++i) { + std::vector split_string; + SplitStringToVector(segments[i], ":", true, &split_string); + KALDI_ASSERT(split_string.size() == 6); + ConvertStringToReal(split_string[0], &wav_t_start); + ConvertStringToReal(split_string[1], &wav_t_end); + noise_uttid = split_string[2]; + ConvertStringToReal(split_string[3], &noise_t_start); + ConvertStringToReal(split_string[4], &noise_t_end); + ConvertStringToReal(split_string[5], &snr); + + controller->push_back(NoiseController(wav_t_start, wav_t_end, noise_uttid, + noise_t_start, noise_t_end, snr)); + } +} + +void ApplyNoise(std::string &noise_scp, const std::vector &controller, + const VectorBase &input_wav, VectorBase *perturbed_wav) { + // about noise list + RandomAccessTableReader noise_reader(noise_scp); + int samp_freq_input = input_wav.Dim(); + + // add noise + + for (int i=0; i < controller.size(); ++i) { + const WaveData &noise_wav = noise_reader.Value(controller[i].noise_uttid_); + BaseFloat samp_freq_noise = noise_wav.SampFreq(); + KALDI_ASSERT(samp_freq_input == samp_freq_noise); + + const Matrix &noise_matrix = noise_wav.Data(); + int32 num_samp_noise = noise_matrix.NumCols(); + Vector noise(num_samp_noise); + noise.CopyRowFromMat(noise_matrix, 0); + + int32 input_start_point = samp_freq_input * controller[i].wav_t_start_; + int32 input_end_point = samp_freq_input * controller[i].wav_t_end_ - 1; + int32 noise_start_point = samp_freq_noise * controller[i].noise_t_start_; + int32 noise_end_point = samp_freq_noise * controller[i].noise_t_end_ - 1; + BaseFloat snr = controller[i].snr_; + + SubVector input_part(input_wav, input_start_point, + input_end_point - input_start_point + 1); + SubVector noise_part(noise, noise_start_point, + noise_end_point - noise_start_point + 1); + Vector selected_noise(input_part.Dim()); + if (noise_part.Dim() < input_part.Dim()) { + int32 the_rest = selected_noise.Dim(); + while (the_rest > noise_part.Dim()) { + selected_noise.Range(selected_noise.Dim()-the_rest, + noise_part.Dim()).CopyFromVec(noise_part); + the_rest = the_rest - noise_part.Dim(); + } + selected_noise.Range(selected_noise.Dim()-the_rest, the_rest).CopyFromVec( + noise_part.Range(0, the_rest)); + } else { + selected_noise.CopyFromVec(noise_part); + } + + BaseFloat input_energy = VecVec(input_part, input_part); + BaseFloat noise_energy = VecVec(selected_noise, selected_noise); + BaseFloat scale_factor = sqrt(input_energy/ noise_energy/ (pow(10, snr/20)) ); + perturbed_wav->Range(input_start_point, input_part.Dim()).AddVec(scale_factor, selected_noise); + } +} } @@ -44,27 +116,27 @@ int main(int argc, char *argv[]) { using namespace kaldi; const char *usage = - "Perturb the wave files supplied via the specified noise-range file\n" + "Perturb the wave files supplied via the specified noise-range\n" "Usage: nnet3-fvector-perturb-signal [options...] " "\n" "e.g.\n" - "nnet3-fvector-perturb-signal --noise-range-file=uttid.range.n --add-noise-list=" - "scp:noise.scp --input-channel=0 input.wav output.wav\n"; + "nnet3-fvector-perturb-signal --noise=scp:noise.scp --noise-range=" + "\"head -n 5 a.noiserange | tail -n 1\" --input-channel=0 input.wav " + "perturbed_input.wav\n"; ParseOptions po(usage); - std::string noise_range_file; - std::string noise_list_rspecifier; + std::string noise; + std::string noise_range; int32 input_channel = 0; - po.Register("noise-range-file",&noise_range_file, + po.Register("noise",&noise, + "There is a list of optional noise. It need to match the --noise-range."); + po.Register("noise-range",&noise_range, "Provide a range file. We use the content in this file to control " - "the process of adding noise. The format of each line in this file " - ": " - " "); - po.Register("add-noise-list",&noise_list_rspecifier, - "There is a list of optional noise. It need to match the " - "--noise-range-file."); + "the process of adding noise. For each line, the format is " + ":::::,...," + ":::::"); po.Register("input-channel",&input_channel, "Specifies the channel to be used in input file"); @@ -77,23 +149,15 @@ int main(int argc, char *argv[]) { std::string input_wave_file = po.GetArg(1); std::string output_wave_file = po.GetArg(2); - // Genterate the Noise Controller list + // Generate the Noise Controller list std::vector controller; - if (noise_range_file != "") { - std::ifstream fi(noise_range_file.c_str()); - if (!fi) { - KALDI_ERR << "failed to open file " << noise_range_file; - } - BaseFloat wav_t_start; - BaseFloat wav_t_end; - std::string noise_uttid; - BaseFloat noise_t_start; - BaseFloat noise_t_end; - BaseFloat snr; - while (fi >> wav_t_start >> wav_t_end >> noise_uttid >> noise_t_start >> noise_t_end >> snr) { - controller.push_back(NoiseController(wav_t_start, wav_t_end, noise_uttid, - noise_t_start, noise_t_end, snr)); - } + if (noise_range != "") { + int index = noise_range.find_first_of(" "); + std::string perturbed_utt_id = noise_range.substr(0, index); + std::string noise_range_content = noise_range.substr(index+1); + std::vector segments; + SplitStringToVector(noise_range_content, ",", true, &segments); + GenerateController(segments, &controller); } WaveData input_wave; @@ -116,52 +180,9 @@ int main(int argc, char *argv[]) { Vector input(num_samp_input); input.CopyRowFromMat(input_matrix, input_channel); - // new output vector + // new output vector and add noise Vector output(input); - - // about noise list - RandomAccessTableReader noise_reader(noise_list_rspecifier); - - // add noise - for (int i=0; i < controller.size(); ++i) { - const WaveData &noise_wav = noise_reader.Value(controller[i].noise_uttid_); - BaseFloat samp_freq_noise = noise_wav.SampFreq(); - KALDI_ASSERT(samp_freq_input == samp_freq_noise); - - const Matrix &noise_matrix = noise_wav.Data(); - int32 num_samp_noise = noise_matrix.NumCols(); - Vector noise(num_samp_noise); - noise.CopyRowFromMat(noise_matrix, 0); - - int32 input_start_point = samp_freq_input * controller[i].wav_t_start_; - int32 input_end_point = samp_freq_input * controller[i].wav_t_end_ - 1; - int32 noise_start_point = samp_freq_noise * controller[i].noise_t_start_; - int32 noise_end_point = samp_freq_noise * controller[i].noise_t_end_ - 1; - BaseFloat snr = controller[i].snr_; - - SubVector input_part(input, input_start_point, - input_end_point - input_start_point + 1); - SubVector noise_part(noise, noise_start_point, - noise_end_point - noise_start_point + 1); - Vector selected_noise(input_part.Dim()); - if (noise_part.Dim() < input_part.Dim()) { - int32 the_rest = selected_noise.Dim(); - while (the_rest > noise_part.Dim()) { - selected_noise.Range(selected_noise.Dim()-the_rest, - noise_part.Dim()).CopyFromVec(noise_part); - the_rest = the_rest - noise_part.Dim(); - } - selected_noise.Range(selected_noise.Dim()-the_rest, the_rest).CopyFromVec( - noise_part.Range(0, the_rest)); - } else { - selected_noise.CopyFromVec(noise_part); - } - - BaseFloat input_energy = VecVec(input_part, input_part); - BaseFloat noise_energy = VecVec(selected_noise, selected_noise); - BaseFloat scale_factor = sqrt(input_energy/ noise_energy/ (pow(10, snr/20)) ); - output.Range(input_start_point, input_part.Dim()).AddVec(scale_factor, selected_noise); - } + ApplyNoise(noise, controller, input, &output); Matrix out_matrix(1, num_samp_input); out_matrix.CopyRowsFromVec(output); From 8574d3e0625c07b389e67c890781432fc1a9c29f Mon Sep 17 00:00:00 2001 From: Hang Lyu Date: Fri, 30 Dec 2016 17:48:41 -0500 Subject: [PATCH 13/23] modify generate_noise_range.py, add_noise.sh and nnet3-fvector-perturb-signal.cc --- egs/wsj/s5/steps/nnet3/fvector/add_noise.sh | 25 +-- .../nnet3/fvector/generate_noise_range.py | 166 +++++++++--------- .../nnet3-fvector-perturb-signal.cc | 71 ++++---- 3 files changed, 139 insertions(+), 123 deletions(-) diff --git a/egs/wsj/s5/steps/nnet3/fvector/add_noise.sh b/egs/wsj/s5/steps/nnet3/fvector/add_noise.sh index 4a7169a3b42..03dd451d064 100644 --- a/egs/wsj/s5/steps/nnet3/fvector/add_noise.sh +++ b/egs/wsj/s5/steps/nnet3/fvector/add_noise.sh @@ -2,18 +2,23 @@ # Copyright 2016 Johns Hopkins University (Author: Daniel Povey) # Apache 2.0 +# The script is used to generate the egs which will be used in fvector framework. +# So far, the script achieves the duration files of train dataset and noise +# dataset seperately. Then, with the duration files, it will generate the range +# file which is used to control the process about adding additive noise. + # Begin Configuration section. stage=0 cmd=run.pl nj=4 # Begain Configuration. -min_additive_noise_len=2.0 # the minimum duration of each noise file -num_kind_range=4 # the number of kinds of noise ranges -min_snr=-5 # the minimum snr value -max_snr=-15 # the maximum snr value -seed=-1 # set the random seed -variable_len_additive_noise=true #If true, generate the variable-length range files - #If false, generate the fixed-length range files +min_additive_noise_len=2.0 # the minimum duration of each noise file in seconds. +num_ranges_per_wav=4 # the number of noise ranges for each wav. +min_snr=-5 # the minimum snr value in dB. +max_snr=-15 # the maximum snr value in dB. +seed=-1 # set the random seed. +variable_len_additive_noise=true #If true, generate the variable-length range files. + #If false, generate the fixed-length range files. # End Configuration options. echo "$0 $@" # Print the command line for logging @@ -26,7 +31,7 @@ if [ $# != 3 ]; then echo "e.g.: steps/nnet3/fvector/add_noise.sh data/train data/noise ranges" echo "main options (for others, see top of script file)" echo " --min-additive-noise-len # limit the minimum length of noise" - echo " --num-kind-range # number of noise range kinds" + echo " --num-ranges-per-wav # number of noise range kinds" echo " --variable-len-additive-noise (true|false) # decide fixed/variable version" echo " --nj # number of parallel jobs" echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs" @@ -52,13 +57,13 @@ if [ $stage -le 0 ]; then echo "$0: generate $num_kind_rage kinds of noise range for each original wav" $cmd $dir/log/generate_noise_range.log \ steps/nnet3/fvector/generate_noise_range.py \ - --num-kind-range=$num_kind_range \ + --num-ranges-per-wav=$num_ranges_per_wav \ --min-additive-noise-len=$min_additive_noise_len \ --min-snr=$min_snr \ --max-snr=$max_snr \ --variable-len-additive-noise $variable_len_additive_noise \ --seed=$seed \ - $data/utt2dur $noise/utt2dur $dir + $data/utt2dur $noise/utt2dur $dir/ranges fi exit 0 diff --git a/egs/wsj/s5/steps/nnet3/fvector/generate_noise_range.py b/egs/wsj/s5/steps/nnet3/fvector/generate_noise_range.py index fa42e030e85..f55af9e33e9 100644 --- a/egs/wsj/s5/steps/nnet3/fvector/generate_noise_range.py +++ b/egs/wsj/s5/steps/nnet3/fvector/generate_noise_range.py @@ -1,12 +1,14 @@ #!/usr/bin/env python # The function use to generate range-file for fvector -# Each line of the range-file corrsponds to a kind of perturbed wav. In each line, +# Each line of the range-file corrsponds to a kind of perturbed wav. In each +# line, there is a in the beginning of the line and then # we use comma to seperate different addnoise range. The format of each addnoise # range is ::::: # The line which starts with the asterisk(*) is the differences between two versions. # For the fixed-length version: +# In the beginning of the line, there is a # *For # *Except the last fragement, the length will be a fixed value T. # For @@ -19,6 +21,7 @@ # The snr will be randomly selected form the range (max-snr, min-snr). # For the variable-length version: +# In the beginning of the line, there is a # *For # *Except the last fragement, the length will be random. # For @@ -39,10 +42,10 @@ "The file which created by this python code will be supplied to " "add additive noise program.", epilog="Called by steps/nnet3/fvector/add_noise.sh") -parser.add_argument("--num-kind-range", type=int, default=4, +parser.add_argument("--num-ranges-per-wav", type=int, default=4, help="the number of expected addnoise kinds") parser.add_argument("--min-additive-noise-len", type=float, default=2.0, - help="the minimum duration/length of each noise file") + help="the minimum duration/length of each noise file in seconds") parser.add_argument("--min-snr", type=int, default=-5, help="the minimum Signal-to-Noise Rate, the default=0") parser.add_argument("--max-snr", type=int, default=-15, @@ -61,8 +64,8 @@ parser.add_argument("noise2dur", help="noise2dur file of the noise wav to be used as input (format is: " " ") -parser.add_argument("range_dir", - help="Name of ranges directory, exp/fxvector/ranges") +parser.add_argument("range_file", + help="Name of range file, e.g.: exp/fxvector/ranges") print(' '.join(sys.argv)) @@ -75,73 +78,58 @@ random.seed(args.seed) -# deal with the original wav utt2dur -# the information was stored in wav_ids[], wav_lengths[] and wav_num_utts -f = open(args.wav2dur, "r") -if f is None: - sys.exit("Error opening wav2dur file " + str(args.wav2dur)) -wav_ids = [] -wav_lengths = [] -for line in f: - a = line.split() - if len(a) != 2: - sys.exit("Bad line in wav2dur file " + line) - if float(a[1]) < args.min_additive_noise_len: - sys.exit("ERROR: The wav length is shorter than --min-additive-noise-len") - wav_ids.append(a[0]) - wav_lengths.append(float(a[1])) -f.close() - -wav_num_utts = len(wav_ids) - -# deal with the noise wav utt2dur -# remove the noise whose length < --min-additive-noise-len -num_error = 0 -num_done = 0 -f = open(args.noise2dur, "r") -if f is None: - sys.exit("Error opening noise2dur file " + str(args.noise2dur)) -noise_ids = [] -noise_lengths = [] -for line in f: - a = line.split() - if len(a) != 2: - sys.exit("Bad line in noise2dur file " + line); - if float(a[1]) < args.min_additive_noise_len: - num_error += 1 - continue - noise_ids.append(a[0]) - noise_lengths.append(float(a[1])) - num_done += 1 -f.close() -noise_num_utts = len(noise_ids) -noise_str = "Warning: There are " + str(num_error) + " noise files length smaller than " + \ - "--min-additive-noise-len, we remove it from the noise list. Now, there are " + \ - str(num_done) + " noise file." -sys.stdout.write( noise_str + '\n') +# This function extract the information from the file--wav2dur. Its outputs will +# be ids[] and lengths[] +def WavToDuration(duration_file, ids, lengths, strict): + f = open(duration_file, "r") + if f is None: + sys.exit("Error opening wav2dur file " + str(duration_file)) + num_error = 0 + num_done = 0 + for line in f: + a = line.split() + if len(a) != 2: + sys.exit("Bad line \"" + line.strip() +"\" in file: " + str(duration_file)) + if float(a[1]) < args.min_additive_noise_len: + if strict: + sys.exit("ERROR: The wav length \"" + line.strip()+ "\" is shorter than --min-additive-noise-len") + else: + num_error += 1 + continue + ids.append(a[0]) + lengths.append(float(a[1])) + num_done += 1 + f.close() + if num_error is not 0: + warning_str ="Warning: There are " + str(num_error) + " utterances whose length smaller than " + \ + "--min-additive-noise-len, we remove it from the list. Now, there are " + \ + str(num_done) + " utterances in the list." + sys.stdout.write( warning_str + '\n') + return # This function generates the fixed-length range files def GenerateFixedLengthRangeFile(): num_fixed_error = 0 num_fixed_done = 0 + num_wav = len(wav_ids) + num_noise = len(noise_ids) + # create a file to record the ranges + f = open(args.range_file, "w") + if f is None: + sys.exit("Error open file " + args.range_file) - for i in range(0, wav_num_utts): + for i in range(0, num_wav): # decide the number of noises which will be add to current_wav_len = wav_lengths[i] max_num_additive_noise = int(current_wav_len / args.min_additive_noise_len) - if max_num_additive_noise > noise_num_utts: + if max_num_additive_noise > num_noise: print( "Warning: The number of noise files or the --min-additive-noise-len is too small" ) num_fixed_error += 1 continue - - # create a file to record the ranges - f = open(args.range_dir + "/" + str(wav_ids[i]) + ".noiserange", "w") - if f is None: - sys.exit("Error open file " + args.range_dir + "/" + str(wav_ids[i]) + ".noiserange") - - # We generate $num_kind_range ranges - for j in range(0, args.num_kind_range): + + # We generate $num_ranges_per_wav ranges + for j in range(0, args.num_ranges_per_wav): # print the perturbed wav id in the beginning of line print("{0}-{1}".format(wav_ids[i], "perturbed-"+str(j+1)), end=" ", file=f) @@ -157,7 +145,7 @@ def GenerateFixedLengthRangeFile(): wav_t_start = float('{:.2f}'.format(k * additive_noise_len)) wav_t_end = wav_t_start + additive_noise_len - noise_index = random.randrange(0, noise_num_utts) + noise_index = random.randrange(0, num_noise) current_noise_name = noise_ids[noise_index] current_noise_len = noise_lengths[noise_index] if current_noise_len <= additive_noise_len: @@ -182,7 +170,7 @@ def GenerateFixedLengthRangeFile(): wav_t_start = float('{:.2f}'.format(k * additive_noise_len)) wav_t_end = current_wav_len - noise_index = random.randrange(0, noise_num_utts) + noise_index = random.randrange(0, num_noise) current_noise_name = noise_ids[noise_index] current_noise_len = noise_lengths[noise_index] @@ -204,32 +192,32 @@ def GenerateFixedLengthRangeFile(): current_snr), file=f) num_fixed_done += 1 - f.close() - print('''generate_fixed_length_range.py: finished generate the range files for all wav. Compare with our expect, it lacks %d files. Now we totally have %d noise range files.''' %(num_fixed_error, num_fixed_done) ) + f.close() + print('''Finished generating fixed_length range-file for all wav. Compare with our expect, it lacks %d ranges. Now we totally have %d noise ranges in the range-file.''' %(num_fixed_error, num_fixed_done) ) # This function generates the variable-length range files def GenerateVariableLengthRangeFile(): num_variable_error = 0 num_variable_done = 0 - for i in range(0, wav_num_utts): + # create a file to record the ranges + f = open(args.range_file, "w") + if f is None: + sys.exit("Error open file " + args.range_file) + + for i in range(0, num_wav): # check the noise list has enough sample or not current_wav_len = wav_lengths[i] max_num_additive_noise = int(current_wav_len / args.min_additive_noise_len) - if max_num_additive_noise > noise_num_utts: + if max_num_additive_noise > num_noise: print( "Warning: The number of noise files or the --min-additive-noise-len is too small" ) num_variable_error += 1 continue - - # create a file to record the ranges - f = open(args.range_dir + "/" + str(wav_ids[i]) + ".noiserange", "w") - if f is None: - sys.exit("Error open file " + args.range_dir + "/" + str(wav_ids[i]) + ".noiserange") - - # We generate $num_kind_range ranges - for j in range(0, args.num_kind_range): + + # We generate $num_ranges_per_wav ranges + for j in range(0, args.num_ranges_per_wav): # print the perturbed wav id in the beginning of line print("{0}-{1}".format(wav_ids[i], "perturbed-"+str(j+1)), end=" ", file=f) @@ -240,7 +228,7 @@ def GenerateVariableLengthRangeFile(): wav_t_end = 0.0 while (the_rest > float(args.min_additive_noise_len)): # firstly, we randomly choose a kind of noise and snr - noise_index = random.randrange(0, noise_num_utts) + noise_index = random.randrange(0, num_noise) current_noise_name = noise_ids[noise_index] current_noise_len = noise_lengths[noise_index] current_snr = random.randrange(args.max_snr, args.min_snr) @@ -270,7 +258,7 @@ def GenerateVariableLengthRangeFile(): end=",",file=f) # deal with the bit of wav # firstly, we randomly choose a kind of noise and snr - noise_index = random.randrange(0, noise_num_utts) + noise_index = random.randrange(0, num_noise) current_noise_name = noise_ids[noise_index] current_noise_len = noise_lengths[noise_index] current_snr = random.randrange(args.max_snr, args.min_snr) @@ -294,10 +282,26 @@ def GenerateVariableLengthRangeFile(): current_snr), file=f) num_variable_done += 1 - f.close() - print('''generate_fixed_length_range.py: finished generate the range files for all wav. Compare with our expect, it lacks %d files. Now we totally have %d noise range files.''' %(num_variable_error, num_variable_done) ) + f.close() + print('''Finished generating variable_length range-file for all wav. Compare with our expect, it lacks %d ranges. Now we totally have %d noise ranges in the range-file.''' %(num_variable_error, num_variable_done) ) + +if __name__ == "__main__": + # deal with the original wav utt2dur + # the information was stored in wav_ids[], wav_lengths[] and num_wav + wav_ids = [] + wav_lengths = [] + WavToDuration(args.wav2dur, wav_ids, wav_lengths, True) + num_wav = len(wav_ids) + + # deal with the noise wav utt2dur + # remove the noise whose length < --min-additive-noise-len + noise_ids = [] + noise_lengths = [] + WavToDuration(args.noise2dur, noise_ids, noise_lengths, False) + num_noise = len(noise_ids) -if args.variable_len_additive_noise == "true": - GenerateVariableLengthRangeFile() -else: - GenerateFixedLengthRangeFile() + # generate the range file + if args.variable_len_additive_noise == "true": + GenerateVariableLengthRangeFile() + else: + GenerateFixedLengthRangeFile() diff --git a/src/fvectorbin/nnet3-fvector-perturb-signal.cc b/src/fvectorbin/nnet3-fvector-perturb-signal.cc index 09962e96a7b..52992c173c0 100644 --- a/src/fvectorbin/nnet3-fvector-perturb-signal.cc +++ b/src/fvectorbin/nnet3-fvector-perturb-signal.cc @@ -22,22 +22,22 @@ namespace kaldi { -struct NoiseController{ - BaseFloat wav_t_start_; - BaseFloat wav_t_end_; - std::string noise_uttid_; - BaseFloat noise_t_start_; - BaseFloat noise_t_end_; - BaseFloat snr_; - - NoiseController(BaseFloat wav_t_start, BaseFloat wav_t_end, std::string noise_uttid, - BaseFloat noise_t_start, BaseFloat noise_t_end, BaseFloat snr): - wav_t_start_(wav_t_start), wav_t_end_(wav_t_end), noise_uttid_(noise_uttid), - noise_t_start_(noise_t_start), noise_t_end_(noise_t_end), snr_(snr) { } +struct AdditiveNoiseRange{ + BaseFloat wav_t_start; + BaseFloat wav_t_end; + std::string noise_uttid; + BaseFloat noise_t_start; + BaseFloat noise_t_end; + BaseFloat snr; + + AdditiveNoiseRange(BaseFloat wav_t_start, BaseFloat wav_t_end, std::string noise_uttid, + BaseFloat noise_t_start, BaseFloat noise_t_end, BaseFloat snr): + wav_t_start(wav_t_start), wav_t_end(wav_t_end), noise_uttid(noise_uttid), + noise_t_start(noise_t_start), noise_t_end(noise_t_end), snr(snr) { } }; void GenerateController(std::vector &segments, - std::vector *controller) { + std::vector *controller) { BaseFloat wav_t_start; BaseFloat wav_t_end; std::string noise_uttid; @@ -55,12 +55,12 @@ void GenerateController(std::vector &segments, ConvertStringToReal(split_string[4], &noise_t_end); ConvertStringToReal(split_string[5], &snr); - controller->push_back(NoiseController(wav_t_start, wav_t_end, noise_uttid, - noise_t_start, noise_t_end, snr)); + controller->push_back(AdditiveNoiseRange(wav_t_start, wav_t_end, noise_uttid, + noise_t_start, noise_t_end, snr)); } } -void ApplyNoise(std::string &noise_scp, const std::vector &controller, +void ApplyNoise(std::string &noise_scp, const std::vector &controller, const VectorBase &input_wav, VectorBase *perturbed_wav) { // about noise list RandomAccessTableReader noise_reader(noise_scp); @@ -69,7 +69,7 @@ void ApplyNoise(std::string &noise_scp, const std::vector &cont // add noise for (int i=0; i < controller.size(); ++i) { - const WaveData &noise_wav = noise_reader.Value(controller[i].noise_uttid_); + const WaveData &noise_wav = noise_reader.Value(controller[i].noise_uttid); BaseFloat samp_freq_noise = noise_wav.SampFreq(); KALDI_ASSERT(samp_freq_input == samp_freq_noise); @@ -78,26 +78,33 @@ void ApplyNoise(std::string &noise_scp, const std::vector &cont Vector noise(num_samp_noise); noise.CopyRowFromMat(noise_matrix, 0); - int32 input_start_point = samp_freq_input * controller[i].wav_t_start_; - int32 input_end_point = samp_freq_input * controller[i].wav_t_end_ - 1; - int32 noise_start_point = samp_freq_noise * controller[i].noise_t_start_; - int32 noise_end_point = samp_freq_noise * controller[i].noise_t_end_ - 1; - BaseFloat snr = controller[i].snr_; + int32 input_start_point = samp_freq_input * controller[i].wav_t_start; + int32 input_end_point = samp_freq_input * controller[i].wav_t_end - 1; + int32 noise_start_point = samp_freq_noise * controller[i].noise_t_start; + int32 noise_end_point = samp_freq_noise * controller[i].noise_t_end - 1; + BaseFloat snr = controller[i].snr; + // The input vector and noise vector contain the whole content of utt seperately. + // According to the AdditiveNoiseRange, we stepwise add the additive noise to input. + // To save the space, we use Subvector, because it returns the pointer. SubVector input_part(input_wav, input_start_point, input_end_point - input_start_point + 1); SubVector noise_part(noise, noise_start_point, noise_end_point - noise_start_point + 1); Vector selected_noise(input_part.Dim()); + + // When encounter the situation where noise_part_length is shorter than input_part_length, + // We pad recursively until the selected_noise_length equal to input_part_length. + // Otherwise, selected_noise = noise_part if (noise_part.Dim() < input_part.Dim()) { - int32 the_rest = selected_noise.Dim(); - while (the_rest > noise_part.Dim()) { - selected_noise.Range(selected_noise.Dim()-the_rest, + int32 the_rest_length = selected_noise.Dim(); + while (the_rest_length > noise_part.Dim()) { + selected_noise.Range(selected_noise.Dim()-the_rest_length, noise_part.Dim()).CopyFromVec(noise_part); - the_rest = the_rest - noise_part.Dim(); + the_rest_length = the_rest_length - noise_part.Dim(); } - selected_noise.Range(selected_noise.Dim()-the_rest, the_rest).CopyFromVec( - noise_part.Range(0, the_rest)); + selected_noise.Range(selected_noise.Dim()-the_rest_length, the_rest_length).CopyFromVec( + noise_part.Range(0, the_rest_length)); } else { selected_noise.CopyFromVec(noise_part); } @@ -121,8 +128,8 @@ int main(int argc, char *argv[]) { "\n" "e.g.\n" "nnet3-fvector-perturb-signal --noise=scp:noise.scp --noise-range=" - "\"head -n 5 a.noiserange | tail -n 1\" --input-channel=0 input.wav " - "perturbed_input.wav\n"; + "wav1-perturbed-1 0.0:1.0:noise1:3.5:4.5:-8,... --input-channel=0 " + "input.wav perturbed_input.wav\n"; ParseOptions po(usage); @@ -150,8 +157,8 @@ int main(int argc, char *argv[]) { std::string output_wave_file = po.GetArg(2); // Generate the Noise Controller list - std::vector controller; - if (noise_range != "") { + std::vector controller; + if (!noise_range.empty()) { int index = noise_range.find_first_of(" "); std::string perturbed_utt_id = noise_range.substr(0, index); std::string noise_range_content = noise_range.substr(index+1); From c6ec39e327dda63122c85dc98cfee13e4f91b0ef Mon Sep 17 00:00:00 2001 From: Hang Lyu Date: Sat, 31 Dec 2016 01:16:56 -0500 Subject: [PATCH 14/23] finished the two steps --- egs/wsj/s5/steps/nnet3/fvector/add_noise.sh | 3 +- .../nnet3/fvector/generate_noise_range.py | 38 ++++++- .../fvector/generate_perturb_wav_specifier.py | 99 +++++++++++++++++++ 3 files changed, 136 insertions(+), 4 deletions(-) create mode 100644 egs/wsj/s5/steps/nnet3/fvector/generate_perturb_wav_specifier.py diff --git a/egs/wsj/s5/steps/nnet3/fvector/add_noise.sh b/egs/wsj/s5/steps/nnet3/fvector/add_noise.sh index 03dd451d064..efd648bcbe0 100644 --- a/egs/wsj/s5/steps/nnet3/fvector/add_noise.sh +++ b/egs/wsj/s5/steps/nnet3/fvector/add_noise.sh @@ -6,6 +6,7 @@ # So far, the script achieves the duration files of train dataset and noise # dataset seperately. Then, with the duration files, it will generate the range # file which is used to control the process about adding additive noise. +# At the same time, it will generate the mapping between wav and perturbedwav. # Begin Configuration section. stage=0 @@ -63,7 +64,7 @@ if [ $stage -le 0 ]; then --max-snr=$max_snr \ --variable-len-additive-noise $variable_len_additive_noise \ --seed=$seed \ - $data/utt2dur $noise/utt2dur $dir/ranges + $data/utt2dur $noise/utt2dur $dir/ranges $dir/wav2perturbedwav fi exit 0 diff --git a/egs/wsj/s5/steps/nnet3/fvector/generate_noise_range.py b/egs/wsj/s5/steps/nnet3/fvector/generate_noise_range.py index f55af9e33e9..714f187b896 100644 --- a/egs/wsj/s5/steps/nnet3/fvector/generate_noise_range.py +++ b/egs/wsj/s5/steps/nnet3/fvector/generate_noise_range.py @@ -33,7 +33,8 @@ # For , it was used to control the amplitude of noise # It will be randomly selected from the range (max-snr, min-snr) - +# At the same time, the function will generate the mapping of wav and perturbedwav +# Each line contains a mapping. (e.g.: wav1 wav1-perturbed-1 wav1-perturbed-2 ...) from __future__ import print_function import re, os, argparse, sys, math, warnings, random @@ -66,6 +67,9 @@ " ") parser.add_argument("range_file", help="Name of range file, e.g.: exp/fxvector/ranges") +parser.add_argument("wav2perturbedwav", + help="This file is used to store the mapping between wav and perturbedwav" + "(e.g.: wav1 wav1-perturbed-1 wav1-perturbed-2 ...") print(' '.join(sys.argv)) @@ -113,10 +117,16 @@ def GenerateFixedLengthRangeFile(): num_fixed_done = 0 num_wav = len(wav_ids) num_noise = len(noise_ids) + # create a file to record the ranges f = open(args.range_file, "w") if f is None: sys.exit("Error open file " + args.range_file) + + # create a file to record the wav2perturbedwav + g = open(args.wav2perturbedwav, "w") + if g is None: + sys.exit("Error open file " + args.wav2perturbedwav) for i in range(0, num_wav): # decide the number of noises which will be add to @@ -127,12 +137,18 @@ def GenerateFixedLengthRangeFile(): print( "Warning: The number of noise files or the --min-additive-noise-len is too small" ) num_fixed_error += 1 continue - + + # print the wav_id + print("{0}".format(wav_ids[i]), end="", file=g) + # We generate $num_ranges_per_wav ranges for j in range(0, args.num_ranges_per_wav): # print the perturbed wav id in the beginning of line print("{0}-{1}".format(wav_ids[i], "perturbed-"+str(j+1)), end=" ", file=f) - + + # print the perturbedwav_id + print(" {0}-{1}".format(wav_ids[i], "perturbed-"+str(j+1)), end="", file=g) + # select a number from [1 ... max_num_additive_noise] num_additive_noise = random.randint(1, max_num_additive_noise) @@ -192,7 +208,10 @@ def GenerateFixedLengthRangeFile(): current_snr), file=f) num_fixed_done += 1 + # print the "\n" + print("\n", end="", file=g) f.close() + g.close() print('''Finished generating fixed_length range-file for all wav. Compare with our expect, it lacks %d ranges. Now we totally have %d noise ranges in the range-file.''' %(num_fixed_error, num_fixed_done) ) # This function generates the variable-length range files @@ -205,6 +224,11 @@ def GenerateVariableLengthRangeFile(): if f is None: sys.exit("Error open file " + args.range_file) + # create a file to record the wav2perturbedwav + g = open(args.wav2perturbedwav, "w") + if g is None: + sys.exit("Error open file " + args.wav2perturbedwav) + for i in range(0, num_wav): # check the noise list has enough sample or not @@ -216,10 +240,16 @@ def GenerateVariableLengthRangeFile(): num_variable_error += 1 continue + # print the wav_id + print("{0}".format(wav_ids[i]), end="", file=g) + # We generate $num_ranges_per_wav ranges for j in range(0, args.num_ranges_per_wav): # print the perturbed wav id in the beginning of line print("{0}-{1}".format(wav_ids[i], "perturbed-"+str(j+1)), end=" ", file=f) + + # print the perturbedwav_id + print(" {0}-{1}".format(wav_ids[i], "perturbed-"+str(j+1)), end="", file=g) # generate range file # format: wav_t_start:wav_t_end:noise_name:noise_t_start:noise_t_end:snr, @@ -282,7 +312,9 @@ def GenerateVariableLengthRangeFile(): current_snr), file=f) num_variable_done += 1 + print("\n", end="", file=g) f.close() + g.close() print('''Finished generating variable_length range-file for all wav. Compare with our expect, it lacks %d ranges. Now we totally have %d noise ranges in the range-file.''' %(num_variable_error, num_variable_done) ) if __name__ == "__main__": diff --git a/egs/wsj/s5/steps/nnet3/fvector/generate_perturb_wav_specifier.py b/egs/wsj/s5/steps/nnet3/fvector/generate_perturb_wav_specifier.py new file mode 100644 index 00000000000..1e4f0775bea --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/fvector/generate_perturb_wav_specifier.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python + +# This function is used to generate the perturbed_wav.scp with the inputs as +# wav.scp, wav2perturbedwav, ranges + +# The final format is : +# wav1 sph2pipe -f wav -p -c 1 $path/wav1.sph | +# wav1-p1 sph2pipe -f wav -p -c 1 $path/wav1.sph | nnet3-fvector-perturb-signal +# --noise-scp=scp:noise.scp noise-range="range-p1-for-wav1" - | + +from __future__ import print_function +import re, os, argparse, sys, math, warnings, random + +parser = argparse.ArgumentParser(description="Generate a mapping file which use to map the wav to " + "Corresponding pertrubedwav", + epilog="Called by steps/nnet3/fvector/add_noise.sh") +parser.add_argument("--noise", type=str, + help="To assign the noise.scp. You must make sure it is same with " + "the noise.scp which is used to generate range_file.") +# now the positional arguments +parser.add_argument("wav_scp", + help="The orginial wav.scp which contains all the original wav " + "The format is: .") +parser.add_argument("range_file", + help="The file contains the range information which is used to " + "control the process of adding noise. The format is : " + " .") +parser.add_argument("wav2perturbedwav", + help="This file contains the mapping between wav and perturbedwav.") +parser.add_argument("perturbed_wav_scp", + help="The file is used to store the perturbed wav sperifier.") + +print(' '.join(sys.argv)) + +args = parser.parse_args() + +# Extract the information form the wav_scprding_ids = [] +wav_recording_ids = [] +wav_extended_files = [] +f = open(args.wav_scp, "r") +if f is None: + sys.exit("Error opening wav.scp file") +for line in f: + # remove the "\n" in the end of each line + line.split("\n") + a = line.split() + wav_recording_ids.append(a[0]) + del a[0] + wav_extended_files.append(' '.join(a)) +f.close() + +# Extract the infromation from the range_file +perturbed_range_ids = [] +perturbed_range_contents = [] +f = open(args.range_file, "r") +if f is None: + sys.exit("Error opening range_file") +for line in f: + # remove the "\n" in the end of each line + line.split("\n") + a = line.split() + if len(a) != 2: + sys.exit("Bad line \"" + line + "\" in file: " + str(args.range_file)) + perturbed_range_ids.append(a[0]) + perturbed_range_contents.append(a[1]) +f.close() + +# generate the mapping file through iterating all terms in the wav2perturbedwav +f = open(args.wav2perturbedwav, "r") +if f is None: + sys.exit("Error opening wav2perturbedwav") +# make a store file. +g = open(args.perturbed_wav_scp, "w") +if g is None: + sys.exit("Error opening perturbed_wav_specifier") + +# start the loop +for line in f: + # remove the "\n" in the end of each line + line.split("\n") + wav_list = line.split() + current_wav_id = wav_list[0] + current_wav_index = wav_recording_ids.index(current_wav_id) + + # print the original wav + print("{0} {1}".format(current_wav_id, wav_extended_files[current_wav_index]), file=g) + + for i in range(1, len(wav_list)): + current_perturbed_wav_id = wav_list[i] + current_perturbed_wav_index = perturbed_range_ids.index(current_perturbed_wav_id) + print('''{0} {1} nnet3-fvector-perturb-signal --noise-scp=scp:{3} --noise=\" + {4}\" - |'''.format(current_perturbed_wav_id, + wav_extend_files[current_wav_index], + args.noise, + perturbed_range_contents[current_perturbed_wav_index]), + file=g) +g.close() +f.close() +print("Finished generating the perturb_wav.scp") From 429c1461bc8c922802cddec1d088cd22c0eb6d03 Mon Sep 17 00:00:00 2001 From: LvHang Date: Sat, 31 Dec 2016 01:18:31 -0500 Subject: [PATCH 15/23] Delete generate_fixed_length_range.py --- .../fvector/generate_fixed_length_range.py | 191 ------------------ 1 file changed, 191 deletions(-) delete mode 100644 egs/wsj/s5/steps/nnet3/fvector/generate_fixed_length_range.py diff --git a/egs/wsj/s5/steps/nnet3/fvector/generate_fixed_length_range.py b/egs/wsj/s5/steps/nnet3/fvector/generate_fixed_length_range.py deleted file mode 100644 index 84a41541163..00000000000 --- a/egs/wsj/s5/steps/nnet3/fvector/generate_fixed_length_range.py +++ /dev/null @@ -1,191 +0,0 @@ -#!/usr/bin/env python - -# The function use to generate range file for fvector -# This is the fixed-length version -# Each line of the range-file corrsponds to a kind of perturbed wav. In each line, -# we use comma to seperate different addnoise range. The format of each addnoise -# range is ::::: - -# For -# Except the last fragement, the length will be a fixed value T. - -# For -# It is randomly selected from noise list, which is longer than --min-additive-noise-len - -# For -# If the noise file is longer than fixed value. We randomly select the start point and -# the length will be fixed value T. -# If the noise file is shorter than T. We select the whole noise. - -# The control the rate of signal and noise. In the other word, scale the amplitude of noise. -# The snr will be randomly selected form the range (max-snr, min-snr). - -from __future__ import print_function -import re, os, argparse, sys, math, warnings, random - -parser = argparse.ArgumentParser(description="Generate a noise range files which contains " - "N lines corresponding to the number of kinds for each original wav. " - "The file created by this python code will be supplied to fixed-length add " - "additive noise program.", - epilog="Called by steps/nnet3/fvector/add_noise.sh") -parser.add_argument("--num-kind-range", type=int, default=4, - help="the number of expected addnoise kinds") -parser.add_argument("--min-additive-noise-len", type=float, default=2.0, - help="the minimum duration/length of each noise file") -parser.add_argument("--min-snr", type=int, default=0, - help="the minimum Signal-to-Noise Rate, the default=0") -parser.add_argument("--max-snr", type=int, default=-10, - help="the maximum Signal-to-Noise Rate, the default=-10") -parser.add_argument("--seed", type=int, default=-1, - help="Seed for random number generator") - -# now the positional arguments -parser.add_argument("wav2dur", - help="wav2dur file of the original wav to be used as input (format is: " - " ") -parser.add_argument("noise2dur", - help="noise2dur file of the noise wav to be used as input (format is: " - " ") -parser.add_argument("range_dir", - help="Name of ranges directory, exp/fxvector/ranges") - -print(' '.join(sys.argv)) - -args = parser.parse_args() - -## Check arguments -if args.min_snr < args.max_snr: - sys.exit("For SNR, the less numerical value is, the larger noise is. So --min-snr bigger " - "than --max-snr in numerical value.") - -random.seed(args.seed) - -# deal with the original wav utt2dur -# the information was stored in wav_ids[], wav_lengths[] and wav_num_utts -f = open(args.wav2dur, "r") -if f is None: - sys.exit("Error opening wav2dur file " + str(args.wav2dur)) -wav_ids = [] -wav_lengths = [] -for line in f: - a = line.split() - if len(a) != 2: - sys.exit("Bad line in wav2dur file " + line) - if float(a[1]) < args.min_additive_noise_len: - sys.exit("ERROR: The wav length is shorter than --min-additive-noise-len") - wav_ids.append(a[0]) - wav_lengths.append(float(a[1])) -f.close() - -wav_num_utts = len(wav_ids) - -# deal with the noise wav utt2dur -# remove the noise whose length < --min-additive-noise-len -num_error = 0 -num_done = 0 -f = open(args.noise2dur, "r") -if f is None: - sys.exit("Error opening noise2dur file " + str(args.noise2dur)) -noise_ids = [] -noise_lengths = [] -for line in f: - a = line.split() - if len(a) != 2: - sys.exit("Bad line in noise2dur file " + line); - if float(a[1]) < args.min_additive_noise_len: - num_error += 1 - continue - noise_ids.append(a[0]) - noise_lengths.append(float(a[1])) - num_done += 1 -f.close() -noise_num_utts = len(noise_ids) -noise_str = "Warning: There are " + str(num_error) + " noise files length smaller than " + \ - "--min-additive-noise-len, we remove it from the noise list. Now, there are " + \ - str(num_done) + " noise file." -sys.stdout.write( noise_str + '\n') - -num_fixed_error = 0 -num_fixed_done = 0 - -for i in range(0, wav_num_utts): - # decide the number of noises which will be add to - current_wav_len = wav_lengths[i] - max_num_additive_noise = int(current_wav_len / args.min_additive_noise_len) - - if max_num_additive_noise > noise_num_utts: - print( "Warning: The number of noise files or the --min-additive-noise-len is too small" ) - num_fixed_error += 1 - continue - - # create a file to record the ranges - f = open(args.range_dir + "/" + str(wav_ids[i]) + ".noiserange", "w") - if f is None: - sys.exit("Error open file " + args.range_dir + "/" + str(wav_ids[i]) + ".noiserange") - - # We generate $num_kind_range ranges - for j in range(0, args.num_kind_range): - # print the perturbed wav id in the beginning of line - print("{0}-{1}".format(wav_ids[i], "perturbed-"+str(j+1)), end=" ", file=f) - - # select a number from [1 ... max_num_additive_noise] - num_additive_noise = random.randint(1, max_num_additive_noise) - - # decide the length of each noise, minus 0.01 to prevent overstep - additive_noise_len = float('{:.2f}'.format(current_wav_len / num_additive_noise)) - 0.01 - - # generate one line of file - # format: wav_t_start:wav_t_end:noise_name:noise_t_start:noise_t_end:snr, - for k in range(0, num_additive_noise - 1): - wav_t_start = float('{:.2f}'.format(k * additive_noise_len)) - wav_t_end = wav_t_start + additive_noise_len - - noise_index = random.randrange(0, noise_num_utts) - current_noise_name = noise_ids[noise_index] - current_noise_len = noise_lengths[noise_index] - if current_noise_len <= additive_noise_len: - noise_t_start = 0.0 - noise_t_end = current_noise_len - else : - noise_start_bound = float('{:.2f}'.format(current_noise_len - additive_noise_len)) - noise_t_start = float('{:.2f}'.format(random.uniform(0, noise_start_bound))) - noise_t_end = noise_t_start + additive_noise_len - - current_snr = random.randrange(args.max_snr, args.min_snr) - - print("{0}:{1}:{2}:{3}:{4}:{5}".format(wav_t_start, - wav_t_end, - current_noise_name, - noise_t_start, - noise_t_end, - current_snr), - end=",",file=f) - # deal with the last noise, which cover the rest - k = num_additive_noise - 1 - wav_t_start = float('{:.2f}'.format(k * additive_noise_len)) - wav_t_end = current_wav_len - - noise_index = random.randrange(0, noise_num_utts) - current_noise_name = noise_ids[noise_index] - current_noise_len = noise_lengths[noise_index] - - if current_noise_len <= (wav_t_end - wav_t_start): - noise_t_start = 0.0 - noise_t_end = current_noise_len - else : - noise_start_bound = float('{:.2f}'.format(current_noise_len - wav_t_end + wav_t_start)) - noise_t_start = float('{:.2f}'.format(random.uniform(0, noise_start_bound))) - noise_t_end = noise_t_start + wav_t_end - wav_t_start - - current_snr = random.randrange(args.max_snr, args.min_snr) - - print("{0}:{1}:{2}:{3}:{4}:{5}".format(wav_t_start, - wav_t_end, - current_noise_name, - noise_t_start, - noise_t_end, - current_snr), - file=f) - num_fixed_done += 1 - f.close() -print('''generate_fixed_length_range.py: finished generate the range files for all wav. Compare with our expect, it lacks %d files. Now we totally have %d noise range files.''' %(num_fixed_error, num_fixed_done) ) From ffca370f2ef1d1cc9552197877e625b13ae05adb Mon Sep 17 00:00:00 2001 From: LvHang Date: Sat, 31 Dec 2016 01:18:40 -0500 Subject: [PATCH 16/23] Delete generate_variable_length_range.py --- .../fvector/generate_variable_length_range.py | 198 ------------------ 1 file changed, 198 deletions(-) delete mode 100644 egs/wsj/s5/steps/nnet3/fvector/generate_variable_length_range.py diff --git a/egs/wsj/s5/steps/nnet3/fvector/generate_variable_length_range.py b/egs/wsj/s5/steps/nnet3/fvector/generate_variable_length_range.py deleted file mode 100644 index ef0fe9e02a2..00000000000 --- a/egs/wsj/s5/steps/nnet3/fvector/generate_variable_length_range.py +++ /dev/null @@ -1,198 +0,0 @@ -#!/usr/bin/env python - -# The function use to generate range file for fvector -# This is the variable-length version -# Each line of the range-file corrsponds to a kind of perturbed wav. In each line, -# we use comma to seperate different addnoise range. The format of each addnoise -# range is ::::: - -# For -# Except the last fragement, the length will be random. - -# For -# It is randomly selected from noise list, which is longer than --min-additive-noise-len - -# For -# If the noise file is longer than wav length. We randomly select the start point and -# the length will be the same as wav length. -# If the noise file is shorter than T. We select the whole noise. - -# For , it was used to control the amplitude of noise -# It will be randomly selected from the range (max-snr, min-snr) - -from __future__ import print_function -import re, os, argparse, sys, math, warnings, random - -parser = argparse.ArgumentParser(description="Generate a noise range-file which cotains " - "N lines corresponding to the number of kinds for each original wav. " - "The file created by this python code will be supplied to variable-length " - "and additive noise program.", - epilog="Called by steps/nnet3/fvector/add_noise.sh") -parser.add_argument("--num-kind-range", type=int, default=4, - help="the number of noise range files") -parser.add_argument("--min-additive-noise-len", type=float, default=2.0, - help="the minimum duration/length of each noise file") -parser.add_argument("--min-snr", type=int, default=0, - help="the minimum Signal-to-Noise Rate, the default=0") -parser.add_argument("--max-snr", type=int, default=-10, - help="the maximum Signal-to-Noise Rate, the default=-10") -parser.add_argument("--seed", type=int, default=-1, - help="Seed for random number generator") - -# now the positional arguments -parser.add_argument("wav2dur", - help="wav2dur file of the original wav to be used as input (format is: " - " ") -parser.add_argument("noise2dur", - help="noise2dur file of the noise wav to be used as input (format is: " - " ") -parser.add_argument("range_dir", - help="Name of ranges directory, exp/fxvector/ranges") - -print(' '.join(sys.argv)) - -args = parser.parse_args() - -## Check arguments -if args.min_snr < args.max_snr: - sys.exit("For SNR, the less numerical value is, the larger noise is. So --min-snr bigger " - "than --max-snr in numerical value.") - -random.seed(args.seed) - -# deal with the original wav utt2dur -# the information was stored in wav_ids[], wav_lengths[] and wav_num_utts -f = open(args.wav2dur, "r") -if f is None: - sys.exit("Error opening wav2dur file " + str(args.wav2dur)) -wav_ids = [] -wav_lengths = [] -for line in f: - a = line.split() - if len(a) != 2: - sys.exit("Bad line in wav2dur file " + line) - if float(a[1]) < args.min_additive_noise_len: - sys.exit("ERROR: The wav length is shorter than --min-additive-noise-len") - wav_ids.append(a[0]) - wav_lengths.append(float(a[1])) -f.close() - -wav_num_utts = len(wav_ids) - -# deal with the noise wav utt2dur -# remove the noise whose length < --min-additive-noise-len -num_error = 0 -num_done = 0 -f = open(args.noise2dur, "r") -if f is None: - sys.exit("Error opening wav2dur file " + str(args.noise2dur)) -noise_ids = [] -noise_lengths = [] -for line in f: - a = line.split() - if len(a) != 2: - sys.exit("bad line in noise2dur file " + line); - if float(a[1]) < args.min_additive_noise_len: - num_error += 1 - continue - noise_ids.append(a[0]) - noise_lengths.append(float(a[1])) - num_done += 1 -f.close() -noise_num_utts = len(noise_ids) -noise_str = "Warning: There are " + str(num_error) + " noise files length smaller than " + \ - "--min-additive-noise-len, we remove it from the noise list. Now, there are " + \ - str(num_done) + " noise file." -sys.stdout.write( noise_str + '\n') - -num_error = 0 -num_done = 0 -# generate the range file for each original wav file -num_variable_error = 0 -num_variable_done = 0 - -for i in range(0, wav_num_utts): - - # check the noise list has enough sample or not - current_wav_len = wav_lengths[i] - max_num_additive_noise = int(current_wav_len / args.min_additive_noise_len) - - if max_num_additive_noise > noise_num_utts: - print( "Warning: The number of noise files or the --min-additive-noise-len is too small" ) - num_variable_error += 1 - continue - - # create a file to record the ranges - f = open(args.range_dir + "/" + str(wav_ids[i]) + ".noiserange", "w") - if f is None: - sys.exit("Error open file " + args.range_dir + "/" + str(wav_ids[i]) + ".noiserange") - - # We generate $num_kind_range ranges - for j in range(0, args.num_kind_range): - # print the perturbed wav id in the beginning of line - print("{0}-{1}".format(wav_ids[i], "perturbed-"+str(j+1)), end=" ", file=f) - - # generate range file - # format: wav_t_start:wav_t_end:noise_name:noise_t_start:noise_t_end:snr, - the_rest = current_wav_len - wav_t_start = 0.0 - wav_t_end = 0.0 - while (the_rest > float(args.min_additive_noise_len)): - # firstly, we randomly choose a kind of noise and snr - noise_index = random.randrange(0, noise_num_utts) - current_noise_name = noise_ids[noise_index] - current_noise_len = noise_lengths[noise_index] - current_snr = random.randrange(args.max_snr, args.min_snr) - - # Secondly, we randomly select a fragement of the noise file. - noise_start_bound = float('{:.2f}'.format(current_noise_len - float(args.min_additive_noise_len))) - noise_t_start = float('{:.2f}'.format(random.uniform(0, noise_start_bound))) - noise_end_upperbound = float('{:.2f}'.format(noise_t_start + float(args.min_additive_noise_len))) - noise_end_lowerbound = float('{:.2f}'.format(min((noise_t_start + the_rest), current_noise_len))) - noise_t_end = float('{:.2f}'.format(random.uniform(noise_end_upperbound, noise_end_lowerbound))) - current_noise_length = noise_t_end - noise_t_start - - # Thirdly, we generate the start and end point of wav - wav_t_start = wav_t_end #the new start is the end of the last. - wav_t_end = wav_t_start + current_noise_length - - # Forthly, update the_rest - the_rest = the_rest - current_noise_length - - # Fifthly, print - print("{0}:{1}:{2}:{3}:{4}:{5}".format(wav_t_start, - wav_t_end, - current_noise_name, - noise_t_start, - noise_t_end, - current_snr), - end=",",file=f) - # deal with the bit of wav - # firstly, we randomly choose a kind of noise and snr - noise_index = random.randrange(0, noise_num_utts) - current_noise_name = noise_ids[noise_index] - current_noise_len = noise_lengths[noise_index] - current_snr = random.randrange(args.max_snr, args.min_snr) - - # Secondly, we randomly select a fragement of the noise file. - noise_start_bound = float('{:.2f}'.format(current_noise_len - the_rest)) - noise_t_start = float('{:.2f}'.format(random.uniform(0, noise_start_bound))) - noise_t_end = noise_t_start + the_rest - current_noise_length = noise_t_end - noise_t_start - - # Thirdly, we generate the start and end point of wav - wav_t_start = wav_t_end #the new start is the end of the last. - wav_t_end = wav_t_start + current_noise_length - - # Forthly, print - print("{0}:{1}:{2}:{3}:{4}:{5}".format(wav_t_start, - wav_t_end, - current_noise_name, - noise_t_start, - noise_t_end, - current_snr), - file=f) - num_variable_done += 1 - f.close() -print('''generate_fixed_length_range.py: finished generate the range files for all wav. Compare with our expect, it lacks %d files. Now we totally have %d noise range files.''' %(num_variable_error, num_variable_done) ) - From 8a9febf1b0d73cbf9b96389f704fd738e4ed2080 Mon Sep 17 00:00:00 2001 From: LvHang Date: Sat, 31 Dec 2016 01:18:46 -0500 Subject: [PATCH 17/23] Delete lh_add_noise.sh --- .../s5/steps/nnet3/fvector/lh_add_noise.sh | 41 ------------------- 1 file changed, 41 deletions(-) delete mode 100644 egs/wsj/s5/steps/nnet3/fvector/lh_add_noise.sh diff --git a/egs/wsj/s5/steps/nnet3/fvector/lh_add_noise.sh b/egs/wsj/s5/steps/nnet3/fvector/lh_add_noise.sh deleted file mode 100644 index 6b81af6c6ec..00000000000 --- a/egs/wsj/s5/steps/nnet3/fvector/lh_add_noise.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/bash - -# Begin Configuration section -stage=0 -min_additive_noise_len=2 # the minimum duration of each noise file -num_kind_range=4 # the number of kinds of noise ranges -min_snr=0 # the minimum snr value -max_snr=0 # the maximum snr value -seed=-1 # set the random seed - -# End Configuration section - -data=$1 # contain wav.scp -noise=$2 # contain noise.scp -dir=$3 # eg: ranges/ - - -if [ ! -f $data/utt2dur ]; then - # get original clean wav's duration - utils/data/get_utt2dur.sh $data -fi - -if [ ! -f $noise/utt2dur ]; then - # get the duration of each noise file - utils/data/get_utt2dur.sh $noise -fi - -mkdir -p $dir/log -if [ $stage -le 0 ]; then - echo "$0: generate $num_kind_rage kinds of noise range for each original wav" - $cmd $dir/log/generate_noise_range.log \ - steps/nnet3/fvector/generate_noise_range.py \ - --num-kind-range=$num_kind_range \ - --min-additive-noise-len=$min_additive_noise_len \ - --min-snr=$min_snr \ - --max-snr=$max_snr \ - --seed=$seed \ - $data/utt2dur $noise/utt2dur $dir -fi - -exit 0 From 954fa90313dacb1d10bd791c04dbf463b59194e5 Mon Sep 17 00:00:00 2001 From: Hang Lyu Date: Mon, 2 Jan 2017 22:00:04 -0500 Subject: [PATCH 18/23] fix utt-id != recording-id --- egs/wsj/s5/steps/nnet3/fvector/add_noise.sh | 27 +++++++++++++++++-- .../nnet3/fvector/generate_noise_range.py | 12 ++++----- .../fvector/generate_perturb_wav_specifier.py | 11 ++++---- 3 files changed, 36 insertions(+), 14 deletions(-) diff --git a/egs/wsj/s5/steps/nnet3/fvector/add_noise.sh b/egs/wsj/s5/steps/nnet3/fvector/add_noise.sh index efd648bcbe0..d7836fb5f01 100644 --- a/egs/wsj/s5/steps/nnet3/fvector/add_noise.sh +++ b/egs/wsj/s5/steps/nnet3/fvector/add_noise.sh @@ -40,17 +40,31 @@ fi data=$1 # contain wav.scp noise=$2 # contain noise.scp -dir=$3 # eg: ranges/ +dir=$3 # eg: data/perturbed if [ ! -f $data/utt2dur ]; then + # remove the segments so that the duration corresponding to recording-id + if [ -f $data/segments ]; then + mv $data/segments $data/segments_backup + fi # get original clean wav's duration - utils/data/get_utt2dur.sh $data + utils/data/get_utt2dur.sh $data + if [ -f $data/segments_backup ]; then + mv $data/segments_backup segments + fi fi if [ ! -f $noise/utt2dur ]; then + # remove the segments so that the duration corresponding to recording-id + if [ -f $data/segments ]; then + mv $data/segments $data/segments_backup + fi # get the duration of each noise file utils/data/get_utt2dur.sh $noise + if [ -f $data/segments_backup ]; then + mv $data/segments_backup segments + fi fi mkdir -p $dir/log @@ -65,6 +79,15 @@ if [ $stage -le 0 ]; then --variable-len-additive-noise $variable_len_additive_noise \ --seed=$seed \ $data/utt2dur $noise/utt2dur $dir/ranges $dir/wav2perturbedwav + #if the segments is exist + fi +if [ $stage -le 1 ]; then + echo "$0: generate perturbed_wav_specifier" + $cmd $dir/log/generate_perturb_wav_specifier.log \ + steps/nnet3/fvector/generate_perturb_wav_specifier.py \ + --noise=$noise/wav.scp \ + $data/wav.scp $dir/ranges $dir/wav2perturbedwav $dir/perturbed_wav.scp +fi exit 0 diff --git a/egs/wsj/s5/steps/nnet3/fvector/generate_noise_range.py b/egs/wsj/s5/steps/nnet3/fvector/generate_noise_range.py index 714f187b896..ed147b27d40 100644 --- a/egs/wsj/s5/steps/nnet3/fvector/generate_noise_range.py +++ b/egs/wsj/s5/steps/nnet3/fvector/generate_noise_range.py @@ -144,10 +144,10 @@ def GenerateFixedLengthRangeFile(): # We generate $num_ranges_per_wav ranges for j in range(0, args.num_ranges_per_wav): # print the perturbed wav id in the beginning of line - print("{0}-{1}".format(wav_ids[i], "perturbed-"+str(j+1)), end=" ", file=f) + print("{1}-{0}".format(wav_ids[i], "perturbed"+str(j+1)), end=" ", file=f) # print the perturbedwav_id - print(" {0}-{1}".format(wav_ids[i], "perturbed-"+str(j+1)), end="", file=g) + print(" {1}-{0}".format(wav_ids[i], "perturbed"+str(j+1)), end="", file=g) # select a number from [1 ... max_num_additive_noise] num_additive_noise = random.randint(1, max_num_additive_noise) @@ -184,7 +184,7 @@ def GenerateFixedLengthRangeFile(): # deal with the last noise, which cover the rest k = num_additive_noise - 1 wav_t_start = float('{:.2f}'.format(k * additive_noise_len)) - wav_t_end = current_wav_len + wav_t_end = float('{:.2f}'.format(current_wav_len)) noise_index = random.randrange(0, num_noise) current_noise_name = noise_ids[noise_index] @@ -246,14 +246,14 @@ def GenerateVariableLengthRangeFile(): # We generate $num_ranges_per_wav ranges for j in range(0, args.num_ranges_per_wav): # print the perturbed wav id in the beginning of line - print("{0}-{1}".format(wav_ids[i], "perturbed-"+str(j+1)), end=" ", file=f) + print("{1}-{0}".format(wav_ids[i], "perturbed"+str(j+1)), end=" ", file=f) # print the perturbedwav_id - print(" {0}-{1}".format(wav_ids[i], "perturbed-"+str(j+1)), end="", file=g) + print(" {1}-{0}".format(wav_ids[i], "perturbed"+str(j+1)), end="", file=g) # generate range file # format: wav_t_start:wav_t_end:noise_name:noise_t_start:noise_t_end:snr, - the_rest = current_wav_len + the_rest = float('{:.2f}'.format(current_wav_len)) wav_t_start = 0.0 wav_t_end = 0.0 while (the_rest > float(args.min_additive_noise_len)): diff --git a/egs/wsj/s5/steps/nnet3/fvector/generate_perturb_wav_specifier.py b/egs/wsj/s5/steps/nnet3/fvector/generate_perturb_wav_specifier.py index 1e4f0775bea..58892782f05 100644 --- a/egs/wsj/s5/steps/nnet3/fvector/generate_perturb_wav_specifier.py +++ b/egs/wsj/s5/steps/nnet3/fvector/generate_perturb_wav_specifier.py @@ -88,12 +88,11 @@ for i in range(1, len(wav_list)): current_perturbed_wav_id = wav_list[i] current_perturbed_wav_index = perturbed_range_ids.index(current_perturbed_wav_id) - print('''{0} {1} nnet3-fvector-perturb-signal --noise-scp=scp:{3} --noise=\" - {4}\" - |'''.format(current_perturbed_wav_id, - wav_extend_files[current_wav_index], - args.noise, - perturbed_range_contents[current_perturbed_wav_index]), - file=g) + print('''{0} {1} nnet3-fvector-perturb-signal --noise-scp=scp:{2} --noise=\"{3}\" - |'''.format( + current_perturbed_wav_id, + wav_extended_files[current_wav_index], + args.noise, + perturbed_range_contents[current_perturbed_wav_index]),file=g) g.close() f.close() print("Finished generating the perturb_wav.scp") From 5ad323d6207c2e46beba9b858f3600a355fd5604 Mon Sep 17 00:00:00 2001 From: Hang Lyu Date: Mon, 2 Jan 2017 22:28:47 -0500 Subject: [PATCH 19/23] fix add_noise.sh --- egs/wsj/s5/steps/nnet3/fvector/add_noise.sh | 56 ++++++++++++++------- 1 file changed, 37 insertions(+), 19 deletions(-) diff --git a/egs/wsj/s5/steps/nnet3/fvector/add_noise.sh b/egs/wsj/s5/steps/nnet3/fvector/add_noise.sh index d7836fb5f01..b2f6fd7edb1 100644 --- a/egs/wsj/s5/steps/nnet3/fvector/add_noise.sh +++ b/egs/wsj/s5/steps/nnet3/fvector/add_noise.sh @@ -42,31 +42,41 @@ data=$1 # contain wav.scp noise=$2 # contain noise.scp dir=$3 # eg: data/perturbed - -if [ ! -f $data/utt2dur ]; then - # remove the segments so that the duration corresponding to recording-id - if [ -f $data/segments ]; then - mv $data/segments $data/segments_backup - fi - # get original clean wav's duration - utils/data/get_utt2dur.sh $data - if [ -f $data/segments_backup ]; then - mv $data/segments_backup segments +# remove the segments so that the duration corresponding to recording-id +if [ -f $data/segments ]; then + mv $data/segments $data/segments_backup + if [ -f $data/utt2dur ]; then + mv $data/utt2dur $data/utt2dur.backup + utils/data/get_utt2dur.sh $data + else if + utils/data/get_utt2dur.sh $data fi + mv $data/segments_backup $data/segments +else if + if [ ! -f $data/utt2dur ]; then + # get original clean wav's duration + utils/data/get_utt2dur.sh $data + fi fi -if [ ! -f $noise/utt2dur ]; then - # remove the segments so that the duration corresponding to recording-id - if [ -f $data/segments ]; then - mv $data/segments $data/segments_backup - fi - # get the duration of each noise file - utils/data/get_utt2dur.sh $noise - if [ -f $data/segments_backup ]; then - mv $data/segments_backup segments +# remove the segments so that the duration corresponding to recording-id +if [ -f $noise/segments ]; then + mv $noise/segments $noise/segments_backup + if [ -f $noise/utt2dur ]; then + mv $noise/utt2dur $noise/utt2dur.backup + utils/data/get_utt2dur.sh $noise + else if + utils/data/get_utt2dur.sh $noise fi + mv $noise/segments_backup $noise/segments +else if + if [ ! -f $noise/utt2dur ]; then + # get original clean wav's duration + utils/data/get_utt2dur.sh $noise + fi fi + mkdir -p $dir/log if [ $stage -le 0 ]; then echo "$0: generate $num_kind_rage kinds of noise range for each original wav" @@ -90,4 +100,12 @@ if [ $stage -le 1 ]; then --noise=$noise/wav.scp \ $data/wav.scp $dir/ranges $dir/wav2perturbedwav $dir/perturbed_wav.scp fi + +if [ -f $data/utt2dur.backup ]; then + mv $data/utt2dur.backup $data/utt2dur +fi +if [ -f $noise/utt2dur.backup ]; then + mv $noise/utt2dur.backup $noise/utt2dur +fi + exit 0 From cef219cf62b3ae9abe7007e86b15c48c9afa0898 Mon Sep 17 00:00:00 2001 From: Hang Lyu Date: Tue, 3 Jan 2017 22:47:41 -0500 Subject: [PATCH 20/23] generate a intergrated dir --- egs/wsj/s5/steps/nnet3/fvector/add_noise.sh | 46 ++++++-- .../steps/nnet3/fvector/apply_map_one2mult.pl | 111 ++++++++++++++++++ .../nnet3/fvector/generate_noise_range.py | 0 .../fvector/generate_perturb_wav_specifier.py | 0 src/fvectorbin/Makefile | 2 +- 5 files changed, 151 insertions(+), 8 deletions(-) mode change 100644 => 100755 egs/wsj/s5/steps/nnet3/fvector/add_noise.sh create mode 100755 egs/wsj/s5/steps/nnet3/fvector/apply_map_one2mult.pl mode change 100644 => 100755 egs/wsj/s5/steps/nnet3/fvector/generate_noise_range.py mode change 100644 => 100755 egs/wsj/s5/steps/nnet3/fvector/generate_perturb_wav_specifier.py diff --git a/egs/wsj/s5/steps/nnet3/fvector/add_noise.sh b/egs/wsj/s5/steps/nnet3/fvector/add_noise.sh old mode 100644 new mode 100755 index b2f6fd7edb1..9d40d074a3e --- a/egs/wsj/s5/steps/nnet3/fvector/add_noise.sh +++ b/egs/wsj/s5/steps/nnet3/fvector/add_noise.sh @@ -48,11 +48,11 @@ if [ -f $data/segments ]; then if [ -f $data/utt2dur ]; then mv $data/utt2dur $data/utt2dur.backup utils/data/get_utt2dur.sh $data - else if + else utils/data/get_utt2dur.sh $data fi mv $data/segments_backup $data/segments -else if +else if [ ! -f $data/utt2dur ]; then # get original clean wav's duration utils/data/get_utt2dur.sh $data @@ -65,11 +65,11 @@ if [ -f $noise/segments ]; then if [ -f $noise/utt2dur ]; then mv $noise/utt2dur $noise/utt2dur.backup utils/data/get_utt2dur.sh $noise - else if + else utils/data/get_utt2dur.sh $noise fi mv $noise/segments_backup $noise/segments -else if +else if [ ! -f $noise/utt2dur ]; then # get original clean wav's duration utils/data/get_utt2dur.sh $noise @@ -89,8 +89,6 @@ if [ $stage -le 0 ]; then --variable-len-additive-noise $variable_len_additive_noise \ --seed=$seed \ $data/utt2dur $noise/utt2dur $dir/ranges $dir/wav2perturbedwav - #if the segments is exist - fi if [ $stage -le 1 ]; then @@ -98,7 +96,41 @@ if [ $stage -le 1 ]; then $cmd $dir/log/generate_perturb_wav_specifier.log \ steps/nnet3/fvector/generate_perturb_wav_specifier.py \ --noise=$noise/wav.scp \ - $data/wav.scp $dir/ranges $dir/wav2perturbedwav $dir/perturbed_wav.scp + $data/wav.scp $dir/ranges $dir/wav2perturbedwav $dir/wav.scp +fi + +if [ $stage -le 2 ]; then + echo "$0: generate other files in data directory" + #reco2file_and_channel + cat $dir/wav2perturbedwav | cut -d ' ' -f 1 | paste -d ' ' - $dir/wav2perturbedwav > $dir/perturb_recording_map + steps/nnet3/fvector/apply_map_one2mult.pl -f 1 $dir/perturb_recording_map <$data/reco2file_and_channel >$dir/reco2file_and_channel + if [ -f $data/segments ]; then + awk -v num=$num_ranges_per_wav '{ + printf("%s %s",$1, $1); + for(i=1; i<= num; i++){ printf(" %s%s-%s","perturb", i, $1); } + printf("\n"); + }' <$data/segments > $dir/perturb_utt_map + cat $dir/perturb_recording_map > $dir/perturb_map + cat $dir/perturb_utt_map >> $dir/perturb_map + #segments + steps/nnet3/fvector/apply_map_one2mult.pl -f 1 $dir/perturb_map <$data/segments >$dir/segments + #text + steps/nnet3/fvector/apply_map_one2mult.pl -f 1 $dir/perturb_map <$data/text >$dir/text + #utt2spk + steps/nnet3/fvector/apply_map_one2mult.pl -f 1 $dir/perturb_map <$data/utt2spk >$dir/utt2spk + #spk2utt + utt2spk_to_spk2utt.pl <$dir/utt2spk | sort > $dir/spk2utt + else #no segments->wav indexed by utterence-id/ is equal to + cp $dir/perturb_recording_map $dir/perturb_map + #segments + steps/nnet3/fvector/apply_map_one2mult.pl -f 1 $dir/perturb_map <$data/segments >$dir/segments + #text + steps/nnet3/fvector/apply_map_one2mult.pl -f 1 $dir/perturb_map <$data/text >$dir/text + #utt2spk + steps/nnet3/fvector/apply_map_one2mult.pl -f 1 $dir/perturb_map <$data/utt2spk >$dir/utt2spk + #spk2utt + utt2spk_to_spk2utt.pl <$dir/utt2spk | sort > $dir/spk2utt + fi fi if [ -f $data/utt2dur.backup ]; then diff --git a/egs/wsj/s5/steps/nnet3/fvector/apply_map_one2mult.pl b/egs/wsj/s5/steps/nnet3/fvector/apply_map_one2mult.pl new file mode 100755 index 00000000000..fbf92e10331 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/fvector/apply_map_one2mult.pl @@ -0,0 +1,111 @@ +#!/usr/bin/env perl +use warnings; #sed replacement for -w perl parameter +# Copyright 2016 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0. + +# This program try to slove the following problem: +# Assume the map is A A1 A2 A3 A4 +# The input is A B C D +# The output is A1 B C D \n A2 B C D \n A3 B C D \n A4 B C D \n +# This is a one2multiple mapping. + +# Attentation: Use ":" to join the post-map. + + +if (@ARGV > 0 && $ARGV[0] eq "-f") { + shift @ARGV; + $field_spec = shift @ARGV; + if ($field_spec =~ m/^\d+$/) { + $field_begin = $field_spec - 1; $field_end = $field_spec - 1; + } + if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10) + if ($1 ne "") { + $field_begin = $1 - 1; # Change to zero-based indexing. + } + if ($2 ne "") { + $field_end = $2 - 1; # Change to zero-based indexing. + } + } + if (!defined $field_begin && !defined $field_end) { + die "Bad argument to -f option: $field_spec"; + } +} + +# Mapping is obligatory +$permissive = 0; +if (@ARGV > 0 && $ARGV[0] eq '--permissive') { + shift @ARGV; + # Mapping is optional (missing key is printed to output) + $permissive = 1; +} + +if(@ARGV != 1) { + print STDERR "Invalid usage: " . join(" ", @ARGV) . "\n"; + print STDERR "Usage: apply_map_one2mult.pl [options] map output\n" . + "options: [-f ]\n" . + "Applies the map 'map' to all input text, where each line of the map\n" . + "is interpreted as a map from the first field to the list of the other fields\n" . + "Note: can look like 4-5, or 4-, or 5-, or 1, it means the field\n" . + "range in the input to apply the map to.\n" . + "e.g.: echo A B | apply_map.pl a.txt\n" . + "where a.txt is:\n" . + "A A1 A2\n" . + "B B1\n" . + "will produce:\n" . + "A1 B1\n" . + "A2 B1\n"; + exit(1); +} + +($map) = @ARGV; +open(M, "<$map") || die "Error opening map file $map: $!"; + +while () { + @A = split(" ", $_); + @A >= 1 || die "apply_map.pl: empty line."; + $i = shift @A; + $o = join(":", @A); + $map{$i} = $o; +} + +sub printcontent { + (my $start, my @string)=@_; + + if ( $start == @string ) { print join(" ",@string) . "\n"; + } else { + my $tmp = $string[$start]; + my @Word = split(":", $tmp); + if ( @Word != 1) { + foreach(@Word) { + $string[$start] = $_; + $start++; + &printcontent($start, @string); + $start--; + } + } else { + $start++; + &printcontent($start, @string); + } + } +} + +while() { + @A = split(" ", $_); + for ($x = 0; $x < @A; $x++) { + if ( (!defined $field_begin || $x >= $field_begin) + && (!defined $field_end || $x <= $field_end)) { + $a = $A[$x]; + if (!defined $map{$a}) { + if (!$permissive) { + die "apply_map.pl: undefined key $a\n"; + } else { + print STDERR "apply_map.pl: warning! missing key $a\n"; + } + } else { + $A[$x] = $map{$a}; + } + } + } + # print the content + &printcontent(0,@A); +} diff --git a/egs/wsj/s5/steps/nnet3/fvector/generate_noise_range.py b/egs/wsj/s5/steps/nnet3/fvector/generate_noise_range.py old mode 100644 new mode 100755 diff --git a/egs/wsj/s5/steps/nnet3/fvector/generate_perturb_wav_specifier.py b/egs/wsj/s5/steps/nnet3/fvector/generate_perturb_wav_specifier.py old mode 100644 new mode 100755 diff --git a/src/fvectorbin/Makefile b/src/fvectorbin/Makefile index 7d826881cf6..73c81a4bbb6 100644 --- a/src/fvectorbin/Makefile +++ b/src/fvectorbin/Makefile @@ -6,7 +6,7 @@ include ../kaldi.mk LDFLAGS += $(CUDA_LDFLAGS) LDLIBS += $(CUDA_LDLIBS) -BINFILES = nnet3-fvector-get-egs +BINFILES = nnet3-fvector-get-egs nnet3-fvector-perturb-signal OBJFILES = From c7ebfc4a0f296f947342b210e85d0d4ee4086a20 Mon Sep 17 00:00:00 2001 From: Hang Lyu Date: Mon, 16 Jan 2017 08:55:16 -0500 Subject: [PATCH 21/23] allocate_examples --- egs/wsj/s5/steps/nnet3/fvector/add_noise.sh | 16 +- .../steps/nnet3/fvector/allocate_examples.py | 234 ++++++++++++++++++ 2 files changed, 248 insertions(+), 2 deletions(-) create mode 100755 egs/wsj/s5/steps/nnet3/fvector/allocate_examples.py diff --git a/egs/wsj/s5/steps/nnet3/fvector/add_noise.sh b/egs/wsj/s5/steps/nnet3/fvector/add_noise.sh index 9d40d074a3e..406adda8c1d 100755 --- a/egs/wsj/s5/steps/nnet3/fvector/add_noise.sh +++ b/egs/wsj/s5/steps/nnet3/fvector/add_noise.sh @@ -113,7 +113,13 @@ if [ $stage -le 2 ]; then cat $dir/perturb_recording_map > $dir/perturb_map cat $dir/perturb_utt_map >> $dir/perturb_map #segments - steps/nnet3/fvector/apply_map_one2mult.pl -f 1 $dir/perturb_map <$data/segments >$dir/segments + steps/nnet3/fvector/apply_map_one2mult.pl -f 1 $dir/perturb_map <$data/segments | \ + awk '{ + uttid=$1; start_time=$3; end_time=$4; + split(uttid,S,"[_]"); + recordingid=S[1]; + print uttid " " recordingid " " start_time " " end_time + }' >$dir/segments #text steps/nnet3/fvector/apply_map_one2mult.pl -f 1 $dir/perturb_map <$data/text >$dir/text #utt2spk @@ -123,7 +129,13 @@ if [ $stage -le 2 ]; then else #no segments->wav indexed by utterence-id/ is equal to cp $dir/perturb_recording_map $dir/perturb_map #segments - steps/nnet3/fvector/apply_map_one2mult.pl -f 1 $dir/perturb_map <$data/segments >$dir/segments + steps/nnet3/fvector/apply_map_one2mult.pl -f 1 $dir/perturb_map <$data/segments | \ + awk '{ + uttid=$1; start_time=$3; end_time=$4; + split(uttid,S,"[_]"); + recordingid=S[1]; + print uttid " " recordingid " " start_time " " end_time + }' >$dir/segments #text steps/nnet3/fvector/apply_map_one2mult.pl -f 1 $dir/perturb_map <$data/text >$dir/text #utt2spk diff --git a/egs/wsj/s5/steps/nnet3/fvector/allocate_examples.py b/egs/wsj/s5/steps/nnet3/fvector/allocate_examples.py new file mode 100755 index 00000000000..aae6c53c3b1 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/fvector/allocate_examples.py @@ -0,0 +1,234 @@ +#!/usr/bin/env python + +# This script, for use when training fvectors, decides for you which examples +# will come from which utterances, and at what point. + +# You call it as (e.g.) +# +# allocate_examples.py --frames-per-chunk=200 --frames-per-iter=1000000 \ +# --num-archives=169 --num-jobs=24 exp/xvector_a/egs/temp/utt2len.train exp/xvector_a/egs +# +# and this program outputs certain things to the temp directory (exp/xvector_a/egs/temp in this case) +# that will enable you to dump the chunks for xvector training. What we'll eventually be doing is invoking +# the following program with something like the following args: +# +# nnet3-fvector-get-egs [options] exp/xvector_a/temp/ranges.1 scp:data/train/feats.scp \ +# ark:exp/xvector_a/egs/egs_temp.1.ark ark:exp/xvector_a/egs/egs_temp.2.ark \ +# ark:exp/xvector_a/egs/egs_temp.3.ark +# +# where exp/xvector_a/temp/ranges.1 contains something like the following: +# +# 0 1 50 200 +# +# where each line is interpreted as follows: +# +# +# Note: is the zero-based offset of the archive-index +# within the subset of archives that a particular ranges file corresponds to; +# and is the 1-based numeric index of the destination +# archive among the entire list of archives, which will form part of the +# archive's filename (e.g. egs/egs..ark); +# is only kept for debug purposes so you can see which +# archive each line corresponds to. +# +# The list of archives corresponding to ranges.n will be written to output.n, +# so in exp/xvector_a/temp/outputs.1 we'd have: +# +# ark:exp/xvector_a/egs/egs_temp.1.ark ark:exp/xvector_a/egs/egs_temp.2.ark ark:exp/xvector_a/egs/egs_temp.3.ark +# +# The number of these files will equal 'num-jobs'. If you add up the word-counts of +# all the outputs.* files you'll get 'num-archives'. The number of frames in each archive +# will be about the --frames-per-iter. +# + +# we're using python 3.x style print but want it to work in python 2.x, +from __future__ import print_function +import re, os, argparse, sys, math, warnings, random + + +parser = argparse.ArgumentParser(description="Writes ranges.*, outputs.* and archive_chunk_lengths files " + "in preparation for dumping egs for xvector training.", + epilog="Called by steps/nnet3/xvector/get_egs.sh") +parser.add_argument("--prefix", type=str, default="", + help="Adds a prefix to the output files. This is used to distinguish between the train " + "and diagnostic files.") +parser.add_argument("--frames-per-chunk", type=int, default=100, + help="The number of frames-per-chunk used for any archive") +parser.add_argument("--frames-per-iter", type=int, default=1000000, + help="Target number of frames for each archive") +parser.add_argument("--num-archives", type=int, default=-1, + help="Number of archives to write") +parser.add_argument("--num-jobs", type=int, default=-1, + help="Number of jobs we're going to use to write the archives; the ranges.* " + "and outputs.* files are indexed by job. Must be <= the --num-archives option.") +parser.add_argument("--seed", type=int, default=1, + help="Seed for random number generator") + +# now the positional arguments +parser.add_argument("utt2len", + help="utt2len file of the features to be used as input (format is: " + " )") +parser.add_argument("oriutt2allutt", + help="oriutt2allutt to be used as input (format is: " + " ... )") +parser.add_argument("egs_dir", + help="Name of egs directory, e.g. exp/xvector_a/egs") + +print(' '.join(sys.argv)) + +args = parser.parse_args() + +if not os.path.exists(args.egs_dir + "/temp"): + os.makedirs(args.egs_dir + "/temp") + +## Check arguments. +if args.frames_per_chunk <= 1: + sys.exit("--frames-per-chunk is invalid.") +if args.frames_per_iter < 1000: + sys.exit("--frames-per-iter is invalid.") +if args.num_archives < 1: + sys.exit("--num-archives is invalid") +if args.num_jobs > args.num_archives: + sys.exit("--num-jobs is invalid (must not exceed num-archives)") + +random.seed(args.seed) + +f = open(args.utt2len, "r"); +if f is None: + sys.exit("Error opening utt2len file " + str(args.utt2len)); +utt_ids = [] +lengths = [] +for line in f: + a = line.split() + if len(a) != 2: + sys.exit("bad line in utt2len file " + line); + utt_ids.append(a[0]) + lengths.append(int(a[1])) +f.close() + +num_utts = len(utt_ids) +max_length = max(lengths) + +if args.frames_per_chunk * 3 > max_length: + sys.exit("--max-frames-per-chunk={0} is not valid: it must be no more " + "than a third of the maximum length {1} from the utt2len file ".format( + args.max_frames_per_chunk, max_length)) + +# create the map form ori-utt-id to all kinds of utt-id. The ori-utt-id is the +# index, which is same with the elements in utt_ids[] +f = open(args.oriutt2allutt, "r"); +if f is None: + sys.exit("Error opening oriutt2allutt file " + str(args.oriutt2allutt)); +utt_map = {} +for line in f: + a = line.split() + if len(a) < 3: + sys.exit("bad line in oriutt2allutt file " + line); + tmp_list = [] + for i in range(1, len(a)): + tmp_list.append(a[i]) + tuple_list = tuple(tmp_list) + utt_map[a[0]]=tuple_list +f.close() + + +# this function returns a random integer utterance index, limited to utterances +# above a minimum length in frames, with probability proportional to its length. +def RandomUttAtLeastThisLong(min_length): + while True: + i = random.randrange(0, num_utts) + # read the next line as 'with probability lengths[i] / max_length'. + # this allows us to draw utterances with probability with + # prob proportional to their length. + if lengths[i] > min_length and random.random() < lengths[i] / float(max_length): + return i + + +# given an utterance length utt_length (in frames) and two desired chunk lengths +# (length1 and length2) whose sum is <= utt_length, +# this function randomly picks the starting points of the chunks for you. +# the chunks may appear randomly in either order. +def GetRandomOffsets(utt_length, length): + if length > utt_length: + sys.exit("code error: tot-length > utt-length") + free_length = utt_length - length + offset = random.randrange(0, free_length + 1) + return offset + + +# this function randomly choose two utt-id form utt_map depending on ori-utt-id +def ChoosePairs(ori_utt_id): + this_tuple = utt_map[ori_utt_id] + while True: + first_index = random.randint(0, len(this_tuple) - 1) + second_index = random.randint(0, len(this_tuple) - 1) + if first_index != second_index: + break + utt_a = this_tuple[first_index] + utt_b = this_tuple[second_index] + return (utt_a, utt_b) + + +# each element of all_egs (one per archive) is +# an array of 2-tuples (utterance-index, offset) +all_egs= [] + +prefix = "" +if args.prefix != "": + prefix = args.prefix + "_" + +for archive_index in range(args.num_archives): + tot_length = 2 * args.frames_per_chunk + this_num_egs = (args.frames_per_iter / tot_length) + 1 + this_egs = [ ] # this will be an array of 2-tuples (utterance-index, start-frame). + for n in range(this_num_egs): + utt_index = RandomUttAtLeastThisLong(args.frames_per_chunk) + utt_len = lengths[utt_index] + offset = GetRandomOffsets(utt_len, args.frames_per_chunk) + this_egs.append( (utt_index, offset) ) + all_egs.append(this_egs) + +# work out how many archives we assign to each job in an equitable way. +num_archives_per_job = [ 0 ] * args.num_jobs +for i in range(0, args.num_archives): + num_archives_per_job[i % args.num_jobs] = num_archives_per_job[i % args.num_jobs] + 1 + + +cur_archive = 0 +for job in range(args.num_jobs): + this_ranges = [] + this_archives_for_job = [] + this_num_archives = num_archives_per_job[job] + + for i in range(0, this_num_archives): + this_archives_for_job.append(cur_archive) + for (utterance_index, offset) in all_egs[cur_archive]: + this_ranges.append( (utterance_index, i, offset) ) + cur_archive = cur_archive + 1 + f = open(args.egs_dir + "/temp/" + prefix + "ranges." + str(job + 1), "w") + if f is None: + sys.exit("Error opening file " + args.egs_dir + "/temp/" + prefix + "ranges." + str(job + 1)) + for (utterance_index, i, offset) in sorted(this_ranges): + archive_index = this_archives_for_job[i] + this_utt_id = utt_ids[utterance_index] + #Random select two utt-id + (utt_a, utt_b) = ChoosePairs(this_utt_id) + print("{0} {1} {2} {3} {4} {5}".format(utt_a, + utt_b, + i, + archive_index + 1, + offset + args.frames_per_chunk, + file=f) + f.close() + + f = open(args.egs_dir + "/temp/" + prefix + "outputs." + str(job + 1), "w") + if f is None: + sys.exit("Error opening file " + args.egs_dir + "/temp/" + prefix + "outputs." + str(job + 1)) + print( " ".join([ str("{0}/" + prefix + "egs_temp.{1}.ark").format(args.egs_dir, n + 1) for n in this_archives_for_job ]), + file=f) + f.close() + + +print("allocate_examples.py: finished generating " + prefix + "ranges.* and " + prefix + "outputs.* files") + From 43db796d5297ed478454eb6641eae91ca8f8961e Mon Sep 17 00:00:00 2001 From: Hang Lyu Date: Tue, 17 Jan 2017 03:30:53 -0500 Subject: [PATCH 22/23] get egs C++ code --- src/fvectorbin/nnet3-fvector-get-egs.cc | 242 +++++++++++++++--------- 1 file changed, 149 insertions(+), 93 deletions(-) diff --git a/src/fvectorbin/nnet3-fvector-get-egs.cc b/src/fvectorbin/nnet3-fvector-get-egs.cc index 2f7fdbfa748..dd05e1efe56 100644 --- a/src/fvectorbin/nnet3-fvector-get-egs.cc +++ b/src/fvectorbin/nnet3-fvector-get-egs.cc @@ -1,6 +1,6 @@ // fvectorbin/nnet3-fvector-get-egs.cc -// Copyright 2016 Johns Hopkins University (author: Daniel Povey) +// Copyright 2012-2016 Johns Hopkins University (author: Daniel Povey) // See ../../COPYING for clarification regarding multiple authors // @@ -19,67 +19,71 @@ #include -#include "base/kaldi-common.h" #include "util/common-utils.h" -#include "hmm/transition-model.h" -#include "hmm/posterior.h" #include "nnet3/nnet-example.h" namespace kaldi { namespace nnet3 { - -static void ProcessFile(const MatrixBase &feats, - const std::string &utt_id, - bool compress, - int32 left_context, - int32 right_context, - int32 frames_per_eg, - int64 *num_frames_written, - int64 *num_egs_written, - NnetExampleWriter *example_writer) { - for (int32 t = 0; t < feats.NumRows(); t += frames_per_eg) { - - // actual_frames_per_eg is the number of frames in center. - // At the end of the file we pad with zero posteriors - // so that all examples have the same structure (prevents the need - // for recompilations). - int32 actual_frames_per_eg = std::min(frames_per_eg, - feats.NumRows() - t); - - int32 tot_frames = left_context + frames_per_eg + right_context; - - Matrix input_frames(tot_frames, feats.NumCols(), kUndefined); - - // Set up "input_frames". - for (int32 j = -left_context; j < frames_per_eg + right_context; j++) { - int32 t2 = j + t; - if (t2 < 0) t2 = 0; - if (t2 >= feats.NumRows()) t2 = feats.NumRows() - 1; - SubVector src(feats, t2), - dest(input_frames, j + left_context); - dest.CopyFromVec(src); +// A struct for holding information about the position and +// duration of each pair of chunks. +struct FvectorChunkPairInfo { + std::string pair_name; + std::string utt_a; + std::string utt_b; + int32 output_archive_id; + int32 start_frame; + int32 num_frames; +}; + +// Process the range input file and store it as a map from utterance +// name to vector of ChunkPairInfo structs. +static void ProcessRangeFile(const std::string &range_rxfilename, + std::vector *pairs) { + Input range_input(range_rxfilename); + if (!range_rxfilename.empty()) { + std::string line; + while (std::getline(range_input.Stream(), line)) { + FvectorChunkPairInfo *pair = new FvectorChunkPairInfo(); + std::vector fields; + SplitStringToVector(line, " \t\n\r", true, &fields); + if (fields.size() != 6) { + KALDI_ERR << "Expected 6 fields in line of range file, got " + << fields.size() << " instead."; + } + + std::string utt_a = fields[0], + utt_b = fields[1], + start_frame_str = fields[4], + num_frames_str = fields[5]; + + if (!ConvertStringToInteger(fields[2], &(pair->output_archive_id)) || + !ConvertStringToInteger(start_frame_str, &(pair->start_frame)) || + !ConvertStringToInteger(num_frames_str, &(pair->num_frames))) { + KALDI_ERR << "Expected integer for output archive in range file."; + } + pair->pair_name = utt_a + "-" + utt_b + "-" + start_frame_str + "-" + + num_frames_str; + pair->utt_a = utt_a; + pair->utt_b = utt_b; + + pairs->push_back(pair); } - - NnetExample eg; - - // call the regular input "input". - eg.io.push_back(NnetIo("input", -left_context, input_frames)); - - if (compress) { eg.Compress();} - - std::ostringstream os; - os << utt_id << "-" << t; - - std::string key = os.str(); // key is - - - *num_frames_written += actual_frames_per_eg; - *num_egs_written += 1; - - example_writer->Write(key, eg); } } +// Delete the dynamically allocated memory. +static void Cleanup(std::vector *pairs, + std::vector *writers) { + for (std::vector::iterator + it = writers->begin(); it != writers->end(); ++it) { + delete *it; + } + for (std::vector::iterator it = pairs->begin(); + it != pairs->end(); ++it) { + delete *it; + } +} } // namespace nnet3 } // namespace kaldi @@ -89,65 +93,117 @@ int main(int argc, char *argv[]) { using namespace kaldi; using namespace kaldi::nnet3; typedef kaldi::int32 int32; - typedef kaldi::int64 int64; const char *usage = - "Get frame-by-frame examples of data for nnet3 neural network training.\n" - "Essentially this is a format change from features into a special frame-by-frame format.\n" - "This program handles the common case where you have some input features\n" - "and convert them to fvector examples format\n" - "Note: In fvector version, there is no need for iVectors, posterior and labels.\n" + "Get examples for training an nnet3 neural network for the fvector\n" + "system. Each output example contains a pair of feature chunks from\n" + "the specified utterance. The location and length of the feature chunks\n" + "are specified in the 'ranges' file. Each line is interpreted as\n" + "follows:\n" + " " + " " + " \n" + "where is interpreted as a zero-based\n" + "index into the wspecifiers specified on the command line (\n" + "and so on), and is ignored by this program.\n" + "For example:\n" + " utt1-p1 utt1-p2 3 13 5 65\n" + " utt2 utt2-pn 0 10 160 50\n" "\n" - "Usage: nnet3-fvector-get-egs [options] \n" + "Usage: nnet3-fvector-get-egs [options] " + " ... \n" "\n" - "An example [where $feats expands to the actual features]:\n" - "nnet3-fvector-get-egs --left-context=12 --right-context=9 --compress=true \"$feats\" \\\n" - "\"ark:train.egs\"\n"; - + "For example:\n" + "nnet3-fvector-get-egs ranges.1 \"$feats\" ark:egs_temp.1.ark" + " ark:egs_temp.2.ark ark:egs_temp.3.ark\n"; bool compress = true; - int32 left_context = 0, right_context = 0, num_frames = 1; - + ParseOptions po(usage); po.Register("compress", &compress, "If true, write egs in " "compressed format."); - po.Register("left-context", &left_context, "Number of frames of left " - "context the neural net requires."); - po.Register("right-context", &right_context, "Number of frames of right " - "context the neural net requires."); - po.Register("num-frames", &num_frames, "Number of frames is central"); - + po.Read(argc, argv); - if (po.NumArgs() != 2) { + if (po.NumArgs() < 3) { po.PrintUsage(); exit(1); } - std::string feature_rspecifier = po.GetArg(1), - examples_wspecifier = po.GetArg(2); - - // Read in all the training files. - SequentialBaseFloatMatrixReader feat_reader(feature_rspecifier); - NnetExampleWriter example_writer(examples_wspecifier); - - int32 num_done = 0; - int64 num_frames_written = 0, num_egs_written = 0; - - for (; !feat_reader.Done(); feat_reader.Next()) { - std::string key = feat_reader.Key(); - const Matrix &feats = feat_reader.Value(); - ProcessFile(feats, key, compress, left_context, right_context, - num_frames, &num_frames_written, &num_egs_written, - &example_writer); - num_done++; + std::string range_rspecifier = po.GetArg(1); + std::string feature_rspecifier = po.GetArg(2); + std::vector example_writers; + + for (int32 i = 3; i <= po.NumArgs(); i++) { + example_writers.push_back(new NnetExampleWriter(po.GetArg(i))); + } + + std::vector pairs; + // deal with the ranges file and initalize the vector + ProcessRangeFile(range_rspecifier, &pairs); + + RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier); + + int32 num_error = 0, + num_egs_written = 0; + + for (std::vector::iterator iter = pairs.begin(); + iter != pairs.end(); iter++) { + + FvectorChunkPairInfo *pair = *iter; + // get the features + if (!feature_reader.HasKey(pair->utt_a) || !feature_reader.HasKey(pair->utt_b)) { + num_error++; + KALDI_WARN << "The feature " << pair->utt_a << " or " << pair->utt_b + << " is not found."; + continue; + } + const Matrix &feats_a = feature_reader.Value(pair->utt_a); + const Matrix &feats_b = feature_reader.Value(pair->utt_b); + int32 num_rows = feats_a.NumRows(), + feat_dim = feats_a.NumCols(); + if (num_rows < (pair->start_frame + pair->num_frames)) { + num_error++; + KALDI_WARN << "Unable to create examples for utterance " << pair->pair_name + << ". Requested chunk boundary is the " + << (pair->start_frame + pair->num_frames) + << "th frmae, but utterance has only " << num_rows << " frames."; + continue; + } else { + SubMatrix chunk1(feats_a, pair->start_frame, + pair->num_frames, 0, feat_dim), + chunk2(feats_b, pair->start_frame, + pair->num_frames, 0, feat_dim); + NnetIo nnet_io1 = NnetIo("input", 0, chunk1), + nnet_io2 = NnetIo("input", 0, chunk2); + for (std::vector::iterator indx_it = nnet_io1.indexes.begin(); + indx_it != nnet_io1.indexes.end(); ++indx_it) { + indx_it->n = 0; + } + for (std::vector::iterator indx_it = nnet_io2.indexes.begin(); + indx_it != nnet_io2.indexes.end(); ++indx_it) { + indx_it->n = 1; + } + NnetExample eg; + eg.io.push_back(nnet_io1); + eg.io.push_back(nnet_io2); + if (compress) + eg.Compress(); + + if (pair->output_archive_id >= example_writers.size()) { + KALDI_ERR << "Requested output index exceeds number of specified " + << "output files."; + } + example_writers[pair->output_archive_id]->Write(pair->pair_name, eg); + num_egs_written += 1; + } } + Cleanup(&pairs, &example_writers); KALDI_LOG << "Finished generating examples, " - << "successfully processed " << num_done - << " feature files, wrote " << num_egs_written << " examples, " - << " with " << num_frames_written << " egs in total."; - return (num_egs_written == 0 || num_done == 0 ? 1 : 0); + << "successfully wrote " << num_egs_written << " examples; " + << num_error << " files had errors."; + return (num_egs_written == 0); } catch(const std::exception &e) { std::cerr << e.what() << '\n'; return -1; From 752cf66fd15c1f1d06013d0985be45dd4cee942a Mon Sep 17 00:00:00 2001 From: Hang Lyu Date: Mon, 30 Jan 2017 03:45:24 -0500 Subject: [PATCH 23/23] now it can generate the egs --- egs/wsj/s5/steps/nnet3/fvector/add_noise.sh | 156 ++++++++++++------ .../steps/nnet3/fvector/allocate_examples.py | 18 +- .../nnet3/fvector/generate_noise_range.py | 8 +- .../fvector/generate_perturb_wav_specifier.py | 2 +- src/fvectorbin/Makefile | 2 +- src/fvectorbin/nnet3-fvector-get-egs.cc | 21 ++- .../nnet3-fvector-perturb-signal.cc | 49 ++++-- tools/config/common_path.sh | 1 + 8 files changed, 169 insertions(+), 88 deletions(-) diff --git a/egs/wsj/s5/steps/nnet3/fvector/add_noise.sh b/egs/wsj/s5/steps/nnet3/fvector/add_noise.sh index 406adda8c1d..205c775e730 100755 --- a/egs/wsj/s5/steps/nnet3/fvector/add_noise.sh +++ b/egs/wsj/s5/steps/nnet3/fvector/add_noise.sh @@ -11,7 +11,7 @@ # Begin Configuration section. stage=0 cmd=run.pl -nj=4 +nj=8 # Begain Configuration. min_additive_noise_len=2.0 # the minimum duration of each noise file in seconds. num_ranges_per_wav=4 # the number of noise ranges for each wav. @@ -20,6 +20,13 @@ max_snr=-15 # the maximum snr value in dB. seed=-1 # set the random seed. variable_len_additive_noise=true #If true, generate the variable-length range files. #If false, generate the fixed-length range files. +# Begin Configuration of section 6 +# for the details, please see steps/nnet3/fvector/get_egs.sh +frames_per_chunk=200 +frames_per_iter=1000000 +frames_per_iter_diagnostic=1000000 +num_diagnostic_archives=3 +num_heldout_utts=500 # End Configuration options. echo "$0 $@" # Print the command line for logging @@ -27,59 +34,63 @@ echo "$0 $@" # Print the command line for logging [ -f path.sh ] && . ./path.sh # source the path. . parse_options.sh || exit 1; -if [ $# != 3 ]; then - echo "usage: steps/nnet3/fvector/add_noise.sh " - echo "e.g.: steps/nnet3/fvector/add_noise.sh data/train data/noise ranges" +if [ $# != 4 ]; then + echo "usage: steps/nnet3/fvector/add_noise.sh " + echo "e.g.: steps/nnet3/fvector/add_noise.sh data/train data/noise data/perturbed exp/fvector_a/egs" echo "main options (for others, see top of script file)" echo " --min-additive-noise-len # limit the minimum length of noise" echo " --num-ranges-per-wav # number of noise range kinds" echo " --variable-len-additive-noise (true|false) # decide fixed/variable version" echo " --nj # number of parallel jobs" echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs" + exit 1 fi data=$1 # contain wav.scp noise=$2 # contain noise.scp dir=$3 # eg: data/perturbed +exp=$4 # the directory used to store the egs -# remove the segments so that the duration corresponding to recording-id -if [ -f $data/segments ]; then - mv $data/segments $data/segments_backup - if [ -f $data/utt2dur ]; then - mv $data/utt2dur $data/utt2dur.backup - utils/data/get_utt2dur.sh $data +if [ $stage -le 0 ];then + echo "The 1st stage: generating the duration file for each recording" + # remove the segments so that the duration corresponding to recording-id + if [ -f $data/segments ]; then + mv $data/segments $data/segments_backup + if [ -f $data/utt2dur ]; then + mv $data/utt2dur $data/utt2dur.backup + utils/data/get_utt2dur.sh $data + else + utils/data/get_utt2dur.sh $data + fi + mv $data/segments_backup $data/segments else - utils/data/get_utt2dur.sh $data + if [ ! -f $data/utt2dur ]; then + # get original clean wav's duration + utils/data/get_utt2dur.sh $data + fi fi - mv $data/segments_backup $data/segments -else - if [ ! -f $data/utt2dur ]; then - # get original clean wav's duration - utils/data/get_utt2dur.sh $data - fi -fi -# remove the segments so that the duration corresponding to recording-id -if [ -f $noise/segments ]; then - mv $noise/segments $noise/segments_backup - if [ -f $noise/utt2dur ]; then - mv $noise/utt2dur $noise/utt2dur.backup - utils/data/get_utt2dur.sh $noise + # remove the segments so that the duration corresponding to recording-id + if [ -f $noise/segments ]; then + mv $noise/segments $noise/segments_backup + if [ -f $noise/utt2dur ]; then + mv $noise/utt2dur $noise/utt2dur.backup + utils/data/get_utt2dur.sh $noise + else + utils/data/get_utt2dur.sh $noise + fi + mv $noise/segments_backup $noise/segments else - utils/data/get_utt2dur.sh $noise + if [ ! -f $noise/utt2dur ]; then + # get original clean wav's duration + utils/data/get_utt2dur.sh $noise + fi fi - mv $noise/segments_backup $noise/segments -else - if [ ! -f $noise/utt2dur ]; then - # get original clean wav's duration - utils/data/get_utt2dur.sh $noise - fi fi - mkdir -p $dir/log -if [ $stage -le 0 ]; then - echo "$0: generate $num_kind_rage kinds of noise range for each original wav" +if [ $stage -le 1 ]; then + echo "The 2nd stage: generate $num_kind_rage kinds of noise range for each original wav" $cmd $dir/log/generate_noise_range.log \ steps/nnet3/fvector/generate_noise_range.py \ --num-ranges-per-wav=$num_ranges_per_wav \ @@ -91,60 +102,85 @@ if [ $stage -le 0 ]; then $data/utt2dur $noise/utt2dur $dir/ranges $dir/wav2perturbedwav fi -if [ $stage -le 1 ]; then - echo "$0: generate perturbed_wav_specifier" +if [ $stage -le 2 ]; then + echo "The 3rd stage: generate perturbed_wav_specifier" $cmd $dir/log/generate_perturb_wav_specifier.log \ steps/nnet3/fvector/generate_perturb_wav_specifier.py \ --noise=$noise/wav.scp \ - $data/wav.scp $dir/ranges $dir/wav2perturbedwav $dir/wav.scp + $data/wav.scp $dir/ranges $dir/wav2perturbedwav $dir/tmp.wav.scp + cat $dir/tmp.wav.scp | sort > $dir/wav.scp + rm -f $dir/tmp.wav.scp fi -if [ $stage -le 2 ]; then - echo "$0: generate other files in data directory" +if [ $stage -le 3 ]; then + echo "The 4th stage: generate other files in data directory" #reco2file_and_channel - cat $dir/wav2perturbedwav | cut -d ' ' -f 1 | paste -d ' ' - $dir/wav2perturbedwav > $dir/perturb_recording_map - steps/nnet3/fvector/apply_map_one2mult.pl -f 1 $dir/perturb_recording_map <$data/reco2file_and_channel >$dir/reco2file_and_channel + cat $dir/wav2perturbedwav | cut -d ' ' -f 1 | paste -d ' ' - $dir/wav2perturbedwav > $dir/.perturb_recording_map + steps/nnet3/fvector/apply_map_one2mult.pl -f 1 $dir/.perturb_recording_map <$data/reco2file_and_channel >$dir/reco2file_and_channel if [ -f $data/segments ]; then awk -v num=$num_ranges_per_wav '{ printf("%s %s",$1, $1); - for(i=1; i<= num; i++){ printf(" %s%s-%s","perturb", i, $1); } + for(i=1; i<= num; i++){ printf(" %s-%s%s", $1, "perturbed", i); } printf("\n"); - }' <$data/segments > $dir/perturb_utt_map - cat $dir/perturb_recording_map > $dir/perturb_map - cat $dir/perturb_utt_map >> $dir/perturb_map + }' <$data/segments > $dir/.perturb_utt_map + cat $dir/.perturb_recording_map > $dir/.perturb_map + cat $dir/.perturb_utt_map >> $dir/.perturb_map + cp $dir/.perturb_utt_map $dir/uniq2utt #segments - steps/nnet3/fvector/apply_map_one2mult.pl -f 1 $dir/perturb_map <$data/segments | \ + steps/nnet3/fvector/apply_map_one2mult.pl -f 1 $dir/.perturb_map <$data/segments | \ awk '{ uttid=$1; start_time=$3; end_time=$4; split(uttid,S,"[_]"); - recordingid=S[1]; + if ( S[2] ~/.*-perturb.*$/ ) { + split(S[2],S1,"[-]"); + recordingid=(S[1]"-"S1[3]); + } else { + recordingid=S[1]; + } print uttid " " recordingid " " start_time " " end_time }' >$dir/segments #text - steps/nnet3/fvector/apply_map_one2mult.pl -f 1 $dir/perturb_map <$data/text >$dir/text + steps/nnet3/fvector/apply_map_one2mult.pl -f 1 $dir/.perturb_map <$data/text | sort >$dir/text #utt2spk - steps/nnet3/fvector/apply_map_one2mult.pl -f 1 $dir/perturb_map <$data/utt2spk >$dir/utt2spk + steps/nnet3/fvector/apply_map_one2mult.pl -f 1 $dir/.perturb_map <$data/utt2spk | sort >$dir/utt2spk #spk2utt utt2spk_to_spk2utt.pl <$dir/utt2spk | sort > $dir/spk2utt else #no segments->wav indexed by utterence-id/ is equal to - cp $dir/perturb_recording_map $dir/perturb_map + cp $dir/.perturb_recording_map $dir/.perturb_map + cp $dir/.perturb_map $dir/uniq2utt #segments - steps/nnet3/fvector/apply_map_one2mult.pl -f 1 $dir/perturb_map <$data/segments | \ + steps/nnet3/fvector/apply_map_one2mult.pl -f 1 $dir/.perturb_map <$data/segments | \ awk '{ uttid=$1; start_time=$3; end_time=$4; split(uttid,S,"[_]"); - recordingid=S[1]; + if ( S[2] ~/.*-perturb.*$/ ) { + split(S[2],S1,"[-]"); + recordingid=(S[1]"-"S1[3]); + } else { + recordingid=S[1]; + } print uttid " " recordingid " " start_time " " end_time - }' >$dir/segments + }' | sort >$dir/segments #text - steps/nnet3/fvector/apply_map_one2mult.pl -f 1 $dir/perturb_map <$data/text >$dir/text + steps/nnet3/fvector/apply_map_one2mult.pl -f 1 $dir/.perturb_map <$data/text | sort >$dir/text #utt2spk - steps/nnet3/fvector/apply_map_one2mult.pl -f 1 $dir/perturb_map <$data/utt2spk >$dir/utt2spk + steps/nnet3/fvector/apply_map_one2mult.pl -f 1 $dir/.perturb_map <$data/utt2spk | sort >$dir/utt2spk #spk2utt utt2spk_to_spk2utt.pl <$dir/utt2spk | sort > $dir/spk2utt fi fi +#Now, we have already finished generating the perturbed data directory. +if [ $stage -le 4 ]; then + echo "The 5th stage: make features." + mfccdir=mfcc + dir_basename=$(basename $dir) + steps/make_mfcc.sh --nj 50 --cmd "$cmd" \ + $dir exp/make_mfcc/$dir_basename $mfccdir + steps/compute_cmvn_stats.sh $dir exp/make_mfcc/$dir_basename $mfccdir +fi + +#restore if [ -f $data/utt2dur.backup ]; then mv $data/utt2dur.backup $data/utt2dur fi @@ -152,4 +188,14 @@ if [ -f $noise/utt2dur.backup ]; then mv $noise/utt2dur.backup $noise/utt2dur fi +if [ $stage -le 5 ]; then + echo "The 6th stage: generate egs." + steps/nnet3/fvector/get_egs.sh \ + --frames-per-chunk $frames_per_chunk \ + --frames-per-iter $frames_per_iter \ + --frames-per-iter-diagnostic $frames_per_iter_diagnostic \ + --num-diagnostic-archives $num_diagnostic_archives \ + --num-heldout-utts $num_heldout_utts \ + $dir $exp +fi exit 0 diff --git a/egs/wsj/s5/steps/nnet3/fvector/allocate_examples.py b/egs/wsj/s5/steps/nnet3/fvector/allocate_examples.py index aae6c53c3b1..219356e7388 100755 --- a/egs/wsj/s5/steps/nnet3/fvector/allocate_examples.py +++ b/egs/wsj/s5/steps/nnet3/fvector/allocate_examples.py @@ -6,17 +6,17 @@ # You call it as (e.g.) # # allocate_examples.py --frames-per-chunk=200 --frames-per-iter=1000000 \ -# --num-archives=169 --num-jobs=24 exp/xvector_a/egs/temp/utt2len.train exp/xvector_a/egs +# --num-archives=169 --num-jobs=24 exp/fvector_a/egs/temp/utt2len.train exp/fvector_a/egs # # and this program outputs certain things to the temp directory (exp/xvector_a/egs/temp in this case) # that will enable you to dump the chunks for xvector training. What we'll eventually be doing is invoking # the following program with something like the following args: # -# nnet3-fvector-get-egs [options] exp/xvector_a/temp/ranges.1 scp:data/train/feats.scp \ -# ark:exp/xvector_a/egs/egs_temp.1.ark ark:exp/xvector_a/egs/egs_temp.2.ark \ -# ark:exp/xvector_a/egs/egs_temp.3.ark +# nnet3-fvector-get-egs [options] exp/fvector_a/temp/ranges.1 scp:data/train/feats.scp \ +# ark:exp/fvector_a/egs/egs_temp.1.ark ark:exp/fvector_a/egs/egs_temp.2.ark \ +# ark:exp/fvector_a/egs/egs_temp.3.ark # -# where exp/xvector_a/temp/ranges.1 contains something like the following: +# where exp/fvector_a/temp/ranges.1 contains something like the following: # # 0 1 50 200 # @@ -32,9 +32,9 @@ # archive each line corresponds to. # # The list of archives corresponding to ranges.n will be written to output.n, -# so in exp/xvector_a/temp/outputs.1 we'd have: +# so in exp/fvector_a/temp/outputs.1 we'd have: # -# ark:exp/xvector_a/egs/egs_temp.1.ark ark:exp/xvector_a/egs/egs_temp.2.ark ark:exp/xvector_a/egs/egs_temp.3.ark +# ark:exp/fvector_a/egs/egs_temp.1.ark ark:exp/fvector_a/egs/egs_temp.2.ark ark:exp/fvector_a/egs/egs_temp.3.ark # # The number of these files will equal 'num-jobs'. If you add up the word-counts of # all the outputs.* files you'll get 'num-archives'. The number of frames in each archive @@ -217,8 +217,8 @@ def ChoosePairs(ori_utt_id): utt_b, i, archive_index + 1, - offset - args.frames_per_chunk, + offset, + args.frames_per_chunk), file=f) f.close() diff --git a/egs/wsj/s5/steps/nnet3/fvector/generate_noise_range.py b/egs/wsj/s5/steps/nnet3/fvector/generate_noise_range.py index ed147b27d40..e29359b8e9b 100755 --- a/egs/wsj/s5/steps/nnet3/fvector/generate_noise_range.py +++ b/egs/wsj/s5/steps/nnet3/fvector/generate_noise_range.py @@ -144,10 +144,10 @@ def GenerateFixedLengthRangeFile(): # We generate $num_ranges_per_wav ranges for j in range(0, args.num_ranges_per_wav): # print the perturbed wav id in the beginning of line - print("{1}-{0}".format(wav_ids[i], "perturbed"+str(j+1)), end=" ", file=f) + print("{0}-{1}".format(wav_ids[i], "perturbed"+str(j+1)), end=" ", file=f) # print the perturbedwav_id - print(" {1}-{0}".format(wav_ids[i], "perturbed"+str(j+1)), end="", file=g) + print(" {0}-{1}".format(wav_ids[i], "perturbed"+str(j+1)), end="", file=g) # select a number from [1 ... max_num_additive_noise] num_additive_noise = random.randint(1, max_num_additive_noise) @@ -246,10 +246,10 @@ def GenerateVariableLengthRangeFile(): # We generate $num_ranges_per_wav ranges for j in range(0, args.num_ranges_per_wav): # print the perturbed wav id in the beginning of line - print("{1}-{0}".format(wav_ids[i], "perturbed"+str(j+1)), end=" ", file=f) + print("{0}-{1}".format(wav_ids[i], "perturbed"+str(j+1)), end=" ", file=f) # print the perturbedwav_id - print(" {1}-{0}".format(wav_ids[i], "perturbed"+str(j+1)), end="", file=g) + print(" {0}-{1}".format(wav_ids[i], "perturbed"+str(j+1)), end="", file=g) # generate range file # format: wav_t_start:wav_t_end:noise_name:noise_t_start:noise_t_end:snr, diff --git a/egs/wsj/s5/steps/nnet3/fvector/generate_perturb_wav_specifier.py b/egs/wsj/s5/steps/nnet3/fvector/generate_perturb_wav_specifier.py index 58892782f05..fc49a6bc4df 100755 --- a/egs/wsj/s5/steps/nnet3/fvector/generate_perturb_wav_specifier.py +++ b/egs/wsj/s5/steps/nnet3/fvector/generate_perturb_wav_specifier.py @@ -88,7 +88,7 @@ for i in range(1, len(wav_list)): current_perturbed_wav_id = wav_list[i] current_perturbed_wav_index = perturbed_range_ids.index(current_perturbed_wav_id) - print('''{0} {1} nnet3-fvector-perturb-signal --noise-scp=scp:{2} --noise=\"{3}\" - |'''.format( + print('''{0} {1} nnet3-fvector-perturb-signal --noise=scp:{2} --noise-range=\"{3}\" - - |'''.format( current_perturbed_wav_id, wav_extended_files[current_wav_index], args.noise, diff --git a/src/fvectorbin/Makefile b/src/fvectorbin/Makefile index 73c81a4bbb6..48709027de1 100644 --- a/src/fvectorbin/Makefile +++ b/src/fvectorbin/Makefile @@ -6,7 +6,7 @@ include ../kaldi.mk LDFLAGS += $(CUDA_LDFLAGS) LDLIBS += $(CUDA_LDLIBS) -BINFILES = nnet3-fvector-get-egs nnet3-fvector-perturb-signal +BINFILES = nnet3-fvector-get-egs nnet3-fvector-perturb-signal nnet3-fvector-get-egs-simple OBJFILES = diff --git a/src/fvectorbin/nnet3-fvector-get-egs.cc b/src/fvectorbin/nnet3-fvector-get-egs.cc index dd05e1efe56..4e3179eb2d5 100644 --- a/src/fvectorbin/nnet3-fvector-get-egs.cc +++ b/src/fvectorbin/nnet3-fvector-get-egs.cc @@ -158,21 +158,26 @@ int main(int argc, char *argv[]) { << " is not found."; continue; } - const Matrix &feats_a = feature_reader.Value(pair->utt_a); - const Matrix &feats_b = feature_reader.Value(pair->utt_b); + const Matrix feats_a = feature_reader.Value(pair->utt_a); + const Matrix feats_b = feature_reader.Value(pair->utt_b); int32 num_rows = feats_a.NumRows(), feat_dim = feats_a.NumCols(); - if (num_rows < (pair->start_frame + pair->num_frames)) { + if (num_rows < pair->num_frames) { num_error++; KALDI_WARN << "Unable to create examples for utterance " << pair->pair_name - << ". Requested chunk boundary is the " - << (pair->start_frame + pair->num_frames) - << "th frmae, but utterance has only " << num_rows << " frames."; + << ". Requested chunk size is " + << pair->num_frames + << ", but utterance has only " << num_rows << " frames."; continue; } else { - SubMatrix chunk1(feats_a, pair->start_frame, + // As the utt2len file is not the exact frames of a utterance, so the + // requested chunk positions are approximate. It's possible that they + // slightly exceed the number of frames in the utterance. + // If that occurs, we can shift the chunks location back slightly. + int32 shift = std::min(0, num_rows - pair->start_frame - pair->num_frames); + SubMatrix chunk1(feats_a, pair->start_frame + shift, pair->num_frames, 0, feat_dim), - chunk2(feats_b, pair->start_frame, + chunk2(feats_b, pair->start_frame + shift, pair->num_frames, 0, feat_dim); NnetIo nnet_io1 = NnetIo("input", 0, chunk1), nnet_io2 = NnetIo("input", 0, chunk2); diff --git a/src/fvectorbin/nnet3-fvector-perturb-signal.cc b/src/fvectorbin/nnet3-fvector-perturb-signal.cc index 52992c173c0..a4459ce6d50 100644 --- a/src/fvectorbin/nnet3-fvector-perturb-signal.cc +++ b/src/fvectorbin/nnet3-fvector-perturb-signal.cc @@ -61,10 +61,10 @@ void GenerateController(std::vector &segments, } void ApplyNoise(std::string &noise_scp, const std::vector &controller, - const VectorBase &input_wav, VectorBase *perturbed_wav) { + const VectorBase &input_wav, const int &samp_freq_input, + VectorBase *perturbed_wav) { // about noise list RandomAccessTableReader noise_reader(noise_scp); - int samp_freq_input = input_wav.Dim(); // add noise @@ -83,7 +83,35 @@ void ApplyNoise(std::string &noise_scp, const std::vector &c int32 noise_start_point = samp_freq_noise * controller[i].noise_t_start; int32 noise_end_point = samp_freq_noise * controller[i].noise_t_end - 1; BaseFloat snr = controller[i].snr; + // This part is used to deal with the precise problem. + // e.g. If the wav_t_start = 259.49, the sample frequency is 8000. In theroy, + // the wav_start_point is 2075920, however, it will be 2075919 in practise. + int32 input_length = input_end_point - input_start_point + 1; + int32 noise_length = noise_end_point - noise_start_point + 1; + if (input_length != noise_length) { + int32 delta = (input_length > noise_length?(input_length - noise_length) + :(noise_length-input_length)); + if (delta < 0.01*samp_freq_input) { + if (input_length > noise_length) { + input_end_point = input_end_point - delta; + } else { + noise_end_point = noise_end_point - delta; + } + } else { + KALDI_ERR << "There is a problem about input length does not match noise length" + << " where the noise-id is: " << controller[i].noise_uttid + << ", the input length is: " << input_length + << ", the noise length is: " << noise_length << std::endl; + } + } + // End sample must be less than total number + if ((input_end_point > input_wav.Dim()-1) || (noise_end_point > noise.Dim()-1)) { + int32 over_boundary = ((input_end_point - input_wav.Dim() + 1) > (noise_end_point - noise.Dim() + 1) ? + (input_end_point - input_wav.Dim() + 1) : (noise_end_point - noise.Dim() + 1)); + input_end_point = input_end_point - over_boundary; + noise_end_point = noise_end_point - over_boundary; + } // The input vector and noise vector contain the whole content of utt seperately. // According to the AdditiveNoiseRange, we stepwise add the additive noise to input. // To save the space, we use Subvector, because it returns the pointer. @@ -159,18 +187,19 @@ int main(int argc, char *argv[]) { // Generate the Noise Controller list std::vector controller; if (!noise_range.empty()) { - int index = noise_range.find_first_of(" "); - std::string perturbed_utt_id = noise_range.substr(0, index); - std::string noise_range_content = noise_range.substr(index+1); + //int index = noise_range.find_first_of(" "); + //std::string perturbed_utt_id = noise_range.substr(0, index); + //std::string noise_range_content = noise_range.substr(index+1); std::vector segments; - SplitStringToVector(noise_range_content, ",", true, &segments); + SplitStringToVector(noise_range, ",", true, &segments); GenerateController(segments, &controller); } + bool binary = true; WaveData input_wave; { WaveHolder waveholder; - Input ki(input_wave_file); + Input ki(input_wave_file, &binary); waveholder.Read(ki.Stream()); input_wave = waveholder.Value(); } @@ -189,14 +218,14 @@ int main(int argc, char *argv[]) { // new output vector and add noise Vector output(input); - ApplyNoise(noise, controller, input, &output); + ApplyNoise(noise, controller, input, samp_freq_input, &output); Matrix out_matrix(1, num_samp_input); out_matrix.CopyRowsFromVec(output); WaveData out_wave(samp_freq_input, out_matrix); - Output ko(output_wave_file, false); - out_wave.Write(ko.Stream()); + Output ko(output_wave_file, binary, false); + WaveHolder::Write(ko.Stream(), true, out_wave); return 0; } catch(const std::exception &e) { diff --git a/tools/config/common_path.sh b/tools/config/common_path.sh index 5534cf2d13b..f95c5acf8e6 100644 --- a/tools/config/common_path.sh +++ b/tools/config/common_path.sh @@ -21,4 +21,5 @@ ${KALDI_ROOT}/src/onlinebin:\ ${KALDI_ROOT}/src/sgmm2bin:\ ${KALDI_ROOT}/src/sgmmbin:\ ${KALDI_ROOT}/src/xvectorbin:\ +${KALDI_ROOT}/src/fvectorbin:\ $PATH