From 715f219415ee4e36403f354f8f1b86a0b5327d4d Mon Sep 17 00:00:00 2001 From: Pegita Date: Tue, 13 Feb 2018 18:06:30 -0500 Subject: [PATCH 1/6] added new functions to accept NnetExample in nnet-chain-training.cc. --- src/chain/chain-supervision.cc | 31 ++++++ src/chain/chain-supervision.h | 4 + src/chain/chain-training.cc | 51 ++++++++++ src/chain/chain-training.h | 12 ++- src/chainbin/Makefile | 2 +- src/chainbin/nnet3-chain-get-egs.cc | 4 +- src/latbin/lattice-1best.cc | 6 +- src/latbin/lattice-to-fst.cc | 150 +++++++++++++++++++++++----- src/nnet3/nnet-chain-training.cc | 116 ++++++++++++++++++++- src/nnet3/nnet-chain-training.h | 10 ++ src/nnet3/nnet-example-utils.cc | 50 +++++++++- src/nnet3/nnet-example-utils.h | 19 ++++ src/nnet3bin/nnet3-get-egs.cc | 6 +- 13 files changed, 422 insertions(+), 39 deletions(-) diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc index b5597b15667..7d87201dfdd 100644 --- a/src/chain/chain-supervision.cc +++ b/src/chain/chain-supervision.cc @@ -650,6 +650,37 @@ void AppendSupervision(const std::vector &input, } } +bool AddWeightToFst(const fst::StdVectorFst &normalization_fst, + fst::StdVectorFst *supervision_fst) { + // remove epsilons before composing. 'normalization_fst' has noepsilons so + // the composed result will be epsilon free. + fst::StdVectorFst supervision_fst_noeps(*supervision_fst); + fst::RmEpsilon(&supervision_fst_noeps); + if (!TryDeterminizeMinimize(kSupervisionMaxStates, + &supervision_fst_noeps)) + return false; + + // note: by default, 'Compose' will call 'Connect', so if the + // resulting FST is not connected, it will end up empty. + fst::StdVectorFst composed_fst; + fst::Compose(supervision_fst_noeps, normalization_fst, + &composed_fst); + if (composed_fst.NumStates() == 0) + return false; + // projection should not be necessary, as both FSTs are acceptors. + // determinize and minimize to make it as compact as possible. + + if (!TryDeterminizeMinimize(kSupervisionMaxStates, + &composed_fst)) + return false; + *supervision_fst = composed_fst; + // Make sure the states are numbered in increasing order of time. + SortBreadthFirstSearch(supervision_fst); + KALDI_ASSERT(supervision_fst->Properties(fst::kAcceptor, true) == fst::kAcceptor); + KALDI_ASSERT(supervision_fst->Properties(fst::kIEpsilons, true) == 0); + return true; +} + bool AddWeightToSupervisionFst(const fst::StdVectorFst &normalization_fst, Supervision *supervision) { // remove epsilons before composing. 'normalization_fst' has noepsilons so diff --git a/src/chain/chain-supervision.h b/src/chain/chain-supervision.h index a94f68ade90..c54d4770aa0 100644 --- a/src/chain/chain-supervision.h +++ b/src/chain/chain-supervision.h @@ -323,6 +323,10 @@ class SupervisionSplitter { bool AddWeightToSupervisionFst(const fst::StdVectorFst &normalization_fst, Supervision *supervision); + +bool AddWeightToFst(const fst::StdVectorFst &normalization_fst, + fst::StdVectorFst *supervision_fst); + /// Assuming the 'fst' is epsilon-free, connected, and has the property that all /// paths from the start-state are of the same length, output a vector /// containing that length (from the start-state to the current state) to diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc index 53de69a0e07..40108636da0 100644 --- a/src/chain/chain-training.cc +++ b/src/chain/chain-training.cc @@ -25,6 +25,57 @@ namespace kaldi { namespace chain { +void ComputeObjfAndDeriv2(const ChainTrainingOptions &opts, + const DenominatorGraph &den_graph, + const GeneralMatrix &supervision, + const CuMatrixBase &nnet_output, + BaseFloat *objf, + BaseFloat *l2_term, + BaseFloat *weight, + CuMatrixBase *nnet_output_deriv, + CuMatrixBase *xent_output_deriv) { + if (nnet_output_deriv) { + nnet_output_deriv->SetZero(); + nnet_output_deriv->CopyFromMat(supervision.GetFullMatrix()); + if (xent_output_deriv) + xent_output_deriv->CopyFromMat(*nnet_output_deriv); + } else if (xent_output_deriv) { + // this branch will be taken if xent_output_deriv but not + // nnet_output_deriv is set- which could happen if you want to compute the + // cross-entropy objective but not the derivatives. + xent_output_deriv->SetZero(); + xent_output_deriv->CopyFromMat(supervision.GetFullMatrix()); + } + int32 num_sequences = 64, + frames_per_sequence = 150; + BaseFloat sup_weight = 1.0; + DenominatorComputation denominator(opts, den_graph, + num_sequences, + nnet_output); + BaseFloat den_logprob = denominator.Forward(); + bool ok = true; + if (nnet_output_deriv) + ok = denominator.Backward(-sup_weight, nnet_output_deriv); + // we don't consider log-prob w.r.t numerator. + *objf = -sup_weight * den_logprob; + *weight = sup_weight * num_sequences * frames_per_sequence; + + if (!((*objf) - (*objf) == 0) || !ok) { + // inf or NaN detected, or denominator computation returned false. + if (nnet_output_deriv) + nnet_output_deriv->SetZero(); + if (xent_output_deriv) + xent_output_deriv->SetZero(); + BaseFloat default_objf = -10; + KALDI_WARN << "Objective function is " << (*objf) + << " and denominator computation (if done) returned " + << std::boolalpha << ok + << ", setting objective function to " << default_objf + << " per frame."; + *objf = default_objf * *weight; + } +} + void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, const DenominatorGraph &den_graph, const Supervision &supervision, diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h index e6143d10846..8c276a4854f 100644 --- a/src/chain/chain-training.h +++ b/src/chain/chain-training.h @@ -63,7 +63,7 @@ struct ChainTrainingOptions { ChainTrainingOptions(): l2_regularize(0.0), leaky_hmm_coefficient(1.0e-05), xent_regularize(0.0) { } - + void Register(OptionsItf *opts) { opts->Register("l2-regularize", &l2_regularize, "l2 regularization " "constant for 'chain' training, applied to the output " @@ -121,8 +121,16 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, BaseFloat *weight, CuMatrixBase *nnet_output_deriv, CuMatrixBase *xent_output_deriv = NULL); - +void ComputeObjfAndDeriv2(const ChainTrainingOptions &opts, + const DenominatorGraph &den_graph, + const GeneralMatrix &supervision, + const CuMatrixBase &nnet_output, + BaseFloat *objf, + BaseFloat *l2_term, + BaseFloat *weight, + CuMatrixBase *nnet_output_deriv, + CuMatrixBase *xent_output_deriv = NULL); } // namespace chain } // namespace kaldi diff --git a/src/chainbin/Makefile b/src/chainbin/Makefile index 096040000eb..2ee87d7ec33 100644 --- a/src/chainbin/Makefile +++ b/src/chainbin/Makefile @@ -7,7 +7,7 @@ LDFLAGS += $(CUDA_LDFLAGS) LDLIBS += $(CUDA_LDLIBS) BINFILES = chain-est-phone-lm chain-get-supervision chain-make-den-fst \ - nnet3-chain-get-egs nnet3-chain-copy-egs nnet3-chain-merge-egs \ + nnet3-chain-get-egs nnet3-chain-get-egs-post nnet3-chain-copy-egs nnet3-chain-merge-egs \ nnet3-chain-shuffle-egs nnet3-chain-subset-egs \ nnet3-chain-acc-lda-stats nnet3-chain-train nnet3-chain-compute-prob \ nnet3-chain-combine nnet3-chain-normalize-egs diff --git a/src/chainbin/nnet3-chain-get-egs.cc b/src/chainbin/nnet3-chain-get-egs.cc index c8c251900ec..206921771c8 100644 --- a/src/chainbin/nnet3-chain-get-egs.cc +++ b/src/chainbin/nnet3-chain-get-egs.cc @@ -42,6 +42,7 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst, const GeneralMatrix &feats, const MatrixBase *ivector_feats, int32 ivector_period, + const Lattice &lattice, const chain::Supervision &supervision, const std::string &utt_id, bool compress, @@ -278,12 +279,13 @@ int main(int argc, char *argv[]) { num_err++; continue; } - + /* if (!ProcessFile(normalization_fst, feats, online_ivector_feats, online_ivector_period, supervision, key, compress, &utt_splitter, &example_writer)) num_err++; + */ } } if (num_err > 0) diff --git a/src/latbin/lattice-1best.cc b/src/latbin/lattice-1best.cc index f6723687790..f325cb3016e 100644 --- a/src/latbin/lattice-1best.cc +++ b/src/latbin/lattice-1best.cc @@ -61,9 +61,9 @@ int main(int argc, char *argv[]) { lats_wspecifier = po.GetArg(2); SequentialCompactLatticeReader clat_reader(lats_rspecifier); - + // Write as compact lattice. - CompactLatticeWriter compact_1best_writer(lats_wspecifier); + CompactLatticeWriter compact_1best_writer(lats_wspecifier); int32 n_done = 0, n_err = 0; @@ -77,7 +77,7 @@ int main(int argc, char *argv[]) { CompactLattice best_path; CompactLatticeShortestPath(clat, &best_path); - + if (best_path.Start() == fst::kNoStateId) { KALDI_WARN << "Possibly empty lattice for utterance-id " << key << "(no output)"; diff --git a/src/latbin/lattice-to-fst.cc b/src/latbin/lattice-to-fst.cc index 0d2ac29a99b..19f8bf453c1 100644 --- a/src/latbin/lattice-to-fst.cc +++ b/src/latbin/lattice-to-fst.cc @@ -22,6 +22,50 @@ #include "util/common-utils.h" #include "fstext/fstext-lib.h" #include "lat/kaldi-lattice.h" +#include "hmm/transition-model.h" + +namespace kaldi { + +void ConvertLatticeToPdfLabels( + const TransitionModel &tmodel, + const Lattice &ifst, + fst::StdVectorFst *ofst) { + typedef fst::ArcTpl ArcIn; + typedef fst::StdArc ArcOut; + typedef ArcIn::StateId StateId; + ofst->DeleteStates(); + // The states will be numbered exactly the same as the original FST. + // Add the states to the new FST. + StateId num_states = ifst.NumStates(); + for (StateId s = 0; s < num_states; s++) { + StateId news = ofst->AddState(); + assert(news == s); + } + ofst->SetStart(ifst.Start()); + for (StateId s = 0; s < num_states; s++) { + LatticeWeight final_iweight = ifst.Final(s); + if (final_iweight != LatticeWeight::Zero()) { + fst::TropicalWeight final_oweight; + ConvertLatticeWeight(final_iweight, &final_oweight); + ofst->SetFinal(s, final_oweight); + } + for (fst::ArcIterator iter(ifst, s); + !iter.Done(); + iter.Next()) { + ArcIn arc = iter.Value(); + KALDI_PARANOID_ASSERT(arc.weight != LatticeWeight::Zero()); + ArcOut oarc; + ConvertLatticeWeight(arc.weight, &oarc.weight); + oarc.ilabel = tmodel.TransitionIdToPdf(arc.ilabel) + 1; + oarc.olabel = arc.olabel; + oarc.nextstate = arc.nextstate; + ofst->AddArc(s, oarc); + } + } +} + +} + int main(int argc, char *argv[]) { try { @@ -34,20 +78,33 @@ int main(int argc, char *argv[]) { using std::vector; BaseFloat acoustic_scale = 0.0; BaseFloat lm_scale = 0.0; - bool rm_eps = true; - + bool rm_eps = true, read_compact = true, convert_to_pdf_labels = false; + std::string trans_model; + bool project_input = false, project_output = true; + const char *usage = "Turn lattices into normal FSTs, retaining only the word labels\n" "By default, removes all weights and also epsilons (configure with\n" "with --acoustic-scale, --lm-scale and --rm-eps)\n" "Usage: lattice-to-fst [options] lattice-rspecifier fsts-wspecifier\n" " e.g.: lattice-to-fst ark:1.lats ark:1.fsts\n"; - + ParseOptions po(usage); + po.Register("read-compact", &read_compact, "Read compact lattice"); po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods"); po.Register("lm-scale", &lm_scale, "Scaling factor for graph/lm costs"); po.Register("rm-eps", &rm_eps, "Remove epsilons in resulting FSTs (in lazy way; may not remove all)"); - + po.Register("convert-to-pdf-labels", &convert_to_pdf_labels, + "Convert lattice to pdf labels"); + po.Register("trans-model", &trans_model, + "Transition model"); + po.Register("project-input", &project_input, + "Project to input labels (transition-ids); applicable only " + "when --read-compact=false"); + po.Register("project-output", &project_output, + "Project to output labels (transition-ids); applicable only " + "when --read-compact=false"); + po.Read(argc, argv); if (po.NumArgs() != 2) { @@ -56,35 +113,74 @@ int main(int argc, char *argv[]) { } vector > scale = fst::LatticeScale(lm_scale, acoustic_scale); - + std::string lats_rspecifier = po.GetArg(1), fsts_wspecifier = po.GetArg(2); - - SequentialCompactLatticeReader lattice_reader(lats_rspecifier); + + TransitionModel tmodel; + if (!trans_model.empty()) { + ReadKaldiObject(trans_model, &tmodel); + } + + SequentialCompactLatticeReader compact_lattice_reader; + SequentialLatticeReader lattice_reader; + TableWriter fst_writer(fsts_wspecifier); - + int32 n_done = 0; // there is no failure mode, barring a crash. - for (; !lattice_reader.Done(); lattice_reader.Next()) { - std::string key = lattice_reader.Key(); - CompactLattice clat = lattice_reader.Value(); - lattice_reader.FreeCurrent(); - ScaleLattice(scale, &clat); // typically scales to zero. - RemoveAlignmentsFromCompactLattice(&clat); // remove the alignments... - fst::VectorFst fst; - { - Lattice lat; - ConvertLattice(clat, &lat); // convert to non-compact form.. won't introduce - // extra states because already removed alignments. - ConvertLattice(lat, &fst); // this adds up the (lm,acoustic) costs to get - // the normal (tropical) costs. - Project(&fst, fst::PROJECT_OUTPUT); // Because in the standard Lattice format, - // the words are on the output, and we want the word labels. + if (read_compact) { + SequentialCompactLatticeReader compact_lattice_reader(lats_rspecifier); + for (; !compact_lattice_reader.Done(); compact_lattice_reader.Next()) { + std::string key = compact_lattice_reader.Key(); + CompactLattice clat = compact_lattice_reader.Value(); + compact_lattice_reader.FreeCurrent(); + ScaleLattice(scale, &clat); // typically scales to zero. + RemoveAlignmentsFromCompactLattice(&clat); // remove the alignments... + fst::VectorFst fst; + { + Lattice lat; + ConvertLattice(clat, &lat); // convert to non-compact form.. won't introduce + // extra states because already removed alignments. + + if (convert_to_pdf_labels) { + ConvertLatticeToPdfLabels(tmodel, lat, &fst); // this adds up the (lm,acoustic) costs to get + // the normal (tropical) costs. + } else { + ConvertLattice(lat, &fst); + } + + Project(&fst, fst::PROJECT_OUTPUT); // Because in the standard compact_lattice format, + // the words are on the output, and we want the word labels. + } + if (rm_eps) RemoveEpsLocal(&fst); + + fst_writer.Write(key, fst); + n_done++; + } + } else { + SequentialLatticeReader lattice_reader(lats_rspecifier); + for (; !lattice_reader.Done(); lattice_reader.Next()) { + std::string key = lattice_reader.Key(); + Lattice lat = lattice_reader.Value(); + lattice_reader.FreeCurrent(); + ScaleLattice(scale, &lat); // typically scales to zero. + fst::VectorFst fst; + if (convert_to_pdf_labels) { + ConvertLatticeToPdfLabels(tmodel, lat, &fst); + } else { + ConvertLattice(lat, &fst); + } + if (project_input) + Project(&fst, fst::PROJECT_INPUT); + else if (project_output) + Project(&fst, fst::PROJECT_OUTPUT); + if (rm_eps) RemoveEpsLocal(&fst); + + fst_writer.Write(key, fst); + n_done++; } - if (rm_eps) RemoveEpsLocal(&fst); - - fst_writer.Write(key, fst); - n_done++; + } KALDI_LOG << "Done converting " << n_done << " lattices to word-level FSTs"; return (n_done != 0 ? 0 : 1); diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc index 780a7115a8a..4c799ea96c3 100644 --- a/src/nnet3/nnet-chain-training.cc +++ b/src/nnet3/nnet-chain-training.cc @@ -57,6 +57,22 @@ NnetChainTrainer::NnetChainTrainer(const NnetChainTrainingOptions &opts, } } +void NnetChainTrainer::Train(const NnetExample &eg) { + bool need_model_derivative = true; + const NnetTrainerOptions &nnet_config = opts_.nnet_config; + bool use_xent_regularization = (opts_.chain_config.xent_regularize != 0.0); + ComputationRequest request; + GetComputationRequest(*nnet_, eg, need_model_derivative, + nnet_config.store_component_stats, + use_xent_regularization, need_model_derivative, + &request); + const NnetComputation *computation = compiler_.Compile(request); + + // conventional training + TrainInternal(eg, *computation); + + num_minibatches_processed_++; +} void NnetChainTrainer::Train(const NnetChainExample &chain_eg) { bool need_model_derivative = true; @@ -91,6 +107,41 @@ void NnetChainTrainer::Train(const NnetChainExample &chain_eg) { num_minibatches_processed_++; } +void NnetChainTrainer::TrainInternal(const NnetExample &eg, + const NnetComputation &computation) { + const NnetTrainerOptions &nnet_config = opts_.nnet_config; + NnetComputer computer(nnet_config.compute_config, computation, + *nnet_, delta_nnet_); + // give the inputs to the computer object + computer.AcceptInputs(*nnet_, eg.io); + computer.Run(); + + this->ProcessOutputs(eg, &computer); + computer.Run(); + + // If relevant, add in the part of the gradient that comes from L2 + // regularization. + ApplyL2Regularization(*nnet_, + GetNumNvalues(eg.io, false) * + nnet_config.l2_regularize_factor, + delta_nnet_); + + // Updates the parameters of nnet + bool success = UpdateNnetWithMaxChange(*delta_nnet_, + nnet_config.max_param_change, 1.0, 1.0 - nnet_config.momentum, nnet_, + &num_max_change_per_component_applied_, &num_max_change_global_applied_); + + // Scale down the batchnorm stats (keeps them fresh... this affects what + // happens when we use the model with batchnorm test-mode set). + ScaleBatchnormStats(nnet_config.batchnorm_stats_scale, nnet_); + + // Scale delta_nnet + if (success) + ScaleNnet(nnet_config.momentum, delta_nnet_); + else + ScaleNnet(0.0, delta_nnet_); +} + void NnetChainTrainer::TrainInternal(const NnetChainExample &eg, const NnetComputation &computation) { const NnetTrainerOptions &nnet_config = opts_.nnet_config; @@ -170,6 +221,69 @@ void NnetChainTrainer::TrainInternalBackstitch(const NnetChainExample &eg, ScaleNnet(0.0, delta_nnet_); } +void NnetChainTrainer::ProcessOutputs(const NnetExample &eg, + NnetComputer *computer) { + std::vector::const_iterator iter = eg.io.begin(), + end = eg.io.end(); + for (; iter != end; ++iter) { + const NnetIo &io = *iter; + int32 node_index = nnet_->GetNodeIndex(io.name); + KALDI_ASSERT(node_index >= 0); + if (nnet_->IsOutputNode(node_index)) { + const CuMatrixBase &nnet_output = computer->GetOutput(io.name); + CuMatrix nnet_output_deriv(nnet_output.NumRows(), + nnet_output.NumCols(), + kUndefined); + bool use_xent = (opts_.chain_config.xent_regularize != 0.0); + std::string xent_name = io.name + "-xent"; // typically "output-xent". + CuMatrix xent_deriv; + if (use_xent) + xent_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(), + kUndefined); + + BaseFloat tot_objf, tot_l2_term, tot_weight; + + ComputeObjfAndDeriv2(opts_.chain_config, den_graph_, + io.features, + nnet_output, + &tot_objf, &tot_l2_term, &tot_weight, + &nnet_output_deriv, + (use_xent ? &xent_deriv : NULL)); + if (use_xent) { + // this block computes the cross-entropy objective. + const CuMatrixBase &xent_output = computer->GetOutput( + xent_name); + // at this point, xent_deriv is posteriors derived from the numerato + // computation. note, xent_objf has a factor of '.supervision.weight' + CuMatrix cu_post(io.features.GetFullMatrix()); + BaseFloat xent_objf = TraceMatMat(xent_output, cu_post, kTrans); + objf_info_[xent_name].UpdateStats(xent_name, + opts_.nnet_config.print_interval, + num_minibatches_processed_, + tot_weight, xent_objf); + } + + //if (opts_.apply_deriv_weights && sup.deriv_weights.Dim() != 0) { + if (opts_.apply_deriv_weights) { + CuVector cu_deriv_weights; + nnet_output_deriv.MulRowsVec(cu_deriv_weights); + if (use_xent) + xent_deriv.MulRowsVec(cu_deriv_weights); + } + computer->AcceptInput(io.name, &nnet_output_deriv); + + objf_info_[io.name].UpdateStats(io.name, + opts_.nnet_config.print_interval, + num_minibatches_processed_, + tot_weight, tot_objf, tot_l2_term); + if (use_xent) { + xent_deriv.Scale(opts_.chain_config.xent_regularize); + computer->AcceptInput(xent_name, &xent_deriv); + } + } + } +} + void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, const NnetChainExample &eg, NnetComputer *computer) { @@ -214,7 +328,7 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, // at this point, xent_deriv is posteriors derived from the numerator // computation. note, xent_objf has a factor of '.supervision.weight' BaseFloat xent_objf = TraceMatMat(xent_output, xent_deriv, kTrans); - objf_info_[xent_name + suffix].UpdateStats(xent_name + suffix, + objf_info_[xent_name].UpdateStats(xent_name, opts_.nnet_config.print_interval, num_minibatches_processed_, tot_weight, xent_objf); diff --git a/src/nnet3/nnet-chain-training.h b/src/nnet3/nnet-chain-training.h index 5bf6a3f6fce..6e9bbe57ef1 100644 --- a/src/nnet3/nnet-chain-training.h +++ b/src/nnet3/nnet-chain-training.h @@ -61,6 +61,9 @@ class NnetChainTrainer { // train on one minibatch. void Train(const NnetChainExample &eg); + // train on one minibatch using NnetExample + void Train(const NnetExample &eg); + // Prints out the final stats, and return true if there was a nonzero count. bool PrintTotalStats() const; @@ -74,6 +77,10 @@ class NnetChainTrainer { void TrainInternal(const NnetChainExample &eg, const NnetComputation &computation); + // The internal function for doing one step of conventional SGD training. + void TrainInternal(const NnetExample &eg, + const NnetComputation &computation); + // The internal function for doing one step of backstitch training. Depending // on whether is_backstitch_step1 is true, It could be either the first // (backward) step, or the second (forward) step of backstitch. @@ -84,6 +91,9 @@ class NnetChainTrainer { void ProcessOutputs(bool is_backstitch_step2, const NnetChainExample &eg, NnetComputer *computer); + void ProcessOutputs(const NnetExample &eg, + NnetComputer *computer); + const NnetChainTrainingOptions opts_; chain::DenominatorGraph den_graph_; diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc index 65df0c891c1..2151e06bbb4 100644 --- a/src/nnet3/nnet-example-utils.cc +++ b/src/nnet3/nnet-example-utils.cc @@ -198,6 +198,54 @@ void ShiftExampleTimes(int32 t_offset, } } } +void GetComputationRequest(const Nnet &nnet, + const NnetExample &eg, + bool need_model_derivative, + bool store_component_stats, + bool use_xent_regularization, + bool use_xent_derivative, + ComputationRequest *request) { + request->inputs.clear(); + request->inputs.reserve(eg.io.size()); + request->outputs.clear(); + request->outputs.reserve(eg.io.size() * 2); + request->need_model_derivative = need_model_derivative; + request->store_component_stats = store_component_stats; + for (size_t i = 0; i < eg.io.size(); i++) { + const NnetIo &io = eg.io[i]; + const std::string &name = io.name; + int32 node_index = nnet.GetNodeIndex(name); + if (node_index == -1 && + !nnet.IsInputNode(node_index) && !nnet.IsOutputNode(node_index)) + KALDI_ERR << "Nnet example has input or output named '" << name + << "', but no such input or output node is in the network."; + + std::vector &dest = + nnet.IsInputNode(node_index) ? request->inputs : request->outputs; + dest.resize(dest.size() + 1); + IoSpecification &io_spec = dest.back(); + io_spec.name = name; + io_spec.indexes = io.indexes; + io_spec.has_deriv = nnet.IsOutputNode(node_index) && need_model_derivative; + if (use_xent_regularization && nnet.IsOutputNode(node_index)) { + size_t cur_size = request->outputs.size(); + request->outputs.resize(cur_size + 1); + IoSpecification &io_spec = request->outputs[cur_size - 1], + io_spec_xent = request->outputs[cur_size]; + // the IoSpecification for the -xent output is the same + // as for the regular output, except for its name which has + // the -xent suffix (and the has_deriv member may differ). + io_spec_xent = io_spec; + io_spec_xent.name = name + "-xent"; + io_spec_xent.has_deriv = use_xent_derivative; + } + } + // check to see if something went wrong. + if (request->inputs.empty()) + KALDI_ERR << "No inputs in computation request."; + if (request->outputs.empty()) + KALDI_ERR << "No outputs in computation request."; +} void GetComputationRequest(const Nnet &nnet, const NnetExample &eg, @@ -207,7 +255,7 @@ void GetComputationRequest(const Nnet &nnet, request->inputs.clear(); request->inputs.reserve(eg.io.size()); request->outputs.clear(); - request->outputs.reserve(eg.io.size()); + request->outputs.reserve(eg.io.size() * 2); request->need_model_derivative = need_model_derivative; request->store_component_stats = store_component_stats; for (size_t i = 0; i < eg.io.size(); i++) { diff --git a/src/nnet3/nnet-example-utils.h b/src/nnet3/nnet-example-utils.h index 02620df7485..05f35fb44de 100644 --- a/src/nnet3/nnet-example-utils.h +++ b/src/nnet3/nnet-example-utils.h @@ -64,6 +64,25 @@ void GetComputationRequest(const Nnet &nnet, ComputationRequest *computation_request); +/** This function takes NnetExample and produces a ComputatioRequest. + It assumes you don't want the derivatives w.r.t the input; + + If use_xent_regularization == true, then it assumes that for each output + name (e.g. "output" in the eg, there is another output with the same + dimension and with the suffix "-xent" on its name, e.g. named + "output-xent". The derivative w.r.t. the xent objective will only be + supplied to the nnet computation if 'use_xent_derivative' is true (we + propagate back the xent derivative to the model only in training, not in + model-combination in nnet3-chain-combine). +*/ +void GetComputationRequest(const Nnet &nnet, + const NnetExample &eg, + bool need_model_derivative, + bool store_component_stats, + bool use_xent_regularization, + bool use_xent_derivative, + ComputationRequest *computation_request); + // Writes as unsigned char a vector 'vec' that is required to have // values between 0 and 1. void WriteVectorAsChar(std::ostream &os, diff --git a/src/nnet3bin/nnet3-get-egs.cc b/src/nnet3bin/nnet3-get-egs.cc index cec9549541d..de7904a8d6c 100644 --- a/src/nnet3bin/nnet3-get-egs.cc +++ b/src/nnet3bin/nnet3-get-egs.cc @@ -166,7 +166,7 @@ int main(int argc, char *argv[]) { bool compress = true; int32 num_pdfs = -1, length_tolerance = 100, - targets_length_tolerance = 2, + targets_length_tolerance = 2, online_ivector_period = 1; ExampleGenerationConfig eg_config; // controls num-frames, @@ -192,7 +192,7 @@ int main(int argc, char *argv[]) { "--online-ivectors option"); po.Register("length-tolerance", &length_tolerance, "Tolerance for " "difference in num-frames between feat and ivector matrices"); - po.Register("targets-length-tolerance", &targets_length_tolerance, + po.Register("targets-length-tolerance", &targets_length_tolerance, "Tolerance for " "difference in num-frames (after subsampling) between " "feature matrix and posterior"); @@ -260,7 +260,7 @@ int main(int argc, char *argv[]) { } if (!ProcessFile(feats, online_ivector_feats, online_ivector_period, - pdf_post, key, compress, num_pdfs, + pdf_post, key, compress, num_pdfs, targets_length_tolerance, &utt_splitter, &example_writer)) num_err++; From ae22eece2f90d9d2d7ed4a37f7d92e17ca7b063e Mon Sep 17 00:00:00 2001 From: Pegita Date: Thu, 22 Feb 2018 16:40:51 -0500 Subject: [PATCH 2/6] fixed issues w.r.t comments (part 1). --- src/chain/chain-training.cc | 3 +- src/chain/chain-training.h | 6 +- src/chainbin/nnet3-chain-get-egs-post.cc | 397 +++++++++++++++++++++++ src/nnet3/nnet-chain-training.cc | 8 +- src/nnet3/nnet-example-utils.cc | 41 +-- src/nnet3/nnet-example-utils.h | 16 +- 6 files changed, 413 insertions(+), 58 deletions(-) create mode 100644 src/chainbin/nnet3-chain-get-egs-post.cc diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc index 40108636da0..38c72efe057 100644 --- a/src/chain/chain-training.cc +++ b/src/chain/chain-training.cc @@ -29,6 +29,7 @@ void ComputeObjfAndDeriv2(const ChainTrainingOptions &opts, const DenominatorGraph &den_graph, const GeneralMatrix &supervision, const CuMatrixBase &nnet_output, + int32 num_sequences, int32 frames_per_sequence, BaseFloat *objf, BaseFloat *l2_term, BaseFloat *weight, @@ -46,8 +47,6 @@ void ComputeObjfAndDeriv2(const ChainTrainingOptions &opts, xent_output_deriv->SetZero(); xent_output_deriv->CopyFromMat(supervision.GetFullMatrix()); } - int32 num_sequences = 64, - frames_per_sequence = 150; BaseFloat sup_weight = 1.0; DenominatorComputation denominator(opts, den_graph, num_sequences, diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h index 8c276a4854f..5b9f43e04e8 100644 --- a/src/chain/chain-training.h +++ b/src/chain/chain-training.h @@ -121,11 +121,15 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, BaseFloat *weight, CuMatrixBase *nnet_output_deriv, CuMatrixBase *xent_output_deriv = NULL); - +/** + This function uses supervision as numerator and does denominator computation. + It can be uses, where numerator is fixed e.g. TS learning. +*/ void ComputeObjfAndDeriv2(const ChainTrainingOptions &opts, const DenominatorGraph &den_graph, const GeneralMatrix &supervision, const CuMatrixBase &nnet_output, + int32 num_sequences, int32 frames_per_sequence, BaseFloat *objf, BaseFloat *l2_term, BaseFloat *weight, diff --git a/src/chainbin/nnet3-chain-get-egs-post.cc b/src/chainbin/nnet3-chain-get-egs-post.cc new file mode 100644 index 00000000000..9aa0eba0fb8 --- /dev/null +++ b/src/chainbin/nnet3-chain-get-egs-post.cc @@ -0,0 +1,397 @@ +// chainbin/nnet3-chain-get-egs.cc + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "hmm/transition-model.h" +#include "hmm/posterior.h" +#include "nnet3/nnet-example.h" +#include "nnet3/nnet-chain-example.h" +#include "nnet3/nnet-example-utils.h" +#include "lat/lattice-functions.h" +#include "chain/chain-supervision.h" + +namespace kaldi { +namespace nnet3 { + +/** This function scales weights for fst. +*/ +void ScaleFst(BaseFloat scale, + fst::StdVectorFst *fst) { + typedef fst::StdArc StdArc; + typedef fst::StdArc::Weight Weight; + int32 num_states = fst->NumStates(); + for (int32 s = 0; s < num_states; s++) { + for (fst::MutableArcIterator iter(fst, s); + !iter.Done(); iter.Next()) { + StdArc arc = iter.Value(); + BaseFloat scaled_weight = scale * iter.Value().weight.Value(); + //arc.weight.SetWeight(scaled_weight); + arc.weight = scaled_weight; + iter.SetValue(arc); + } + Weight final_weight = fst->Final(s); + //if (final_weight != Weight::Zero()) + // scale = 1.0; + fst->SetFinal(s, final_weight); + } +} + +/** This function converts lattice to fst with weight equel to weighted + average of acoustic and language score. +*/ +void ConvertLatticeToPdfLabels( + const TransitionModel &tmodel, + const Lattice &ifst, + fst::StdVectorFst *ofst) { + typedef fst::ArcTpl ArcIn; + typedef fst::StdArc ArcOut; + typedef ArcIn::StateId StateId; + ofst->DeleteStates(); + // The states will be numbered exactly the same as the original FST. + // Add the states to the new FST. + StateId num_states = ifst.NumStates(); + for (StateId s = 0; s < num_states; s++) { + StateId news = ofst->AddState(); + assert(news == s); + } + ofst->SetStart(ifst.Start()); + for (StateId s = 0; s < num_states; s++) { + LatticeWeight final_iweight = ifst.Final(s); + if (final_iweight != LatticeWeight::Zero()) { + fst::TropicalWeight final_oweight; + ConvertLatticeWeight(final_iweight, &final_oweight); + ofst->SetFinal(s, final_oweight); + } + for (fst::ArcIterator iter(ifst, s); + !iter.Done(); + iter.Next()) { + ArcIn arc = iter.Value(); + KALDI_PARANOID_ASSERT(arc.weight != LatticeWeight::Zero()); + ArcOut oarc; + ConvertLatticeWeight(arc.weight, &oarc.weight); + oarc.ilabel = tmodel.TransitionIdToPdf(arc.ilabel) + 1; + oarc.olabel = tmodel.TransitionIdToPdf(arc.ilabel) + 1; + oarc.nextstate = arc.nextstate; + ofst->AddArc(s, oarc); + } + } +} + + +/** + This function does all the processing for one utterance, and outputs the + supervision objects to 'example_writer'. Note: if normalization_fst is the + empty FST (with no states), it skips the final stage of egs preparation and + you should do it later with nnet3-chain-normalize-egs. +*/ + +static bool ProcessFile(const fst::StdVectorFst &normalization_fst, + const GeneralMatrix &feats, + const MatrixBase *ivector_feats, + int32 ivector_period, + const Lattice &lat, + int32 num_output_frames, + const std::string &utt_id, + bool compress, + int32 num_pdfs, + TransitionModel &tmodel, + UtteranceSplitter *utt_splitter, + NnetExampleWriter *example_writer) { + //KALDI_ASSERT(supervision.num_sequences == 1); + int32 num_input_frames = feats.NumRows(); + + if (!utt_splitter->LengthsMatch(utt_id, num_input_frames, num_output_frames)) + return false; // LengthsMatch() will have printed a warning. + + std::vector chunks; + + utt_splitter->GetChunksForUtterance(num_input_frames, &chunks); + + if (chunks.empty()) { + KALDI_WARN << "Not producing egs for utterance " << utt_id + << " because it is too short: " + << num_input_frames << " frames."; + return false; + } + + int32 frame_subsampling_factor = utt_splitter->Config().frame_subsampling_factor; + + fst::StdVectorFst sup_fst, + scaled_normalization_fst(normalization_fst); + ConvertLatticeToPdfLabels(tmodel, lat, &sup_fst); + ScaleFst(0.5, &scaled_normalization_fst); // Scale lattice to have weights similar + // to weights used to combine lm weight + // with acoustic weight in sup_lat + if (normalization_fst.NumStates() > 0 && + !chain::AddWeightToFst(normalization_fst, &sup_fst)) { + KALDI_WARN << "For utterance " << utt_id << ", feature frames " + << ", FST was empty after composing with normalization FST. " + << "This should be extremely rare (a few per corpus, at most)"; + } + + // Convert fst to lattice to extract posterior using forward backward. + Lattice sup_lat; + ConvertFstToLattice(sup_fst, &sup_lat); + Posterior pdf_post; + LatticeForwardBackward(lat, &pdf_post); + + for (size_t c = 0; c < chunks.size(); c++) { + ChunkTimeInfo &chunk = chunks[c]; + + int32 start_frame_subsampled = chunk.first_frame / frame_subsampling_factor, + num_frames_subsampled = chunk.num_frames / frame_subsampling_factor; + + + // Do we need to substract 1 from post to convert it back to pdf-id. + // Select subset of posterior correspond to subset of utts. + // select subset of pdf-ids + Posterior labels(num_frames_subsampled); + for (int i = 0; i < num_frames_subsampled; i++) { + int t = i + start_frame_subsampled; + if (t < pdf_post.size()) + labels[i] = pdf_post[t]; + //for (std::vector >::iterator + // iter = labels[i].begin(); iter ! labels[i].end(); ++iter) + // iter->second *= chunk.output_weights[i]; + } + + int32 first_frame = 0; // we shift the time-indexes of all these parts so + // that the supervised part starts from frame 0. + + SubVector output_weights( + &(chunk.output_weights[0]), + static_cast(chunk.output_weights.size())); + + NnetExample nnet_eg; + nnet_eg.io.push_back(NnetIo("output", num_pdfs, 0, labels)); + nnet_eg.io.resize(ivector_feats != NULL ? 3 : 2); + + int32 tot_input_frames = chunk.left_context + chunk.num_frames + + chunk.right_context, + start_frame = chunk.first_frame - chunk.left_context; + + GeneralMatrix input_frames; + ExtractRowRangeWithPadding(feats, start_frame, tot_input_frames, + &input_frames); + + NnetIo input_io("input", -chunk.left_context, input_frames); + nnet_eg.io[0].Swap(&input_io); + + if (ivector_feats != NULL) { + // if applicable, add the iVector feature. + // choose iVector from a random frame in the chunk + int32 ivector_frame = RandInt(start_frame, + start_frame + num_input_frames - 1), + ivector_frame_subsampled = ivector_frame / ivector_period; + if (ivector_frame_subsampled < 0) + ivector_frame_subsampled = 0; + if (ivector_frame_subsampled >= ivector_feats->NumRows()) + ivector_frame_subsampled = ivector_feats->NumRows() - 1; + Matrix ivector(1, ivector_feats->NumCols()); + ivector.Row(0).CopyFromVec(ivector_feats->Row(ivector_frame_subsampled)); + NnetIo ivector_io("ivector", 0, ivector); + nnet_eg.io[2].Swap(&ivector_io); + } + + if (compress) + nnet_eg.Compress(); + + std::ostringstream os; + os << utt_id << "-" << chunk.first_frame; + + std::string key = os.str(); // key is - + + example_writer->Write(key, nnet_eg); + } + return true; +} + +} // namespace nnet3 +} // namespace kaldi + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + using namespace kaldi::chain; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + + const char *usage = + "Get frame-by-frame examples of data for nnet3+chain neural network\n" + "training. This involves breaking up utterances into pieces of a\n" + "fixed size. \n" + "The input is lattice and it will transform into new lattice " + "with pdf labels. The it will compose with " + "and does forward backward to get posterior.\n" + "This egs generation can be used for teacher student learning setup \n" + "where the lattice extracted from teacher network.\n" + "Note: if is not supplied the egs will not be\n" + "ready for training; in that case they should later be processed\n" + "with nnet3-chain-normalize-egs\n" + "\n" + "Usage: nnet3-chain-get-egs [options] [] " + " \n" + "\n" + "An example [where $feats expands to the actual features]:\n" + "chain-get-supervision [args] | \\\n" + " nnet3-chain-get-egs --left-context=25 --right-context=9 --num-frames=20 dir/normalization.fst \\\n" + " \"$feats\" ark,s,cs:- ark:cegs.1.ark\n" + "Note: the --frame-subsampling-factor option must be the same as given to\n" + "chain-get-supervision.\n"; + + bool compress = true; + int32 length_tolerance = 100, online_ivector_period = 1; + + ExampleGenerationConfig eg_config; // controls num-frames, + // left/right-context, etc. + + int32 srand_seed = 0, num_pdfs = -1; + std::string online_ivector_rspecifier, + trans_model; + + ParseOptions po(usage); + po.Register("compress", &compress, "If true, write egs with input features " + "in compressed format (recommended). Update: this is now " + "only relevant if the features being read are un-compressed; " + "if already compressed, we keep we same compressed format when " + "dumping-egs."); + po.Register("ivectors", &online_ivector_rspecifier, "Alias for " + "--online-ivectors option, for back compatibility"); + po.Register("online-ivectors", &online_ivector_rspecifier, "Rspecifier of " + "ivector features, as a matrix."); + po.Register("online-ivector-period", &online_ivector_period, "Number of " + "frames between iVectors in matrices supplied to the " + "--online-ivectors option"); + po.Register("srand", &srand_seed, "Seed for random number generator "); + po.Register("length-tolerance", &length_tolerance, "Tolerance for " + "difference in num-frames between feat and ivector matrices"); + po.Register("num-pdfs", &num_pdfs, "Number of pdfs in the acoustic " + "model"); + po.Register("trans-model", &trans_model, + "Transition model"); + + eg_config.Register(&po); + + po.Read(argc, argv); + + srand(srand_seed); + + if (po.NumArgs() < 3 || po.NumArgs() > 4) { + po.PrintUsage(); + exit(1); + } + + if (num_pdfs <= 0) + KALDI_ERR << "--num-pdfs options is required."; + TransitionModel tmodel; + if (!trans_model.empty()) + ReadKaldiObject(trans_model, &tmodel); + + std::string + normalization_fst_rxfilename, + feature_rspecifier, + lattice_rspecifier, + examples_wspecifier; + if (po.NumArgs() == 3) { + feature_rspecifier = po.GetArg(1); + lattice_rspecifier = po.GetArg(2); + examples_wspecifier = po.GetArg(3); + } else { + normalization_fst_rxfilename = po.GetArg(1); + KALDI_ASSERT(!normalization_fst_rxfilename.empty()); + feature_rspecifier = po.GetArg(2); + lattice_rspecifier = po.GetArg(3); + examples_wspecifier = po.GetArg(4); + } + + eg_config.ComputeDerived(); + UtteranceSplitter utt_splitter(eg_config); + + fst::StdVectorFst normalization_fst; + if (!normalization_fst_rxfilename.empty()) { + ReadFstKaldi(normalization_fst_rxfilename, &normalization_fst); + KALDI_ASSERT(normalization_fst.NumStates() > 0); + } + + // Read as GeneralMatrix so we don't need to un-compress and re-compress + // when selecting parts of matrices. + SequentialGeneralMatrixReader feat_reader(feature_rspecifier); + //chain::RandomAccessSupervisionReader supervision_reader( + // supervision_rspecifier); + RandomAccessLatticeReader lattice_reader(lattice_rspecifier); + NnetExampleWriter example_writer(examples_wspecifier); + RandomAccessBaseFloatMatrixReader online_ivector_reader( + online_ivector_rspecifier); + + int32 num_err = 0; + + for (; !feat_reader.Done(); feat_reader.Next()) { + std::string key = feat_reader.Key(); + const GeneralMatrix &feats = feat_reader.Value(); + if (!lattice_reader.HasKey(key)) { + KALDI_WARN << "No pdf-level posterior for key " << key; + num_err++; + } else { + //const chain::Supervision &supervision = supervision_reader.Value(key); + const Lattice &lat = lattice_reader.Value(key); + const Matrix *online_ivector_feats = NULL; + if (!online_ivector_rspecifier.empty()) { + if (!online_ivector_reader.HasKey(key)) { + KALDI_WARN << "No iVectors for utterance " << key; + num_err++; + continue; + } else { + // this address will be valid until we call HasKey() or Value() + // again. + online_ivector_feats = &(online_ivector_reader.Value(key)); + } + } + if (online_ivector_feats != NULL && + (abs(feats.NumRows() - (online_ivector_feats->NumRows() * + online_ivector_period)) > length_tolerance + || online_ivector_feats->NumRows() == 0)) { + KALDI_WARN << "Length difference between feats " << feats.NumRows() + << " and iVectors " << online_ivector_feats->NumRows() + << "exceeds tolerance " << length_tolerance; + num_err++; + continue; + } + int32 num_output_frames = 1; + if (!ProcessFile(normalization_fst, feats, + online_ivector_feats, online_ivector_period, + lat, num_output_frames, key, compress, num_pdfs, + tmodel, + &utt_splitter, &example_writer)) + num_err++; + } + } + if (num_err > 0) + KALDI_WARN << num_err << " utterances had errors and could " + "not be processed."; + // utt_splitter prints stats in its destructor. + return utt_splitter.ExitStatus(); + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc index 4c799ea96c3..7608aea831e 100644 --- a/src/nnet3/nnet-chain-training.cc +++ b/src/nnet3/nnet-chain-training.cc @@ -63,9 +63,8 @@ void NnetChainTrainer::Train(const NnetExample &eg) { bool use_xent_regularization = (opts_.chain_config.xent_regularize != 0.0); ComputationRequest request; GetComputationRequest(*nnet_, eg, need_model_derivative, - nnet_config.store_component_stats, - use_xent_regularization, need_model_derivative, - &request); + nnet_config.store_component_stats, &request, + use_xent_regularization, need_model_derivative); const NnetComputation *computation = compiler_.Compile(request); // conventional training @@ -242,10 +241,11 @@ void NnetChainTrainer::ProcessOutputs(const NnetExample &eg, kUndefined); BaseFloat tot_objf, tot_l2_term, tot_weight; - + int32 num_sequences = 64, frames_per_sequence = 150; ComputeObjfAndDeriv2(opts_.chain_config, den_graph_, io.features, nnet_output, + num_sequences, frames_per_sequence, &tot_objf, &tot_l2_term, &tot_weight, &nnet_output_deriv, (use_xent ? &xent_deriv : NULL)); diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc index 2151e06bbb4..62fc88521bc 100644 --- a/src/nnet3/nnet-example-utils.cc +++ b/src/nnet3/nnet-example-utils.cc @@ -202,13 +202,13 @@ void GetComputationRequest(const Nnet &nnet, const NnetExample &eg, bool need_model_derivative, bool store_component_stats, + ComputationRequest *request, bool use_xent_regularization, - bool use_xent_derivative, - ComputationRequest *request) { + bool use_xent_derivative) { request->inputs.clear(); request->inputs.reserve(eg.io.size()); request->outputs.clear(); - request->outputs.reserve(eg.io.size() * 2); + request->outputs.reserve((use_xent_regularization ? 2 : 1) * eg.io.size()); request->need_model_derivative = need_model_derivative; request->store_component_stats = store_component_stats; for (size_t i = 0; i < eg.io.size(); i++) { @@ -247,41 +247,6 @@ void GetComputationRequest(const Nnet &nnet, KALDI_ERR << "No outputs in computation request."; } -void GetComputationRequest(const Nnet &nnet, - const NnetExample &eg, - bool need_model_derivative, - bool store_component_stats, - ComputationRequest *request) { - request->inputs.clear(); - request->inputs.reserve(eg.io.size()); - request->outputs.clear(); - request->outputs.reserve(eg.io.size() * 2); - request->need_model_derivative = need_model_derivative; - request->store_component_stats = store_component_stats; - for (size_t i = 0; i < eg.io.size(); i++) { - const NnetIo &io = eg.io[i]; - const std::string &name = io.name; - int32 node_index = nnet.GetNodeIndex(name); - if (node_index == -1 && - !nnet.IsInputNode(node_index) && !nnet.IsOutputNode(node_index)) - KALDI_ERR << "Nnet example has input or output named '" << name - << "', but no such input or output node is in the network."; - - std::vector &dest = - nnet.IsInputNode(node_index) ? request->inputs : request->outputs; - dest.resize(dest.size() + 1); - IoSpecification &io_spec = dest.back(); - io_spec.name = name; - io_spec.indexes = io.indexes; - io_spec.has_deriv = nnet.IsOutputNode(node_index) && need_model_derivative; - } - // check to see if something went wrong. - if (request->inputs.empty()) - KALDI_ERR << "No inputs in computation request."; - if (request->outputs.empty()) - KALDI_ERR << "No outputs in computation request."; -} - void WriteVectorAsChar(std::ostream &os, bool binary, const VectorBase &vec) { diff --git a/src/nnet3/nnet-example-utils.h b/src/nnet3/nnet-example-utils.h index 05f35fb44de..5f6c69f7d96 100644 --- a/src/nnet3/nnet-example-utils.h +++ b/src/nnet3/nnet-example-utils.h @@ -56,16 +56,6 @@ void ShiftExampleTimes(int32 t_offset, inputs; if you do, you can create/modify the ComputationRequest manually. Assumes that if need_model_derivative is true, you will be supplying derivatives w.r.t. all outputs. -*/ -void GetComputationRequest(const Nnet &nnet, - const NnetExample &eg, - bool need_model_derivative, - bool store_component_stats, - ComputationRequest *computation_request); - - -/** This function takes NnetExample and produces a ComputatioRequest. - It assumes you don't want the derivatives w.r.t the input; If use_xent_regularization == true, then it assumes that for each output name (e.g. "output" in the eg, there is another output with the same @@ -79,9 +69,9 @@ void GetComputationRequest(const Nnet &nnet, const NnetExample &eg, bool need_model_derivative, bool store_component_stats, - bool use_xent_regularization, - bool use_xent_derivative, - ComputationRequest *computation_request); + ComputationRequest *computation_request, + bool use_xent_regularization = false, + bool use_xent_derivative = false); // Writes as unsigned char a vector 'vec' that is required to have // values between 0 and 1. From f94738faa3f6c3e68e70d980e2cdbce2152e1bad Mon Sep 17 00:00:00 2001 From: Pegita Date: Sun, 25 Feb 2018 22:07:04 -0500 Subject: [PATCH 3/6] modfied functions to accept new sort (sort by t and then n) in nnet3-merge-egs. --- src/matrix/sparse-matrix.cc | 83 ++++++++++++++++++++++++++------- src/matrix/sparse-matrix.h | 11 ++++- src/nnet3/nnet-example-utils.cc | 23 +++++++-- src/nnet3/nnet-example-utils.h | 18 ++++--- 4 files changed, 104 insertions(+), 31 deletions(-) diff --git a/src/matrix/sparse-matrix.cc b/src/matrix/sparse-matrix.cc index 38ad940fb45..5ad7f2bfeca 100644 --- a/src/matrix/sparse-matrix.cc +++ b/src/matrix/sparse-matrix.cc @@ -654,26 +654,50 @@ void SparseMatrix::Resize(MatrixIndexT num_rows, template void SparseMatrix::AppendSparseMatrixRows( - std::vector > *inputs) { + std::vector > *inputs, + bool sort_by_t) { rows_.clear(); size_t num_rows = 0; typename std::vector >::iterator input_iter = inputs->begin(), input_end = inputs->end(); - for (; input_iter != input_end; ++input_iter) + int32 local_row_size = input_iter->rows_.size(), + num_inputs = inputs->size(); + for (; input_iter != input_end; ++input_iter) { num_rows += input_iter->rows_.size(); + if (sort_by_t) + if (input_iter->rows_.size() == local_row_size) + KALDI_ERR << "we can not append sparse matrices with inconsistent " + << " number of rows, if sort_by_t is true"; + } rows_.resize(num_rows); typename std::vector >::iterator row_iter = rows_.begin(), row_end = rows_.end(); - for (input_iter = inputs->begin(); input_iter != input_end; ++input_iter) { - typename std::vector >::iterator - input_row_iter = input_iter->rows_.begin(), - input_row_end = input_iter->rows_.end(); - for (; input_row_iter != input_row_end; ++input_row_iter, ++row_iter) - row_iter->Swap(&(*input_row_iter)); + if (sort_by_t) { + // If true, the matrices appended to be sorted first by original row index (t) and next by matrix order in input. + // i.e. all rows with same index in local input matrix are appended in a same block. + int32 n = 0, t = 0; // 'n' is index over matrices and 't' is index for rows in matrixes. + for (input_iter = inputs->begin(); input_iter != input_end; ++input_iter, ++n) { + typename std::vector >::iterator + input_row_iter = input_iter->rows_.begin(), + input_row_end = input_iter->rows_.end(); + t = 0; + for (; input_row_iter != input_row_end; ++input_row_iter, ++t) { + int32 src_row_index = n + t * num_inputs; + rows_[src_row_index].Swap(&(*input_row_iter)); + } + } + } else { + for (input_iter = inputs->begin(); input_iter != input_end; ++input_iter) { + typename std::vector >::iterator + input_row_iter = input_iter->rows_.begin(), + input_row_end = input_iter->rows_.end(); + for (; input_row_iter != input_row_end; ++input_row_iter, ++row_iter) + row_iter->Swap(&(*input_row_iter)); + } + KALDI_ASSERT(row_iter == row_end); } - KALDI_ASSERT(row_iter == row_end); int32 num_cols = NumCols(); for (row_iter = rows_.begin(); row_iter != row_end; ++row_iter) { if (row_iter->Dim() != num_cols) @@ -916,7 +940,8 @@ void GeneralMatrix::Read(std::istream &is, bool binary) { void AppendGeneralMatrixRows(const std::vector &src, - GeneralMatrix *mat) { + GeneralMatrix *mat, + bool sort_by_t) { mat->Clear(); int32 size = src.size(); if (size == 0) @@ -933,7 +958,7 @@ void AppendGeneralMatrixRows(const std::vector &src, for (int32 i = 0; i < size; i++) sparse_mats[i] = src[i]->GetSparseMatrix(); SparseMatrix appended_mat; - appended_mat.AppendSparseMatrixRows(&sparse_mats); + appended_mat.AppendSparseMatrixRows(&sparse_mats, sort_by_t); mat->SwapSparseMatrix(&appended_mat); } else { int32 tot_rows = 0, num_cols = -1; @@ -950,15 +975,37 @@ void AppendGeneralMatrixRows(const std::vector &src, } Matrix appended_mat(tot_rows, num_cols, kUndefined); int32 row_offset = 0; - for (int32 i = 0; i < size; i++) { - const GeneralMatrix &src_mat = *(src[i]); - int32 src_rows = src_mat.NumRows(); - if (src_rows != 0) { - SubMatrix dest_submat(appended_mat, row_offset, src_rows, - 0, num_cols); - src_mat.CopyToMat(&dest_submat); + if (sort_by_t) { + // reorder the src mat rows to be inserted in appended matrix, in order to + // have sorted matrix first by 't' and next by 'n'. + int32 local_row_size = src[0]->NumRows(); + for (int32 i = 0; i < size; i++) { + const GeneralMatrix &src_mat = *(src[i]); + Matrix full_src_mat(src_mat.NumRows(), src_mat.NumCols()); + src_mat.CopyToMat(&full_src_mat); + int32 src_rows = src_mat.NumRows(); + if (src_rows != local_row_size) + KALDI_ERR << "Appending rows of matrices with inconsistent num-rows " + << "with sort-by-t=true is not possible:"; + std::vector reorder_indexes(local_row_size, + static_cast(NULL)); + for (int32 j = 0; j < src_rows; j++) { + reorder_indexes[j] = j * size + i; + } + full_src_mat.AddToRows(1.0, &(reorder_indexes[0]), &appended_mat); row_offset += src_rows; } + } else { + for (int32 i = 0; i < size; i++) { + const GeneralMatrix &src_mat = *(src[i]); + int32 src_rows = src_mat.NumRows(); + if (src_rows != 0) { + SubMatrix dest_submat(appended_mat, row_offset, src_rows, + 0, num_cols); + src_mat.CopyToMat(&dest_submat); + row_offset += src_rows; + } + } } KALDI_ASSERT(row_offset == tot_rows); mat->SwapFullMatrix(&appended_mat); diff --git a/src/matrix/sparse-matrix.h b/src/matrix/sparse-matrix.h index 60085b93fbe..48e085f1e4f 100644 --- a/src/matrix/sparse-matrix.h +++ b/src/matrix/sparse-matrix.h @@ -201,7 +201,13 @@ class SparseMatrix { /// function is destructive of the inputs. Requires, obviously, /// that the inputs all have the same dimension (although some may be /// empty). - void AppendSparseMatrixRows(std::vector > *inputs); + /// + /// If sort_by_t is true, all sparse matrixes are appended in a way to be sorted + /// w.r.t their local row indexes and then sorted with matrix index. + /// i.e. all rows of matrixes with same index are in same block. + /// Also number of rows in all matrixes needs to be equal. + void AppendSparseMatrixRows(std::vector > *inputs, + bool sort_by_t=false); SparseMatrix() { } @@ -383,7 +389,8 @@ class GeneralMatrix { /// Does not preserve compression, if inputs were compressed; you have to /// re-compress manually, if that's what you need. void AppendGeneralMatrixRows(const std::vector &src, - GeneralMatrix *mat); + GeneralMatrix *mat, + bool sort_by_t = false); /// Outputs a SparseMatrix containing only the rows r of "in" such that diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc index 62fc88521bc..82fbee1cf22 100644 --- a/src/nnet3/nnet-example-utils.cc +++ b/src/nnet3/nnet-example-utils.cc @@ -89,7 +89,8 @@ static void MergeIo(const std::vector &src, const std::vector &names, const std::vector &sizes, bool compress, - NnetExample *merged_eg) { + NnetExample *merged_eg, + bool sort_by_t) { // The total number of Indexes we have across all examples. int32 num_feats = names.size(); @@ -143,13 +144,24 @@ static void MergeIo(const std::vector &src, "Merging already-merged egs? Not currentlysupported."); output_iter[i].n = n; } + this_offset += this_size; // note: this_offset is a reference. } } + // If sort_by_t is true, the indexes is rearranged to be sorted + // first by 't' and next by 'n'. + for (int32 f = 0; f < num_feats; f++) { + NnetIo output_io = merged_eg->io[f]; + if (sort_by_t) + if (output_io.name == "output") + std::sort(output_io.indexes.begin(), output_io.indexes.end()); + } + KALDI_ASSERT(cur_size == sizes); for (int32 f = 0; f < num_feats; f++) { AppendGeneralMatrixRows(output_lists[f], - &(merged_eg->io[f].features)); + &(merged_eg->io[f].features), + sort_by_t); if (compress) { // the following won't do anything if the features were sparse. merged_eg->io[f].features.Compress(); @@ -161,14 +173,15 @@ static void MergeIo(const std::vector &src, void MergeExamples(const std::vector &src, bool compress, - NnetExample *merged_eg) { + NnetExample *merged_eg, + bool sort_by_t) { KALDI_ASSERT(!src.empty()); std::vector io_names; GetIoNames(src, &io_names); // the sizes are the total number of Indexes we have across all examples. std::vector io_sizes; GetIoSizes(src, io_names, &io_sizes); - MergeIo(src, io_names, io_sizes, compress, merged_eg); + MergeIo(src, io_names, io_sizes, compress, merged_eg, sort_by_t); } void ShiftExampleTimes(int32 t_offset, @@ -1225,7 +1238,7 @@ void ExampleMerger::WriteMinibatch(const std::vector &egs) { int32 minibatch_size = egs.size(); stats_.WroteExample(eg_size, structure_hash, minibatch_size); NnetExample merged_eg; - MergeExamples(egs, config_.compress, &merged_eg); + MergeExamples(egs, config_.compress, &merged_eg, config_.sort_by_t); std::ostringstream key; key << "merged-" << (num_egs_written_++) << "-" << minibatch_size; writer_->Write(key.str(), merged_eg); diff --git a/src/nnet3/nnet-example-utils.h b/src/nnet3/nnet-example-utils.h index 5f6c69f7d96..9d55f3b0d7d 100644 --- a/src/nnet3/nnet-example-utils.h +++ b/src/nnet3/nnet-example-utils.h @@ -33,10 +33,14 @@ namespace nnet3 { /** Merge a set of input examples into a single example (typically the size of "src" will be the minibatch size). Will crash if "src" is the empty vector. If "compress" is true, it will compress any non-sparse features in the output. + + If sort_by_t is true, the examples and indexes for output are sorted first + by 't' and then by 'n' index. */ void MergeExamples(const std::vector &src, bool compress, - NnetExample *dest); + NnetExample *dest, + bool sort_by_t = false); /** Shifts the time-index t of everything in the "eg" by adding "t_offset" to @@ -334,12 +338,14 @@ class ExampleMergingConfig { std::string measure_output_frames; // for back-compatibility, not used. std::string minibatch_size; std::string discard_partial_minibatches; // for back-compatibility, not used. - + bool sort_by_t; // If true, the examples and indexes are sorted + // first by 't' and next by 'n'. ExampleMergingConfig(const char *default_minibatch_size = "256"): compress(false), measure_output_frames("deprecated"), minibatch_size(default_minibatch_size), - discard_partial_minibatches("deprecated") { } + discard_partial_minibatches("deprecated"), + sort_by_t(false) { } void Register(OptionsItf *po) { po->Register("compress", &compress, "If true, compress the output examples " @@ -363,6 +369,9 @@ class ExampleMergingConfig { "--minibatch-size=128=64:128,256/256=32:64,128. Egs are given " "minibatch-sizes based on the specified eg-size closest to " "their actual size."); + po->Register("sort-by-t", &sort_by_t, + "If true, the features in examples and indexes are sorted " + "first by 't' and next by 'n'."); } @@ -517,7 +526,6 @@ class ExampleMerger { const ExampleMergingConfig &config_; NnetExampleWriter *writer_; ExampleMergingStats stats_; - // Note: the "key" into the egs is the first element of the vector. typedef unordered_map, NnetExampleStructureHasher, @@ -525,8 +533,6 @@ class ExampleMerger { MapType eg_to_egs_; }; - - } // namespace nnet3 } // namespace kaldi From 40fa1541cfa5cd4403ecc5ac11af836c656b9266 Mon Sep 17 00:00:00 2001 From: Pegita Date: Tue, 27 Feb 2018 15:52:45 -0500 Subject: [PATCH 4/6] fixed some issues. --- src/chain/chain-supervision.cc | 29 +-------------------- src/chain/chain-supervision.h | 7 +++-- src/chainbin/nnet3-chain-get-egs-post.cc | 33 +++++++++++------------- src/chainbin/nnet3-chain-get-egs.cc | 4 +-- 4 files changed, 20 insertions(+), 53 deletions(-) diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc index 7d87201dfdd..c38cd4698f7 100644 --- a/src/chain/chain-supervision.cc +++ b/src/chain/chain-supervision.cc @@ -683,34 +683,7 @@ bool AddWeightToFst(const fst::StdVectorFst &normalization_fst, bool AddWeightToSupervisionFst(const fst::StdVectorFst &normalization_fst, Supervision *supervision) { - // remove epsilons before composing. 'normalization_fst' has noepsilons so - // the composed result will be epsilon free. - fst::StdVectorFst supervision_fst_noeps(supervision->fst); - fst::RmEpsilon(&supervision_fst_noeps); - if (!TryDeterminizeMinimize(kSupervisionMaxStates, - &supervision_fst_noeps)) - return false; - - // note: by default, 'Compose' will call 'Connect', so if the - // resulting FST is not connected, it will end up empty. - fst::StdVectorFst composed_fst; - fst::Compose(supervision_fst_noeps, normalization_fst, - &composed_fst); - if (composed_fst.NumStates() == 0) - return false; - // projection should not be necessary, as both FSTs are acceptors. - // determinize and minimize to make it as compact as possible. - - if (!TryDeterminizeMinimize(kSupervisionMaxStates, - &composed_fst)) - return false; - supervision->fst = composed_fst; - - // Make sure the states are numbered in increasing order of time. - SortBreadthFirstSearch(&(supervision->fst)); - KALDI_ASSERT(supervision->fst.Properties(fst::kAcceptor, true) == fst::kAcceptor); - KALDI_ASSERT(supervision->fst.Properties(fst::kIEpsilons, true) == 0); - return true; + return AddWeightToFst(normalization_fst, &(supervision->fst)); } void SplitIntoRanges(int32 num_frames, diff --git a/src/chain/chain-supervision.h b/src/chain/chain-supervision.h index c54d4770aa0..36401009b15 100644 --- a/src/chain/chain-supervision.h +++ b/src/chain/chain-supervision.h @@ -320,13 +320,12 @@ class SupervisionSplitter { /// This function also removes epsilons and makes sure supervision->fst has the /// required sorting of states. Think of it as the final stage in preparation /// of the supervision FST. -bool AddWeightToSupervisionFst(const fst::StdVectorFst &normalization_fst, - Supervision *supervision); - - bool AddWeightToFst(const fst::StdVectorFst &normalization_fst, fst::StdVectorFst *supervision_fst); +bool AddWeightToSupervisionFst(const fst::StdVectorFst &normalization_fst, + Supervision *supervision); + /// Assuming the 'fst' is epsilon-free, connected, and has the property that all /// paths from the start-state are of the same length, output a vector /// containing that length (from the start-state to the current state) to diff --git a/src/chainbin/nnet3-chain-get-egs-post.cc b/src/chainbin/nnet3-chain-get-egs-post.cc index 9aa0eba0fb8..f3b82f2229d 100644 --- a/src/chainbin/nnet3-chain-get-egs-post.cc +++ b/src/chainbin/nnet3-chain-get-egs-post.cc @@ -134,13 +134,7 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst, } int32 frame_subsampling_factor = utt_splitter->Config().frame_subsampling_factor; - - fst::StdVectorFst sup_fst, - scaled_normalization_fst(normalization_fst); - ConvertLatticeToPdfLabels(tmodel, lat, &sup_fst); - ScaleFst(0.5, &scaled_normalization_fst); // Scale lattice to have weights similar - // to weights used to combine lm weight - // with acoustic weight in sup_lat + fst::StdVectorFst sup_fst; if (normalization_fst.NumStates() > 0 && !chain::AddWeightToFst(normalization_fst, &sup_fst)) { KALDI_WARN << "For utterance " << utt_id << ", feature frames " @@ -249,15 +243,13 @@ int main(int argc, char *argv[]) { "ready for training; in that case they should later be processed\n" "with nnet3-chain-normalize-egs\n" "\n" - "Usage: nnet3-chain-get-egs [options] [] " - " \n" + "Usage: nnet3-chain-get-egs-post [options] [] " + " \n" "\n" "An example [where $feats expands to the actual features]:\n" - "chain-get-supervision [args] | \\\n" - " nnet3-chain-get-egs --left-context=25 --right-context=9 --num-frames=20 dir/normalization.fst \\\n" - " \"$feats\" ark,s,cs:- ark:cegs.1.ark\n" - "Note: the --frame-subsampling-factor option must be the same as given to\n" - "chain-get-supervision.\n"; + "nnet3-chain-get-egs-post --left-context=25 --right-context=9\n" + "--num-frames=20 dir/normalization.fst \"$feats\" \n" + "ark:lat.1.ark ark:cegs.1.ark"; bool compress = true; int32 length_tolerance = 100, online_ivector_period = 1; @@ -278,9 +270,7 @@ int main(int argc, char *argv[]) { po.Register("ivectors", &online_ivector_rspecifier, "Alias for " "--online-ivectors option, for back compatibility"); po.Register("online-ivectors", &online_ivector_rspecifier, "Rspecifier of " - "ivector features, as a matrix."); - po.Register("online-ivector-period", &online_ivector_period, "Number of " - "frames between iVectors in matrices supplied to the " + "ivector features, as a matrix." "--online-ivectors option"); po.Register("srand", &srand_seed, "Seed for random number generator "); po.Register("length-tolerance", &length_tolerance, "Tolerance for " @@ -376,8 +366,15 @@ int main(int argc, char *argv[]) { num_err++; continue; } + // we scale normalization fst to have similar weights used to combine lm weight + // with acoustic weight in sup_lat. + fst::StdVectorFst sup_fst, scaled_normalization_fst(normalization_fst); + ConvertLatticeToPdfLabels(tmodel, lat, &sup_fst); + ScaleFst(0.5, &scaled_normalization_fst); // Scale lattice to have weights similar + // to weights used to combine lm weight + // with acoustic weight in sup_lat int32 num_output_frames = 1; - if (!ProcessFile(normalization_fst, feats, + if (!ProcessFile(scaled_normalization_fst, feats, online_ivector_feats, online_ivector_period, lat, num_output_frames, key, compress, num_pdfs, tmodel, diff --git a/src/chainbin/nnet3-chain-get-egs.cc b/src/chainbin/nnet3-chain-get-egs.cc index 206921771c8..c8c251900ec 100644 --- a/src/chainbin/nnet3-chain-get-egs.cc +++ b/src/chainbin/nnet3-chain-get-egs.cc @@ -42,7 +42,6 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst, const GeneralMatrix &feats, const MatrixBase *ivector_feats, int32 ivector_period, - const Lattice &lattice, const chain::Supervision &supervision, const std::string &utt_id, bool compress, @@ -279,13 +278,12 @@ int main(int argc, char *argv[]) { num_err++; continue; } - /* + if (!ProcessFile(normalization_fst, feats, online_ivector_feats, online_ivector_period, supervision, key, compress, &utt_splitter, &example_writer)) num_err++; - */ } } if (num_err > 0) From 46a96c521feeea0b6d79f3fd5da7d2497b2b3498 Mon Sep 17 00:00:00 2001 From: Pegita Date: Fri, 13 Apr 2018 15:00:13 -0400 Subject: [PATCH 5/6] added small change. --- src/nnet3/nnet-example-utils.cc | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc index 82fbee1cf22..4ff60b9413e 100644 --- a/src/nnet3/nnet-example-utils.cc +++ b/src/nnet3/nnet-example-utils.cc @@ -80,9 +80,6 @@ static void GetIoSizes(const std::vector &src, } } - - - // Do the final merging of NnetIo, once we have obtained the names, dims and // sizes for each feature/supervision type. static void MergeIo(const std::vector &src, From 417dad64051a383d93f6c91877efa04acfddfc28 Mon Sep 17 00:00:00 2001 From: Pegita Date: Fri, 13 Apr 2018 16:25:26 -0400 Subject: [PATCH 6/6] added run_tdnn_7{n,o}.sh --- .../s5c/local/chain/tuning/run_tdnn_7n.sh | 386 ++++++++++++++++ .../s5c/local/chain/tuning/run_tdnn_7o.sh | 411 ++++++++++++++++++ 2 files changed, 797 insertions(+) create mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_7n.sh create mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7n.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7n.sh new file mode 100755 index 00000000000..a45ba6314c9 --- /dev/null +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7n.sh @@ -0,0 +1,386 @@ +#!/bin/bash +# _7m is as _7i but it has single hint-guide in the middle of primary and sibling +# network. +# _7i is as _7h but it uses sibling network and multi-stage training +# to transfer information from larger network to smaller network. +# It uses multi-stage training to train sibling network with smaller parameter. +# The 1st stage of training is as as basleline tdnn_7d and it trains primary network. +# The second stage of training is to use regularizers in all layers as objectives +# to train sibling network and in the 3rd stage, we train a sibling network using +# chain objective for 1 epoch. + +#System tdnn_7g tdnn_7h +#WER on train_dev(tg) 13.98 13.84 +#WER on train_dev(fg) 12.78 12.84 +#WER on eval2000(tg) 16.7 16.5 +#WER on eval2000(fg) 14.9 14.8 +#Final train prob -0.0817467-0.0889771 +#Final valid prob -0.110475 -0.113102 +#Final train prob (xent) -1.20065 -1.2533 +#Final valid prob (xent) -1.3313 -1.36743 +# +set -e + +# configs for 'chain' +affix= +stage=12 +multi_stage_train=1 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_7n # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= +chain_regularize=0.5 +num_epochs_s2=1 +num_epochs_s3=1 +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +initial_effective_lrate_s2=0.001 +final_effective_lrate_s2=0.0001 +initial_effective_lrate_s3=0.0005 +final_effective_lrate_s3=0.00005 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +num_jobs_initial_s2=3 +num_jobs_final_s2=8 +num_jobs_initial_s3=3 +num_jobs_final_s3=8 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false +common_egs_dir=exp/chain/tdnn_7h_sp/egs +xent_regularize=0.1 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser for primary network"; + + num_targets=$(tree-info exp/chain/tri5_7d_tree_sp/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/stage1/configs + cat < $dir/stage1/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(input@-1,input,input@1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/stage1/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=625 + relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=625 + relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=625 + relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=625 + relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=625 + relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=625 + relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=625 + + ## adding the layers for chain branch + relu-renorm-layer name=prefinal-chain input=tdnn7 dim=625 target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-renorm-layer name=prefinal-xent input=tdnn7 dim=625 target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/stage1/configs/network.xconfig \ + --config-dir $dir/stage1/configs/ + + echo "$0: creating neural net configs using the xconfig parser for sibling network" + sibling_dim=300 + primary_dim=625 + regressor_lr_factor=1.0 + regressor_scale=`echo $regressor_lr_factor $primary_dim | awk '{printf "%.8f \n", $1/$2}'` + regressor_scale_vec="" + for i in `seq $primary_dim`;do + regressor_scale_vec="$regressor_scale_vec $regressor_scale" + done + + mkdir -p $dir/stage2 + cat < $dir/stage2/regressor_scale.vec + [ $regressor_scale_vec ] +EOF + + mkdir -p $dir/stage2/configs + cat < $dir/stage2/configs/network.xconfig + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1-sibling input=lda dim=$sibling_dim + relu-renorm-layer name=tdnn2-sibling input=Append(-1,0,1) dim=$sibling_dim + relu-renorm-layer name=tdnn3-sibling input=Append(-1,0,1) dim=$sibling_dim + relu-renorm-layer name=tdnn4-sibling input=Append(-3,0,3) dim=$sibling_dim + relu-renorm-layer name=tdnn5-sibling input=Append(-3,0,3) dim=$sibling_dim + relu-renorm-layer name=tdnn6-sibling input=Append(-3,0,3) dim=$sibling_dim + relu-renorm-layer name=tdnn7-sibling input=Append(-3,0,3) dim=$sibling_dim + + ## adding the layers for chain branch + relu-renorm-layer name=prefinal-chain-sibling input=tdnn7-sibling dim=$sibling_dim target-rms=0.5 + output-layer name=output-sibling include-log-softmax=false dim=$num_targets max-change=1.5 + + relu-renorm-layer name=prefinal-xent-sibling input=tdnn7-sibling dim=$sibling_dim target-rms=0.5 + output-layer name=output-xent-sibling dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + ## adding the regressor outputs to the sibling network configs. + relu-renorm-layer name=tdnn4-regressor input=tdnn4-sibling dim=$primary_dim + regressor-layer name=regressor-4 input1=tdnn4-regressor input2=tdnn4 objective-type=linear max-change=1.5 dim=$primary_dim regressor-scale-file=$dir/stage2/regressor_scale.vec supervision-type=unsupervised +EOF + steps/nnet3/xconfig_to_configs.py --aux-xconfig-file $dir/stage1/configs/network.xconfig \ + --xconfig-file $dir/stage2/configs/network.xconfig --config-dir $dir/stage2/configs/ + + # edits.config contains edits required for different stage of training. + # it is applied to 0.mdl generated at prepare_initial_network stage in + # iter -1. + # The edits for 2nd stage contains renaming primary network's outputs to + # -primary to not train using these outputs. + # the edits contain renaming sibling network output to be output. + cat < $dir/stage2/configs/edits.config + rename-node old-name=output new-name=output-primary + rename-node old-name=output-xent new-name=output-xent-primary + rename-node old-name=output-sibling new-name=output + rename-node old-name=output-xent-sibling new-name=output-xent +EOF + # edits.config contains edits required for 3rd stage of training. + mkdir -p $dir/stage3 + mkdir -p $dir/stage3/configs + cat < $dir/stage3/configs/edits.config + remove-output-nodes name=regressor* + remove-output-nodes name=*-primary + remove-orphans +EOF + # we skip add_compatiblity stage in xconfig_to_config.py + # we copy vars from stage1 to stage2 and stage3 for now. + cp -r $dir/stage1/configs/vars $dir/stage2/configs/. + cp -r $dir/stage1/configs/vars $dir/stage3/configs/. +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + if [ $multi_stage_train -le 0 ] && [ ! -f $dir/stage1/final.mdl ]; then + echo "$0: Training primary network" + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir/stage1 || exit 1; + fi + + if [ $multi_stage_train -le 1 ]; then + mkdir -p $dir/stage2 + echo "$0: copy final primary network in $dir/stage1/final.raw to " + echo "$dir/stage2/init.raw as initial network with zero lr factor" + echo "as primary network for sibling network." + nnet3-am-copy --raw=true \ + --edits='set-learning-rate-factor name=* learning-rate-factor=0.0;' \ + $dir/stage1/final.mdl $dir/stage2/init.raw || exit 1; + + echo "$0: Training sibling network using regularizer objectives." + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --init-raw-model $dir/stage2/init.raw \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.chain-regularize $chain_regularize \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_s2 \ + --trainer.optimization.num-jobs-initial $num_jobs_initial_s2 \ + --trainer.optimization.num-jobs-final $num_jobs_final_s2 \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate_s2 \ + --trainer.optimization.final-effective-lrate $final_effective_lrate_s2 \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs false \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir/stage2 || exit 1; + fi + if [ $multi_stage_train -le 2 ]; then + cp $dir/stage2/den.fst $dir/stage3/. + echo "$0:remove sibling network regularizer outputs " + echo "and raname chain-objective for sibling to train " + echo "with chain objective output for sibling network. \n" + echo "Teacher-student objective can be added in future." + nnet3-am-copy --edits-config=$dir/stage3/configs/edits.config \ + $dir/stage2/final.mdl $dir/stage3/0.mdl || exit 1; + mkdir -p $dir/stage3/configs + train_stage_s3=0 + if [ $train_stage -gt $train_stage_s3 ]; then + train_stage_s3=$train_stage + fi + cp -r $dir/stage2/configs $dir/stage3/configs || exit 1; + steps/nnet3/chain/train.py --stage $train_stage_s3 \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_s3 \ + --trainer.optimization.num-jobs-initial $num_jobs_initial_s3 \ + --trainer.optimization.num-jobs-final $num_jobs_final_s3 \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate_s3 \ + --trainer.optimization.final-effective-lrate $final_effective_lrate_s3 \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir/stage3 || exit 1; + + fi +fi +dir=$dir/stage2 +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh new file mode 100755 index 00000000000..cb86ffd71d6 --- /dev/null +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh @@ -0,0 +1,411 @@ +#!/bin/bash +# _7o is as 7n but it has extra regressor-layer which maximize weighted log-likelihood +# product for two networks where weights are prior for each output class. +# _7m is as _7i but it has single hint-guide in the middle of primary and sibling +# network. +# _7i is as _7h but it uses sibling network and multi-stage training +# to transfer information from larger network to smaller network. +# It uses multi-stage training to train sibling network with smaller parameter. +# The 1st stage of training is as as basleline tdnn_7d and it trains primary network. +# The second stage of training is to use regularizers in all layers as objectives +# to train sibling network and in the 3rd stage, we train a sibling network using +# chain objective for 1 epoch. + +#System tdnn_7g tdnn_7h +#WER on train_dev(tg) 13.98 13.84 +#WER on train_dev(fg) 12.78 12.84 +#WER on eval2000(tg) 16.7 16.5 +#WER on eval2000(fg) 14.9 14.8 +#Final train prob -0.0817467-0.0889771 +#Final valid prob -0.110475 -0.113102 +#Final train prob (xent) -1.20065 -1.2533 +#Final valid prob (xent) -1.3313 -1.36743 +# +set -e + +# configs for 'chain' +affix= +stage=12 +multi_stage_train=1 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_7o # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= +chain_regularize=0.5 +num_epochs_s2=1 +num_epochs_s3=1 +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +initial_effective_lrate_s2=0.001 +final_effective_lrate_s2=0.0001 +initial_effective_lrate_s3=0.0005 +final_effective_lrate_s3=0.00005 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +num_jobs_initial_s2=3 +num_jobs_final_s2=16 +num_jobs_initial_s3=3 +num_jobs_final_s3=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false +common_egs_dir=exp/chain/tdnn_7h_sp/egs +xent_regularize=0.1 +src_mdl=exp/chain/tdnn_7h_sp/final.mdl +regressor_prior_scale=-0.25 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser for primary network"; + + num_targets=$(tree-info exp/chain/tri5_7d_tree_sp/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/stage1/configs + cat < $dir/stage1/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(input@-1,input,input@1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/stage1/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=625 + relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=625 + relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=625 + relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=625 + relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=625 + relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=625 + relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=625 + + ## adding the layers for chain branch + relu-renorm-layer name=prefinal-chain input=tdnn7 dim=625 target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-renorm-layer name=prefinal-xent input=tdnn7 dim=625 target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/stage1/configs/network.xconfig \ + --config-dir $dir/stage1/configs/ + + echo "$0: creating neural net configs using the xconfig parser for sibling network" + sibling_dim=300 + primary_dim=625 + regressor_lr_factor=1.0 + regressor_scale=`echo $regressor_lr_factor $primary_dim | awk '{printf "%.8f \n", $1/$2}'` + prior_scale_factor=`echo $regressor_lr_factor $num_targets | awk '{printf "%.8f \n", $1/sqrt($2)}'` + regressor_scale_vec="" + negate_vec="" + for i in `seq $primary_dim`;do + regressor_scale_vec="$regressor_scale_vec $regressor_scale" + done + for i in `seq $num_targets`;do + negate_vec="$negate_vec -1.0" + done + mkdir -p $dir/stage2 + cat < $dir/stage2/regressor_scale.vec + [ $regressor_scale_vec ] +EOF + cat < $dir/stage2/negate.vec + [ $negate_vec ] +EOF + mkdir -p $dir/stage2/configs + cat < $dir/stage2/configs/network.xconfig + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1-sibling input=lda dim=$sibling_dim + relu-renorm-layer name=tdnn2-sibling input=Append(-1,0,1) dim=$sibling_dim + relu-renorm-layer name=tdnn3-sibling input=Append(-1,0,1) dim=$sibling_dim + relu-renorm-layer name=tdnn4-sibling input=Append(-3,0,3) dim=$sibling_dim + relu-renorm-layer name=tdnn5-sibling input=Append(-3,0,3) dim=$sibling_dim + relu-renorm-layer name=tdnn6-sibling input=Append(-3,0,3) dim=$sibling_dim + relu-renorm-layer name=tdnn7-sibling input=Append(-3,0,3) dim=$sibling_dim + + ## adding the layers for chain branch + relu-renorm-layer name=prefinal-chain-sibling input=tdnn7-sibling dim=$sibling_dim target-rms=0.5 + output-layer name=output-sibling include-log-softmax=false dim=$num_targets max-change=1.5 + + relu-renorm-layer name=prefinal-xent-sibling input=tdnn7-sibling dim=$sibling_dim target-rms=0.5 + output-layer name=output-xent-sibling dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + ## adding the regressor outputs to the sibling network configs. + relu-renorm-layer name=tdnn4-regressor input=tdnn4-sibling dim=$primary_dim + regressor-layer name=regressor-4 input1=tdnn4-regressor input2=tdnn4 objective-type=linear max-change=1.5 dim=$primary_dim regressor-scale-file=$dir/stage2/regressor_scale.vec supervision-type=unsupervised + ## adding regressor for log-likelihood output to maximize weighted log-likelihood + # product for two networks. + regressor-layer name=regressor-final input1=output-sibling.affine input2=output.affine objective-type=quadratic max-change=1.5 dim=$num_targets negate-file=$dir/stage2/negate.vec supervision-type=unsupervised regressor-scale-file=$dir/stage2/presoftmax_prior_scale.vec +EOF + # regressor-scale-file=$dir/stage2/ presoftmax_prior_scale.vec + steps/nnet3/xconfig_to_configs.py --aux-xconfig-file $dir/stage1/configs/network.xconfig \ + --xconfig-file $dir/stage2/configs/network.xconfig --config-dir $dir/stage2/configs/ + + # edits.config contains edits required for different stage of training. + # it is applied to 0.mdl generated at prepare_initial_network stage in + # iter -1. + # The edits for 2nd stage contains renaming primary network's outputs to + # -primary to not train using these outputs. + # the edits contain renaming sibling network output to be output. + cat < $dir/stage2/configs/edits.config + rename-node old-name=output new-name=output-primary + rename-node old-name=output-xent new-name=output-xent-primary + rename-node old-name=output-sibling new-name=output + rename-node old-name=output-xent-sibling new-name=output-xent +EOF + # edits.config contains edits required for 3rd stage of training. + mkdir -p $dir/stage3 + mkdir -p $dir/stage3/configs + cat < $dir/stage3/configs/edits.config + remove-output-nodes name=regressor* + remove-output-nodes name=*-primary + remove-orphans +EOF + # we skip add_compatiblity stage in xconfig_to_config.py + # we copy vars from stage1 to stage2 and stage3 for now. + cp -r $dir/stage1/configs/vars $dir/stage2/configs/. + cp -r $dir/stage1/configs/vars $dir/stage3/configs/. +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + if [ $multi_stage_train -le 0 ] && [ ! -f $src_mdl ]; then + echo "$0: Training primary network" + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir/stage1 || exit 1; + fi + + if [ $multi_stage_train -le 1 ]; then + mkdir -p $dir/stage2 + echo "$0: copy final primary network in $dir/stage1/final.raw to " + echo "$dir/stage2/init.raw as initial network with zero lr factor" + echo "as primary network for sibling network." + nnet3-am-copy --raw=true \ + --edits='set-learning-rate-factor name=* learning-rate-factor=0.0;' \ + $src_mdl $dir/stage2/init.raw || exit 1; + + echo "$0: Training sibling network using regularizer objectives." + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --init-raw-model $dir/stage2/init.raw \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.chain-regularize $chain_regularize \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_s2 \ + --trainer.optimization.num-jobs-initial $num_jobs_initial_s2 \ + --trainer.optimization.num-jobs-final $num_jobs_final_s2 \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate_s2 \ + --trainer.optimization.final-effective-lrate $final_effective_lrate_s2 \ + --trainer.presoftmax-prior-scale-power $regressor_prior_scale \ + --trainer.max-param-change $max_param_change \ + --trainer.presoftmax-prior-scale-power $regressor_prior_scale \ + --trainer.prior-scale-factor $prior_scale_factor \ + --cleanup.remove-egs false \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir/stage2 || exit 1; + fi + if [ $multi_stage_train -le 2 ]; then + cp $dir/stage2/den.fst $dir/stage3/. + echo "$0:remove sibling network regularizer outputs " + echo "and raname chain-objective for sibling to train " + echo "with chain objective output for sibling network. \n" + echo "Teacher-student objective can be added in future." + nnet3-am-copy --edits-config=$dir/stage3/configs/edits.config \ + $dir/stage2/final.mdl $dir/stage3/0.mdl || exit 1; + mkdir -p $dir/stage3/configs + train_stage_s3=0 + if [ $train_stage -gt $train_stage_s3 ]; then + train_stage_s3=$train_stage + fi + cp -r $dir/stage2/configs $dir/stage3/configs || exit 1; + steps/nnet3/chain/train.py --stage $train_stage_s3 \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_s3 \ + --trainer.optimization.num-jobs-initial $num_jobs_initial_s3 \ + --trainer.optimization.num-jobs-final $num_jobs_final_s3 \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate_s3 \ + --trainer.optimization.final-effective-lrate $final_effective_lrate_s3 \ + --trainer.max-param-change $max_param_change \ + --trainer.presoftmax-prior-scale-power $regressor_prior_scale \ + --trainer.prior-scale-factor $prior_scale_factor \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir/stage3 || exit 1; + + fi +fi +#dirs_for_decode="$dir/stage2 $dir/stage3" +dirs_for_decode="$dir/stage2" +#dirs_for_decode="$dir/stage3" +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + for dset in $dirs_for_decode; do + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dset $dset/graph_sw1_tg + done +fi + +decode_suff=sw1_tg +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for dset in $dirs_for_decode; do + graph_dir=$dset/graph_sw1_tg + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dset/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dset/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done + done +fi +wait; +exit 0;