From 715f219415ee4e36403f354f8f1b86a0b5327d4d Mon Sep 17 00:00:00 2001
From: Pegita <pegahgh@gmail.com>
Date: Tue, 13 Feb 2018 18:06:30 -0500
Subject: [PATCH 1/6] added new functions to accept NnetExample in
 nnet-chain-training.cc.

---
 src/chain/chain-supervision.cc      |  31 ++++++
 src/chain/chain-supervision.h       |   4 +
 src/chain/chain-training.cc         |  51 ++++++++++
 src/chain/chain-training.h          |  12 ++-
 src/chainbin/Makefile               |   2 +-
 src/chainbin/nnet3-chain-get-egs.cc |   4 +-
 src/latbin/lattice-1best.cc         |   6 +-
 src/latbin/lattice-to-fst.cc        | 150 +++++++++++++++++++++++-----
 src/nnet3/nnet-chain-training.cc    | 116 ++++++++++++++++++++-
 src/nnet3/nnet-chain-training.h     |  10 ++
 src/nnet3/nnet-example-utils.cc     |  50 +++++++++-
 src/nnet3/nnet-example-utils.h      |  19 ++++
 src/nnet3bin/nnet3-get-egs.cc       |   6 +-
 13 files changed, 422 insertions(+), 39 deletions(-)
diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc
index b5597b15667..7d87201dfdd 100644
--- a/src/chain/chain-supervision.cc
+++ b/src/chain/chain-supervision.cc
@@ -650,6 +650,37 @@ void AppendSupervision(const std::vector<const Supervision*> &input,
   }
 }
 
+bool AddWeightToFst(const fst::StdVectorFst &normalization_fst,
+                    fst::StdVectorFst *supervision_fst) {
+  // remove epsilons before composing. 'normalization_fst' has noepsilons so
+  // the composed result will be epsilon free.
+  fst::StdVectorFst supervision_fst_noeps(*supervision_fst);
+  fst::RmEpsilon(&supervision_fst_noeps);
+  if (!TryDeterminizeMinimize(kSupervisionMaxStates,
+                              &supervision_fst_noeps))
+    return false;
+
+  // note: by default, 'Compose' will call 'Connect', so if the
+  // resulting FST is not connected, it will end up empty.
+  fst::StdVectorFst composed_fst;
+  fst::Compose(supervision_fst_noeps, normalization_fst,
+               &composed_fst);
+  if (composed_fst.NumStates() == 0)
+    return false;
+  // projection should not be necessary, as both FSTs are acceptors.
+  // determinize and minimize to make it as compact as possible.
+
+  if (!TryDeterminizeMinimize(kSupervisionMaxStates,
+                              &composed_fst))
+    return false;
+  *supervision_fst = composed_fst;
+  // Make sure the states are numbered in increasing order of time.
+  SortBreadthFirstSearch(supervision_fst);
+  KALDI_ASSERT(supervision_fst->Properties(fst::kAcceptor, true) == fst::kAcceptor);
+  KALDI_ASSERT(supervision_fst->Properties(fst::kIEpsilons, true) == 0);
+  return true;
+}
+
 bool AddWeightToSupervisionFst(const fst::StdVectorFst &normalization_fst,
                                Supervision *supervision) {
   // remove epsilons before composing.  'normalization_fst' has noepsilons so
diff --git a/src/chain/chain-supervision.h b/src/chain/chain-supervision.h
index a94f68ade90..c54d4770aa0 100644
--- a/src/chain/chain-supervision.h
+++ b/src/chain/chain-supervision.h
@@ -323,6 +323,10 @@ class SupervisionSplitter {
 bool AddWeightToSupervisionFst(const fst::StdVectorFst &normalization_fst,
                                Supervision *supervision);
 
+
+bool AddWeightToFst(const fst::StdVectorFst &normalization_fst,
+                    fst::StdVectorFst *supervision_fst);
+
 /// Assuming the 'fst' is epsilon-free, connected, and has the property that all
 /// paths from the start-state are of the same length, output a vector
 /// containing that length (from the start-state to the current state) to
diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc
index 53de69a0e07..40108636da0 100644
--- a/src/chain/chain-training.cc
+++ b/src/chain/chain-training.cc
@@ -25,6 +25,57 @@
 namespace kaldi {
 namespace chain {
 
+void ComputeObjfAndDeriv2(const ChainTrainingOptions &opts,
+                         const DenominatorGraph &den_graph,
+                         const GeneralMatrix &supervision,
+                         const CuMatrixBase<BaseFloat> &nnet_output,
+                         BaseFloat *objf,
+                         BaseFloat *l2_term,
+                         BaseFloat *weight,
+                         CuMatrixBase<BaseFloat> *nnet_output_deriv,
+                         CuMatrixBase<BaseFloat> *xent_output_deriv) {
+  if (nnet_output_deriv) {
+    nnet_output_deriv->SetZero();
+    nnet_output_deriv->CopyFromMat(supervision.GetFullMatrix());
+    if (xent_output_deriv)
+      xent_output_deriv->CopyFromMat(*nnet_output_deriv);
+  } else if (xent_output_deriv) {
+    // this branch will be taken if xent_output_deriv but not
+    // nnet_output_deriv is set- which could happen if you want to compute the
+    // cross-entropy objective but not the derivatives.
+    xent_output_deriv->SetZero();
+    xent_output_deriv->CopyFromMat(supervision.GetFullMatrix());
+  }
+  int32 num_sequences = 64,
+    frames_per_sequence = 150;
+  BaseFloat sup_weight = 1.0;
+  DenominatorComputation denominator(opts, den_graph,
+                                     num_sequences,
+                                     nnet_output);
+  BaseFloat den_logprob = denominator.Forward();
+  bool ok = true;
+  if (nnet_output_deriv)
+    ok = denominator.Backward(-sup_weight, nnet_output_deriv);
+  // we don't consider log-prob w.r.t numerator.
+  *objf = -sup_weight * den_logprob;
+  *weight = sup_weight * num_sequences * frames_per_sequence;
+
+  if (!((*objf) - (*objf) == 0) || !ok) {
+    // inf or NaN detected, or denominator computation returned false.
+    if (nnet_output_deriv)
+      nnet_output_deriv->SetZero();
+    if (xent_output_deriv)
+      xent_output_deriv->SetZero();
+    BaseFloat default_objf = -10;
+    KALDI_WARN << "Objective function is " << (*objf)
+               << " and denominator computation (if done) returned "
+               << std::boolalpha << ok
+               << ", setting objective function to " << default_objf
+               << " per frame.";
+    *objf  = default_objf * *weight;
+  }
+}
+
 void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts,
                               const DenominatorGraph &den_graph,
                               const Supervision &supervision,
diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h
index e6143d10846..8c276a4854f 100644
--- a/src/chain/chain-training.h
+++ b/src/chain/chain-training.h
@@ -63,7 +63,7 @@ struct ChainTrainingOptions {
 
   ChainTrainingOptions(): l2_regularize(0.0), leaky_hmm_coefficient(1.0e-05),
                           xent_regularize(0.0) { }
-  
+
   void Register(OptionsItf *opts) {
     opts->Register("l2-regularize", &l2_regularize, "l2 regularization "
                    "constant for 'chain' training, applied to the output "
@@ -121,8 +121,16 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts,
                               BaseFloat *weight,
                               CuMatrixBase<BaseFloat> *nnet_output_deriv,
                               CuMatrixBase<BaseFloat> *xent_output_deriv = NULL);
-                              
 
+void ComputeObjfAndDeriv2(const ChainTrainingOptions &opts,
+                         const DenominatorGraph &den_graph,
+                         const GeneralMatrix &supervision,
+                         const CuMatrixBase<BaseFloat> &nnet_output,
+                         BaseFloat *objf,
+                         BaseFloat *l2_term,
+                         BaseFloat *weight,
+                         CuMatrixBase<BaseFloat> *nnet_output_deriv,
+                         CuMatrixBase<BaseFloat> *xent_output_deriv = NULL);
 
 }  // namespace chain
 }  // namespace kaldi
diff --git a/src/chainbin/Makefile b/src/chainbin/Makefile
index 096040000eb..2ee87d7ec33 100644
--- a/src/chainbin/Makefile
+++ b/src/chainbin/Makefile
@@ -7,7 +7,7 @@ LDFLAGS += $(CUDA_LDFLAGS)
 LDLIBS += $(CUDA_LDLIBS)
 
 BINFILES = chain-est-phone-lm chain-get-supervision chain-make-den-fst \
-        nnet3-chain-get-egs nnet3-chain-copy-egs nnet3-chain-merge-egs \
+        nnet3-chain-get-egs nnet3-chain-get-egs-post nnet3-chain-copy-egs nnet3-chain-merge-egs \
         nnet3-chain-shuffle-egs nnet3-chain-subset-egs \
         nnet3-chain-acc-lda-stats nnet3-chain-train nnet3-chain-compute-prob \
         nnet3-chain-combine nnet3-chain-normalize-egs
diff --git a/src/chainbin/nnet3-chain-get-egs.cc b/src/chainbin/nnet3-chain-get-egs.cc
index c8c251900ec..206921771c8 100644
--- a/src/chainbin/nnet3-chain-get-egs.cc
+++ b/src/chainbin/nnet3-chain-get-egs.cc
@@ -42,6 +42,7 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst,
                         const GeneralMatrix &feats,
                         const MatrixBase<BaseFloat> *ivector_feats,
                         int32 ivector_period,
+                        const Lattice &lattice,
                         const chain::Supervision &supervision,
                         const std::string &utt_id,
                         bool compress,
@@ -278,12 +279,13 @@ int main(int argc, char *argv[]) {
           num_err++;
           continue;
         }
-
+        /*
         if (!ProcessFile(normalization_fst, feats,
                          online_ivector_feats, online_ivector_period,
                          supervision, key, compress,
                          &utt_splitter, &example_writer))
           num_err++;
+        */
       }
     }
     if (num_err > 0)
diff --git a/src/latbin/lattice-1best.cc b/src/latbin/lattice-1best.cc
index f6723687790..f325cb3016e 100644
--- a/src/latbin/lattice-1best.cc
+++ b/src/latbin/lattice-1best.cc
@@ -61,9 +61,9 @@ int main(int argc, char *argv[]) {
         lats_wspecifier = po.GetArg(2);
 
     SequentialCompactLatticeReader clat_reader(lats_rspecifier);
-    
+
     // Write as compact lattice.
-    CompactLatticeWriter compact_1best_writer(lats_wspecifier); 
+    CompactLatticeWriter compact_1best_writer(lats_wspecifier);
 
     int32 n_done = 0, n_err = 0;
 
@@ -77,7 +77,7 @@ int main(int argc, char *argv[]) {
 
       CompactLattice best_path;
       CompactLatticeShortestPath(clat, &best_path);
-      
+
       if (best_path.Start() == fst::kNoStateId) {
         KALDI_WARN << "Possibly empty lattice for utterance-id " << key
                    << "(no output)";
diff --git a/src/latbin/lattice-to-fst.cc b/src/latbin/lattice-to-fst.cc
index 0d2ac29a99b..19f8bf453c1 100644
--- a/src/latbin/lattice-to-fst.cc
+++ b/src/latbin/lattice-to-fst.cc
@@ -22,6 +22,50 @@
 #include "util/common-utils.h"
 #include "fstext/fstext-lib.h"
 #include "lat/kaldi-lattice.h"
+#include "hmm/transition-model.h"
+
+namespace kaldi {
+
+void ConvertLatticeToPdfLabels(
+    const TransitionModel &tmodel,
+    const Lattice &ifst,
+    fst::StdVectorFst *ofst) {
+  typedef fst::ArcTpl<LatticeWeight> ArcIn;
+  typedef fst::StdArc ArcOut;
+  typedef ArcIn::StateId StateId;
+  ofst->DeleteStates();
+  // The states will be numbered exactly the same as the original FST.
+  // Add the states to the new FST.
+  StateId num_states = ifst.NumStates();
+  for (StateId s = 0; s < num_states; s++) {
+    StateId news = ofst->AddState();
+    assert(news == s);
+  }
+  ofst->SetStart(ifst.Start());
+  for (StateId s = 0; s < num_states; s++) {
+    LatticeWeight final_iweight = ifst.Final(s);
+    if (final_iweight != LatticeWeight::Zero()) {
+      fst::TropicalWeight final_oweight;
+      ConvertLatticeWeight(final_iweight, &final_oweight);
+      ofst->SetFinal(s, final_oweight);
+    }
+    for (fst::ArcIterator<Lattice> iter(ifst, s);
+         !iter.Done();
+         iter.Next()) {
+      ArcIn arc = iter.Value();
+      KALDI_PARANOID_ASSERT(arc.weight != LatticeWeight::Zero());
+      ArcOut oarc;
+      ConvertLatticeWeight(arc.weight, &oarc.weight);
+      oarc.ilabel = tmodel.TransitionIdToPdf(arc.ilabel) + 1;
+      oarc.olabel = arc.olabel;
+      oarc.nextstate = arc.nextstate;
+      ofst->AddArc(s, oarc);
+    }
+  }
+}
+
+}
+
 
 int main(int argc, char *argv[]) {
   try {
@@ -34,20 +78,33 @@ int main(int argc, char *argv[]) {
     using std::vector;
     BaseFloat acoustic_scale = 0.0;
     BaseFloat lm_scale = 0.0;
-    bool rm_eps = true;
-    
+    bool rm_eps = true, read_compact = true, convert_to_pdf_labels = false;
+    std::string trans_model;
+    bool project_input = false, project_output = true;
+
     const char *usage =
         "Turn lattices into normal FSTs, retaining only the word labels\n"
         "By default, removes all weights and also epsilons (configure with\n"
         "with --acoustic-scale, --lm-scale and --rm-eps)\n"
         "Usage: lattice-to-fst [options] lattice-rspecifier fsts-wspecifier\n"
         " e.g.: lattice-to-fst  ark:1.lats ark:1.fsts\n";
-      
+
     ParseOptions po(usage);
+    po.Register("read-compact", &read_compact, "Read compact lattice");
     po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods");
     po.Register("lm-scale", &lm_scale, "Scaling factor for graph/lm costs");
     po.Register("rm-eps", &rm_eps, "Remove epsilons in resulting FSTs (in lazy way; may not remove all)");
-    
+    po.Register("convert-to-pdf-labels", &convert_to_pdf_labels,
+                "Convert lattice to pdf labels");
+    po.Register("trans-model", &trans_model,
+                "Transition model");
+    po.Register("project-input", &project_input,
+                "Project to input labels (transition-ids); applicable only "
+                "when --read-compact=false");
+    po.Register("project-output", &project_output,
+                "Project to output labels (transition-ids); applicable only "
+                "when --read-compact=false");
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 2) {
@@ -56,35 +113,74 @@ int main(int argc, char *argv[]) {
     }
 
     vector<vector<double> > scale = fst::LatticeScale(lm_scale, acoustic_scale);
-    
+
     std::string lats_rspecifier = po.GetArg(1),
         fsts_wspecifier = po.GetArg(2);
-    
-    SequentialCompactLatticeReader lattice_reader(lats_rspecifier);
+
+    TransitionModel tmodel;
+    if (!trans_model.empty()) {
+      ReadKaldiObject(trans_model, &tmodel);
+    }
+
+    SequentialCompactLatticeReader compact_lattice_reader;
+    SequentialLatticeReader lattice_reader;
+
     TableWriter<fst::VectorFstHolder> fst_writer(fsts_wspecifier);
-    
+ 
     int32 n_done = 0; // there is no failure mode, barring a crash.
     
-    for (; !lattice_reader.Done(); lattice_reader.Next()) {
-      std::string key = lattice_reader.Key();
-      CompactLattice clat = lattice_reader.Value();
-      lattice_reader.FreeCurrent();
-      ScaleLattice(scale, &clat); // typically scales to zero.
-      RemoveAlignmentsFromCompactLattice(&clat); // remove the alignments...
-      fst::VectorFst<StdArc> fst;
-      {
-        Lattice lat;
-        ConvertLattice(clat, &lat); // convert to non-compact form.. won't introduce
-        // extra states because already removed alignments.
-        ConvertLattice(lat, &fst); // this adds up the (lm,acoustic) costs to get
-        // the normal (tropical) costs.
-        Project(&fst, fst::PROJECT_OUTPUT); // Because in the standard Lattice format,
-        // the words are on the output, and we want the word labels.
+    if (read_compact) {
+      SequentialCompactLatticeReader compact_lattice_reader(lats_rspecifier);
+      for (; !compact_lattice_reader.Done(); compact_lattice_reader.Next()) {
+        std::string key = compact_lattice_reader.Key();
+        CompactLattice clat = compact_lattice_reader.Value();
+        compact_lattice_reader.FreeCurrent();
+        ScaleLattice(scale, &clat); // typically scales to zero.
+        RemoveAlignmentsFromCompactLattice(&clat); // remove the alignments...
+        fst::VectorFst<StdArc> fst;
+        {
+          Lattice lat;
+          ConvertLattice(clat, &lat); // convert to non-compact form.. won't introduce
+          // extra states because already removed alignments.
+          
+          if (convert_to_pdf_labels) {
+            ConvertLatticeToPdfLabels(tmodel, lat, &fst); // this adds up the (lm,acoustic) costs to get
+            // the normal (tropical) costs.
+          } else {
+            ConvertLattice(lat, &fst);
+          }
+
+          Project(&fst, fst::PROJECT_OUTPUT); // Because in the standard compact_lattice format,
+          // the words are on the output, and we want the word labels.
+        }
+        if (rm_eps) RemoveEpsLocal(&fst);
+        
+        fst_writer.Write(key, fst);
+        n_done++;
+      }
+    } else {
+      SequentialLatticeReader lattice_reader(lats_rspecifier);
+      for (; !lattice_reader.Done(); lattice_reader.Next()) {
+        std::string key = lattice_reader.Key();
+        Lattice lat = lattice_reader.Value();
+        lattice_reader.FreeCurrent();
+        ScaleLattice(scale, &lat); // typically scales to zero.
+        fst::VectorFst<StdArc> fst;
+        if (convert_to_pdf_labels) {
+          ConvertLatticeToPdfLabels(tmodel, lat, &fst);
+        } else {
+          ConvertLattice(lat, &fst);
+        }
+        if (project_input) 
+          Project(&fst, fst::PROJECT_INPUT); 
+        else if (project_output)
+          Project(&fst, fst::PROJECT_OUTPUT); 
+        if (rm_eps) RemoveEpsLocal(&fst);
+        
+        fst_writer.Write(key, fst);
+        n_done++;
       }
-      if (rm_eps) RemoveEpsLocal(&fst);
-      
-      fst_writer.Write(key, fst);
-      n_done++;
+
     }
     KALDI_LOG << "Done converting " << n_done << " lattices to word-level FSTs";
     return (n_done != 0 ? 0 : 1);
diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc
index 780a7115a8a..4c799ea96c3 100644
--- a/src/nnet3/nnet-chain-training.cc
+++ b/src/nnet3/nnet-chain-training.cc
@@ -57,6 +57,22 @@ NnetChainTrainer::NnetChainTrainer(const NnetChainTrainingOptions &opts,
   }
 }
 
+void NnetChainTrainer::Train(const NnetExample &eg) {
+  bool need_model_derivative = true;
+  const NnetTrainerOptions &nnet_config = opts_.nnet_config;
+  bool use_xent_regularization = (opts_.chain_config.xent_regularize != 0.0);
+  ComputationRequest request;
+  GetComputationRequest(*nnet_, eg, need_model_derivative,
+                        nnet_config.store_component_stats,
+                        use_xent_regularization, need_model_derivative,
+                        &request);
+  const NnetComputation *computation = compiler_.Compile(request);
+
+  // conventional training
+  TrainInternal(eg, *computation);
+
+  num_minibatches_processed_++;
+}
 
 void NnetChainTrainer::Train(const NnetChainExample &chain_eg) {
   bool need_model_derivative = true;
@@ -91,6 +107,41 @@ void NnetChainTrainer::Train(const NnetChainExample &chain_eg) {
   num_minibatches_processed_++;
 }
 
+void NnetChainTrainer::TrainInternal(const NnetExample &eg,
+                                     const NnetComputation &computation) {
+  const NnetTrainerOptions &nnet_config = opts_.nnet_config;
+  NnetComputer computer(nnet_config.compute_config, computation,
+                        *nnet_, delta_nnet_);
+  // give the inputs to the computer object
+  computer.AcceptInputs(*nnet_, eg.io);
+  computer.Run();
+
+  this->ProcessOutputs(eg, &computer);
+  computer.Run();
+
+  // If relevant, add in the part of the gradient that comes from L2
+  // regularization.
+  ApplyL2Regularization(*nnet_,
+                        GetNumNvalues(eg.io, false) *
+                        nnet_config.l2_regularize_factor,
+                        delta_nnet_);
+
+  // Updates the parameters of nnet
+  bool success = UpdateNnetWithMaxChange(*delta_nnet_,
+      nnet_config.max_param_change, 1.0, 1.0 - nnet_config.momentum, nnet_,
+      &num_max_change_per_component_applied_, &num_max_change_global_applied_);
+
+  // Scale down the batchnorm stats (keeps them fresh... this affects what
+  // happens when we use the model with batchnorm test-mode set).
+  ScaleBatchnormStats(nnet_config.batchnorm_stats_scale, nnet_);
+
+  // Scale delta_nnet
+  if (success)
+    ScaleNnet(nnet_config.momentum, delta_nnet_);
+  else
+    ScaleNnet(0.0, delta_nnet_);
+}
+
 void NnetChainTrainer::TrainInternal(const NnetChainExample &eg,
                                      const NnetComputation &computation) {
   const NnetTrainerOptions &nnet_config = opts_.nnet_config;
@@ -170,6 +221,69 @@ void NnetChainTrainer::TrainInternalBackstitch(const NnetChainExample &eg,
   ScaleNnet(0.0, delta_nnet_);
 }
 
+void NnetChainTrainer::ProcessOutputs(const NnetExample &eg,
+                                      NnetComputer *computer) {
+  std::vector<NnetIo>::const_iterator iter = eg.io.begin(),
+    end = eg.io.end();
+  for (; iter != end; ++iter) {
+    const NnetIo &io = *iter;
+    int32 node_index = nnet_->GetNodeIndex(io.name);
+    KALDI_ASSERT(node_index >= 0);
+    if (nnet_->IsOutputNode(node_index)) {
+      const CuMatrixBase<BaseFloat> &nnet_output = computer->GetOutput(io.name);
+      CuMatrix<BaseFloat> nnet_output_deriv(nnet_output.NumRows(),
+                                            nnet_output.NumCols(),
+                                            kUndefined);
+      bool use_xent = (opts_.chain_config.xent_regularize != 0.0);
+      std::string xent_name = io.name + "-xent";  // typically "output-xent".
+      CuMatrix<BaseFloat> xent_deriv;
+      if (use_xent)
+        xent_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(),
+                          kUndefined);
+
+      BaseFloat tot_objf, tot_l2_term, tot_weight;
+
+      ComputeObjfAndDeriv2(opts_.chain_config, den_graph_,
+                           io.features,
+                           nnet_output,
+                           &tot_objf, &tot_l2_term, &tot_weight,
+                           &nnet_output_deriv,
+                           (use_xent ? &xent_deriv : NULL));
+      if (use_xent) {
+        // this block computes the cross-entropy objective.
+        const CuMatrixBase<BaseFloat> &xent_output = computer->GetOutput(
+          xent_name);
+        // at this point, xent_deriv is posteriors derived from the numerato
+        // computation.  note, xent_objf has a factor of '.supervision.weight'
+        CuMatrix<BaseFloat> cu_post(io.features.GetFullMatrix());
+        BaseFloat xent_objf = TraceMatMat(xent_output, cu_post, kTrans);
+        objf_info_[xent_name].UpdateStats(xent_name,
+                                          opts_.nnet_config.print_interval,
+                                          num_minibatches_processed_,
+                                          tot_weight, xent_objf);
+      }
+
+      //if (opts_.apply_deriv_weights && sup.deriv_weights.Dim() != 0) {
+        if (opts_.apply_deriv_weights) {
+        CuVector<BaseFloat> cu_deriv_weights;
+        nnet_output_deriv.MulRowsVec(cu_deriv_weights);
+        if (use_xent)
+          xent_deriv.MulRowsVec(cu_deriv_weights);
+      }
+      computer->AcceptInput(io.name, &nnet_output_deriv);
+
+      objf_info_[io.name].UpdateStats(io.name,
+                            opts_.nnet_config.print_interval,
+                            num_minibatches_processed_,
+                            tot_weight, tot_objf, tot_l2_term);
+      if (use_xent) {
+        xent_deriv.Scale(opts_.chain_config.xent_regularize);
+        computer->AcceptInput(xent_name, &xent_deriv);
+      }
+    }
+  }
+}
+
 void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2,
                                       const NnetChainExample &eg,
                                       NnetComputer *computer) {
@@ -214,7 +328,7 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2,
       // at this point, xent_deriv is posteriors derived from the numerator
       // computation.  note, xent_objf has a factor of '.supervision.weight'
       BaseFloat xent_objf = TraceMatMat(xent_output, xent_deriv, kTrans);
-      objf_info_[xent_name + suffix].UpdateStats(xent_name + suffix,
+      objf_info_[xent_name].UpdateStats(xent_name,
                                         opts_.nnet_config.print_interval,
                                         num_minibatches_processed_,
                                         tot_weight, xent_objf);
diff --git a/src/nnet3/nnet-chain-training.h b/src/nnet3/nnet-chain-training.h
index 5bf6a3f6fce..6e9bbe57ef1 100644
--- a/src/nnet3/nnet-chain-training.h
+++ b/src/nnet3/nnet-chain-training.h
@@ -61,6 +61,9 @@ class NnetChainTrainer {
   // train on one minibatch.
   void Train(const NnetChainExample &eg);
 
+  // train on one minibatch using NnetExample
+  void Train(const NnetExample &eg);
+
   // Prints out the final stats, and return true if there was a nonzero count.
   bool PrintTotalStats() const;
 
@@ -74,6 +77,10 @@ class NnetChainTrainer {
   void TrainInternal(const NnetChainExample &eg,
                      const NnetComputation &computation);
 
+  // The internal function for doing one step of conventional SGD training.
+  void TrainInternal(const NnetExample &eg,
+                     const NnetComputation &computation);
+
   // The internal function for doing one step of backstitch training. Depending
   // on whether is_backstitch_step1 is true, It could be either the first
   // (backward) step, or the second (forward) step of backstitch.
@@ -84,6 +91,9 @@ class NnetChainTrainer {
   void ProcessOutputs(bool is_backstitch_step2, const NnetChainExample &eg,
                       NnetComputer *computer);
 
+  void ProcessOutputs(const NnetExample &eg,
+                      NnetComputer *computer);
+
   const NnetChainTrainingOptions opts_;
 
   chain::DenominatorGraph den_graph_;
diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc
index 65df0c891c1..2151e06bbb4 100644
--- a/src/nnet3/nnet-example-utils.cc
+++ b/src/nnet3/nnet-example-utils.cc
@@ -198,6 +198,54 @@ void ShiftExampleTimes(int32 t_offset,
     }
   }
 }
+void GetComputationRequest(const Nnet &nnet,
+                           const NnetExample &eg,
+                           bool need_model_derivative,
+                           bool store_component_stats,
+                           bool use_xent_regularization,
+                           bool use_xent_derivative,
+                           ComputationRequest *request) {
+  request->inputs.clear();
+  request->inputs.reserve(eg.io.size());
+  request->outputs.clear();
+  request->outputs.reserve(eg.io.size() * 2);
+  request->need_model_derivative = need_model_derivative;
+  request->store_component_stats = store_component_stats;
+  for (size_t i = 0; i < eg.io.size(); i++) {
+    const NnetIo &io = eg.io[i];
+    const std::string &name = io.name;
+    int32 node_index = nnet.GetNodeIndex(name);
+    if (node_index == -1 &&
+        !nnet.IsInputNode(node_index) && !nnet.IsOutputNode(node_index))
+      KALDI_ERR << "Nnet example has input or output named '" << name
+                << "', but no such input or output node is in the network.";
+
+    std::vector<IoSpecification> &dest =
+        nnet.IsInputNode(node_index) ? request->inputs : request->outputs;
+    dest.resize(dest.size() + 1);
+    IoSpecification &io_spec = dest.back();
+    io_spec.name = name;
+    io_spec.indexes = io.indexes;
+    io_spec.has_deriv = nnet.IsOutputNode(node_index) && need_model_derivative;
+    if (use_xent_regularization && nnet.IsOutputNode(node_index)) {
+      size_t cur_size = request->outputs.size();
+      request->outputs.resize(cur_size + 1);
+      IoSpecification &io_spec = request->outputs[cur_size - 1],
+        io_spec_xent = request->outputs[cur_size];
+      // the IoSpecification for the -xent output is the same
+      // as for the regular output, except for its name which has
+      // the -xent suffix (and the has_deriv member may differ).
+      io_spec_xent = io_spec;
+      io_spec_xent.name = name + "-xent";
+      io_spec_xent.has_deriv = use_xent_derivative;
+    }
+  }
+  // check to see if something went wrong.
+  if (request->inputs.empty())
+    KALDI_ERR << "No inputs in computation request.";
+  if (request->outputs.empty())
+    KALDI_ERR << "No outputs in computation request.";
+}
 
 void GetComputationRequest(const Nnet &nnet,
                            const NnetExample &eg,
@@ -207,7 +255,7 @@ void GetComputationRequest(const Nnet &nnet,
   request->inputs.clear();
   request->inputs.reserve(eg.io.size());
   request->outputs.clear();
-  request->outputs.reserve(eg.io.size());
+  request->outputs.reserve(eg.io.size() * 2);
   request->need_model_derivative = need_model_derivative;
   request->store_component_stats = store_component_stats;
   for (size_t i = 0; i < eg.io.size(); i++) {
diff --git a/src/nnet3/nnet-example-utils.h b/src/nnet3/nnet-example-utils.h
index 02620df7485..05f35fb44de 100644
--- a/src/nnet3/nnet-example-utils.h
+++ b/src/nnet3/nnet-example-utils.h
@@ -64,6 +64,25 @@ void GetComputationRequest(const Nnet &nnet,
                            ComputationRequest *computation_request);
 
 
+/**  This function takes NnetExample and produces a ComputatioRequest.
+     It assumes you don't want the derivatives w.r.t the input;
+
+     If use_xent_regularization == true, then it assumes that for each output
+     name (e.g. "output" in the eg, there is another output with the same
+     dimension and with the suffix "-xent" on its name, e.g. named
+     "output-xent".  The derivative w.r.t. the xent objective will only be
+     supplied to the nnet computation if 'use_xent_derivative' is true (we
+     propagate back the xent derivative to the model only in training, not in
+     model-combination in nnet3-chain-combine).
+*/
+void GetComputationRequest(const Nnet &nnet,
+                           const NnetExample &eg,
+                           bool need_model_derivative,
+                           bool store_component_stats,
+                           bool use_xent_regularization,
+                           bool use_xent_derivative,
+                           ComputationRequest *computation_request);
+
 // Writes as unsigned char a vector 'vec' that is required to have
 // values between 0 and 1.
 void WriteVectorAsChar(std::ostream &os,
diff --git a/src/nnet3bin/nnet3-get-egs.cc b/src/nnet3bin/nnet3-get-egs.cc
index cec9549541d..de7904a8d6c 100644
--- a/src/nnet3bin/nnet3-get-egs.cc
+++ b/src/nnet3bin/nnet3-get-egs.cc
@@ -166,7 +166,7 @@ int main(int argc, char *argv[]) {
 
     bool compress = true;
     int32 num_pdfs = -1, length_tolerance = 100,
-        targets_length_tolerance = 2,  
+        targets_length_tolerance = 2,
         online_ivector_period = 1;
 
     ExampleGenerationConfig eg_config;  // controls num-frames,
@@ -192,7 +192,7 @@ int main(int argc, char *argv[]) {
                 "--online-ivectors option");
     po.Register("length-tolerance", &length_tolerance, "Tolerance for "
                 "difference in num-frames between feat and ivector matrices");
-    po.Register("targets-length-tolerance", &targets_length_tolerance, 
+    po.Register("targets-length-tolerance", &targets_length_tolerance,
                 "Tolerance for "
                 "difference in num-frames (after subsampling) between "
                 "feature matrix and posterior");
@@ -260,7 +260,7 @@ int main(int argc, char *argv[]) {
         }
 
         if (!ProcessFile(feats, online_ivector_feats, online_ivector_period,
-                         pdf_post, key, compress, num_pdfs, 
+                         pdf_post, key, compress, num_pdfs,
                          targets_length_tolerance,
                          &utt_splitter, &example_writer))
           num_err++;

From ae22eece2f90d9d2d7ed4a37f7d92e17ca7b063e Mon Sep 17 00:00:00 2001
From: Pegita <pegahgh@gmail.com>
Date: Thu, 22 Feb 2018 16:40:51 -0500
Subject: [PATCH 2/6] fixed issues w.r.t comments (part 1).

---
 src/chain/chain-training.cc              |   3 +-
 src/chain/chain-training.h               |   6 +-
 src/chainbin/nnet3-chain-get-egs-post.cc | 397 +++++++++++++++++++++++
 src/nnet3/nnet-chain-training.cc         |   8 +-
 src/nnet3/nnet-example-utils.cc          |  41 +--
 src/nnet3/nnet-example-utils.h           |  16 +-
 6 files changed, 413 insertions(+), 58 deletions(-)
 create mode 100644 src/chainbin/nnet3-chain-get-egs-post.cc

diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc
index 40108636da0..38c72efe057 100644
--- a/src/chain/chain-training.cc
+++ b/src/chain/chain-training.cc
@@ -29,6 +29,7 @@ void ComputeObjfAndDeriv2(const ChainTrainingOptions &opts,
                          const DenominatorGraph &den_graph,
                          const GeneralMatrix &supervision,
                          const CuMatrixBase<BaseFloat> &nnet_output,
+                         int32 num_sequences, int32 frames_per_sequence,
                          BaseFloat *objf,
                          BaseFloat *l2_term,
                          BaseFloat *weight,
@@ -46,8 +47,6 @@ void ComputeObjfAndDeriv2(const ChainTrainingOptions &opts,
     xent_output_deriv->SetZero();
     xent_output_deriv->CopyFromMat(supervision.GetFullMatrix());
   }
-  int32 num_sequences = 64,
-    frames_per_sequence = 150;
   BaseFloat sup_weight = 1.0;
   DenominatorComputation denominator(opts, den_graph,
                                      num_sequences,
diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h
index 8c276a4854f..5b9f43e04e8 100644
--- a/src/chain/chain-training.h
+++ b/src/chain/chain-training.h
@@ -121,11 +121,15 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts,
                               BaseFloat *weight,
                               CuMatrixBase<BaseFloat> *nnet_output_deriv,
                               CuMatrixBase<BaseFloat> *xent_output_deriv = NULL);
-
+/**
+  This function uses supervision as numerator and does denominator computation.
+  It can be uses, where numerator is fixed e.g. TS learning.
+*/
 void ComputeObjfAndDeriv2(const ChainTrainingOptions &opts,
                          const DenominatorGraph &den_graph,
                          const GeneralMatrix &supervision,
                          const CuMatrixBase<BaseFloat> &nnet_output,
+                         int32 num_sequences, int32 frames_per_sequence,
                          BaseFloat *objf,
                          BaseFloat *l2_term,
                          BaseFloat *weight,
diff --git a/src/chainbin/nnet3-chain-get-egs-post.cc b/src/chainbin/nnet3-chain-get-egs-post.cc
new file mode 100644
index 00000000000..9aa0eba0fb8
--- /dev/null
+++ b/src/chainbin/nnet3-chain-get-egs-post.cc
@@ -0,0 +1,397 @@
+// chainbin/nnet3-chain-get-egs.cc
+
+// Copyright      2015  Johns Hopkins University (author:  Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sstream>
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "hmm/posterior.h"
+#include "nnet3/nnet-example.h"
+#include "nnet3/nnet-chain-example.h"
+#include "nnet3/nnet-example-utils.h"
+#include "lat/lattice-functions.h"
+#include "chain/chain-supervision.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+/** This function scales weights for fst.
+*/
+void ScaleFst(BaseFloat scale,
+              fst::StdVectorFst *fst) {
+  typedef fst::StdArc StdArc;
+  typedef fst::StdArc::Weight  Weight;
+  int32 num_states = fst->NumStates();
+  for (int32 s = 0; s < num_states; s++) {
+    for (fst::MutableArcIterator<fst::StdVectorFst> iter(fst, s);
+         !iter.Done(); iter.Next()) {
+      StdArc arc = iter.Value();
+      BaseFloat scaled_weight = scale * iter.Value().weight.Value();
+      //arc.weight.SetWeight(scaled_weight);
+      arc.weight = scaled_weight;
+      iter.SetValue(arc);
+    }
+    Weight  final_weight = fst->Final(s);
+    //if (final_weight != Weight::Zero())
+    //  scale = 1.0;
+    fst->SetFinal(s, final_weight);
+  }
+}
+
+/** This function converts lattice to fst with weight equel to weighted
+    average of acoustic and language score.
+*/
+void ConvertLatticeToPdfLabels(
+    const TransitionModel &tmodel,
+    const Lattice &ifst,
+    fst::StdVectorFst *ofst) {
+  typedef fst::ArcTpl<LatticeWeight> ArcIn;
+  typedef fst::StdArc ArcOut;
+  typedef ArcIn::StateId StateId;
+  ofst->DeleteStates();
+  // The states will be numbered exactly the same as the original FST.
+  // Add the states to the new FST.
+  StateId num_states = ifst.NumStates();
+  for (StateId s = 0; s < num_states; s++) {
+    StateId news = ofst->AddState();
+    assert(news == s);
+  }
+  ofst->SetStart(ifst.Start());
+  for (StateId s = 0; s < num_states; s++) {
+    LatticeWeight final_iweight = ifst.Final(s);
+    if (final_iweight != LatticeWeight::Zero()) {
+      fst::TropicalWeight final_oweight;
+      ConvertLatticeWeight(final_iweight, &final_oweight);
+      ofst->SetFinal(s, final_oweight);
+    }
+    for (fst::ArcIterator<Lattice> iter(ifst, s);
+         !iter.Done();
+         iter.Next()) {
+      ArcIn arc = iter.Value();
+      KALDI_PARANOID_ASSERT(arc.weight != LatticeWeight::Zero());
+      ArcOut oarc;
+      ConvertLatticeWeight(arc.weight, &oarc.weight);
+      oarc.ilabel = tmodel.TransitionIdToPdf(arc.ilabel) + 1;
+      oarc.olabel = tmodel.TransitionIdToPdf(arc.ilabel) + 1;
+      oarc.nextstate = arc.nextstate;
+      ofst->AddArc(s, oarc);
+    }
+  }
+}
+
+
+/**
+   This function does all the processing for one utterance, and outputs the
+   supervision objects to 'example_writer'.  Note: if normalization_fst is the
+   empty FST (with no states), it skips the final stage of egs preparation and
+   you should do it later with nnet3-chain-normalize-egs.
+*/
+
+static bool ProcessFile(const fst::StdVectorFst &normalization_fst,
+                        const GeneralMatrix &feats,
+                        const MatrixBase<BaseFloat> *ivector_feats,
+                        int32 ivector_period,
+                        const Lattice &lat,
+                        int32 num_output_frames,
+                        const std::string &utt_id,
+                        bool compress,
+                        int32 num_pdfs,
+                        TransitionModel &tmodel,
+                        UtteranceSplitter *utt_splitter,
+                        NnetExampleWriter *example_writer) {
+  //KALDI_ASSERT(supervision.num_sequences == 1);
+  int32 num_input_frames = feats.NumRows();
+
+  if (!utt_splitter->LengthsMatch(utt_id, num_input_frames, num_output_frames))
+    return false;  // LengthsMatch() will have printed a warning.
+
+  std::vector<ChunkTimeInfo> chunks;
+
+  utt_splitter->GetChunksForUtterance(num_input_frames, &chunks);
+
+  if (chunks.empty()) {
+    KALDI_WARN << "Not producing egs for utterance " << utt_id
+               << " because it is too short: "
+               << num_input_frames << " frames.";
+    return false;
+  }
+
+  int32 frame_subsampling_factor = utt_splitter->Config().frame_subsampling_factor;
+
+  fst::StdVectorFst sup_fst,
+    scaled_normalization_fst(normalization_fst);
+  ConvertLatticeToPdfLabels(tmodel, lat, &sup_fst);
+  ScaleFst(0.5, &scaled_normalization_fst); // Scale lattice to have weights similar
+                                     // to weights used to combine lm weight
+                                     // with acoustic weight in sup_lat
+  if (normalization_fst.NumStates() > 0 &&
+      !chain::AddWeightToFst(normalization_fst, &sup_fst)) {
+    KALDI_WARN << "For utterance " << utt_id << ", feature frames "
+               << ", FST was empty after composing with normalization FST. "
+               << "This should be extremely rare (a few per corpus, at most)";
+  }
+
+  // Convert fst to lattice to extract posterior using forward backward.
+  Lattice sup_lat;
+  ConvertFstToLattice(sup_fst, &sup_lat);
+  Posterior pdf_post;
+  LatticeForwardBackward(lat, &pdf_post);
+
+  for (size_t c = 0; c < chunks.size(); c++) {
+    ChunkTimeInfo &chunk = chunks[c];
+
+    int32 start_frame_subsampled = chunk.first_frame / frame_subsampling_factor,
+        num_frames_subsampled = chunk.num_frames / frame_subsampling_factor;
+
+
+    // Do we need to substract 1 from post to convert it back to pdf-id.
+    // Select subset of posterior correspond to subset of utts.
+    // select subset of pdf-ids
+    Posterior labels(num_frames_subsampled);
+    for (int i = 0; i < num_frames_subsampled; i++) {
+      int t = i + start_frame_subsampled;
+      if (t < pdf_post.size())
+        labels[i] = pdf_post[t];
+      //for (std::vector<std::pair<int32, BaseFloat> >::iterator
+      //        iter = labels[i].begin(); iter ! labels[i].end(); ++iter)
+      //  iter->second *= chunk.output_weights[i];
+    }
+
+    int32 first_frame = 0;  // we shift the time-indexes of all these parts so
+                            // that the supervised part starts from frame 0.
+
+    SubVector<BaseFloat> output_weights(
+        &(chunk.output_weights[0]),
+        static_cast<int32>(chunk.output_weights.size()));
+
+    NnetExample nnet_eg;
+    nnet_eg.io.push_back(NnetIo("output", num_pdfs, 0, labels));
+    nnet_eg.io.resize(ivector_feats != NULL ? 3 : 2);
+
+    int32 tot_input_frames = chunk.left_context + chunk.num_frames +
+        chunk.right_context,
+        start_frame = chunk.first_frame - chunk.left_context;
+
+    GeneralMatrix input_frames;
+    ExtractRowRangeWithPadding(feats, start_frame, tot_input_frames,
+                               &input_frames);
+
+    NnetIo input_io("input", -chunk.left_context, input_frames);
+    nnet_eg.io[0].Swap(&input_io);
+
+    if (ivector_feats != NULL) {
+      // if applicable, add the iVector feature.
+      // choose iVector from a random frame in the chunk
+      int32 ivector_frame = RandInt(start_frame,
+                                    start_frame + num_input_frames - 1),
+          ivector_frame_subsampled = ivector_frame / ivector_period;
+      if (ivector_frame_subsampled < 0)
+        ivector_frame_subsampled = 0;
+      if (ivector_frame_subsampled >= ivector_feats->NumRows())
+        ivector_frame_subsampled = ivector_feats->NumRows() - 1;
+      Matrix<BaseFloat> ivector(1, ivector_feats->NumCols());
+      ivector.Row(0).CopyFromVec(ivector_feats->Row(ivector_frame_subsampled));
+      NnetIo ivector_io("ivector", 0, ivector);
+      nnet_eg.io[2].Swap(&ivector_io);
+    }
+
+    if (compress)
+      nnet_eg.Compress();
+
+    std::ostringstream os;
+    os << utt_id << "-" << chunk.first_frame;
+
+    std::string key = os.str(); // key is <utt_id>-<frame_id>
+
+    example_writer->Write(key, nnet_eg);
+  }
+  return true;
+}
+
+} // namespace nnet3
+} // namespace kaldi
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    using namespace kaldi::chain;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Get frame-by-frame examples of data for nnet3+chain neural network\n"
+        "training.  This involves breaking up utterances into pieces of a\n"
+        "fixed size. \n"
+        "The input is lattice and it will transform into new lattice "
+        "with pdf labels. The it will compose with <normalization-fst> "
+        "and does forward backward to get posterior.\n"
+        "This egs generation can be used for teacher student learning setup \n"
+        "where the lattice extracted from teacher network.\n"
+        "Note: if <normalization-fst> is not supplied the egs will not be\n"
+        "ready for training; in that case they should later be processed\n"
+        "with nnet3-chain-normalize-egs\n"
+        "\n"
+        "Usage:  nnet3-chain-get-egs [options] [<normalization-fst>] <features-rspecifier> "
+        "<chain-supervision-rspecifier> <egs-wspecifier>\n"
+        "\n"
+        "An example [where $feats expands to the actual features]:\n"
+        "chain-get-supervision [args] | \\\n"
+        "  nnet3-chain-get-egs --left-context=25 --right-context=9 --num-frames=20 dir/normalization.fst \\\n"
+        "  \"$feats\" ark,s,cs:- ark:cegs.1.ark\n"
+        "Note: the --frame-subsampling-factor option must be the same as given to\n"
+        "chain-get-supervision.\n";
+
+    bool compress = true;
+    int32 length_tolerance = 100, online_ivector_period = 1;
+
+    ExampleGenerationConfig eg_config;  // controls num-frames,
+                                        // left/right-context, etc.
+
+    int32 srand_seed = 0, num_pdfs = -1;
+    std::string online_ivector_rspecifier,
+      trans_model;
+
+    ParseOptions po(usage);
+    po.Register("compress", &compress, "If true, write egs with input features "
+                "in compressed format (recommended).  Update: this is now "
+                "only relevant if the features being read are un-compressed; "
+                "if already compressed, we keep we same compressed format when "
+                "dumping-egs.");
+    po.Register("ivectors", &online_ivector_rspecifier, "Alias for "
+                "--online-ivectors option, for back compatibility");
+    po.Register("online-ivectors", &online_ivector_rspecifier, "Rspecifier of "
+                "ivector features, as a matrix.");
+    po.Register("online-ivector-period", &online_ivector_period, "Number of "
+                "frames between iVectors in matrices supplied to the "
+                "--online-ivectors option");
+    po.Register("srand", &srand_seed, "Seed for random number generator ");
+    po.Register("length-tolerance", &length_tolerance, "Tolerance for "
+                "difference in num-frames between feat and ivector matrices");
+    po.Register("num-pdfs", &num_pdfs, "Number of pdfs in the acoustic "
+                "model");
+    po.Register("trans-model", &trans_model,
+                "Transition model");
+
+    eg_config.Register(&po);
+
+    po.Read(argc, argv);
+
+    srand(srand_seed);
+
+    if (po.NumArgs() < 3 || po.NumArgs() > 4) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    if (num_pdfs <= 0)
+      KALDI_ERR << "--num-pdfs options is required.";
+    TransitionModel tmodel;
+    if (!trans_model.empty())
+      ReadKaldiObject(trans_model, &tmodel);
+
+    std::string
+        normalization_fst_rxfilename,
+        feature_rspecifier,
+        lattice_rspecifier,
+        examples_wspecifier;
+    if (po.NumArgs() == 3) {
+      feature_rspecifier = po.GetArg(1);
+      lattice_rspecifier = po.GetArg(2);
+      examples_wspecifier = po.GetArg(3);
+    } else {
+      normalization_fst_rxfilename = po.GetArg(1);
+      KALDI_ASSERT(!normalization_fst_rxfilename.empty());
+      feature_rspecifier = po.GetArg(2);
+      lattice_rspecifier = po.GetArg(3);
+      examples_wspecifier = po.GetArg(4);
+    }
+
+    eg_config.ComputeDerived();
+    UtteranceSplitter utt_splitter(eg_config);
+
+    fst::StdVectorFst normalization_fst;
+    if (!normalization_fst_rxfilename.empty()) {
+      ReadFstKaldi(normalization_fst_rxfilename, &normalization_fst);
+      KALDI_ASSERT(normalization_fst.NumStates() > 0);
+    }
+
+    // Read as GeneralMatrix so we don't need to un-compress and re-compress
+    // when selecting parts of matrices.
+    SequentialGeneralMatrixReader feat_reader(feature_rspecifier);
+    //chain::RandomAccessSupervisionReader supervision_reader(
+    //    supervision_rspecifier);
+    RandomAccessLatticeReader lattice_reader(lattice_rspecifier);
+    NnetExampleWriter example_writer(examples_wspecifier);
+    RandomAccessBaseFloatMatrixReader online_ivector_reader(
+        online_ivector_rspecifier);
+
+    int32 num_err = 0;
+
+    for (; !feat_reader.Done(); feat_reader.Next()) {
+      std::string key = feat_reader.Key();
+      const GeneralMatrix &feats = feat_reader.Value();
+      if (!lattice_reader.HasKey(key)) {
+        KALDI_WARN << "No pdf-level posterior for key " << key;
+        num_err++;
+      } else {
+        //const chain::Supervision &supervision = supervision_reader.Value(key);
+        const Lattice &lat = lattice_reader.Value(key);
+        const Matrix<BaseFloat> *online_ivector_feats = NULL;
+        if (!online_ivector_rspecifier.empty()) {
+          if (!online_ivector_reader.HasKey(key)) {
+            KALDI_WARN << "No iVectors for utterance " << key;
+            num_err++;
+            continue;
+          } else {
+            // this address will be valid until we call HasKey() or Value()
+            // again.
+            online_ivector_feats = &(online_ivector_reader.Value(key));
+          }
+        }
+        if (online_ivector_feats != NULL &&
+            (abs(feats.NumRows() - (online_ivector_feats->NumRows() *
+                                    online_ivector_period)) > length_tolerance
+             || online_ivector_feats->NumRows() == 0)) {
+          KALDI_WARN << "Length difference between feats " << feats.NumRows()
+                     << " and iVectors " << online_ivector_feats->NumRows()
+                     << "exceeds tolerance " << length_tolerance;
+          num_err++;
+          continue;
+        }
+        int32 num_output_frames = 1;
+        if (!ProcessFile(normalization_fst, feats,
+                         online_ivector_feats, online_ivector_period,
+                         lat, num_output_frames, key, compress, num_pdfs,
+                         tmodel,
+                         &utt_splitter, &example_writer))
+          num_err++;
+      }
+    }
+    if (num_err > 0)
+      KALDI_WARN << num_err << " utterances had errors and could "
+          "not be processed.";
+    // utt_splitter prints stats in its destructor.
+    return utt_splitter.ExitStatus();
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc
index 4c799ea96c3..7608aea831e 100644
--- a/src/nnet3/nnet-chain-training.cc
+++ b/src/nnet3/nnet-chain-training.cc
@@ -63,9 +63,8 @@ void NnetChainTrainer::Train(const NnetExample &eg) {
   bool use_xent_regularization = (opts_.chain_config.xent_regularize != 0.0);
   ComputationRequest request;
   GetComputationRequest(*nnet_, eg, need_model_derivative,
-                        nnet_config.store_component_stats,
-                        use_xent_regularization, need_model_derivative,
-                        &request);
+                        nnet_config.store_component_stats, &request,
+                        use_xent_regularization, need_model_derivative);
   const NnetComputation *computation = compiler_.Compile(request);
 
   // conventional training
@@ -242,10 +241,11 @@ void NnetChainTrainer::ProcessOutputs(const NnetExample &eg,
                           kUndefined);
 
       BaseFloat tot_objf, tot_l2_term, tot_weight;
-
+      int32 num_sequences = 64, frames_per_sequence = 150;
       ComputeObjfAndDeriv2(opts_.chain_config, den_graph_,
                            io.features,
                            nnet_output,
+                           num_sequences, frames_per_sequence,
                            &tot_objf, &tot_l2_term, &tot_weight,
                            &nnet_output_deriv,
                            (use_xent ? &xent_deriv : NULL));
diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc
index 2151e06bbb4..62fc88521bc 100644
--- a/src/nnet3/nnet-example-utils.cc
+++ b/src/nnet3/nnet-example-utils.cc
@@ -202,13 +202,13 @@ void GetComputationRequest(const Nnet &nnet,
                            const NnetExample &eg,
                            bool need_model_derivative,
                            bool store_component_stats,
+                           ComputationRequest *request,
                            bool use_xent_regularization,
-                           bool use_xent_derivative,
-                           ComputationRequest *request) {
+                           bool use_xent_derivative) {
   request->inputs.clear();
   request->inputs.reserve(eg.io.size());
   request->outputs.clear();
-  request->outputs.reserve(eg.io.size() * 2);
+  request->outputs.reserve((use_xent_regularization ? 2 : 1) * eg.io.size());
   request->need_model_derivative = need_model_derivative;
   request->store_component_stats = store_component_stats;
   for (size_t i = 0; i < eg.io.size(); i++) {
@@ -247,41 +247,6 @@ void GetComputationRequest(const Nnet &nnet,
     KALDI_ERR << "No outputs in computation request.";
 }
 
-void GetComputationRequest(const Nnet &nnet,
-                           const NnetExample &eg,
-                           bool need_model_derivative,
-                           bool store_component_stats,
-                           ComputationRequest *request) {
-  request->inputs.clear();
-  request->inputs.reserve(eg.io.size());
-  request->outputs.clear();
-  request->outputs.reserve(eg.io.size() * 2);
-  request->need_model_derivative = need_model_derivative;
-  request->store_component_stats = store_component_stats;
-  for (size_t i = 0; i < eg.io.size(); i++) {
-    const NnetIo &io = eg.io[i];
-    const std::string &name = io.name;
-    int32 node_index = nnet.GetNodeIndex(name);
-    if (node_index == -1 &&
-        !nnet.IsInputNode(node_index) && !nnet.IsOutputNode(node_index))
-      KALDI_ERR << "Nnet example has input or output named '" << name
-                << "', but no such input or output node is in the network.";
-
-    std::vector<IoSpecification> &dest =
-        nnet.IsInputNode(node_index) ? request->inputs : request->outputs;
-    dest.resize(dest.size() + 1);
-    IoSpecification &io_spec = dest.back();
-    io_spec.name = name;
-    io_spec.indexes = io.indexes;
-    io_spec.has_deriv = nnet.IsOutputNode(node_index) && need_model_derivative;
-  }
-  // check to see if something went wrong.
-  if (request->inputs.empty())
-    KALDI_ERR << "No inputs in computation request.";
-  if (request->outputs.empty())
-    KALDI_ERR << "No outputs in computation request.";
-}
-
 void WriteVectorAsChar(std::ostream &os,
                        bool binary,
                        const VectorBase<BaseFloat> &vec) {
diff --git a/src/nnet3/nnet-example-utils.h b/src/nnet3/nnet-example-utils.h
index 05f35fb44de..5f6c69f7d96 100644
--- a/src/nnet3/nnet-example-utils.h
+++ b/src/nnet3/nnet-example-utils.h
@@ -56,16 +56,6 @@ void ShiftExampleTimes(int32 t_offset,
      inputs; if you do, you can create/modify the ComputationRequest manually.
      Assumes that if need_model_derivative is true, you will be supplying
      derivatives w.r.t. all outputs.
-*/
-void GetComputationRequest(const Nnet &nnet,
-                           const NnetExample &eg,
-                           bool need_model_derivative,
-                           bool store_component_stats,
-                           ComputationRequest *computation_request);
-
-
-/**  This function takes NnetExample and produces a ComputatioRequest.
-     It assumes you don't want the derivatives w.r.t the input;
 
      If use_xent_regularization == true, then it assumes that for each output
      name (e.g. "output" in the eg, there is another output with the same
@@ -79,9 +69,9 @@ void GetComputationRequest(const Nnet &nnet,
                            const NnetExample &eg,
                            bool need_model_derivative,
                            bool store_component_stats,
-                           bool use_xent_regularization,
-                           bool use_xent_derivative,
-                           ComputationRequest *computation_request);
+                           ComputationRequest *computation_request,
+                           bool use_xent_regularization = false,
+                           bool use_xent_derivative = false);
 
 // Writes as unsigned char a vector 'vec' that is required to have
 // values between 0 and 1.

From f94738faa3f6c3e68e70d980e2cdbce2152e1bad Mon Sep 17 00:00:00 2001
From: Pegita <pegahgh@gmail.com>
Date: Sun, 25 Feb 2018 22:07:04 -0500
Subject: [PATCH 3/6] modfied functions to accept new sort (sort by t and then
 n) in nnet3-merge-egs.

---
 src/matrix/sparse-matrix.cc     | 83 ++++++++++++++++++++++++++-------
 src/matrix/sparse-matrix.h      | 11 ++++-
 src/nnet3/nnet-example-utils.cc | 23 +++++++--
 src/nnet3/nnet-example-utils.h  | 18 ++++---
 4 files changed, 104 insertions(+), 31 deletions(-)

diff --git a/src/matrix/sparse-matrix.cc b/src/matrix/sparse-matrix.cc
index 38ad940fb45..5ad7f2bfeca 100644
--- a/src/matrix/sparse-matrix.cc
+++ b/src/matrix/sparse-matrix.cc
@@ -654,26 +654,50 @@ void SparseMatrix<Real>::Resize(MatrixIndexT num_rows,
 
 template <typename Real>
 void SparseMatrix<Real>::AppendSparseMatrixRows(
-    std::vector<SparseMatrix<Real> > *inputs) {
+    std::vector<SparseMatrix<Real> > *inputs,
+    bool sort_by_t) {
   rows_.clear();
   size_t num_rows = 0;
   typename std::vector<SparseMatrix<Real> >::iterator
       input_iter = inputs->begin(),
       input_end = inputs->end();
-  for (; input_iter != input_end; ++input_iter)
+  int32 local_row_size = input_iter->rows_.size(),
+    num_inputs = inputs->size();
+  for (; input_iter != input_end; ++input_iter) {
     num_rows += input_iter->rows_.size();
+    if (sort_by_t)
+      if (input_iter->rows_.size() == local_row_size)
+        KALDI_ERR << "we can not append sparse matrices with inconsistent "
+                  << " number of rows, if sort_by_t is true";
+  }
   rows_.resize(num_rows);
   typename std::vector<SparseVector<Real> >::iterator
       row_iter = rows_.begin(),
       row_end = rows_.end();
-  for (input_iter = inputs->begin(); input_iter != input_end; ++input_iter) {
-    typename std::vector<SparseVector<Real> >::iterator
-        input_row_iter = input_iter->rows_.begin(),
-        input_row_end = input_iter->rows_.end();
-    for (; input_row_iter != input_row_end; ++input_row_iter, ++row_iter)
-      row_iter->Swap(&(*input_row_iter));
+  if (sort_by_t) {
+    // If true, the matrices appended to be sorted first by original row index (t) and next by matrix order in input.
+    // i.e. all rows with same index in local input matrix are appended in a same block.
+    int32 n = 0, t = 0; // 'n' is index over matrices and 't' is index for rows in matrixes.
+    for (input_iter = inputs->begin(); input_iter != input_end; ++input_iter, ++n) {
+      typename std::vector<SparseVector<Real> >::iterator
+          input_row_iter = input_iter->rows_.begin(),
+          input_row_end = input_iter->rows_.end();
+      t = 0;
+      for (; input_row_iter != input_row_end; ++input_row_iter, ++t) {
+        int32 src_row_index = n + t * num_inputs;
+        rows_[src_row_index].Swap(&(*input_row_iter));
+      }
+    }
+  } else {
+    for (input_iter = inputs->begin(); input_iter != input_end; ++input_iter) {
+      typename std::vector<SparseVector<Real> >::iterator
+          input_row_iter = input_iter->rows_.begin(),
+          input_row_end = input_iter->rows_.end();
+      for (; input_row_iter != input_row_end; ++input_row_iter, ++row_iter)
+        row_iter->Swap(&(*input_row_iter));
+    }
+    KALDI_ASSERT(row_iter == row_end);
   }
-  KALDI_ASSERT(row_iter == row_end);
   int32 num_cols = NumCols();
   for (row_iter = rows_.begin(); row_iter != row_end; ++row_iter) {
     if (row_iter->Dim() != num_cols)
@@ -916,7 +940,8 @@ void GeneralMatrix::Read(std::istream &is, bool binary) {
 
 
 void AppendGeneralMatrixRows(const std::vector<const GeneralMatrix *> &src,
-                             GeneralMatrix *mat) {
+                             GeneralMatrix *mat,
+                             bool sort_by_t) {
   mat->Clear();
   int32 size = src.size();
   if (size == 0)
@@ -933,7 +958,7 @@ void AppendGeneralMatrixRows(const std::vector<const GeneralMatrix *> &src,
     for (int32 i = 0; i < size; i++)
       sparse_mats[i] = src[i]->GetSparseMatrix();
     SparseMatrix<BaseFloat> appended_mat;
-    appended_mat.AppendSparseMatrixRows(&sparse_mats);
+    appended_mat.AppendSparseMatrixRows(&sparse_mats, sort_by_t);
     mat->SwapSparseMatrix(&appended_mat);
   } else {
     int32 tot_rows = 0, num_cols = -1;
@@ -950,15 +975,37 @@ void AppendGeneralMatrixRows(const std::vector<const GeneralMatrix *> &src,
     }
     Matrix<BaseFloat> appended_mat(tot_rows, num_cols, kUndefined);
     int32 row_offset = 0;
-    for (int32 i = 0; i < size; i++) {
-      const GeneralMatrix &src_mat = *(src[i]);
-      int32 src_rows = src_mat.NumRows();
-      if (src_rows != 0) {
-        SubMatrix<BaseFloat> dest_submat(appended_mat, row_offset, src_rows,
-                                         0, num_cols);
-        src_mat.CopyToMat(&dest_submat);
+    if (sort_by_t) {
+      // reorder the src mat rows to be inserted in appended matrix, in order to
+      // have sorted matrix first by 't' and next by 'n'.
+      int32 local_row_size = src[0]->NumRows();
+      for (int32 i = 0; i < size; i++) {
+        const GeneralMatrix &src_mat = *(src[i]);
+        Matrix<BaseFloat> full_src_mat(src_mat.NumRows(), src_mat.NumCols());
+        src_mat.CopyToMat(&full_src_mat);
+        int32 src_rows = src_mat.NumRows();
+        if (src_rows != local_row_size)
+          KALDI_ERR << "Appending rows of matrices with inconsistent num-rows "
+                    << "with sort-by-t=true is not possible:";
+        std::vector<int32> reorder_indexes(local_row_size,
+          static_cast<int32>(NULL));
+        for (int32 j = 0; j < src_rows; j++) {
+          reorder_indexes[j] = j * size + i;
+        }
+        full_src_mat.AddToRows(1.0, &(reorder_indexes[0]), &appended_mat);
         row_offset += src_rows;
       }
+    } else {
+      for (int32 i = 0; i < size; i++) {
+        const GeneralMatrix &src_mat = *(src[i]);
+        int32 src_rows = src_mat.NumRows();
+        if (src_rows != 0) {
+          SubMatrix<BaseFloat> dest_submat(appended_mat, row_offset, src_rows,
+                                           0, num_cols);
+          src_mat.CopyToMat(&dest_submat);
+          row_offset += src_rows;
+        }
+      }
     }
     KALDI_ASSERT(row_offset == tot_rows);
     mat->SwapFullMatrix(&appended_mat);
diff --git a/src/matrix/sparse-matrix.h b/src/matrix/sparse-matrix.h
index 60085b93fbe..48e085f1e4f 100644
--- a/src/matrix/sparse-matrix.h
+++ b/src/matrix/sparse-matrix.h
@@ -201,7 +201,13 @@ class SparseMatrix {
   /// function is destructive of the inputs.  Requires, obviously,
   /// that the inputs all have the same dimension (although some may be
   /// empty).
-  void AppendSparseMatrixRows(std::vector<SparseMatrix<Real> > *inputs);
+  ///
+  /// If sort_by_t is true, all sparse matrixes are appended in a way to be sorted
+  /// w.r.t their local row indexes and then sorted with matrix index.
+  /// i.e. all rows of matrixes with same index are in same block.
+  /// Also number of rows in all matrixes needs to be equal.
+  void AppendSparseMatrixRows(std::vector<SparseMatrix<Real> > *inputs,
+                              bool sort_by_t=false);
 
   SparseMatrix() { }
 
@@ -383,7 +389,8 @@ class GeneralMatrix {
 /// Does not preserve compression, if inputs were compressed; you have to
 /// re-compress manually, if that's what you need.
 void AppendGeneralMatrixRows(const std::vector<const GeneralMatrix *> &src,
-                             GeneralMatrix *mat);
+                             GeneralMatrix *mat,
+                             bool sort_by_t = false);
 
 
 /// Outputs a SparseMatrix<Real> containing only the rows r of "in" such that
diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc
index 62fc88521bc..82fbee1cf22 100644
--- a/src/nnet3/nnet-example-utils.cc
+++ b/src/nnet3/nnet-example-utils.cc
@@ -89,7 +89,8 @@ static void MergeIo(const std::vector<NnetExample> &src,
                     const std::vector<std::string> &names,
                     const std::vector<int32> &sizes,
                     bool compress,
-                    NnetExample *merged_eg) {
+                    NnetExample *merged_eg,
+                    bool sort_by_t) {
   // The total number of Indexes we have across all examples.
   int32 num_feats = names.size();
 
@@ -143,13 +144,24 @@ static void MergeIo(const std::vector<NnetExample> &src,
                      "Merging already-merged egs?  Not currentlysupported.");
         output_iter[i].n = n;
       }
+
       this_offset += this_size;  // note: this_offset is a reference.
     }
   }
+  // If sort_by_t is true, the indexes is rearranged to be sorted
+  // first by 't' and next by 'n'.
+  for (int32 f = 0; f < num_feats; f++) {
+    NnetIo output_io = merged_eg->io[f];
+    if (sort_by_t)
+      if (output_io.name == "output")
+        std::sort(output_io.indexes.begin(), output_io.indexes.end());
+  }
+
   KALDI_ASSERT(cur_size == sizes);
   for (int32 f = 0; f < num_feats; f++) {
     AppendGeneralMatrixRows(output_lists[f],
-                            &(merged_eg->io[f].features));
+                            &(merged_eg->io[f].features),
+                            sort_by_t);
     if (compress) {
       // the following won't do anything if the features were sparse.
       merged_eg->io[f].features.Compress();
@@ -161,14 +173,15 @@ static void MergeIo(const std::vector<NnetExample> &src,
 
 void MergeExamples(const std::vector<NnetExample> &src,
                    bool compress,
-                   NnetExample *merged_eg) {
+                   NnetExample *merged_eg,
+                   bool sort_by_t) {
   KALDI_ASSERT(!src.empty());
   std::vector<std::string> io_names;
   GetIoNames(src, &io_names);
   // the sizes are the total number of Indexes we have across all examples.
   std::vector<int32> io_sizes;
   GetIoSizes(src, io_names, &io_sizes);
-  MergeIo(src, io_names, io_sizes, compress, merged_eg);
+  MergeIo(src, io_names, io_sizes, compress, merged_eg, sort_by_t);
 }
 
 void ShiftExampleTimes(int32 t_offset,
@@ -1225,7 +1238,7 @@ void ExampleMerger::WriteMinibatch(const std::vector<NnetExample> &egs) {
   int32 minibatch_size = egs.size();
   stats_.WroteExample(eg_size, structure_hash, minibatch_size);
   NnetExample merged_eg;
-  MergeExamples(egs, config_.compress, &merged_eg);
+  MergeExamples(egs, config_.compress, &merged_eg, config_.sort_by_t);
   std::ostringstream key;
   key << "merged-" << (num_egs_written_++) << "-" << minibatch_size;
   writer_->Write(key.str(), merged_eg);
diff --git a/src/nnet3/nnet-example-utils.h b/src/nnet3/nnet-example-utils.h
index 5f6c69f7d96..9d55f3b0d7d 100644
--- a/src/nnet3/nnet-example-utils.h
+++ b/src/nnet3/nnet-example-utils.h
@@ -33,10 +33,14 @@ namespace nnet3 {
 /** Merge a set of input examples into a single example (typically the size of
     "src" will be the minibatch size).  Will crash if "src" is the empty vector.
     If "compress" is true, it will compress any non-sparse features in the output.
+
+    If sort_by_t is true, the examples and indexes for output are sorted first
+    by 't' and then by 'n' index.
  */
 void MergeExamples(const std::vector<NnetExample> &src,
                    bool compress,
-                   NnetExample *dest);
+                   NnetExample *dest,
+                   bool sort_by_t = false);
 
 
 /** Shifts the time-index t of everything in the "eg" by adding "t_offset" to
@@ -334,12 +338,14 @@ class ExampleMergingConfig {
   std::string measure_output_frames;  // for back-compatibility, not used.
   std::string minibatch_size;
   std::string discard_partial_minibatches;   // for back-compatibility, not used.
-
+  bool sort_by_t; // If true, the examples and indexes are sorted
+                  // first by 't' and next by 'n'.
   ExampleMergingConfig(const char *default_minibatch_size = "256"):
       compress(false),
       measure_output_frames("deprecated"),
       minibatch_size(default_minibatch_size),
-      discard_partial_minibatches("deprecated") { }
+      discard_partial_minibatches("deprecated"),
+      sort_by_t(false) { }
 
   void Register(OptionsItf *po) {
     po->Register("compress", &compress, "If true, compress the output examples "
@@ -363,6 +369,9 @@ class ExampleMergingConfig {
                  "--minibatch-size=128=64:128,256/256=32:64,128.  Egs are given "
                  "minibatch-sizes based on the specified eg-size closest to "
                  "their actual size.");
+    po->Register("sort-by-t", &sort_by_t,
+                 "If true, the features in examples and indexes are sorted "
+                 "first by 't' and next by 'n'.");
   }
 
 
@@ -517,7 +526,6 @@ class ExampleMerger {
   const ExampleMergingConfig &config_;
   NnetExampleWriter *writer_;
   ExampleMergingStats stats_;
-
   // Note: the "key" into the egs is the first element of the vector.
   typedef unordered_map<NnetExample*, std::vector<NnetExample*>,
                         NnetExampleStructureHasher,
@@ -525,8 +533,6 @@ class ExampleMerger {
    MapType eg_to_egs_;
 };
 
-
-
 } // namespace nnet3
 } // namespace kaldi
 

From 40fa1541cfa5cd4403ecc5ac11af836c656b9266 Mon Sep 17 00:00:00 2001
From: Pegita <pegahgh@gmail.com>
Date: Tue, 27 Feb 2018 15:52:45 -0500
Subject: [PATCH 4/6] fixed some issues.

---
 src/chain/chain-supervision.cc           | 29 +--------------------
 src/chain/chain-supervision.h            |  7 +++--
 src/chainbin/nnet3-chain-get-egs-post.cc | 33 +++++++++++-------------
 src/chainbin/nnet3-chain-get-egs.cc      |  4 +--
 4 files changed, 20 insertions(+), 53 deletions(-)

diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc
index 7d87201dfdd..c38cd4698f7 100644
--- a/src/chain/chain-supervision.cc
+++ b/src/chain/chain-supervision.cc
@@ -683,34 +683,7 @@ bool AddWeightToFst(const fst::StdVectorFst &normalization_fst,
 
 bool AddWeightToSupervisionFst(const fst::StdVectorFst &normalization_fst,
                                Supervision *supervision) {
-  // remove epsilons before composing.  'normalization_fst' has noepsilons so
-  // the composed result will be epsilon free.
-  fst::StdVectorFst supervision_fst_noeps(supervision->fst);
-  fst::RmEpsilon(&supervision_fst_noeps);
-  if (!TryDeterminizeMinimize(kSupervisionMaxStates,
-                              &supervision_fst_noeps))
-    return false;
-
-  // note: by default, 'Compose' will call 'Connect', so if the
-  // resulting FST is not connected, it will end up empty.
-  fst::StdVectorFst composed_fst;
-  fst::Compose(supervision_fst_noeps, normalization_fst,
-               &composed_fst);
-  if (composed_fst.NumStates() == 0)
-    return false;
-  // projection should not be necessary, as both FSTs are acceptors.
-  // determinize and minimize to make it as compact as possible.
-
-  if (!TryDeterminizeMinimize(kSupervisionMaxStates,
-                              &composed_fst))
-    return false;
-  supervision->fst = composed_fst;
-
-  // Make sure the states are numbered in increasing order of time.
-  SortBreadthFirstSearch(&(supervision->fst));
-  KALDI_ASSERT(supervision->fst.Properties(fst::kAcceptor, true) == fst::kAcceptor);
-  KALDI_ASSERT(supervision->fst.Properties(fst::kIEpsilons, true) == 0);
-  return true;
+  return AddWeightToFst(normalization_fst, &(supervision->fst));
 }
 
 void SplitIntoRanges(int32 num_frames,
diff --git a/src/chain/chain-supervision.h b/src/chain/chain-supervision.h
index c54d4770aa0..36401009b15 100644
--- a/src/chain/chain-supervision.h
+++ b/src/chain/chain-supervision.h
@@ -320,13 +320,12 @@ class SupervisionSplitter {
 /// This function also removes epsilons and makes sure supervision->fst has the
 /// required sorting of states.  Think of it as the final stage in preparation
 /// of the supervision FST.
-bool AddWeightToSupervisionFst(const fst::StdVectorFst &normalization_fst,
-                               Supervision *supervision);
-
-
 bool AddWeightToFst(const fst::StdVectorFst &normalization_fst,
                     fst::StdVectorFst *supervision_fst);
 
+bool AddWeightToSupervisionFst(const fst::StdVectorFst &normalization_fst,
+                               Supervision *supervision);
+
 /// Assuming the 'fst' is epsilon-free, connected, and has the property that all
 /// paths from the start-state are of the same length, output a vector
 /// containing that length (from the start-state to the current state) to
diff --git a/src/chainbin/nnet3-chain-get-egs-post.cc b/src/chainbin/nnet3-chain-get-egs-post.cc
index 9aa0eba0fb8..f3b82f2229d 100644
--- a/src/chainbin/nnet3-chain-get-egs-post.cc
+++ b/src/chainbin/nnet3-chain-get-egs-post.cc
@@ -134,13 +134,7 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst,
   }
 
   int32 frame_subsampling_factor = utt_splitter->Config().frame_subsampling_factor;
-
-  fst::StdVectorFst sup_fst,
-    scaled_normalization_fst(normalization_fst);
-  ConvertLatticeToPdfLabels(tmodel, lat, &sup_fst);
-  ScaleFst(0.5, &scaled_normalization_fst); // Scale lattice to have weights similar
-                                     // to weights used to combine lm weight
-                                     // with acoustic weight in sup_lat
+  fst::StdVectorFst sup_fst;
   if (normalization_fst.NumStates() > 0 &&
       !chain::AddWeightToFst(normalization_fst, &sup_fst)) {
     KALDI_WARN << "For utterance " << utt_id << ", feature frames "
@@ -249,15 +243,13 @@ int main(int argc, char *argv[]) {
         "ready for training; in that case they should later be processed\n"
         "with nnet3-chain-normalize-egs\n"
         "\n"
-        "Usage:  nnet3-chain-get-egs [options] [<normalization-fst>] <features-rspecifier> "
-        "<chain-supervision-rspecifier> <egs-wspecifier>\n"
+        "Usage:  nnet3-chain-get-egs-post [options] [<normalization-fst>] <features-rspecifier> "
+        "<lattice-rspecifier> <egs-wspecifier>\n"
         "\n"
         "An example [where $feats expands to the actual features]:\n"
-        "chain-get-supervision [args] | \\\n"
-        "  nnet3-chain-get-egs --left-context=25 --right-context=9 --num-frames=20 dir/normalization.fst \\\n"
-        "  \"$feats\" ark,s,cs:- ark:cegs.1.ark\n"
-        "Note: the --frame-subsampling-factor option must be the same as given to\n"
-        "chain-get-supervision.\n";
+        "nnet3-chain-get-egs-post --left-context=25 --right-context=9\n"
+        "--num-frames=20 dir/normalization.fst \"$feats\" \n"
+        "ark:lat.1.ark ark:cegs.1.ark";
 
     bool compress = true;
     int32 length_tolerance = 100, online_ivector_period = 1;
@@ -278,9 +270,7 @@ int main(int argc, char *argv[]) {
     po.Register("ivectors", &online_ivector_rspecifier, "Alias for "
                 "--online-ivectors option, for back compatibility");
     po.Register("online-ivectors", &online_ivector_rspecifier, "Rspecifier of "
-                "ivector features, as a matrix.");
-    po.Register("online-ivector-period", &online_ivector_period, "Number of "
-                "frames between iVectors in matrices supplied to the "
+                "ivector features, as a matrix."
                 "--online-ivectors option");
     po.Register("srand", &srand_seed, "Seed for random number generator ");
     po.Register("length-tolerance", &length_tolerance, "Tolerance for "
@@ -376,8 +366,15 @@ int main(int argc, char *argv[]) {
           num_err++;
           continue;
         }
+        // we scale normalization fst to have similar weights used to combine lm weight
+        // with acoustic weight in sup_lat.
+        fst::StdVectorFst sup_fst, scaled_normalization_fst(normalization_fst);
+        ConvertLatticeToPdfLabels(tmodel, lat, &sup_fst);
+        ScaleFst(0.5, &scaled_normalization_fst); // Scale lattice to have weights similar
+                                                  // to weights used to combine lm weight
+                                                  // with acoustic weight in sup_lat
         int32 num_output_frames = 1;
-        if (!ProcessFile(normalization_fst, feats,
+        if (!ProcessFile(scaled_normalization_fst, feats,
                          online_ivector_feats, online_ivector_period,
                          lat, num_output_frames, key, compress, num_pdfs,
                          tmodel,
diff --git a/src/chainbin/nnet3-chain-get-egs.cc b/src/chainbin/nnet3-chain-get-egs.cc
index 206921771c8..c8c251900ec 100644
--- a/src/chainbin/nnet3-chain-get-egs.cc
+++ b/src/chainbin/nnet3-chain-get-egs.cc
@@ -42,7 +42,6 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst,
                         const GeneralMatrix &feats,
                         const MatrixBase<BaseFloat> *ivector_feats,
                         int32 ivector_period,
-                        const Lattice &lattice,
                         const chain::Supervision &supervision,
                         const std::string &utt_id,
                         bool compress,
@@ -279,13 +278,12 @@ int main(int argc, char *argv[]) {
           num_err++;
           continue;
         }
-        /*
+
         if (!ProcessFile(normalization_fst, feats,
                          online_ivector_feats, online_ivector_period,
                          supervision, key, compress,
                          &utt_splitter, &example_writer))
           num_err++;
-        */
       }
     }
     if (num_err > 0)

From 46a96c521feeea0b6d79f3fd5da7d2497b2b3498 Mon Sep 17 00:00:00 2001
From: Pegita <pegahgh@gmail.com>
Date: Fri, 13 Apr 2018 15:00:13 -0400
Subject: [PATCH 5/6] added small change.

---
 src/nnet3/nnet-example-utils.cc | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc
index 82fbee1cf22..4ff60b9413e 100644
--- a/src/nnet3/nnet-example-utils.cc
+++ b/src/nnet3/nnet-example-utils.cc
@@ -80,9 +80,6 @@ static void GetIoSizes(const std::vector<NnetExample> &src,
   }
 }
 
-
-
-
 // Do the final merging of NnetIo, once we have obtained the names, dims and
 // sizes for each feature/supervision type.
 static void MergeIo(const std::vector<NnetExample> &src,

From 417dad64051a383d93f6c91877efa04acfddfc28 Mon Sep 17 00:00:00 2001
From: Pegita <pegahgh@gmail.com>
Date: Fri, 13 Apr 2018 16:25:26 -0400
Subject: [PATCH 6/6] added run_tdnn_7{n,o}.sh

---
 .../s5c/local/chain/tuning/run_tdnn_7n.sh     | 386 ++++++++++++++++
 .../s5c/local/chain/tuning/run_tdnn_7o.sh     | 411 ++++++++++++++++++
 2 files changed, 797 insertions(+)
 create mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_7n.sh
 create mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh

diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7n.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7n.sh
new file mode 100755
index 00000000000..a45ba6314c9
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7n.sh
@@ -0,0 +1,386 @@
+#!/bin/bash
+# _7m is as _7i but it has single hint-guide in the middle of primary and sibling
+# network.
+# _7i is as _7h but it uses sibling network and multi-stage training
+# to transfer information from larger network to smaller network.
+# It uses multi-stage training to train sibling network with smaller parameter.
+# The 1st stage of training is as as basleline tdnn_7d and it trains primary network.
+# The second stage of training is to use regularizers in all layers as objectives
+# to train sibling network and in the 3rd stage, we train a sibling network using
+# chain objective for 1 epoch.
+
+#System                  tdnn_7g   tdnn_7h
+#WER on train_dev(tg)      13.98     13.84
+#WER on train_dev(fg)      12.78     12.84
+#WER on eval2000(tg)        16.7      16.5
+#WER on eval2000(fg)        14.9      14.8
+#Final train prob     -0.0817467-0.0889771
+#Final valid prob      -0.110475 -0.113102
+#Final train prob (xent)      -1.20065   -1.2533
+#Final valid prob (xent)       -1.3313  -1.36743
+#
+set -e
+
+# configs for 'chain'
+affix=
+stage=12
+multi_stage_train=1
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_7n  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+chain_regularize=0.5
+num_epochs_s2=1
+num_epochs_s3=1
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+initial_effective_lrate_s2=0.001
+final_effective_lrate_s2=0.0001
+initial_effective_lrate_s3=0.0005
+final_effective_lrate_s3=0.00005
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+num_jobs_initial_s2=3
+num_jobs_final_s2=8
+num_jobs_initial_s3=3
+num_jobs_final_s3=8
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=exp/chain/tdnn_7h_sp/egs
+xent_regularize=0.1
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser for primary network";
+
+  num_targets=$(tree-info exp/chain/tri5_7d_tree_sp/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/stage1/configs
+  cat <<EOF > $dir/stage1/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(input@-1,input,input@1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/stage1/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=625
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=625
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=625
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=625
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=625
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=625
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=625
+
+  ## adding the layers for chain branch
+  relu-renorm-layer name=prefinal-chain input=tdnn7 dim=625 target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-renorm-layer name=prefinal-xent input=tdnn7 dim=625 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/stage1/configs/network.xconfig \
+    --config-dir $dir/stage1/configs/
+
+  echo "$0: creating neural net configs using the xconfig parser for sibling network"
+  sibling_dim=300
+  primary_dim=625
+  regressor_lr_factor=1.0
+  regressor_scale=`echo $regressor_lr_factor $primary_dim | awk '{printf "%.8f \n", $1/$2}'`
+  regressor_scale_vec=""
+  for i in `seq $primary_dim`;do
+    regressor_scale_vec="$regressor_scale_vec $regressor_scale"
+  done
+
+  mkdir -p $dir/stage2
+  cat <<EOF > $dir/stage2/regressor_scale.vec
+  [ $regressor_scale_vec ]
+EOF
+
+  mkdir -p $dir/stage2/configs
+  cat <<EOF > $dir/stage2/configs/network.xconfig
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1-sibling input=lda dim=$sibling_dim
+  relu-renorm-layer name=tdnn2-sibling input=Append(-1,0,1) dim=$sibling_dim
+  relu-renorm-layer name=tdnn3-sibling input=Append(-1,0,1) dim=$sibling_dim
+  relu-renorm-layer name=tdnn4-sibling input=Append(-3,0,3) dim=$sibling_dim
+  relu-renorm-layer name=tdnn5-sibling input=Append(-3,0,3) dim=$sibling_dim
+  relu-renorm-layer name=tdnn6-sibling input=Append(-3,0,3) dim=$sibling_dim
+  relu-renorm-layer name=tdnn7-sibling input=Append(-3,0,3) dim=$sibling_dim
+
+  ## adding the layers for chain branch
+  relu-renorm-layer name=prefinal-chain-sibling input=tdnn7-sibling dim=$sibling_dim target-rms=0.5
+  output-layer name=output-sibling include-log-softmax=false dim=$num_targets max-change=1.5
+
+  relu-renorm-layer name=prefinal-xent-sibling input=tdnn7-sibling dim=$sibling_dim target-rms=0.5
+  output-layer name=output-xent-sibling dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+  ## adding the regressor outputs to the sibling network configs.
+  relu-renorm-layer name=tdnn4-regressor input=tdnn4-sibling dim=$primary_dim
+  regressor-layer name=regressor-4 input1=tdnn4-regressor input2=tdnn4 objective-type=linear max-change=1.5 dim=$primary_dim regressor-scale-file=$dir/stage2/regressor_scale.vec supervision-type=unsupervised
+EOF
+  steps/nnet3/xconfig_to_configs.py --aux-xconfig-file $dir/stage1/configs/network.xconfig \
+    --xconfig-file $dir/stage2/configs/network.xconfig --config-dir $dir/stage2/configs/
+
+  # edits.config contains edits required for different stage of training.
+  # it is applied to 0.mdl generated at prepare_initial_network stage in
+  # iter -1.
+  # The edits for 2nd stage contains renaming primary network's outputs to
+  # <output-name>-primary to not train using these outputs.
+  # the edits contain renaming sibling network output to be output.
+  cat <<EOF > $dir/stage2/configs/edits.config
+  rename-node old-name=output new-name=output-primary
+  rename-node old-name=output-xent new-name=output-xent-primary
+  rename-node old-name=output-sibling new-name=output
+  rename-node old-name=output-xent-sibling new-name=output-xent
+EOF
+  # edits.config contains edits required for 3rd stage of training.
+  mkdir -p $dir/stage3
+  mkdir -p $dir/stage3/configs
+  cat <<EOF > $dir/stage3/configs/edits.config
+  remove-output-nodes name=regressor*
+  remove-output-nodes name=*-primary
+  remove-orphans
+EOF
+  # we skip add_compatiblity stage in xconfig_to_config.py
+  # we copy vars from stage1 to stage2 and stage3 for now.
+  cp -r $dir/stage1/configs/vars $dir/stage2/configs/.
+  cp -r $dir/stage1/configs/vars $dir/stage3/configs/.
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+  if [ $multi_stage_train -le 0 ] && [ ! -f $dir/stage1/final.mdl ]; then
+    echo "$0: Training primary network"
+    steps/nnet3/chain/train.py --stage $train_stage \
+      --cmd "$decode_cmd" \
+      --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+      --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+      --chain.xent-regularize $xent_regularize \
+      --chain.leaky-hmm-coefficient 0.1 \
+      --chain.l2-regularize 0.00005 \
+      --chain.apply-deriv-weights false \
+      --chain.lm-opts="--num-extra-lm-states=2000" \
+      --egs.dir "$common_egs_dir" \
+      --egs.stage $get_egs_stage \
+      --egs.opts "--frames-overlap-per-eg 0" \
+      --egs.chunk-width $frames_per_eg \
+      --trainer.num-chunk-per-minibatch $minibatch_size \
+      --trainer.frames-per-iter 1500000 \
+      --trainer.num-epochs $num_epochs \
+      --trainer.optimization.num-jobs-initial $num_jobs_initial \
+      --trainer.optimization.num-jobs-final $num_jobs_final \
+      --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+      --trainer.optimization.final-effective-lrate $final_effective_lrate \
+      --trainer.max-param-change $max_param_change \
+      --cleanup.remove-egs $remove_egs \
+      --feat-dir data/${train_set}_hires \
+      --tree-dir $treedir \
+      --lat-dir exp/tri4_lats_nodup$suffix \
+      --dir $dir/stage1  || exit 1;
+  fi
+
+  if [ $multi_stage_train -le 1 ]; then
+      mkdir -p $dir/stage2
+      echo "$0: copy final primary network in $dir/stage1/final.raw to "
+      echo "$dir/stage2/init.raw as initial network with zero lr factor"
+      echo "as primary network for sibling network."
+      nnet3-am-copy --raw=true \
+        --edits='set-learning-rate-factor name=* learning-rate-factor=0.0;' \
+        $dir/stage1/final.mdl $dir/stage2/init.raw || exit 1;
+
+      echo "$0: Training sibling network using regularizer objectives."
+      steps/nnet3/chain/train.py --stage $train_stage \
+      --cmd "$decode_cmd" \
+      --init-raw-model $dir/stage2/init.raw \
+      --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+      --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+      --chain.leaky-hmm-coefficient 0.1 \
+      --chain.l2-regularize 0.00005 \
+      --chain.chain-regularize $chain_regularize \
+      --chain.apply-deriv-weights false \
+      --chain.lm-opts="--num-extra-lm-states=2000" \
+      --egs.dir "$common_egs_dir" \
+      --egs.stage $get_egs_stage \
+      --egs.opts "--frames-overlap-per-eg 0" \
+      --egs.chunk-width $frames_per_eg \
+      --trainer.num-chunk-per-minibatch $minibatch_size \
+      --trainer.frames-per-iter 1500000 \
+      --trainer.num-epochs $num_epochs_s2 \
+      --trainer.optimization.num-jobs-initial $num_jobs_initial_s2 \
+      --trainer.optimization.num-jobs-final $num_jobs_final_s2 \
+      --trainer.optimization.initial-effective-lrate $initial_effective_lrate_s2 \
+      --trainer.optimization.final-effective-lrate $final_effective_lrate_s2 \
+      --trainer.max-param-change $max_param_change \
+      --cleanup.remove-egs false \
+      --feat-dir data/${train_set}_hires \
+      --tree-dir $treedir \
+      --lat-dir exp/tri4_lats_nodup$suffix \
+      --dir $dir/stage2  || exit 1;
+  fi
+  if [ $multi_stage_train -le 2 ]; then
+      cp $dir/stage2/den.fst $dir/stage3/.
+      echo "$0:remove sibling network regularizer outputs "
+      echo "and raname chain-objective for sibling to train "
+      echo "with chain objective output for sibling network. \n"
+      echo "Teacher-student objective can be added in future."
+      nnet3-am-copy --edits-config=$dir/stage3/configs/edits.config \
+        $dir/stage2/final.mdl $dir/stage3/0.mdl || exit 1;
+      mkdir -p $dir/stage3/configs
+      train_stage_s3=0
+      if [ $train_stage -gt $train_stage_s3 ]; then
+        train_stage_s3=$train_stage
+      fi
+      cp -r $dir/stage2/configs $dir/stage3/configs || exit 1;
+      steps/nnet3/chain/train.py --stage $train_stage_s3 \
+      --cmd "$decode_cmd" \
+      --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+      --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+      --chain.xent-regularize $xent_regularize \
+      --chain.leaky-hmm-coefficient 0.1 \
+      --chain.l2-regularize 0.00005 \
+      --chain.apply-deriv-weights false \
+      --chain.lm-opts="--num-extra-lm-states=2000" \
+      --egs.dir "$common_egs_dir" \
+      --egs.stage $get_egs_stage \
+      --egs.opts "--frames-overlap-per-eg 0" \
+      --egs.chunk-width $frames_per_eg \
+      --trainer.num-chunk-per-minibatch $minibatch_size \
+      --trainer.frames-per-iter 1500000 \
+      --trainer.num-epochs $num_epochs_s3 \
+      --trainer.optimization.num-jobs-initial $num_jobs_initial_s3 \
+      --trainer.optimization.num-jobs-final $num_jobs_final_s3 \
+      --trainer.optimization.initial-effective-lrate $initial_effective_lrate_s3 \
+      --trainer.optimization.final-effective-lrate $final_effective_lrate_s3 \
+      --trainer.max-param-change $max_param_change \
+      --cleanup.remove-egs $remove_egs \
+      --feat-dir data/${train_set}_hires \
+      --tree-dir $treedir \
+      --lat-dir exp/tri4_lats_nodup$suffix \
+      --dir $dir/stage3  || exit 1;
+
+  fi
+fi
+dir=$dir/stage2
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh
new file mode 100755
index 00000000000..cb86ffd71d6
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh
@@ -0,0 +1,411 @@
+#!/bin/bash
+# _7o is as 7n but it has extra regressor-layer which maximize weighted log-likelihood
+# product for two networks where weights are prior for each output class.
+# _7m is as _7i but it has single hint-guide in the middle of primary and sibling
+# network.
+# _7i is as _7h but it uses sibling network and multi-stage training
+# to transfer information from larger network to smaller network.
+# It uses multi-stage training to train sibling network with smaller parameter.
+# The 1st stage of training is as as basleline tdnn_7d and it trains primary network.
+# The second stage of training is to use regularizers in all layers as objectives
+# to train sibling network and in the 3rd stage, we train a sibling network using
+# chain objective for 1 epoch.
+
+#System                  tdnn_7g   tdnn_7h
+#WER on train_dev(tg)      13.98     13.84
+#WER on train_dev(fg)      12.78     12.84
+#WER on eval2000(tg)        16.7      16.5
+#WER on eval2000(fg)        14.9      14.8
+#Final train prob     -0.0817467-0.0889771
+#Final valid prob      -0.110475 -0.113102
+#Final train prob (xent)      -1.20065   -1.2533
+#Final valid prob (xent)       -1.3313  -1.36743
+#
+set -e
+
+# configs for 'chain'
+affix=
+stage=12
+multi_stage_train=1
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_7o  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+chain_regularize=0.5
+num_epochs_s2=1
+num_epochs_s3=1
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+initial_effective_lrate_s2=0.001
+final_effective_lrate_s2=0.0001
+initial_effective_lrate_s3=0.0005
+final_effective_lrate_s3=0.00005
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+num_jobs_initial_s2=3
+num_jobs_final_s2=16
+num_jobs_initial_s3=3
+num_jobs_final_s3=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=exp/chain/tdnn_7h_sp/egs
+xent_regularize=0.1
+src_mdl=exp/chain/tdnn_7h_sp/final.mdl
+regressor_prior_scale=-0.25
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser for primary network";
+
+  num_targets=$(tree-info exp/chain/tri5_7d_tree_sp/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/stage1/configs
+  cat <<EOF > $dir/stage1/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(input@-1,input,input@1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/stage1/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=625
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=625
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=625
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=625
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=625
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=625
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=625
+
+  ## adding the layers for chain branch
+  relu-renorm-layer name=prefinal-chain input=tdnn7 dim=625 target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-renorm-layer name=prefinal-xent input=tdnn7 dim=625 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/stage1/configs/network.xconfig \
+    --config-dir $dir/stage1/configs/
+
+  echo "$0: creating neural net configs using the xconfig parser for sibling network"
+  sibling_dim=300
+  primary_dim=625
+  regressor_lr_factor=1.0
+  regressor_scale=`echo $regressor_lr_factor $primary_dim | awk '{printf "%.8f \n", $1/$2}'`
+  prior_scale_factor=`echo $regressor_lr_factor $num_targets | awk '{printf "%.8f \n", $1/sqrt($2)}'`
+  regressor_scale_vec=""
+  negate_vec=""
+  for i in `seq $primary_dim`;do
+    regressor_scale_vec="$regressor_scale_vec $regressor_scale"
+  done
+  for i in `seq $num_targets`;do
+    negate_vec="$negate_vec -1.0"
+  done
+  mkdir -p $dir/stage2
+  cat <<EOF > $dir/stage2/regressor_scale.vec
+  [ $regressor_scale_vec ]
+EOF
+  cat <<EOF > $dir/stage2/negate.vec
+  [ $negate_vec ]
+EOF
+  mkdir -p $dir/stage2/configs
+  cat <<EOF > $dir/stage2/configs/network.xconfig
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1-sibling input=lda dim=$sibling_dim
+  relu-renorm-layer name=tdnn2-sibling input=Append(-1,0,1) dim=$sibling_dim
+  relu-renorm-layer name=tdnn3-sibling input=Append(-1,0,1) dim=$sibling_dim
+  relu-renorm-layer name=tdnn4-sibling input=Append(-3,0,3) dim=$sibling_dim
+  relu-renorm-layer name=tdnn5-sibling input=Append(-3,0,3) dim=$sibling_dim
+  relu-renorm-layer name=tdnn6-sibling input=Append(-3,0,3) dim=$sibling_dim
+  relu-renorm-layer name=tdnn7-sibling input=Append(-3,0,3) dim=$sibling_dim
+
+  ## adding the layers for chain branch
+  relu-renorm-layer name=prefinal-chain-sibling input=tdnn7-sibling dim=$sibling_dim target-rms=0.5
+  output-layer name=output-sibling include-log-softmax=false dim=$num_targets max-change=1.5
+
+  relu-renorm-layer name=prefinal-xent-sibling input=tdnn7-sibling dim=$sibling_dim target-rms=0.5
+  output-layer name=output-xent-sibling dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+  ## adding the regressor outputs to the sibling network configs.
+  relu-renorm-layer name=tdnn4-regressor input=tdnn4-sibling dim=$primary_dim
+  regressor-layer name=regressor-4 input1=tdnn4-regressor input2=tdnn4 objective-type=linear max-change=1.5 dim=$primary_dim regressor-scale-file=$dir/stage2/regressor_scale.vec supervision-type=unsupervised
+  ## adding regressor for log-likelihood output to maximize weighted log-likelihood
+  #  product for two networks.
+  regressor-layer name=regressor-final input1=output-sibling.affine input2=output.affine objective-type=quadratic max-change=1.5 dim=$num_targets negate-file=$dir/stage2/negate.vec supervision-type=unsupervised regressor-scale-file=$dir/stage2/presoftmax_prior_scale.vec
+EOF
+  # regressor-scale-file=$dir/stage2/  presoftmax_prior_scale.vec
+  steps/nnet3/xconfig_to_configs.py --aux-xconfig-file $dir/stage1/configs/network.xconfig \
+    --xconfig-file $dir/stage2/configs/network.xconfig --config-dir $dir/stage2/configs/
+
+  # edits.config contains edits required for different stage of training.
+  # it is applied to 0.mdl generated at prepare_initial_network stage in
+  # iter -1.
+  # The edits for 2nd stage contains renaming primary network's outputs to
+  # <output-name>-primary to not train using these outputs.
+  # the edits contain renaming sibling network output to be output.
+  cat <<EOF > $dir/stage2/configs/edits.config
+  rename-node old-name=output new-name=output-primary
+  rename-node old-name=output-xent new-name=output-xent-primary
+  rename-node old-name=output-sibling new-name=output
+  rename-node old-name=output-xent-sibling new-name=output-xent
+EOF
+  # edits.config contains edits required for 3rd stage of training.
+  mkdir -p $dir/stage3
+  mkdir -p $dir/stage3/configs
+  cat <<EOF > $dir/stage3/configs/edits.config
+  remove-output-nodes name=regressor*
+  remove-output-nodes name=*-primary
+  remove-orphans
+EOF
+  # we skip add_compatiblity stage in xconfig_to_config.py
+  # we copy vars from stage1 to stage2 and stage3 for now.
+  cp -r $dir/stage1/configs/vars $dir/stage2/configs/.
+  cp -r $dir/stage1/configs/vars $dir/stage3/configs/.
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+  if [ $multi_stage_train -le 0 ] && [ ! -f $src_mdl ]; then
+    echo "$0: Training primary network"
+    steps/nnet3/chain/train.py --stage $train_stage \
+      --cmd "$decode_cmd" \
+      --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+      --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+      --chain.xent-regularize $xent_regularize \
+      --chain.leaky-hmm-coefficient 0.1 \
+      --chain.l2-regularize 0.00005 \
+      --chain.apply-deriv-weights false \
+      --chain.lm-opts="--num-extra-lm-states=2000" \
+      --egs.dir "$common_egs_dir" \
+      --egs.stage $get_egs_stage \
+      --egs.opts "--frames-overlap-per-eg 0" \
+      --egs.chunk-width $frames_per_eg \
+      --trainer.num-chunk-per-minibatch $minibatch_size \
+      --trainer.frames-per-iter 1500000 \
+      --trainer.num-epochs $num_epochs \
+      --trainer.optimization.num-jobs-initial $num_jobs_initial \
+      --trainer.optimization.num-jobs-final $num_jobs_final \
+      --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+      --trainer.optimization.final-effective-lrate $final_effective_lrate \
+      --trainer.max-param-change $max_param_change \
+      --cleanup.remove-egs $remove_egs \
+      --feat-dir data/${train_set}_hires \
+      --tree-dir $treedir \
+      --lat-dir exp/tri4_lats_nodup$suffix \
+      --dir $dir/stage1  || exit 1;
+  fi
+
+  if [ $multi_stage_train -le 1 ]; then
+      mkdir -p $dir/stage2
+      echo "$0: copy final primary network in $dir/stage1/final.raw to "
+      echo "$dir/stage2/init.raw as initial network with zero lr factor"
+      echo "as primary network for sibling network."
+      nnet3-am-copy --raw=true \
+        --edits='set-learning-rate-factor name=* learning-rate-factor=0.0;' \
+        $src_mdl $dir/stage2/init.raw || exit 1;
+
+      echo "$0: Training sibling network using regularizer objectives."
+      steps/nnet3/chain/train.py --stage $train_stage \
+      --cmd "$decode_cmd" \
+      --init-raw-model $dir/stage2/init.raw \
+      --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+      --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+      --chain.leaky-hmm-coefficient 0.1 \
+      --chain.l2-regularize 0.00005 \
+      --chain.chain-regularize $chain_regularize \
+      --chain.apply-deriv-weights false \
+      --chain.lm-opts="--num-extra-lm-states=2000" \
+      --egs.dir "$common_egs_dir" \
+      --egs.stage $get_egs_stage \
+      --egs.opts "--frames-overlap-per-eg 0" \
+      --egs.chunk-width $frames_per_eg \
+      --trainer.num-chunk-per-minibatch $minibatch_size \
+      --trainer.frames-per-iter 1500000 \
+      --trainer.num-epochs $num_epochs_s2 \
+      --trainer.optimization.num-jobs-initial $num_jobs_initial_s2 \
+      --trainer.optimization.num-jobs-final $num_jobs_final_s2 \
+      --trainer.optimization.initial-effective-lrate $initial_effective_lrate_s2 \
+      --trainer.optimization.final-effective-lrate $final_effective_lrate_s2 \
+      --trainer.presoftmax-prior-scale-power $regressor_prior_scale \
+      --trainer.max-param-change $max_param_change \
+      --trainer.presoftmax-prior-scale-power $regressor_prior_scale \
+      --trainer.prior-scale-factor $prior_scale_factor \
+      --cleanup.remove-egs false \
+      --feat-dir data/${train_set}_hires \
+      --tree-dir $treedir \
+      --lat-dir exp/tri4_lats_nodup$suffix \
+      --dir $dir/stage2  || exit 1;
+  fi
+  if [ $multi_stage_train -le 2 ]; then
+      cp $dir/stage2/den.fst $dir/stage3/.
+      echo "$0:remove sibling network regularizer outputs "
+      echo "and raname chain-objective for sibling to train "
+      echo "with chain objective output for sibling network. \n"
+      echo "Teacher-student objective can be added in future."
+      nnet3-am-copy --edits-config=$dir/stage3/configs/edits.config \
+        $dir/stage2/final.mdl $dir/stage3/0.mdl || exit 1;
+      mkdir -p $dir/stage3/configs
+      train_stage_s3=0
+      if [ $train_stage -gt $train_stage_s3 ]; then
+        train_stage_s3=$train_stage
+      fi
+      cp -r $dir/stage2/configs $dir/stage3/configs || exit 1;
+      steps/nnet3/chain/train.py --stage $train_stage_s3 \
+      --cmd "$decode_cmd" \
+      --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+      --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+      --chain.xent-regularize $xent_regularize \
+      --chain.leaky-hmm-coefficient 0.1 \
+      --chain.l2-regularize 0.00005 \
+      --chain.apply-deriv-weights false \
+      --chain.lm-opts="--num-extra-lm-states=2000" \
+      --egs.dir "$common_egs_dir" \
+      --egs.stage $get_egs_stage \
+      --egs.opts "--frames-overlap-per-eg 0" \
+      --egs.chunk-width $frames_per_eg \
+      --trainer.num-chunk-per-minibatch $minibatch_size \
+      --trainer.frames-per-iter 1500000 \
+      --trainer.num-epochs $num_epochs_s3 \
+      --trainer.optimization.num-jobs-initial $num_jobs_initial_s3 \
+      --trainer.optimization.num-jobs-final $num_jobs_final_s3 \
+      --trainer.optimization.initial-effective-lrate $initial_effective_lrate_s3 \
+      --trainer.optimization.final-effective-lrate $final_effective_lrate_s3 \
+      --trainer.max-param-change $max_param_change \
+      --trainer.presoftmax-prior-scale-power $regressor_prior_scale \
+      --trainer.prior-scale-factor $prior_scale_factor \
+      --cleanup.remove-egs $remove_egs \
+      --feat-dir data/${train_set}_hires \
+      --tree-dir $treedir \
+      --lat-dir exp/tri4_lats_nodup$suffix \
+      --dir $dir/stage3  || exit 1;
+
+  fi
+fi
+#dirs_for_decode="$dir/stage2 $dir/stage3"
+dirs_for_decode="$dir/stage2"
+#dirs_for_decode="$dir/stage3"
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  for dset in $dirs_for_decode; do
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dset $dset/graph_sw1_tg
+  done
+fi
+
+decode_suff=sw1_tg
+if [ $stage -le 15 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for dset in $dirs_for_decode; do
+    graph_dir=$dset/graph_sw1_tg
+    for decode_set in train_dev eval2000; do
+        (
+        steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+            --nj 50 --cmd "$decode_cmd" $iter_opts \
+            --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+            $graph_dir data/${decode_set}_hires $dset/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+        if $has_fisher; then
+            steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+              data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+              $dset/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+        fi
+        ) &
+    done
+  done
+fi
+wait;
+exit 0;