diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 00000000000..660c62884be
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,18 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: bug
+assignees: ''
+
+---
+
+<!--
+    WARNING: THE KALDI ISSUE TRACKER IS **ONLY** USED FOR KALDI DEVELOPMENT!
+
+    If you have a question about using Kaldi, please use the kald-help discussion group:
+
+    https://groups.google.com/forum/#!forum/kaldi-help
+
+    Instructions for joining are available at: http://kaldi-asr.org/forums.html
+-->
diff --git a/.github/ISSUE_TEMPLATE/feature-proposal-discussion.md b/.github/ISSUE_TEMPLATE/feature-proposal-discussion.md
new file mode 100644
index 00000000000..61e797b9ca1
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature-proposal-discussion.md
@@ -0,0 +1,18 @@
+---
+name: Feature proposal or discussion
+about: Suggest an idea for Kaldi
+title: ''
+labels: discussion
+assignees: ''
+
+---
+
+<!--
+    WARNING: THE KALDI ISSUE TRACKER IS **ONLY** USED FOR KALDI DEVELOPMENT!
+
+    If you have a question about using Kaldi, please use the kald-help discussion group:
+
+    https://groups.google.com/forum/#!forum/kaldi-help
+
+    Instructions for joining are available at: http://kaldi-asr.org/forums.html
+-->
diff --git a/.gitignore b/.gitignore
index 910d5cb019d..5764bfe22c6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -83,6 +83,7 @@ GSYMS
 /tools/ATLAS/
 /tools/atlas3.8.3.tar.gz
 /tools/irstlm/
+/tools/mitlm/
 /tools/openfst
 /tools/openfst-1.3.2.tar.gz
 /tools/openfst-1.3.2/
@@ -147,3 +148,4 @@ GSYMS
 /tools/cub-1.8.0.zip
 /tools/cub-1.8.0/
 /tools/cub
+/tools/python/
diff --git a/docker/README.md b/docker/README.md
new file mode 100644
index 00000000000..852e9531bd6
--- /dev/null
+++ b/docker/README.md
@@ -0,0 +1,30 @@
+# Kaldi Docker images
+
+Kaldi offers two set of images: CPU-based images and GPU-based images. Daily builds of the latest version of the master branch (both CPU and GPU images) are pushed daily to [DockerHub](https://hub.docker.com/r/kaldiasr/kaldi). 
+
+## Using pre-built images 
+Sample usage of the CPU based images:
+```bash
+docker run -it kaldiasr/kaldi:latest bash
+``` 
+
+Sample usage of the GPU based images:
+
+Note: use [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) to run the GPU images.
+
+```bash
+docker run -it --runtime=nvidia kaldiasr/kaldi:gpu-latest bash
+```
+
+## Building images locally
+For building the CPU-based image:
+```bash
+cd docker/debian9.8-cpu
+docker build --tag kaldiasr/kaldi:latest .
+```
+
+and for GPU-based image:
+```bash
+cd docker/ubuntu16.04-gpu
+docker build --tag kaldiasr/kaldi:gpu-latest .
+```
diff --git a/docker/debian9.8-cpu/Dockerfile b/docker/debian9.8-cpu/Dockerfile
new file mode 100644
index 00000000000..fb2ef6e8db6
--- /dev/null
+++ b/docker/debian9.8-cpu/Dockerfile
@@ -0,0 +1,40 @@
+
+FROM debian:9.8
+LABEL maintainer="mdoulaty@gmail.com"
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        g++ \
+        make \
+        automake \
+        autoconf \
+        bzip2 \
+        unzip \
+        wget \
+        sox \
+        libtool \
+        git \
+        subversion \
+        python2.7 \
+        python3 \
+        zlib1g-dev \
+        ca-certificates \
+        patch \
+        ffmpeg \
+	vim && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN ln -s /usr/bin/python2.7 /usr/bin/python 
+
+RUN git clone --depth 1 https://github.com/kaldi-asr/kaldi.git /opt/kaldi && \
+    cd /opt/kaldi && \
+    cd /opt/kaldi/tools && \
+    ./extras/install_mkl.sh && \
+    make -j $(nproc) && \
+    cd /opt/kaldi/src && \
+    ./configure --shared && \
+    make depend -j $(nproc) && \
+    make -j $(nproc)
+
+WORKDIR /opt/kaldi/
+
diff --git a/docker/ubuntu16.04-gpu/Dockerfile b/docker/ubuntu16.04-gpu/Dockerfile
new file mode 100644
index 00000000000..49189b2970f
--- /dev/null
+++ b/docker/ubuntu16.04-gpu/Dockerfile
@@ -0,0 +1,40 @@
+
+FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu16.04
+LABEL maintainer="mdoulaty@gmail.com"
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        g++ \
+        make \
+        automake \
+        autoconf \
+        bzip2 \
+        unzip \
+        wget \
+        sox \
+        libtool \
+        git \
+        subversion \
+        python2.7 \
+        python3 \
+        zlib1g-dev \
+        ca-certificates \
+        patch \
+        ffmpeg \
+	vim && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN ln -s /usr/bin/python2.7 /usr/bin/python 
+
+RUN git clone --depth 1 https://github.com/kaldi-asr/kaldi.git /opt/kaldi && \
+    cd /opt/kaldi && \
+    cd /opt/kaldi/tools && \
+    ./extras/install_mkl.sh && \
+    make -j $(nproc) && \
+    cd /opt/kaldi/src && \
+    ./configure --shared --use-cuda && \
+    make depend -j $(nproc) && \
+    make -j $(nproc)
+
+WORKDIR /opt/kaldi/
+
diff --git a/egs/aidatatang_200zh/README.md b/egs/aidatatang_200zh/README.md
new file mode 100644
index 00000000000..097454d84ce
--- /dev/null
+++ b/egs/aidatatang_200zh/README.md
@@ -0,0 +1,21 @@
+Aidatatang_200zh is a free Chinese Mandarin speech corpus provided by Beijing DataTang Technology Co., Ltd under Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International Public License. 
+
+**About the aidatatang_200zh corpus:**
+
+- The corpus contains 200 hours of acoustic data, which is mostly mobile recorded data.
+- 600 speakers from different accent areas in China are invited to participate in the recording.
+- The transcription accuracy for each sentence is larger than 98%.
+- Recordings are conducted in a quiet indoor environment. 
+- The database is divided into training set, validation set, and testing set in a ratio of 7: 1: 2.
+- Detail information such as speech data coding and speaker information is preserved in the metadata file.
+- Segmented transcripts are also provided.
+
+You can get the corpus from [here](https://www.datatang.com/webfront/opensource.html). 
+
+DataTang is a community of creators-of world-changers and future-builders. We're invested in collaborating with a diverse set of voices in the AI world, and are excited about working on large-scale projects. Beyond speech, we're providing multiple resources in image, and text. For more details, please visit [datatang](<https://www.datatang.com/>).
+
+**About the recipe:**
+
+To demonstrate that this corpus is a reasonable data resource for Chinese Mandarin speech recognition research, a baseline recipe is provided here for everyone to explore their own systems easily and quickly.
+
+In this directory, each subdirectory contains the scripts for a sequence of experiments. The recipe in subdirectory "s5" is based on the hkust s5 recipe and aishell s5 recipe. It generates an integrated phonetic lexicon with CMU dictionary and cedit dictionary. This recipe follows the Mono+Triphone+SAT+fMLLR+DNN pipeline. In addition, this directory will be extended as scripts for speaker diarization and so on are created.
diff --git a/egs/aidatatang_200zh/s5/RESULTS b/egs/aidatatang_200zh/s5/RESULTS
new file mode 100644
index 00000000000..8c458e8015e
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/RESULTS
@@ -0,0 +1,17 @@
+%WER 37.09 [ 173936 / 468933, 4868 ins, 31143 del, 137925 sub ] exp/mono/decode_test/cer_10_0.0
+%WER 17.98 [ 84305 / 468933, 4724 ins, 12637 del, 66944 sub ] exp/tri1/decode_test/cer_13_0.0
+%WER 17.94 [ 84149 / 468933, 5025 ins, 12427 del, 66697 sub ] exp/tri2/decode_test/cer_13_0.0
+%WER 17.26 [ 80945 / 468933, 4421 ins, 12958 del, 63566 sub ] exp/tri3a/decode_test/cer_14_0.0
+%WER 14.16 [ 66424 / 468933, 4567 ins, 10224 del, 51633 sub ] exp/tri4a/decode_test/cer_14_0.0
+%WER 12.22 [ 57304 / 468933, 4799 ins, 8197 del, 44308 sub ] exp/tri5a/decode_test/cer_14_0.0
+%WER 5.59 [ 26232 / 468933, 1701 ins, 4377 del, 20154 sub ] exp/chain/tdnn_1a_sp/decode_test/cer_10_0.0
+
+# nnet3 tdnn with online pitch, local/nnet3/tuning/run_tdnn_2a.sh
+%WER 7.21 [ 33797 / 468933, 2141 ins, 6117 del, 25539 sub ] exp/nnet3/tdnn_sp/decode_test/cer_13_0.0
+%WER 7.44 [ 34878 / 468933, 2252 ins, 5854 del, 26772 sub ] exp/nnet3/tdnn_sp_online/decode_test/cer_12_0.0
+%WER 7.79 [ 36542 / 468933, 2527 ins, 5674 del, 28341 sub ] exp/nnet3/tdnn_sp_online/decode_test_per_utt/cer_12_0.0
+
+# chain with online pitch, local/chain/tuning/run_tdnn_2a.sh
+%WER 5.61 [ 26311 / 468933, 1773 ins, 4789 del, 19749 sub ] exp/chain/tdnn_2a_sp/decode_test/cer_11_0.0
+%WER 5.69 [ 26661 / 468933, 1723 ins, 4724 del, 20214 sub ] exp/chain/tdnn_2a_sp_online/decode_test/cer_11_0.0
+%WER 5.98 [ 28046 / 468933, 2031 ins, 4527 del, 21488 sub ] exp/chain/tdnn_2a_sp_online/decode_test_per_utt/cer_11_0.0
diff --git a/egs/aidatatang_200zh/s5/cmd.sh b/egs/aidatatang_200zh/s5/cmd.sh
new file mode 100644
index 00000000000..811adcde474
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/cmd.sh
@@ -0,0 +1,14 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
diff --git a/egs/aidatatang_200zh/s5/conf/cmu2pinyin b/egs/aidatatang_200zh/s5/conf/cmu2pinyin
new file mode 100644
index 00000000000..c02eb600fcc
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/conf/cmu2pinyin
@@ -0,0 +1,39 @@
+AA A
+AE A
+AH A
+AO UO
+AW U
+AY AI
+B B
+CH CH 
+D D
+DH S I
+EH AI
+ER E
+EY AI
+F F
+G G
+HH H
+IH I
+IY I
+JH ZH 
+K K
+L L
+M M
+N N
+NG N
+OW UO
+OY UO
+P P
+R R
+S S
+SH SH
+T T
+TH S
+UH U
+UW U
+V W
+W W
+Y Y
+Z Z 
+ZH X  
diff --git a/egs/aidatatang_200zh/s5/conf/decode.config b/egs/aidatatang_200zh/s5/conf/decode.config
new file mode 100644
index 00000000000..d91f86183af
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/conf/decode.config
@@ -0,0 +1,5 @@
+beam=11.0 # beam for decoding.  Was 13.0 in the scripts.
+first_beam=8.0 # beam for 1st-pass decoding in SAT.
+
+
+
diff --git a/egs/aidatatang_200zh/s5/conf/mfcc.conf b/egs/aidatatang_200zh/s5/conf/mfcc.conf
new file mode 100644
index 00000000000..a1aa3d6c158
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/conf/mfcc.conf
@@ -0,0 +1,2 @@
+--use-energy=false   # only non-default option.
+--sample-frequency=16000
diff --git a/egs/aidatatang_200zh/s5/conf/mfcc_hires.conf b/egs/aidatatang_200zh/s5/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..ca067e77b37
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=16000 #  Switchboard is sampled at 8kHz
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=40    # low cutoff frequency for mel bins
+--high-freq=-200 # high cutoff frequently, relative to Nyquist of 8000 (=3800)
diff --git a/egs/aidatatang_200zh/s5/conf/online_cmvn.conf b/egs/aidatatang_200zh/s5/conf/online_cmvn.conf
new file mode 100644
index 00000000000..591367e7ae9
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used when invoking online2-wav-nnet3-latgen-faster.
diff --git a/egs/aidatatang_200zh/s5/conf/online_pitch.conf b/egs/aidatatang_200zh/s5/conf/online_pitch.conf
new file mode 100644
index 00000000000..c0f1342160d
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/conf/online_pitch.conf
@@ -0,0 +1,4 @@
+--sample-frequency=16000
+--simulate-first-pass-online=true
+--normalization-right-context=25
+--frames-per-chunk=10
diff --git a/egs/aidatatang_200zh/s5/conf/pinyin2cmu b/egs/aidatatang_200zh/s5/conf/pinyin2cmu
new file mode 100644
index 00000000000..a6e53620479
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/conf/pinyin2cmu
@@ -0,0 +1,58 @@
+A AA
+AI AY
+AN AE N 
+ANG AE NG
+AO AW   
+B B 
+CH CH
+C T S
+D D
+E ER 
+EI EY
+EN AH N
+ENG AH NG
+ER AA R 
+F F
+G G
+H HH
+IA IY AA
+IANG IY AE NG
+IAN IY AE N
+IAO IY AW
+IE IY EH
+I IY
+ING IY NG
+IN IY N
+IONG IY UH NG
+IU IY UH 
+J J
+K K
+L L
+M M
+N N
+O AO
+ONG UH NG
+OU OW
+P P
+Q Q
+R R
+SH SH
+S S
+T T
+UAI UW AY
+UANG UW AE NG
+UAN UW AE N
+UA UW AA
+UI UW IY 
+UN UW AH N
+UO UW AO
+U UW
+UE IY EH 
+VE IY EH 
+V IY UW
+VN IY N 
+W W
+X X 
+Y Y
+ZH JH 
+Z Z
diff --git a/egs/aidatatang_200zh/s5/conf/pinyin_initial b/egs/aidatatang_200zh/s5/conf/pinyin_initial
new file mode 100644
index 00000000000..e263ad07e2a
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/conf/pinyin_initial
@@ -0,0 +1,23 @@
+B 
+C
+CH
+D
+F
+G
+H
+J
+K
+L
+M
+N
+P
+Q
+R
+S
+SH
+T
+W
+X
+Y
+Z
+ZH
diff --git a/egs/aidatatang_200zh/s5/conf/pitch.conf b/egs/aidatatang_200zh/s5/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs/aidatatang_200zh/s5/local/chain/compare_wer.sh b/egs/aidatatang_200zh/s5/local/chain/compare_wer.sh
new file mode 100755
index 00000000000..71e6fbe106d
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/local/chain/compare_wer.sh
@@ -0,0 +1,97 @@
+#!/bin/bash
+# Copyright 2018  Emotech LTD (Author: Xuechen Liu)
+
+# compare wer between diff. models in aidatatang_200zh chain directory
+# exemplar usage: local/chain/compare_wer.sh --online exp/chain/tdnn_2a_sp
+# note: this script is made quite general since we kinda wanna give more flexibility to
+#       users on adding affix for their own use when training models.
+
+set -e
+. ./cmd.sh
+. ./path.sh
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: [--online] <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 --online exp/chain/tdnn_2a_sp"
+  exit 1
+fi
+
+echo "# $0 $*"
+
+include_online=false
+if [ "$1" == "--online" ]; then
+  include_online=true
+  shift
+fi
+
+set_names() {
+  if [ $# != 1 ]; then
+    echo "compare_wer.sh: internal error"
+    exit 1  # exit the program
+  fi
+  dirname=$(echo $1 | cut -d: -f1)
+}
+
+# print model names
+echo -n "# Model               "
+for x in $*; do
+  printf "% 10s" " $(basename $x)"
+done
+echo
+
+# print decode WER results
+echo -n "# WER(%)               "
+for x in $*; do
+  set_names $x
+  wer=$([ -d $x ] && grep WER $x/decode_test/cer_* | utils/best_wer.sh | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+# so how about online WER?
+if $include_online; then
+  echo -n "# WER(%)[online]       "
+  for x in $*; do
+    set_names $x
+    wer=$(cat ${x}_online/decode_test/cer_* | utils/best_wer.sh | awk '{print $2}')
+    printf "% 10s" $wer
+  done
+  echo
+  echo -n "# WER(%)[per-utt]      "
+  for x in $*; do
+    set_names $x
+    wer_per_utt=$(cat ${x}_online/decode_test_per_utt/cer_* | utils/best_wer.sh | awk '{print $2}')
+    printf "% 10s" $wer_per_utt
+  done
+  echo
+fi
+
+# print final log prob for train & validation
+echo -n "# Final train prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf($8)}' | cut -c1-7)
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf($8)}' | cut -c1-7)
+  printf "% 10s" $prob
+done
+echo
+
+# do the same for xent objective
+echo -n "# Final train prob (xent)"
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent)"
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
diff --git a/egs/aidatatang_200zh/s5/local/chain/run_tdnn.sh b/egs/aidatatang_200zh/s5/local/chain/run_tdnn.sh
new file mode 120000
index 00000000000..34499362831
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/aidatatang_200zh/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/aidatatang_200zh/s5/local/chain/tuning/run_tdnn_1a.sh
new file mode 100644
index 00000000000..0be0e2c79c6
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,193 @@
+#!/bin/bash
+
+# This script is based on run_tdnn_7h.sh in swbd chain recipe.
+
+# results
+# local/chain/compare_wer.sh exp/chain/tdnn_1a_sp/
+# Model                tdnn_1a_sp
+# WER(%)                     5.59
+# Final train prob        -0.0488
+# Final valid prob        -0.0925
+# Final train prob (xent)   -0.8001
+# Final valid prob (xent)   -1.0398
+
+set -e
+
+# configs for 'chain'
+affix=
+stage=0
+train_stage=-10
+get_egs_stage=-10
+dir=exp/chain/tdnn_1a  # Note: _sp will get added to this
+decode_iter=
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=2
+num_jobs_final=12
+minibatch_size=128
+frames_per_eg=150,110,90
+remove_egs=true
+common_egs_dir=
+xent_regularize=0.1
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+dir=${dir}${affix:+_$affix}_sp
+train_set=train_sp
+ali_dir=exp/tri5a_sp_ali
+treedir=exp/chain/tri6_7d_tree_sp
+lang=data/lang_chain
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage || exit 1;
+
+if [ $stage -le 7 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat $ali_dir/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri5a exp/tri5a_sp_lats
+  rm exp/tri5a_sp_lats/fsts.*.gz # save space
+fi
+
+if [ $stage -le 8 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 9 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 5000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=43 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=625
+  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=625
+  relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=625
+  relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=625
+  relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=625
+  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=625
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=625 target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=625 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 11 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aidatatang-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri5a_sp_lats \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+fi
+
+graph_dir=$dir/graph
+if [ $stage -le 13 ]; then
+  for test_set in dev test; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj 10 --cmd "$decode_cmd" \
+      --online-ivector-dir exp/nnet3/ivectors_$test_set \
+      $graph_dir data/${test_set}_hires $dir/decode_${test_set} || exit 1;
+  done
+fi
+
+exit;
diff --git a/egs/aidatatang_200zh/s5/local/chain/tuning/run_tdnn_2a.sh b/egs/aidatatang_200zh/s5/local/chain/tuning/run_tdnn_2a.sh
new file mode 100644
index 00000000000..78dd4000e58
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/local/chain/tuning/run_tdnn_2a.sh
@@ -0,0 +1,238 @@
+#!/bin/bash
+
+# This script is based on run_tdnn_1a.sh.
+# This setup used online pitch to train the neural network.
+# It requires a online_pitch.conf in the conf dir.
+
+# results
+# local/chain/compare_wer.sh exp/chain/tdnn_2a_sp
+# Model                tdnn_2a_sp
+# WER(%)                     5.61
+# Final train prob        -0.0502
+# Final valid prob        -0.0913
+# Final train prob (xent)   -0.8047
+# Final valid prob (xent)   -1.0292
+
+# local/chain/compare_wer.sh --online exp/chain/tdnn_2a_sp
+# Model                tdnn_2a_sp
+# WER(%)                     5.61
+# WER(%)[online]             5.69
+# WER(%)[per-utt]            5.98
+# Final train prob        -0.0502
+# Final valid prob        -0.0913
+# Final train prob (xent)   -0.8047
+# Final valid prob (xent)   -1.0292
+
+# local/chain/compare_wer.sh exp/chain/tdnn_1a_sp exp/chain/tdnn_2a_sp
+# Model                tdnn_1a_sp tdnn_2a_sp
+# WER(%)                     5.59      5.61
+# Final train prob        -0.0488   -0.0502
+# Final valid prob        -0.0925   -0.0913
+# Final train prob (xent)   -0.8001   -0.8047
+# Final valid prob (xent)   -1.0398   -1.0292
+
+set -e
+
+# configs for 'chain'
+affix=
+stage=0
+train_stage=-10
+get_egs_stage=-10
+dir=exp/chain/tdnn_2a  # Note: _sp will get added to this
+decode_iter=
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=2
+num_jobs_final=12
+minibatch_size=128
+frames_per_eg=150,110,90
+remove_egs=true
+common_egs_dir=
+xent_regularize=0.1
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+dir=${dir}${affix:+_$affix}_sp
+train_set=train_sp
+ali_dir=exp/tri5a_sp_ali
+treedir=exp/chain/tri6_7d_tree_sp
+lang=data/lang_chain
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage --online true || exit 1;
+
+if [ $stage -le 7 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat $ali_dir/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri5a exp/tri5a_sp_lats
+  rm exp/tri5a_sp_lats/fsts.*.gz # save space
+fi
+
+if [ $stage -le 8 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 9 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 5000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=43 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=625
+  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=625
+  relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=625
+  relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=625
+  relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=625
+  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=625
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=625 target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=625 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 11 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aidatatang-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires_online \
+    --tree-dir $treedir \
+    --lat-dir exp/tri5a_sp_lats \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+fi
+
+graph_dir=$dir/graph
+if [ $stage -le 13 ]; then
+  for test_set in dev test; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj 10 --cmd "$decode_cmd" \
+      --online-ivector-dir exp/nnet3/ivectors_$test_set \
+      $graph_dir data/${test_set}_hires_online $dir/decode_${test_set} || exit 1;
+  done
+fi
+
+if [ $stage -le 14 ]; then
+  steps/online/nnet3/prepare_online_decoding.sh --mfcc-config conf/mfcc_hires.conf \
+    --add-pitch true \
+    $lang exp/nnet3/extractor "$dir" ${dir}_online || exit 1;
+fi
+
+dir=${dir}_online
+if [ $stage -le 15 ]; then
+  for test_set in dev test; do
+    steps/online/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj 10 --cmd "$decode_cmd" \
+      --config conf/decode.config \
+      $graph_dir data/${test_set}_hires_online $dir/decode_${test_set} || exit 1;
+  done
+fi
+
+if [ $stage -le 16 ]; then
+  for test_set in dev test; do
+    steps/online/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj 10 --cmd "$decode_cmd" --per-utt true \
+      --config conf/decode.config \
+      $graph_dir data/${test_set}_hires_online $dir/decode_${test_set}_per_utt || exit 1;
+  done
+fi
+
+exit;
diff --git a/egs/aidatatang_200zh/s5/local/create_oov_char_lexicon.pl b/egs/aidatatang_200zh/s5/local/create_oov_char_lexicon.pl
new file mode 100644
index 00000000000..33e2e8061c3
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/local/create_oov_char_lexicon.pl
@@ -0,0 +1,48 @@
+#!/usr/bin/env perl
+# Copyright 2016 Alibaba Robotics Corp. (Author: Xingyu Na)
+#
+# A script for char-based Chinese OOV lexicon generation.
+#
+# Input 1: char-based dictionary, example
+# CHAR1 ph1 ph2
+# CHAR2 ph3
+# CHAR3 ph2 ph4
+#
+# Input 2: OOV word list, example
+# WORD1
+# WORD2
+# WORD3
+#
+# where WORD1 is in the format of "CHAR1CHAR2".
+#
+# Output: OOV lexicon, in the format of normal lexicon
+
+if($#ARGV != 1) {
+  print STDERR "usage: perl create_oov_char_lexicon.pl chardict oovwordlist > oovlex\n\n";
+  print STDERR "### chardict: a dict in which each line contains the pronunciation of one Chinese char\n";
+  print STDERR "### oovwordlist: OOV word list\n";
+  print STDERR "### oovlex: output OOV lexicon\n";
+  exit;
+}
+
+use utf8;
+my %prons;
+open(DICT, $ARGV[0]) || die("Can't open dict ".$ARGV[0]."\n");
+binmode(DICT,":encoding(utf8)");
+foreach (<DICT>) {
+  chomp; @A = split(" ", $_); $prons{$A[0]} = $A[1];
+}
+close DICT;
+
+open(WORDS, $ARGV[1]) || die("Can't open oov word list ".$ARGV[1]."\n");
+binmode(WORDS,":encoding(utf8)");
+while (<WORDS>) {
+  chomp;
+  print $_;
+  @A = split("", $_);
+  foreach (@A) {
+    print " $prons{$_}";
+  }
+  print "\n";
+}
+close WORDS;
diff --git a/egs/aidatatang_200zh/s5/local/data_prep.sh b/egs/aidatatang_200zh/s5/local/data_prep.sh
new file mode 100644
index 00000000000..bb278a7d904
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/local/data_prep.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+
+# Copyright 2017 Xingyu Na
+# Apache 2.0
+
+. ./path.sh || exit 1;
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 <corpus-path> <text-path>"
+  echo " $0 /export/a05/xna/data/data_aidatatang_200zh/corpus /export/a05/xna/data/data_aidatatang_200zh/transcript"
+  exit 1;
+fi
+
+aidatatang_audio_dir=$1
+aidatatang_text=$2/aidatatang_200_zh_transcript.txt
+
+train_dir=data/local/train
+dev_dir=data/local/dev
+test_dir=data/local/test
+tmp_dir=data/local/tmp
+
+mkdir -p $train_dir
+mkdir -p $dev_dir
+mkdir -p $test_dir
+mkdir -p $tmp_dir
+
+# data directory check
+if [ ! -d $aidatatang_audio_dir ] || [ ! -f $aidatatang_text ]; then
+  echo "Error: $0 requires two directory arguments"
+  exit 1;
+fi
+
+# find wav audio file for train, dev and test resp.
+find $aidatatang_audio_dir -iname "*.wav" > $tmp_dir/wav.flist
+n=`cat $tmp_dir/wav.flist | wc -l`
+[ $n -ne 237265 ] && \
+  echo Warning: expected 237265 data files, found $n
+
+grep -i "corpus/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1;
+grep -i "corpus/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1;
+grep -i "corpus/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1;
+
+rm -r $tmp_dir
+
+# Transcriptions preparation
+for dir in $train_dir $dev_dir $test_dir; do
+  echo Preparing $dir transcriptions
+  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list
+  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}' > $dir/utt2spk_all
+  paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all
+  utils/filter_scp.pl -f 1 $dir/utt.list $aidatatang_text > $dir/transcripts.txt
+  awk '{print $1}' $dir/transcripts.txt > $dir/utt.list
+  utils/filter_scp.pl -f 1 $dir/utt.list $dir/utt2spk_all | sort -u > $dir/utt2spk
+  utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp
+  sort -u $dir/transcripts.txt > $dir/text
+  utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
+done
+
+mkdir -p data/train data/dev data/test
+
+for f in spk2utt utt2spk wav.scp text; do
+  cp $train_dir/$f data/train/$f || exit 1;
+  cp $dev_dir/$f data/dev/$f || exit 1;
+  cp $test_dir/$f data/test/$f || exit 1;
+done
+
+echo "$0: aidatatang_200zh data preparation succeeded"
+exit 0;
diff --git a/egs/aidatatang_200zh/s5/local/download_and_untar.sh b/egs/aidatatang_200zh/s5/local/download_and_untar.sh
new file mode 100644
index 00000000000..39f9ac01ff7
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/local/download_and_untar.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+
+# Copyright   2014  Johns Hopkins University (author: Daniel Povey)
+#             2017  Xingyu Na
+# Apache 2.0
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
+  echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/62 aidatatang_200zh"
+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+  echo "<corpus-part> can be one of: aidatatang_200zh."
+fi
+
+data=$1
+url=$2
+part=$3
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data"
+  exit 1;
+fi
+
+part_ok=false
+list="aidatatang_200zh"
+for x in $list; do
+  if [ "$part" == $x ]; then part_ok=true; fi
+done
+if ! $part_ok; then
+  echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
+  exit 1;
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL base."
+  exit 1;
+fi
+
+if [ -f $data/$part/.complete ]; then
+  echo "$0: data part $part was already successfully extracted, nothing to do."
+  exit 0;
+fi
+
+# sizes of the archive files in bytes.
+sizes="18756983399"
+
+if [ -f $data/$part.tgz ]; then
+  size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}')
+  size_ok=false
+  for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
+  if ! $size_ok; then
+    echo "$0: removing existing file $data/$part.tgz because its size in bytes $size"
+    echo "does not equal the size of one of the archives."
+    rm $data/$part.gz
+  else
+    echo "$data/$part.tgz exists and appears to be complete."
+  fi
+fi
+
+if [ ! -f $data/$part.tgz ]; then
+  if ! which wget >/dev/null; then
+    echo "$0: wget is not installed."
+    exit 1;
+  fi
+  full_url=$url/$part.tgz
+  echo "$0: downloading data from $full_url.  This may take some time, please be patient."
+
+  cd $data
+  if ! wget --no-check-certificate $full_url; then
+    echo "$0: error executing wget $full_url"
+    exit 1;
+  fi
+fi
+
+cd $data
+
+if ! tar -xvzf $part.tgz; then
+  echo "$0: error un-tarring archive $data/$part.tgz"
+  exit 1;
+fi
+
+touch $data/$part/.complete
+
+dev_dir=$data/$part/corpus/dev
+test_dir=$data/$part/corpus/test
+train_dir=$data/$part/corpus/train
+if [ $part == "aidatatang_200zh" ]; then
+  for set in $dev_dir $test_dir $train_dir;do
+    cd $set
+    for wav in ./*.tar.gz; do
+      echo "Extracting wav from $wav"
+      tar -zxf $wav && rm $wav
+    done
+  done
+fi
+
+echo "$0: Successfully downloaded and un-tarred $data/$part.tgz"
+
+if $remove_archive; then
+  echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied."
+  rm $data/$part.tgz
+fi
+
+exit 0;
diff --git a/egs/gale_arabic/s5b/local/gale_format_data.sh b/egs/aidatatang_200zh/s5/local/format_data.sh
old mode 100755
new mode 100644
similarity index 73%
rename from egs/gale_arabic/s5b/local/gale_format_data.sh
rename to egs/aidatatang_200zh/s5/local/format_data.sh
index b69c34e68b9..47af9dd9dfd
--- a/egs/gale_arabic/s5b/local/gale_format_data.sh
+++ b/egs/aidatatang_200zh/s5/local/format_data.sh
@@ -1,23 +1,25 @@
 #!/bin/bash
+#
 
-# Copyright 2014 QCRI (author: Ahmed Ali)
-# Apache 2.0
+. ./path.sh
 
-if [ -f path.sh ]; then
-  . ./path.sh; else
-   echo "$0: missing path.sh"; exit 1;
-fi
+silprob=0.5
+mkdir -p data/lang_test data/train data/dev
 
-for dir in test train; do
-   cp -pr data/local/$dir data/$dir
-done
-
-
-mkdir -p data/lang_test
 
 arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
 [ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1;
 
+# Copy stuff into its final locations...
+
+for f in spk2utt utt2spk wav.scp text; do
+  cp data/local/train/$f data/train/$f || exit 1;
+done
+
+for f in spk2utt utt2spk wav.scp text; do
+  cp data/local/dev/$f data/dev/$f || exit 1;
+done
+
 rm -r data/lang_test
 cp -r data/lang data/lang_test
 
@@ -26,15 +28,15 @@ gunzip -c "$arpa_lm" | \
            --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
 
 
-echo  "$0: Checking how stochastic G is (the first of these numbers should be small):"
+echo  "Checking how stochastic G is (the first of these numbers should be small):"
 fstisstochastic data/lang_test/G.fst
 
 ## Check lexicon.
 ## just have a look and make sure it seems sane.
-echo "$0: First few lines of lexicon FST:"
+echo "First few lines of lexicon FST:"
 fstprint   --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/lang/L.fst  | head
 
-echo "$0: Performing further checks"
+echo Performing further checks
 
 # Checking that G.fst is determinizable.
 fstdeterminize data/lang_test/G.fst /dev/null || echo Error determinizing G.
@@ -55,6 +57,4 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
    fstisstochastic || echo LG is not stochastic
 
 
-echo gale_format_data succeeded.
-
-exit 0
+echo format_data succeeded.
diff --git a/egs/aidatatang_200zh/s5/local/nnet3/compare_wer.sh b/egs/aidatatang_200zh/s5/local/nnet3/compare_wer.sh
new file mode 100755
index 00000000000..2d85626c356
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/local/nnet3/compare_wer.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+# Copyright 2018  Emotech LTD (Author: Xuechen Liu)
+
+# compare wer between diff. models in aidatatang_200zh nnet3 directory
+# exemplar usage: local/nnet3/compare_wer.sh exp/nnet3/tdnn_sp
+# note: this script is made quite general since we kinda wanna give more flexibility to
+#       users on adding affix for their own use when training models.
+
+set -e
+. ./cmd.sh
+. ./path.sh
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: [--online] <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/nnet3/tdnn_sp exp/nnet3/tdnn_sp_pr"
+  exit 1
+fi
+
+echo "# $0 $*"
+
+include_online=false
+if [ "$1" == "--online" ]; then
+  include_online=true
+  shift
+fi
+
+set_names() {
+  if [ $# != 1 ]; then
+    echo "compare_wer.sh: internal error"
+    exit 1  # exit the program
+  fi
+  dirname=$(echo $1 | cut -d: -f1)
+}
+
+# print model names
+echo -n "# Model               "
+for x in $*; do
+  printf "% 10s" " $(basename $x)"
+done
+echo
+
+# print decode WER results
+echo -n "# WER(%)               "
+for x in $*; do
+  set_names $x
+  wer=$([ -d $x ] && grep WER $x/decode_test/cer_* | utils/best_wer.sh | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+# so how about online WER?
+if $include_online; then
+  echo -n "# WER(%)[online]       "
+  for x in $*; do
+    set_names $x
+    wer=$(cat ${x}_online/decode_test/cer_* | utils/best_wer.sh | awk '{print $2}')
+    printf "% 10s" $wer
+  done
+  echo
+  echo -n "# WER(%)[per-utt]      "
+  for x in $*; do
+    set_names $x
+    wer_per_utt=$(cat ${x}_online/decode_test_per_utt/cer_* | utils/best_wer.sh | awk '{print $2}')
+    printf "% 10s" $wer_per_utt
+  done
+  echo
+fi
+
+# print log for train & validation
+echo -n "# Final train prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.combined.log | grep log-like | awk '{printf($8)}' | cut -c1-7)
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.combined.log | grep log-like | awk '{printf($8)}' | cut -c1-7)
+  printf "% 10s" $prob
+done
+echo
diff --git a/egs/aidatatang_200zh/s5/local/nnet3/run_ivector_common.sh b/egs/aidatatang_200zh/s5/local/nnet3/run_ivector_common.sh
new file mode 100644
index 00000000000..0fe55ecf000
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,160 @@
+#!/bin/bash
+
+set -euo pipefail
+
+# This script is modified based on mini_librispeech/s5/local/nnet3/run_ivector_common.sh
+
+# This script is called from local/nnet3/run_tdnn.sh and
+# local/chain/run_tdnn.sh (and may eventually be called by more
+# scripts).  It contains the common feature preparation and
+# iVector-related parts of the script.  See those scripts for examples
+# of usage.
+
+stage=0
+train_set=train
+test_sets="dev test"
+gmm=tri5a
+online=false
+nnet3_affix=
+
+. ./cmd.sh
+. ./path.sh
+. utils/parse_options.sh
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_sp_ali
+
+for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+online_affix=
+if [ $online = true ]; then
+  online_affix=_online
+fi
+
+if [ $stage -le 1 ]; then
+  # Although the nnet will be trained by high resolution data, we still have to
+  # perturb the normal data to get the alignment _sp stands for speed-perturbed
+  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+  echo "$0: making MFCC features for low-resolution speed-perturbed data"
+  steps/make_mfcc_pitch.sh --cmd "$train_cmd" --nj 70 data/${train_set}_sp \
+    exp/make_mfcc/train_sp mfcc_perturbed || exit 1;
+  steps/compute_cmvn_stats.sh data/${train_set}_sp \
+    exp/make_mfcc/train_sp mfcc_perturbed || exit 1;
+  utils/fix_data_dir.sh data/${train_set}_sp
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: aligning with the perturbed low-resolution data"
+  steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
+    data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1
+fi
+
+if [ $stage -le 3 ]; then
+  # Create high-resolution MFCC features (with 40 cepstra instead of 13).
+  # this shows how you can split across multiple file-systems.
+  echo "$0: creating high-resolution MFCC features"
+  mfccdir=mfcc_perturbed_hires$online_affix
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/aidatatang-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+  fi
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires$online_affix
+  done
+
+  # do volume-perturbation on the training data prior to extracting hires
+  # features; this helps make trained nnets more invariant to test data volume.
+  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires$online_affix || exit 1;
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    steps/make_mfcc_pitch$online_affix.sh --nj 10 --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires$online_affix exp/make_hires/$datadir $mfccdir || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires$online_affix exp/make_hires/$datadir $mfccdir || exit 1;
+    utils/fix_data_dir.sh data/${datadir}_hires$online_affix || exit 1;
+    # create MFCC data dir without pitch to extract iVector
+    utils/data/limit_feature_dim.sh 0:39 data/${datadir}_hires$online_affix data/${datadir}_hires_nopitch || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires_nopitch exp/make_hires/$datadir $mfccdir || exit 1;
+  done
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: computing a subset of data to train the diagonal UBM."
+  # We'll use about a quarter of the data.
+  mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
+  temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
+
+  num_utts_total=$(wc -l <data/${train_set}_sp_hires_nopitch/utt2spk)
+  num_utts=$[$num_utts_total/4]
+  utils/data/subset_data_dir.sh data/${train_set}_sp_hires_nopitch \
+     $num_utts ${temp_data_root}/${train_set}_sp_hires_nopitch_subset
+
+  echo "$0: computing a PCA transform from the hires data."
+  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
+      --splice-opts "--left-context=3 --right-context=3" \
+      --max-utts 10000 --subsample 2 \
+       ${temp_data_root}/${train_set}_sp_hires_nopitch_subset \
+       exp/nnet3${nnet3_affix}/pca_transform
+
+  echo "$0: training the diagonal UBM."
+  # Use 512 Gaussians in the UBM.
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
+    --num-frames 700000 \
+    --num-threads 8 \
+    ${temp_data_root}/${train_set}_sp_hires_nopitch_subset 512 \
+    exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
+fi
+
+if [ $stage -le 5 ]; then
+  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
+  # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
+  # 100.
+  echo "$0: training the iVector extractor"
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+     data/${train_set}_sp_hires_nopitch exp/nnet3${nnet3_affix}/diag_ubm \
+     exp/nnet3${nnet3_affix}/extractor || exit 1;
+fi
+
+train_set=train_sp
+
+if [ $stage -le 6 ]; then
+  # We extract iVectors on the speed-perturbed training data after combining
+  # short segments, which will be what we train the system on.  With
+  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
+  # each of these pairs as one speaker; this gives more diversity in iVectors..
+  # Note that these are extracted 'online'.
+
+  # note, we don't encode the 'max2' in the name of the ivectordir even though
+  # that's the data we extract the ivectors from, as it's still going to be
+  # valid for the non-'max2' data, the utterance list is the same.
+
+  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/ivectors/aidatatang-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
+  fi
+
+
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  temp_data_root=${ivectordir}
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${train_set}_hires_nopitch ${temp_data_root}/${train_set}_sp_hires_nopitch_max2
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
+    ${temp_data_root}/${train_set}_sp_hires_nopitch_max2 \
+    exp/nnet3${nnet3_affix}/extractor $ivectordir
+
+  # Also extract iVectors for the test data, but in this case we don't need the speed
+  # perturbation (sp).
+  for data in $test_sets; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 8 \
+      data/${data}_hires_nopitch exp/nnet3${nnet3_affix}/extractor \
+      exp/nnet3${nnet3_affix}/ivectors_${data}
+  done
+fi
+
+exit 0
diff --git a/egs/aidatatang_200zh/s5/local/nnet3/run_tdnn.sh b/egs/aidatatang_200zh/s5/local/nnet3/run_tdnn.sh
new file mode 120000
index 00000000000..0659f92fa61
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/local/nnet3/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn.sh
\ No newline at end of file
diff --git a/egs/aidatatang_200zh/s5/local/nnet3/tuning/run_tdnn.sh b/egs/aidatatang_200zh/s5/local/nnet3/tuning/run_tdnn.sh
new file mode 100644
index 00000000000..2bcded42ed1
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/local/nnet3/tuning/run_tdnn.sh
@@ -0,0 +1,161 @@
+#!/bin/bash
+
+# This script is based on aishell/s5/local/nnet3/tuning/run_tdnn_1a.sh
+
+# In this script, the neural network in trained based on hires mfcc and online pitch.
+# The online pitch setup requires a online_pitch.conf in the conf dir for both training
+# and testing.
+
+# results
+# local/nnet3/compare_wer.sh exp/nnet3/tdnn_sp
+# Model                  tdnn_sp
+# WER(%)                     7.21
+# Final train prob        -0.6475
+# Final valid prob        -0.9461
+
+# local/nnet3/compare_wer.sh --online exp/nnet3/tdnn_sp
+# Model                  tdnn_sp
+# WER(%)                     7.21
+# WER(%)[online]             7.44
+# WER(%)[per-utt]            7.79
+# Final train prob        -0.6475
+# Final valid prob        -0.9461
+
+set -e
+
+stage=0
+train_stage=-10
+affix=
+common_egs_dir=
+
+# training options
+initial_effective_lrate=0.0015
+final_effective_lrate=0.00015
+num_epochs=4
+num_jobs_initial=2
+num_jobs_final=12
+remove_egs=true
+
+# feature options
+use_ivectors=true
+
+# End configuration section.
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+dir=exp/nnet3/tdnn_sp${affix:+_$affix}
+gmm_dir=exp/tri5a
+train_set=train_sp
+ali_dir=${gmm_dir}_sp_ali
+graph_dir=$gmm_dir/graph
+
+local/nnet3/run_ivector_common.sh --stage $stage --online true || exit 1;
+
+if [ $stage -le 7 ]; then
+  echo "$0: creating neural net configs";
+
+  ivector_dim=$(feat-to-dim scp:exp/nnet3/ivectors_${train_set}/ivector_online.scp - || exit 1;)
+  feat_dim=$(feat-to-dim scp:data/${train_set}_hires_online/feats.scp - || exit 1;)
+  num_targets=$(tree-info $ali_dir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=$ivector_dim name=ivector
+  input dim=$feat_dim name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=850
+  relu-batchnorm-layer name=tdnn2 dim=850 input=Append(-1,0,2)
+  relu-batchnorm-layer name=tdnn3 dim=850 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn4 dim=850 input=Append(-7,0,2)
+  relu-batchnorm-layer name=tdnn5 dim=850 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn6 dim=850
+  output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 8 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aidatatang-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --cleanup.preserve-model-interval 500 \
+    --feat-dir=data/${train_set}_hires_online \
+    --ali-dir $ali_dir \
+    --lang data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 9 ]; then
+  # this version of the decoding treats each utterance separately
+  # without carrying forward speaker information.
+  for decode_set in dev test; do
+    num_jobs=`cat data/${decode_set}_hires_online/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+    decode_dir=${dir}/decode_$decode_set
+    steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
+       --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+       $graph_dir data/${decode_set}_hires_online $decode_dir || exit 1;
+  done
+fi
+
+if [ $stage -le 10 ]; then
+  steps/online/nnet3/prepare_online_decoding.sh --mfcc-config conf/mfcc_hires.conf \
+    --add-pitch true \
+    data/lang exp/nnet3/extractor "$dir" ${dir}_online || exit 1;
+fi
+
+if [ $stage -le 11 ]; then
+  # do the actual online decoding with iVectors, carrying info forward from
+  # previous utterances of the same speaker.
+  for decode_set in dev test; do
+    num_jobs=`cat data/${decode_set}_hires_online/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+    decode_dir=${dir}_online/decode_$decode_set
+    steps/online/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
+       --config conf/decode.config \
+       $graph_dir data/${decode_set}_hires_online $decode_dir || exit 1;
+  done
+fi
+
+if [ $stage -le 12 ]; then
+  # this version of the decoding treats each utterance separately
+  # without carrying forward speaker information.
+  for decode_set in dev test; do
+    num_jobs=`cat data/${decode_set}_hires_online/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+    decode_dir=${dir}_online/decode_${decode_set}_per_utt
+    steps/online/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
+       --config conf/decode.config --per-utt true \
+       $graph_dir data/${decode_set}_hires_online $decode_dir || exit 1;
+  done
+fi
+
+wait;
+exit 0;
diff --git a/egs/aidatatang_200zh/s5/local/prepare_dict.sh b/egs/aidatatang_200zh/s5/local/prepare_dict.sh
new file mode 100644
index 00000000000..aa72bcd48d2
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/local/prepare_dict.sh
@@ -0,0 +1,320 @@
+#!/bin/bash
+#Copyright 2016 LeSpeech (Author: Xingyu Na)
+
+# prepare dictionary for aidatatang
+# it is done for English and Chinese separately,
+# For English, we use CMU dictionary, and Sequitur G2P
+# for OOVs, while all englist phone set will concert to Chinese
+# phone set at the end. For Chinese, we use an online dictionary,
+# for OOV, we just produce pronunciation using Charactrt Mapping.
+
+. ./path.sh
+
+[ $# != 0 ] && echo "Usage: $0" && exit 1;
+
+train_dir=data/local/train
+dev_dir=data/local/dev
+test_dir=data/local/test
+dict_dir=data/local/dict
+mkdir -p $dict_dir
+mkdir -p $dict_dir/lexicon-{en,ch}
+
+# extract full vocabulary
+cat $train_dir/text $dev_dir/text $test_dir/text | awk '{for (i = 2; i <= NF; i++) print $i}' |\
+  perl -ape 's/ /\n/g;' | sort -u | grep -v '\[LAUGHTER\]' | grep -v '\[NOISE\]' |\
+  grep -v '\[VOCALIZED-NOISE\]' > $dict_dir/words.txt || exit 1;
+
+# split into English and Chinese
+cat $dict_dir/words.txt | grep '[a-zA-Z]' > $dict_dir/lexicon-en/words-en.txt || exit 1;
+cat $dict_dir/words.txt | grep -v '[a-zA-Z]' > $dict_dir/lexicon-ch/words-ch.txt || exit 1;
+
+
+##### produce pronunciations for english
+if [ ! -f $dict_dir/cmudict/cmudict.0.7a ]; then
+  echo "--- Downloading CMU dictionary ..."
+  svn co -r 13068 https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict \
+    $dict_dir/cmudict || exit 1;
+fi
+
+# format cmudict
+echo "--- Striping stress and pronunciation variant markers from cmudict ..."
+perl $dict_dir/cmudict/scripts/make_baseform.pl \
+  $dict_dir/cmudict/cmudict.0.7a /dev/stdout |\
+  sed -e 's:^\([^\s(]\+\)([0-9]\+)\(\s\+\)\(.*\):\1\2\3:' > $dict_dir/cmudict/cmudict-plain.txt || exit 1;
+
+# extract in-vocab lexicon and oov words
+echo "--- Searching for English OOV words ..."
+awk 'NR==FNR{words[$1]; next;} !($1 in words)' \
+  $dict_dir/cmudict/cmudict-plain.txt $dict_dir/lexicon-en/words-en.txt |\
+  egrep -v '<.?s>' > $dict_dir/lexicon-en/words-en-oov.txt || exit 1;
+
+awk 'NR==FNR{words[$1]; next;} ($1 in words)' \
+  $dict_dir/lexicon-en/words-en.txt $dict_dir/cmudict/cmudict-plain.txt |\
+  egrep -v '<.?s>' > $dict_dir/lexicon-en/lexicon-en-iv.txt || exit 1;
+
+wc -l $dict_dir/lexicon-en/words-en-oov.txt
+wc -l $dict_dir/lexicon-en/lexicon-en-iv.txt
+
+# setup g2p and generate oov lexicon
+if [ ! -f conf/g2p_model ]; then
+  echo "--- Downloading a pre-trained Sequitur G2P model ..."
+  wget http://sourceforge.net/projects/kaldi/files/sequitur-model4 -O conf/g2p_model
+  if [ ! -f conf/g2p_model ]; then
+    echo "Failed to download the g2p model!"
+    exit 1
+  fi
+fi
+
+echo "--- Preparing pronunciations for OOV words ..."
+g2p=`which g2p.py`
+if [ ! -x $g2p ]; then
+  echo "g2p.py is not found. Checkout tools/extras/install_sequitur.sh."
+  exit 1
+fi
+g2p.py --model=conf/g2p_model --apply $dict_dir/lexicon-en/words-en-oov.txt \
+  > $dict_dir/lexicon-en/lexicon-en-oov.txt || exit 1;
+
+# merge in-vocab and oov lexicon
+cat $dict_dir/lexicon-en/lexicon-en-oov.txt $dict_dir/lexicon-en/lexicon-en-iv.txt |\
+  sort > $dict_dir/lexicon-en/lexicon-en-phn.txt || exit 1;
+
+# convert cmu phoneme to pinyin phonenme
+mkdir -p $dict_dir/map
+cat conf/cmu2pinyin | awk '{print $1;}' | sort -u > $dict_dir/map/cmu || exit 1;
+cat conf/pinyin2cmu | awk -v cmu=$dict_dir/map/cmu \
+  'BEGIN{while((getline<cmu)) dict[$1] = 1;}
+   {for (i = 2; i <=NF; i++) if (dict[$i]) print $i;}' | sort -u > $dict_dir/map/cmu-used || exit 1;
+cat $dict_dir/map/cmu | awk -v cmu=$dict_dir/map/cmu-used \
+  'BEGIN{while((getline<cmu)) dict[$1] = 1;}
+   {if (!dict[$1]) print $1;}' > $dict_dir/map/cmu-not-used || exit 1;
+
+awk 'NR==FNR{words[$1]; next;} ($1 in words)' \
+  $dict_dir/map/cmu-not-used conf/cmu2pinyin |\
+  egrep -v '<.?s>' > $dict_dir/map/cmu-py || exit 1;
+
+cat $dict_dir/map/cmu-py | \
+  perl -e '
+  open(MAPS, $ARGV[0]) or die("could not open map file");
+  my %py2ph;
+  foreach $line (<MAPS>) {
+    @A = split(" ", $line);
+    $py = shift(@A);
+    $py2ph{$py} = [@A];
+  }
+  my @entry;
+  while (<STDIN>) {
+    @A = split(" ", $_);
+    @entry = ();
+    $W = shift(@A);
+    push(@entry, $W);
+    for($i = 0; $i < @A; $i++) { push(@entry, @{$py2ph{$A[$i]}}); }
+    print "@entry";
+    print "\n";
+  }
+' conf/pinyin2cmu > $dict_dir/map/cmu-cmu || exit 1;
+
+cat $dict_dir/lexicon-en/lexicon-en-phn.txt | \
+  perl -e '
+  open(MAPS, $ARGV[0]) or die("could not open map file");
+  my %py2ph;
+  foreach $line (<MAPS>) {
+    @A = split(" ", $line);
+    $py = shift(@A);
+    $py2ph{$py} = [@A];
+  }
+  my @entry;
+  while (<STDIN>) {
+    @A = split(" ", $_);
+    @entry = ();
+    $W = shift(@A);
+    push(@entry, $W);
+    for($i = 0; $i < @A; $i++) {
+      if (exists $py2ph{$A[$i]}) { push(@entry, @{$py2ph{$A[$i]}}); }
+      else {push(@entry, $A[$i])};
+    }
+    print "@entry";
+    print "\n";
+  }
+' $dict_dir/map/cmu-cmu > $dict_dir/lexicon-en/lexicon-en.txt || exit 1;
+
+
+##### produce pronunciations for chinese
+if [ ! -f $dict_dir/cedict/cedict_1_0_ts_utf-8_mdbg.txt ]; then
+  echo "------------- Downloading cedit dictionary ---------------"
+  mkdir -p $dict_dir/cedict
+  wget -P $dict_dir/cedict http://www.mdbg.net/chindict/export/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz
+  gunzip $dict_dir/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz
+fi
+
+cat $dict_dir/cedict/cedict_1_0_ts_utf-8_mdbg.txt | grep -v '#' | awk -F '/' '{print $1}' |\
+ perl -e '
+  while (<STDIN>) {
+    @A = split(" ", $_);
+    print $A[1];
+    for($n = 2; $n < @A; $n++) {
+      $A[$n] =~ s:\[?([a-zA-Z0-9\:]+)\]?:$1:;
+      $tmp = uc($A[$n]);
+      print " $tmp";
+    }
+    print "\n";
+  }
+ ' | sort -k1 > $dict_dir/cedict/ch-dict.txt || exit 1;
+
+echo "--- Searching for Chinese OOV words ..."
+awk 'NR==FNR{words[$1]; next;} !($1 in words)' \
+  $dict_dir/cedict/ch-dict.txt $dict_dir/lexicon-ch/words-ch.txt |\
+  egrep -v '<.?s>' > $dict_dir/lexicon-ch/words-ch-oov.txt || exit 1;
+
+awk 'NR==FNR{words[$1]; next;} ($1 in words)' \
+  $dict_dir/lexicon-ch/words-ch.txt $dict_dir/cedict/ch-dict.txt |\
+  egrep -v '<.?s>' > $dict_dir/lexicon-ch/lexicon-ch-iv.txt || exit 1;
+
+wc -l $dict_dir/lexicon-ch/words-ch-oov.txt
+wc -l $dict_dir/lexicon-ch/lexicon-ch-iv.txt
+
+
+# validate Chinese dictionary and compose a char-based
+# dictionary in order to get OOV pronunciations
+cat $dict_dir/cedict/ch-dict.txt |\
+  perl -e '
+  use utf8;
+  binmode(STDIN,":encoding(utf8)");
+  binmode(STDOUT,":encoding(utf8)");
+  while (<STDIN>) {
+    @A = split(" ", $_);
+    $word_len = length($A[0]);
+    $proun_len = @A - 1 ;
+    if ($word_len == $proun_len) {print $_;}
+  }
+  ' > $dict_dir/cedict/ch-dict-1.txt || exit 1;
+
+# extract chars
+cat $dict_dir/cedict/ch-dict-1.txt | awk '{print $1}' |\
+  perl -e '
+  use utf8;
+  binmode(STDIN,":encoding(utf8)");
+  binmode(STDOUT,":encoding(utf8)");  
+  while (<STDIN>) {
+    @A = split(" ", $_);
+    @chars = split("", $A[0]);
+    foreach (@chars) {
+      print "$_\n";
+    }
+  }
+  ' | grep -v '^$' > $dict_dir/lexicon-ch/ch-char.txt || exit 1;
+
+# extract individual pinyins
+cat $dict_dir/cedict/ch-dict-1.txt |\
+  awk '{for(i=2; i<=NF; i++) print $i}' |\
+  perl -ape 's/ /\n/g;' > $dict_dir/lexicon-ch/ch-char-pinyin.txt || exit 1;
+
+# first make sure number of characters and pinyins
+# are equal, so that a char-based dictionary can
+# be composed.
+nchars=`wc -l < $dict_dir/lexicon-ch/ch-char.txt`
+npinyin=`wc -l < $dict_dir/lexicon-ch/ch-char-pinyin.txt`
+if [ $nchars -ne $npinyin ]; then
+  echo "Found $nchars chars and $npinyin pinyin. Please check!"
+  exit 1
+fi
+
+paste $dict_dir/lexicon-ch/ch-char.txt $dict_dir/lexicon-ch/ch-char-pinyin.txt |\
+  sort -u > $dict_dir/lexicon-ch/ch-char-dict.txt || exit 1;
+
+# create a multiple pronunciation dictionary
+cat $dict_dir/lexicon-ch/ch-char-dict.txt |\
+  perl -e '
+  my $prev = "";
+  my $out_line = "";
+  while (<STDIN>) {
+    @A = split(" ", $_);
+    $cur = $A[0];
+    $cur_py = $A[1];
+    #print length($prev);
+    if (length($prev) == 0) { $out_line = $_; chomp($out_line);}
+    if (length($prev)>0 && $cur ne $prev) { print $out_line; print "\n"; $out_line = $_; chomp($out_line);}
+    if (length($prev)>0 && $cur eq $prev) { $out_line = $out_line."/"."$cur_py";}
+    $prev = $cur;
+  }
+  print $out_line;
+  ' >  $dict_dir/lexicon-ch/ch-char-dict-mp.txt || exit 1;
+
+# get lexicon for Chinese OOV words
+local/create_oov_char_lexicon.pl $dict_dir/lexicon-ch/ch-char-dict-mp.txt \
+  $dict_dir/lexicon-ch/words-ch-oov.txt > $dict_dir/lexicon-ch/lexicon-ch-oov.txt || exit 1;
+
+# seperate multiple prons for Chinese OOV lexicon
+cat $dict_dir/lexicon-ch/lexicon-ch-oov.txt |\
+  perl -e '
+  my @entry;
+  my @entry1;
+  while (<STDIN>) {
+    @A = split(" ", $_);
+    @entry = ();
+    push(@entry, $A[0]);
+    for($i = 1; $i < @A; $i++ ) {
+      @py = split("/", $A[$i]);
+      @entry1 = @entry;
+      @entry = ();
+      for ($j = 0; $j < @entry1; $j++) {
+        for ($k = 0; $k < @py; $k++) {
+          $tmp = $entry1[$j]." ".$py[$k];
+          push(@entry, $tmp);
+        }
+      }
+    }
+    for ($i = 0; $i < @entry; $i++) {
+      print $entry[$i];
+      print "\n";
+    }
+  }
+  ' > $dict_dir/lexicon-ch/lexicon-ch-oov-mp.txt || exit 1;
+
+# compose IV and OOV lexicons for Chinese
+cat $dict_dir/lexicon-ch/lexicon-ch-oov-mp.txt $dict_dir/lexicon-ch/lexicon-ch-iv.txt |\
+  awk '{if (NF > 1 && $2 ~ /[A-Za-z0-9]+/) print $0;}' > $dict_dir/lexicon-ch/lexicon-ch.txt || exit 1;
+
+# convert Chinese pinyin to CMU format
+cat $dict_dir/lexicon-ch/lexicon-ch.txt | sed -e 's/U:/V/g' | sed -e 's/ R\([0-9]\)/ ER\1/g'|\
+  utils/pinyin_map.pl conf/pinyin2cmu > $dict_dir/lexicon-ch/lexicon-ch-cmu.txt || exit 1;
+
+# combine English and Chinese lexicons
+cat $dict_dir/lexicon-en/lexicon-en.txt $dict_dir/lexicon-ch/lexicon-ch-cmu.txt |\
+  sort -u > $dict_dir/lexicon1.txt || exit 1;
+
+cat $dict_dir/lexicon1.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}'| \
+  sort -u |\
+  perl -e '
+  my %ph_cl;
+  while (<STDIN>) {
+    $phone = $_;
+    chomp($phone);
+    chomp($_);
+    $phone =~ s:([A-Z]+)[0-9]:$1:;
+    if (exists $ph_cl{$phone}) { push(@{$ph_cl{$phone}}, $_)  }
+    else { $ph_cl{$phone} = [$_]; }
+  }
+  foreach $key ( keys %ph_cl ) {
+     print "@{ $ph_cl{$key} }\n"
+  }
+  ' | sort -k1 > $dict_dir/nonsilence_phones.txt  || exit 1;
+
+( echo SIL; echo SPN; echo NSN; echo LAU ) > $dict_dir/silence_phones.txt
+
+echo SIL > $dict_dir/optional_silence.txt
+
+# No "extra questions" in the input to this setup, as we don't
+# have stress or tone
+
+cat $dict_dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dict_dir/extra_questions.txt || exit 1;
+cat $dict_dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) {
+  $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \
+ >> $dict_dir/extra_questions.txt || exit 1;
+
+# Add to the lexicon the silences, noises etc.
+(echo '!SIL SIL'; echo '[VOCALIZED-NOISE] SPN'; echo '[NOISE] NSN'; echo '[LAUGHTER] LAU';
+ echo '<UNK> SPN' ) | \
+ cat - $dict_dir/lexicon1.txt  > $dict_dir/lexicon.txt || exit 1;
+
+echo "$0: aidatatang_200zh dict preparation succeeded"
+exit 0;
diff --git a/egs/aidatatang_200zh/s5/local/score.sh b/egs/aidatatang_200zh/s5/local/score.sh
new file mode 100644
index 00000000000..a9786169973
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/local/score.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+set -e -o pipefail
+set -x
+steps/score_kaldi.sh "$@"
+steps/scoring/score_kaldi_cer.sh --stage 2 "$@"
+
+echo "$0: Done"
diff --git a/egs/aidatatang_200zh/s5/local/train_lms.sh b/egs/aidatatang_200zh/s5/local/train_lms.sh
new file mode 100644
index 00000000000..bc52f8acb20
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/local/train_lms.sh
@@ -0,0 +1,92 @@
+#!/bin/bash
+
+
+# To be run from one directory above this script.
+
+
+text=data/local/train/text
+lexicon=data/local/dict/lexicon.txt
+
+for f in "$text" "$lexicon"; do
+  [ ! -f $x ] && echo "$0: No such file $f" && exit 1;
+done
+
+# This script takes no arguments.  It assumes you have already run
+# aidatatang_data_prep.sh.
+# It takes as input the files
+#data/local/train/text
+#data/local/dict/lexicon.txt
+dir=data/local/lm
+mkdir -p $dir
+
+export LC_ALL=C # You'll get errors about things being not sorted, if you
+                # have a different locale.
+kaldi_lm=`which train_lm.sh`
+if [ ! -x $kaldi_lm ]; then
+  echo "$0: train_lm.sh is not found. That might mean it's not installed"
+  echo "$0: or it is not added to PATH"
+  echo "$0: Use the script tools/extra/install_kaldi_lm.sh to install it"
+  exit 1
+fi
+
+cleantext=$dir/text.no_oov
+
+cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
+  {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<UNK> ");} } printf("\n");}' \
+  > $cleantext || exit 1;
+
+
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
+   sort -nr > $dir/word.counts || exit 1;
+
+
+# Get counts from acoustic training transcripts, and add  one-count
+# for each word in the lexicon (but not silence, we don't want it
+# in the LM-- we'll add it optionally later).
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
+  cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
+   sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
+
+# note: we probably won't really make use of <UNK> as there aren't any OOVs
+cat $dir/unigram.counts  | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<UNK>" > $dir/word_map \
+   || exit 1;
+
+# note: ignore 1st field of train.txt, it's the utterance-id.
+cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
+  { for(n=2;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz \
+   || exit 1;
+
+train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1;
+
+# LM is small enough that we don't need to prune it (only about 0.7M N-grams).
+# Perplexity over 128254.000000 words is 90.446690
+
+# note: output is
+# data/local/lm/3gram-mincount/lm_unpruned.gz
+
+exit 0
+
+
+# From here is some commands to do a baseline with SRILM (assuming
+# you have it installed).
+heldout_sent=10000 # Don't change this if you want result to be comparable with
+    # kaldi_lm results
+sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities.
+mkdir -p $sdir
+cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
+  head -$heldout_sent > $sdir/heldout
+cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
+  tail -n +$heldout_sent > $sdir/train
+
+cat $dir/word_map | awk '{print $1}' | cat - <(echo "<s>"; echo "</s>" ) > $sdir/wordlist
+
+
+ngram-count -text $sdir/train -order 3 -limit-vocab -vocab $sdir/wordlist -unk \
+  -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz
+ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout
+# 0 zeroprobs, logprob= -250954 ppl= 90.5091 ppl1= 132.482
+
+# Note: perplexity SRILM gives to Kaldi-LM model is same as kaldi-lm reports above.
+# Difference in WSJ must have been due to different treatment of <UNK>.
+ngram -lm $dir/3gram-mincount/lm_unpruned.gz  -ppl $sdir/heldout
+# 0 zeroprobs, logprob= -250913 ppl= 90.4439 ppl1= 132.379
diff --git a/egs/aidatatang_200zh/s5/local/wer_hyp_filter b/egs/aidatatang_200zh/s5/local/wer_hyp_filter
new file mode 100644
index 00000000000..a1bfdb57efc
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/local/wer_hyp_filter
@@ -0,0 +1,19 @@
+#!/usr/bin/env perl
+
+@filters=('[NOISE]','[LAUGHTER]','[VOCALIZED-NOISE]','<UNK>','%HESITATION');
+
+foreach $w (@filters) {
+  $bad{$w} = 1;
+}
+
+while(<STDIN>) {
+  @A  = split(" ", $_);
+  $id = shift @A;
+  print "$id ";
+  foreach $a (@A) {
+    if (!defined $bad{$a}) {
+      print "$a ";
+    }
+  }
+  print "\n";
+}
diff --git a/egs/aidatatang_200zh/s5/local/wer_output_filter b/egs/aidatatang_200zh/s5/local/wer_output_filter
new file mode 100644
index 00000000000..aceeeec41b4
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/local/wer_output_filter
@@ -0,0 +1,25 @@
+#!/usr/bin/env perl
+# Copyright 2012-2014  Johns Hopkins University (Author: Yenda Trmal)
+# Apache 2.0
+use utf8;
+
+use open qw(:encoding(utf8));
+binmode STDIN, ":utf8";
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+
+while (<>) {
+  @F = split " ";
+  print $F[0] . " "; 
+  foreach $s (@F[1..$#F]) {
+    if (($s =~ /\[.*\]/) || ($s =~ /\<.*\>/) || ($s =~ "!SIL")) {
+      print "";
+    } else {
+      print "$s"
+    }
+    print " ";
+  }
+  print "\n";
+}
+
+
diff --git a/egs/aidatatang_200zh/s5/local/wer_ref_filter b/egs/aidatatang_200zh/s5/local/wer_ref_filter
new file mode 100644
index 00000000000..a1bfdb57efc
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/local/wer_ref_filter
@@ -0,0 +1,19 @@
+#!/usr/bin/env perl
+
+@filters=('[NOISE]','[LAUGHTER]','[VOCALIZED-NOISE]','<UNK>','%HESITATION');
+
+foreach $w (@filters) {
+  $bad{$w} = 1;
+}
+
+while(<STDIN>) {
+  @A  = split(" ", $_);
+  $id = shift @A;
+  print "$id ";
+  foreach $a (@A) {
+    if (!defined $bad{$a}) {
+      print "$a ";
+    }
+  }
+  print "\n";
+}
diff --git a/egs/aidatatang_200zh/s5/path.sh b/egs/aidatatang_200zh/s5/path.sh
new file mode 100644
index 00000000000..2d17b17a84a
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/path.sh
@@ -0,0 +1,6 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/aidatatang_200zh/s5/run.sh b/egs/aidatatang_200zh/s5/run.sh
new file mode 100644
index 00000000000..47e46a660cd
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/run.sh
@@ -0,0 +1,150 @@
+#!/bin/bash
+
+# Copyright 2019 Beijing DataTang Tech. Co. Ltd. (Author: Liyuan Wang)
+#           2017 Hui Bu
+#           2017 Jiayu Du
+#           2017 Xingyu Na
+#           2017 Bengu Wu
+#           2017 Hao Zheng
+# Apache 2.0
+
+# This is a shell script, but it's recommended that you run the commands one by
+# one by copying and pasting into the shell.
+# Caution: some of the graph creation steps use quite a bit of memory, so you
+# should run this on a machine that has sufficient memory.
+
+
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+. ./path.sh
+
+
+# corpus directory and download URL
+data=/export/a05/xna/data
+data_url=www.openslr.org/resources/62
+
+# Obtain the database
+#[ -d $data ] || mkdir -p $data || exit 1;
+local/download_and_untar.sh $data $data_url aidatatang_200zh || exit 1;
+
+# Data Preparation: generate text, wav.scp, utt2spk, spk2utt
+local/data_prep.sh $data/aidatatang_200zh/corpus $data/aidatatang_200zh/transcript || exit 1;
+
+# Lexicon Preparation: build a large lexicon that invovles words in both the training and decoding
+local/prepare_dict.sh || exit 1;
+
+# Prepare Language Stuff
+# Phone Sets, questions, L compilation
+utils/prepare_lang.sh --position-dependent-phones false data/local/dict "<UNK>" data/local/lang data/lang || exit 1;
+
+# LM training
+local/train_lms.sh || exit 1;
+
+# G compilation, check LG composition
+local/format_data.sh
+
+# Now make MFCC plus pitch features.
+# mfccdir should be some place with a largish disk where you want to store MFCC features.
+mfccdir=mfcc
+for x in train dev test; do
+  steps/make_mfcc_pitch.sh --write_utt2dur false --write_utt2num_frames false --cmd "$train_cmd" --nj 10 data/$x exp/make_mfcc/$x $mfccdir || exit 1;
+  steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1;
+  utils/fix_data_dir.sh data/$x || exit 1;
+done
+
+steps/train_mono.sh --cmd "$train_cmd" --nj 10 \
+  data/train data/lang exp/mono || exit 1;
+
+# Monophone decoding
+utils/mkgraph.sh data/lang_test exp/mono exp/mono/graph || exit 1;
+steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 10 \
+  exp/mono/graph data/dev exp/mono/decode_dev
+
+steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 10 \
+  exp/mono/graph data/test exp/mono/decode_test
+
+# Get alignments from monophone system.
+steps/align_si.sh --cmd "$train_cmd" --nj 10 \
+  data/train data/lang exp/mono exp/mono_ali || exit 1;
+
+# train tri1 [first triphone pass]
+steps/train_deltas.sh --cmd "$train_cmd" \
+ 2500 20000 data/train data/lang exp/mono_ali exp/tri1 || exit 1;
+
+# decode tri1
+utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph || exit 1;
+steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 10 \
+  exp/tri1/graph data/dev exp/tri1/decode_dev
+steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 10 \
+  exp/tri1/graph data/test exp/tri1/decode_test
+
+# align tri1
+steps/align_si.sh --cmd "$train_cmd" --nj 10 \
+  data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
+
+# train tri2 [delta+delta-deltas]
+steps/train_deltas.sh --cmd "$train_cmd" \
+ 2500 20000 data/train data/lang exp/tri1_ali exp/tri2 || exit 1;
+
+# decode tri2
+utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph
+steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 10 \
+  exp/tri2/graph data/dev exp/tri2/decode_dev
+steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 10 \
+  exp/tri2/graph data/test exp/tri2/decode_test
+
+#align tri2
+steps/align_si.sh --cmd "$train_cmd" --nj 10 \
+  data/train data/lang exp/tri2 exp/tri2_ali || exit 1;
+
+# Train tri3a, which is LDA+MLLT,
+steps/train_lda_mllt.sh --cmd "$train_cmd" \
+ 2500 20000 data/train data/lang exp/tri2_ali exp/tri3a || exit 1;
+
+utils/mkgraph.sh data/lang_test exp/tri3a exp/tri3a/graph || exit 1;
+steps/decode.sh --cmd "$decode_cmd" --nj 10 --config conf/decode.config \
+  exp/tri3a/graph data/dev exp/tri3a/decode_dev
+steps/decode.sh --cmd "$decode_cmd" --nj 10 --config conf/decode.config \
+  exp/tri3a/graph data/test exp/tri3a/decode_test
+
+# From now, we start building a more serious system (with SAT), and we'll
+# do the alignment with fMLLR.
+steps/align_fmllr.sh --cmd "$train_cmd" --nj 10 \
+  data/train data/lang exp/tri3a exp/tri3a_ali || exit 1;
+
+steps/train_sat.sh --cmd "$train_cmd" \
+  2500 20000 data/train data/lang exp/tri3a_ali exp/tri4a || exit 1;
+
+utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph
+steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 10 --config conf/decode.config \
+  exp/tri4a/graph data/dev exp/tri4a/decode_dev
+steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 10 --config conf/decode.config \
+  exp/tri4a/graph data/test exp/tri4a/decode_test
+
+steps/align_fmllr.sh  --cmd "$train_cmd" --nj 10 \
+  data/train data/lang exp/tri4a exp/tri4a_ali
+
+# Building a larger SAT system.
+
+steps/train_sat.sh --cmd "$train_cmd" \
+  3500 100000 data/train data/lang exp/tri4a_ali exp/tri5a || exit 1;
+
+utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph || exit 1;
+steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 10 --config conf/decode.config \
+   exp/tri5a/graph data/dev exp/tri5a/decode_dev || exit 1;
+steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 10 --config conf/decode.config \
+   exp/tri5a/graph data/test exp/tri5a/decode_test || exit 1;
+
+steps/align_fmllr.sh --cmd "$train_cmd" --nj 10 \
+  data/train data/lang exp/tri5a exp/tri5a_ali || exit 1;
+
+# nnet3
+local/nnet3/run_tdnn.sh
+
+# chain
+local/chain/run_tdnn.sh
+
+# getting results (see RESULTS file)
+for x in exp/*/decode_test; do [ -d $x ] && grep WER $x/cer_* | utils/best_wer.sh; done 2>/dev/null
+
+exit 0;
diff --git a/egs/aidatatang_200zh/s5/steps b/egs/aidatatang_200zh/s5/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/aidatatang_200zh/s5/utils b/egs/aidatatang_200zh/s5/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file
diff --git a/egs/aishell/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/aishell/s5/local/chain/tuning/run_tdnn_1a.sh
index a0b183e3c5a..b38fa4d9c7a 100755
--- a/egs/aishell/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/aishell/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -90,7 +90,7 @@ if [ $stage -le 10 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/aishell/s5/local/chain/tuning/run_tdnn_2a.sh b/egs/aishell/s5/local/chain/tuning/run_tdnn_2a.sh
index 2ebe2a3092b..6b7223785d9 100755
--- a/egs/aishell/s5/local/chain/tuning/run_tdnn_2a.sh
+++ b/egs/aishell/s5/local/chain/tuning/run_tdnn_2a.sh
@@ -92,7 +92,7 @@ if [ $stage -le 10 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/aishell/s5/local/download_and_untar.sh b/egs/aishell/s5/local/download_and_untar.sh
index 3578a1c0835..58a278241d7 100755
--- a/egs/aishell/s5/local/download_and_untar.sh
+++ b/egs/aishell/s5/local/download_and_untar.sh
@@ -57,7 +57,7 @@ if [ -f $data/$part.tgz ]; then
   if ! $size_ok; then
     echo "$0: removing existing file $data/$part.tgz because its size in bytes $size"
     echo "does not equal the size of one of the archives."
-    rm $data/$part.gz
+    rm $data/$part.tgz
   else
     echo "$data/$part.tgz exists and appears to be complete."
   fi
diff --git a/egs/aishell/v1/local/download_and_untar.sh b/egs/aishell/v1/local/download_and_untar.sh
index 0189bad1d4a..3578a1c0835 100755
--- a/egs/aishell/v1/local/download_and_untar.sh
+++ b/egs/aishell/v1/local/download_and_untar.sh
@@ -15,7 +15,7 @@ if [ $# -ne 3 ]; then
   echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
   echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/33 data_aishell"
   echo "With --remove-archive it will remove the archive after successfully un-tarring it."
-  echo "<corpus-part> can be one of: data_aishell, resource."
+  echo "<corpus-part> can be one of: data_aishell, resource_aishell."
 fi
 
 data=$1
@@ -28,7 +28,7 @@ if [ ! -d "$data" ]; then
 fi
 
 part_ok=false
-list="data_aishell resource"
+list="data_aishell resource_aishell"
 for x in $list; do
   if [ "$part" == $x ]; then part_ok=true; fi
 done
diff --git a/egs/aishell2/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/aishell2/s5/local/chain/tuning/run_tdnn_1a.sh
index 459bd64eeb5..86c9becac5b 100755
--- a/egs/aishell2/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/aishell2/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -103,7 +103,7 @@ fi
 if [ $stage -le 10 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
   num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.002"
   linear_opts="orthonormal-constraint=1.0"
   output_opts="l2-regularize=0.0005 bottleneck-dim=256"
diff --git a/egs/aishell2/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/aishell2/s5/local/chain/tuning/run_tdnn_1b.sh
index ba2a4344349..d8560e63909 100755
--- a/egs/aishell2/s5/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/aishell2/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -150,7 +150,7 @@ if [ $stage -le 10 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
   feat_dim=$(feat-to-dim scp:data/${train_set}_hires/feats.scp -)
   num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.002"
   linear_opts="orthonormal-constraint=1.0"
   output_opts="l2-regularize=0.0005 bottleneck-dim=256"
diff --git a/egs/ami/s5/local/ami_ihm_scoring_data_prep.sh b/egs/ami/s5/local/ami_ihm_scoring_data_prep.sh
index 3157d7ffec7..7112e0259a0 100755
--- a/egs/ami/s5/local/ami_ihm_scoring_data_prep.sh
+++ b/egs/ami/s5/local/ami_ihm_scoring_data_prep.sh
@@ -87,18 +87,15 @@ sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1;
 join $dir/utt2spk $dir/segments | \
    perl -ne '{BEGIN{$pu=""; $pt=0.0;} split;
            if ($pu eq $_[1] && $pt > $_[3]) {
-             print "$_[0] $_[2] $_[3] $_[4]>$_[0] $_[2] $pt $_[4]\n"
+             print "s/^$_[0] $_[2] $_[3] $_[4]\$/$_[0] $_[2] $pt $_[4]/;\n"
            }
-           $pu=$_[1]; $pt=$_[4]; 
+           $pu=$_[1]; $pt=$_[4];
          }' > $dir/segments_to_fix
-if [ `cat $dir/segments_to_fix | wc -l` -gt 0 ]; then
+
+if [ -s $dir/segments_to_fix ]; then
   echo "$0. Applying following fixes to segments"
   cat $dir/segments_to_fix
-  while read line; do
-     p1=`echo $line | awk -F'>' '{print $1}'`
-     p2=`echo $line | awk -F'>' '{print $2}'`
-     sed -ir "s!$p1!$p2!" $dir/segments
-  done < $dir/segments_to_fix
+  perl -i -pf $dir/segments_to_fix $dir/segments
 fi
 
 # Copy stuff into its final locations
diff --git a/egs/ami/s5/local/ami_mdm_scoring_data_prep.sh b/egs/ami/s5/local/ami_mdm_scoring_data_prep.sh
index 4cfa9110edf..9c4b55308f2 100755
--- a/egs/ami/s5/local/ami_mdm_scoring_data_prep.sh
+++ b/egs/ami/s5/local/ami_mdm_scoring_data_prep.sh
@@ -94,19 +94,15 @@ awk '{print $1}' $tmpdir/segments | \
 join $tmpdir/utt2spk_stm $tmpdir/segments | \
   awk '{ utt=$1; spk=$2; wav=$3; t_beg=$4; t_end=$5;
          if(spk_prev == spk && t_end_prev > t_beg) {
-           print utt, wav, t_beg, t_end">"utt, wav, t_end_prev, t_end;
+           print "s/^"utt, wav, t_beg, t_end"$/"utt, wav, t_end_prev, t_end"/;";
          }
          spk_prev=spk; t_end_prev=t_end;
        }' > $tmpdir/segments_to_fix
 
-if [ `cat $tmpdir/segments_to_fix | wc -l` -gt 0 ]; then
+if [ -s $tmpdir/segments_to_fix ]; then
   echo "$0. Applying following fixes to segments"
   cat $tmpdir/segments_to_fix
-  while read line; do
-     p1=`echo $line | awk -F'>' '{print $1}'`
-     p2=`echo $line | awk -F'>' '{print $2}'`
-     sed -ir "s:$p1:$p2:" $tmpdir/segments
-  done < $tmpdir/segments_to_fix
+  perl -i -pf $tmpdir/segments_to_fix $tmpdir/segments
 fi
 
 # Copy stuff into its final locations [this has been moved from the format_data
diff --git a/egs/ami/s5/local/ami_sdm_scoring_data_prep.sh b/egs/ami/s5/local/ami_sdm_scoring_data_prep.sh
index 91baa37d6e1..815e1b2d270 100755
--- a/egs/ami/s5/local/ami_sdm_scoring_data_prep.sh
+++ b/egs/ami/s5/local/ami_sdm_scoring_data_prep.sh
@@ -101,19 +101,15 @@ awk '{print $1}' $tmpdir/segments | \
 join $tmpdir/utt2spk_stm $tmpdir/segments | \
   awk '{ utt=$1; spk=$2; wav=$3; t_beg=$4; t_end=$5;
          if(spk_prev == spk && t_end_prev > t_beg) {
-           print utt, wav, t_beg, t_end">"utt, wav, t_end_prev, t_end;
+           print "s/^"utt, wav, t_beg, t_end"$/"utt, wav, t_end_prev, t_end"/;";
          }
          spk_prev=spk; t_end_prev=t_end;
        }' > $tmpdir/segments_to_fix
 
-if [ `cat $tmpdir/segments_to_fix | wc -l` -gt 0 ]; then
+if [ -s $tmpdir/segments_to_fix ]; then
   echo "$0. Applying following fixes to segments"
   cat $tmpdir/segments_to_fix
-  while read line; do
-     p1=`echo $line | awk -F'>' '{print $1}'`
-     p2=`echo $line | awk -F'>' '{print $2}'`
-     sed -ir "s:$p1:$p2:" $tmpdir/segments
-  done < $tmpdir/segments_to_fix
+  perl -i -pf $tmpdir/segments_to_fix $tmpdir/segments
 fi
 
 # Copy stuff into its final locations [this has been moved from the format_data
diff --git a/egs/ami/s5b/local/ami_ihm_scoring_data_prep.sh b/egs/ami/s5b/local/ami_ihm_scoring_data_prep.sh
index 746c42c4c1a..c54876331f1 100755
--- a/egs/ami/s5b/local/ami_ihm_scoring_data_prep.sh
+++ b/egs/ami/s5b/local/ami_ihm_scoring_data_prep.sh
@@ -93,18 +93,15 @@ sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1;
 join $dir/utt2spk $dir/segments | \
    perl -ne '{BEGIN{$pu=""; $pt=0.0;} split;
            if ($pu eq $_[1] && $pt > $_[3]) {
-             print "$_[0] $_[2] $_[3] $_[4]>$_[0] $_[2] $pt $_[4]\n"
+             print "s/^$_[0] $_[2] $_[3] $_[4]\$/$_[0] $_[2] $pt $_[4]/;\n"
            }
            $pu=$_[1]; $pt=$_[4];
          }' > $dir/segments_to_fix
-if [ `cat $dir/segments_to_fix | wc -l` -gt 0 ]; then
+
+if [ -s $dir/segments_to_fix ]; then
   echo "$0. Applying following fixes to segments"
   cat $dir/segments_to_fix
-  while read line; do
-     p1=`echo $line | awk -F'>' '{print $1}'`
-     p2=`echo $line | awk -F'>' '{print $2}'`
-     sed -ir "s!$p1!$p2!" $dir/segments
-  done < $dir/segments_to_fix
+  perl -i -pf $dir/segments_to_fix $dir/segments
 fi
 
 # Copy stuff into its final locations
diff --git a/egs/ami/s5b/local/ami_mdm_scoring_data_prep.sh b/egs/ami/s5b/local/ami_mdm_scoring_data_prep.sh
index 65f514f223c..475ef5405ba 100755
--- a/egs/ami/s5b/local/ami_mdm_scoring_data_prep.sh
+++ b/egs/ami/s5b/local/ami_mdm_scoring_data_prep.sh
@@ -99,19 +99,15 @@ awk '{print $1}' $tmpdir/segments | \
 join $tmpdir/utt2spk_stm $tmpdir/segments | \
   awk '{ utt=$1; spk=$2; wav=$3; t_beg=$4; t_end=$5;
          if(spk_prev == spk && t_end_prev > t_beg) {
-           print utt, wav, t_beg, t_end">"utt, wav, t_end_prev, t_end;
+           print "s/^"utt, wav, t_beg, t_end"$/"utt, wav, t_end_prev, t_end"/;";
          }
          spk_prev=spk; t_end_prev=t_end;
        }' > $tmpdir/segments_to_fix
 
-if [ `cat $tmpdir/segments_to_fix | wc -l` -gt 0 ]; then
+if [ -s $tmpdir/segments_to_fix ]; then
   echo "$0. Applying following fixes to segments"
   cat $tmpdir/segments_to_fix
-  while read line; do
-     p1=`echo $line | awk -F'>' '{print $1}'`
-     p2=`echo $line | awk -F'>' '{print $2}'`
-     sed -ir "s:$p1:$p2:" $tmpdir/segments
-  done < $tmpdir/segments_to_fix
+  perl -i -pf $tmpdir/segments_to_fix $tmpdir/segments
 fi
 
 # Copy stuff into its final locations [this has been moved from the format_data
diff --git a/egs/ami/s5b/local/ami_sdm_scoring_data_prep.sh b/egs/ami/s5b/local/ami_sdm_scoring_data_prep.sh
index 1378f8b8965..d7ce038c0a7 100755
--- a/egs/ami/s5b/local/ami_sdm_scoring_data_prep.sh
+++ b/egs/ami/s5b/local/ami_sdm_scoring_data_prep.sh
@@ -111,25 +111,21 @@ awk '{print $1}' $tmpdir/segments | \
 join $tmpdir/utt2spk_stm $tmpdir/segments | \
   awk '{ utt=$1; spk=$2; wav=$3; t_beg=$4; t_end=$5;
          if(spk_prev == spk && t_end_prev > t_beg) {
-           print utt, wav, t_beg, t_end">"utt, wav, t_end_prev, t_end;
+           print "s/^"utt, wav, t_beg, t_end"$/"utt, wav, t_end_prev, t_end"/;";
          }
          spk_prev=spk; t_end_prev=t_end;
        }' > $tmpdir/segments_to_fix
 
-if [ `cat $tmpdir/segments_to_fix | wc -l` -gt 0 ]; then
+if [ -s $tmpdir/segments_to_fix ]; then
   echo "$0. Applying following fixes to segments"
   cat $tmpdir/segments_to_fix
-  while read line; do
-     p1=`echo $line | awk -F'>' '{print $1}'`
-     p2=`echo $line | awk -F'>' '{print $2}'`
-     sed -ir "s:$p1:$p2:" $tmpdir/segments
-  done < $tmpdir/segments_to_fix
+  perl -i -pf $tmpdir/segments_to_fix $tmpdir/segments
 fi
 
 # Copy stuff into its final locations [this has been moved from the format_data
 # script]
 mkdir -p $dir
-for f in spk2utt utt2spk utt2spk_stm wav.scp text segments reco2file_and_channel; do
+for f in segments_to_fix spk2utt utt2spk utt2spk_stm wav.scp text segments reco2file_and_channel; do
   cp $tmpdir/$f $dir/$f || exit 1;
 done
 
diff --git a/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_1a.sh b/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_1a.sh
index 1fc641f1166..4d260e3c517 100755
--- a/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_1a.sh
+++ b/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_1a.sh
@@ -220,7 +220,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
   tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
   linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
diff --git a/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1a.sh b/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1a.sh
index a8494420b0d..3546b6a7ced 100755
--- a/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1a.sh
@@ -211,7 +211,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1b.sh b/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1b.sh
index a12e7efa7b9..1a839b045bd 100755
--- a/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1b.sh
+++ b/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1b.sh
@@ -235,7 +235,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   tdnn_opts="l2-regularize=0.006"
   lstm_opts="l2-regularize=0.0025 decay-time=20 dropout-proportion=0.0"
   output_opts="l2-regularize=0.001"
diff --git a/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh b/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh
index 16d1f4044f5..d926c1dc6d7 100644
--- a/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh
@@ -184,7 +184,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20"
 
diff --git a/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1b.sh b/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1b.sh
index 83e6a95582f..d9cd1c356e8 100644
--- a/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1b.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1b.sh
@@ -176,7 +176,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20 dropout-proportion=0"
 
diff --git a/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1c.sh b/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1c.sh
index 387b4bfcc88..a0805b4f9f1 100755
--- a/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1c.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1c.sh
@@ -185,7 +185,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=40"
   
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1b.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1b.sh
index 57108dbddae..997357b80a9 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1b.sh
@@ -164,7 +164,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1c.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1c.sh
index f87e1a12d36..4d062e65429 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1c.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1c.sh
@@ -151,7 +151,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1d.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1d.sh
index eb84a1cd876..387570388d0 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1d.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1d.sh
@@ -163,7 +163,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1e.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1e.sh
index e6592b667dc..0436b08cdc0 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1e.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1e.sh
@@ -161,7 +161,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1f.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1f.sh
index 8bf2b73dada..4ca526d63b8 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1f.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1f.sh
@@ -165,7 +165,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1g.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1g.sh
index dfb6dfedee7..baed760bb68 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1g.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1g.sh
@@ -166,7 +166,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1h.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1h.sh
index 3e26a8b38bd..e721a858c0a 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1h.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1h.sh
@@ -167,7 +167,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1i.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1i.sh
index 1931127c86d..de40cb2d1a4 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1i.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1i.sh
@@ -168,7 +168,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.02"
   output_opts="l2-regularize=0.004"
 
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
index d63712f1f0f..4f580b88f6b 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -171,7 +171,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1b.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1b.sh
index a53785f45c2..904a079d7de 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1b.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1b.sh
@@ -173,7 +173,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1c.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1c.sh
index 76a9f735c5f..511e520465a 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1c.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1c.sh
@@ -172,7 +172,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1d.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1d.sh
index 8cc1a4e15fa..bd81b7df4eb 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1d.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1d.sh
@@ -172,7 +172,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1e.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1e.sh
index accfd158a9d..50903e78b6d 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1e.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1e.sh
@@ -174,7 +174,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1f.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1f.sh
index 2b275e4e27d..f6c53001498 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1f.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1f.sh
@@ -173,7 +173,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1g.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1g.sh
index 1c90af38c4c..79fd9ef3fb5 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1g.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1g.sh
@@ -174,7 +174,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1h.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1h.sh
index fb4b6a475e2..e58a7f89e03 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1h.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1h.sh
@@ -171,7 +171,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh
index 92636b4c17e..13f894f5a48 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh
@@ -174,7 +174,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1j.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1j.sh
index 89fd8ce2915..48b31832e8c 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1j.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1j.sh
@@ -181,7 +181,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20"
 
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1k.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1k.sh
index b8d947d8e92..e675bc494bb 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1k.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1k.sh
@@ -177,7 +177,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20"
 
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh
index 74c0f5a6ead..2d019398274 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh
@@ -224,7 +224,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh
index b0e7af0618d..9e5b971bbe2 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh
@@ -226,7 +226,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20 dropout-proportion=0.0"
 
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1n.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1n.sh
index bee4d997b01..9575c3cf686 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1n.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1n.sh
@@ -178,7 +178,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1o.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1o.sh
index 1e4111adc6a..a7f2625c181 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1o.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1o.sh
@@ -182,7 +182,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   tdnn_opts="l2-regularize=0.025"
   lstm_opts="l2-regularize=0.01"
   output_opts="l2-regularize=0.004"
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_bs_1a.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_bs_1a.sh
index b672a44e572..ca920869b30 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_bs_1a.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_bs_1a.sh
@@ -180,7 +180,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   tdnn_opts="l2-regularize=0.003"
   lstm_opts="l2-regularize=0.005"
   output_opts="l2-regularize=0.001"
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1a.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1a.sh
index f68c4203767..53dbd5238db 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1a.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1a.sh
@@ -178,7 +178,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   gru_opts="dropout-per-frame=true dropout-proportion=0.0"
 
   mkdir -p $dir/configs
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1b.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1b.sh
index ac4266ca162..dafef668e60 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1b.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1b.sh
@@ -177,7 +177,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   gru_opts="dropout-per-frame=true dropout-proportion=0.0"
 
   mkdir -p $dir/configs
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1c.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1c.sh
index 74b21f10c33..677946d0b9a 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1c.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1c.sh
@@ -176,7 +176,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   gru_opts="dropout-per-frame=true dropout-proportion=0.0"
 
   mkdir -p $dir/configs
diff --git a/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh b/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh
index 8ff59d83ed0..bd13010c791 100755
--- a/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh
+++ b/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh
@@ -138,7 +138,7 @@ if [ $stage -le 11 ]; then
 
   num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20"
 
diff --git a/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh b/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh
index 0ca6062e9c8..b5979a3ce6b 100755
--- a/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh
+++ b/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh
@@ -208,7 +208,7 @@ if [ $stage -le 14 ]; then
   extra_right_context=$[$chunk_right_context+10]
   # %WER 26.8 | 2120 27220 | 80.2 11.7 8.1 7.0 26.8 76.5 | -0.804 | exp/chain/blstm_asp_1/decode_dev_aspire_whole_uniformsegmented_win10_over5_v7_iterfinal_pp_fg/score_9/penalty_0.0/
 
-  local/nnet3/prep_test_aspire.sh --stage 4 --decode-num-jobs 30  --affix "v7" \
+  local/multi_condition/prep_test_aspire.sh --stage 4 --decode-num-jobs 30  --affix "v7" \
    --extra-left-context $extra_left_context \
    --extra-right-context $extra_right_context \
    --frames-per-chunk $chunk_width \
diff --git a/egs/aspire/s5/local/chain/tuning/run_tdnn_7b.sh b/egs/aspire/s5/local/chain/tuning/run_tdnn_7b.sh
index 201f61dc64b..cd548142598 100755
--- a/egs/aspire/s5/local/chain/tuning/run_tdnn_7b.sh
+++ b/egs/aspire/s5/local/chain/tuning/run_tdnn_7b.sh
@@ -136,7 +136,7 @@ if [ $stage -le 11 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
@@ -182,6 +182,7 @@ if [ $stage -le 12 ]; then
      /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
   fi
 
+  mkdir -p $dir/egs
   touch $dir/egs/.nodelete # keep egs around when that run dies.
 
   steps/nnet3/chain/train.py --stage $train_stage \
diff --git a/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
index 63d3a7ca988..f98dff5e6fa 100755
--- a/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -26,7 +26,6 @@ cell_dim=1024
 projection_dim=256
 
 # training options
-num_epochs=2
 minibatch_size=64,32
 chunk_left_context=40
 chunk_right_context=0
@@ -95,7 +94,7 @@ if [ $stage -le 8 ]; then
 
   for n in `seq $nj`; do
     awk '{print $1}' data/${train_set}/split$nj/$n/utt2spk | \
-      perl -ane 's/rev[1-3]_//g' > $lat_dir/uttlist.$n.$nj
+      perl -ane 's/rev[1-3]-//g' > $lat_dir/uttlist.$n.$nj
   done
 
   rm -f $lat_dir/lat_tmp.*.{ark,scp} 2>/dev/null
@@ -106,7 +105,7 @@ if [ $stage -le 8 ]; then
     ark,scp:$lat_dir/lat_tmp.JOB.ark,$lat_dir/lat_tmp.JOB.scp || exit 1
 
   for n in `seq 3`; do
-    cat $lat_dir/lat_tmp.*.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}'
+    cat $lat_dir/lat_tmp.*.scp | awk -v n=$n '{print "rev"n"-"$1" "$2}'
   done > $lat_dir/lat_rvb.scp
 
   $train_cmd JOB=1:$nj $lat_dir/log/copy_rvb_lattices.JOB.log \
@@ -151,7 +150,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=40"
 
@@ -309,4 +308,3 @@ if [ $stage -le 17 ]; then
 fi
 
 exit 0;
-
diff --git a/egs/aspire/s5/local/multi_condition/prepare_impulses_noises.sh b/egs/aspire/s5/local/multi_condition/prepare_impulses_noises.sh
index 804de611cae..8297cdee9ca 100755
--- a/egs/aspire/s5/local/multi_condition/prepare_impulses_noises.sh
+++ b/egs/aspire/s5/local/multi_condition/prepare_impulses_noises.sh
@@ -114,7 +114,7 @@ cp ${output_dir}_non_normalized/info/* $output_dir/info
 
 # rename file location in the noise-rir pairing files 
 for file in `ls $output_dir/info/noise_impulse*`; do
-  sed -i "s/_non_normalized//g" $file
+  perl -i -pe "s/_non_normalized//g" $file
 done
 
 # generating the rir-list with probabilities alloted for each rir
diff --git a/egs/babel/s5c/local/syllab/generate_syllable_lang.sh b/egs/babel/s5c/local/syllab/generate_syllable_lang.sh
index 2d1fcb2259e..4a0810b9415 100755
--- a/egs/babel/s5c/local/syllab/generate_syllable_lang.sh
+++ b/egs/babel/s5c/local/syllab/generate_syllable_lang.sh
@@ -118,8 +118,7 @@ ln -s lex.syllabs2phones.disambig.fst $out/L_disambig.fst
 echo "Validating the output lang dir"
 utils/validate_lang.pl $out || exit 1
 
-sed -i'' 's/#1$//g' $lout/lexicon.txt
-sed -i'' 's/#1$//g' $lout/lexiconp.txt
+perl -i -pe 's/#1$//g' $lout/lexicon.txt $lout/lexiconp.txt
 
 echo "Done OK."
 exit 0
diff --git a/egs/babel/s5d/conf/lang/404-georgian.FLP.official.conf b/egs/babel/s5d/conf/lang/404-georgian.FLP.official.conf
index a6b22de419f..9cd043716ce 100644
--- a/egs/babel/s5d/conf/lang/404-georgian.FLP.official.conf
+++ b/egs/babel/s5d/conf/lang/404-georgian.FLP.official.conf
@@ -75,8 +75,8 @@ unsup_data_list=./conf/lists/404-georgian/untranscribed-training.list
 unsup_nj=32
 
 
-lexicon_file=
-lexiconFlags="--romanized --oov <unk>"
+lexicon_file=/export/corpora/LDC/LDC2016S12/IARPA_BABEL_OP3_404/conversational/reference_materials/lexicon.txt
+lexiconFlags=" --romanized --oov <unk>"
 
 
 
diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn.sh
index 4f485edf7da..7b4535f8c5e 100755
--- a/egs/babel/s5d/local/chain/tuning/run_tdnn.sh
+++ b/egs/babel/s5d/local/chain/tuning/run_tdnn.sh
@@ -128,7 +128,7 @@ if [ $stage -le 17 ]; then
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm.sh
index 72f7a3c32dd..5fc14dda826 100755
--- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm.sh
+++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm.sh
@@ -129,7 +129,7 @@ if [ $stage -le 17 ]; then
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   lstm_opts="decay-time=20"
   label_delay=5
 
diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab1.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab1.sh
index be0c2cc4b9b..8c7de5d18d4 100755
--- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab1.sh
+++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab1.sh
@@ -127,7 +127,7 @@ if [ $stage -le 17 ]; then
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   lstm_opts="decay-time=20"
   label_delay=5
 
diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab2.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab2.sh
index 8f21a239794..0b3e70b5a04 100755
--- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab2.sh
+++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab2.sh
@@ -127,7 +127,7 @@ if [ $stage -le 17 ]; then
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   lstm_opts="decay-time=20"
   label_delay=5
 
diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab3.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab3.sh
index 7898d172242..45f2907645e 100755
--- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab3.sh
+++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab3.sh
@@ -128,7 +128,7 @@ if [ $stage -le 17 ]; then
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   lstm_opts="decay-time=20"
   label_delay=5
 
diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab4.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab4.sh
index 49462573245..0d92aff5c28 100755
--- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab4.sh
+++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab4.sh
@@ -128,7 +128,7 @@ if [ $stage -le 17 ]; then
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   lstm_opts="decay-time=20"
   label_delay=5
 
diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab5.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab5.sh
index c888d985f5e..4129c00dcb4 100755
--- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab5.sh
+++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab5.sh
@@ -128,7 +128,7 @@ if [ $stage -le 17 ]; then
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   lstm_opts="decay-time=20"
   label_delay=5
 
diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab6.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab6.sh
index e9a045e113a..1cfa50c1aa1 100755
--- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab6.sh
+++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab6.sh
@@ -128,7 +128,7 @@ if [ $stage -le 17 ]; then
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   lstm_opts="decay-time=20"
   label_delay=5
 
diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab7.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab7.sh
index ce192a91665..ba8ac1e0373 100755
--- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab7.sh
+++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab7.sh
@@ -129,7 +129,7 @@ if [ $stage -le 17 ]; then
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   lstm_opts="decay-time=20 dropout-proportion=0.0"
   label_delay=5
 
diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab8.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab8.sh
index 3fc0ef2206c..5de285e080e 100755
--- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab8.sh
+++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab8.sh
@@ -129,7 +129,7 @@ if [ $stage -le 17 ]; then
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   lstm_opts="decay-time=20 dropout-proportion=0.0 "
   label_delay=5
 
diff --git a/egs/babel/s5d/local/make_L_align.sh b/egs/babel/s5d/local/make_L_align.sh
index 50e46a00493..41e9ff32958 100755
--- a/egs/babel/s5d/local/make_L_align.sh
+++ b/egs/babel/s5d/local/make_L_align.sh
@@ -34,18 +34,24 @@ tmpdir=$1
 dir=$2
 outdir=$3
 
+for f in  $dir/phones/optional_silence.txt $dir/phones.txt $dir/words.txt ; do
+  [ ! -f $f ] &&  echo "$0: The file $f must exist!" exit 1
+fi
+
 silphone=`cat $dir/phones/optional_silence.txt` || exit 1;
 
+if [ ! -f $tmpdir/lexicon.txt ] && [ ! -f $tmpdir/lexiconp.txt ] ; then
+  echo "$0: At least one of the files $tmpdir/lexicon.txt or $tmpdir/lexiconp.txt must exist" >&2
+  exit 1
+fi
+
 # Create lexicon with alignment info
 if  [ -f $tmpdir/lexicon.txt ] ; then
   cat $tmpdir/lexicon.txt | \
     awk '{printf("%s #1 ", $1); for (n=2; n <= NF; n++) { printf("%s ", $n); } print "#2"; }'
-elif [ -f $tmpdir/lexiconp.txt ] ;  then
+else
   cat $tmpdir/lexiconp.txt | \
     awk '{printf("%s #1 ", $1); for (n=3; n <= NF; n++) { printf("%s ", $n); } print "#2"; }'
-else
-  echo "Neither $tmpdir/lexicon.txt nor $tmpdir/lexiconp.txt does not exist"
-  exit 1
 fi | utils/make_lexicon_fst.pl - 0.5 $silphone | \
 fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
   --keep_isymbols=false --keep_osymbols=false | \
diff --git a/egs/babel/s5d/local/syllab/generate_phone_lang.sh b/egs/babel/s5d/local/syllab/generate_phone_lang.sh
index fc21a23231b..81d8a0acdc7 100755
--- a/egs/babel/s5d/local/syllab/generate_phone_lang.sh
+++ b/egs/babel/s5d/local/syllab/generate_phone_lang.sh
@@ -122,8 +122,7 @@ ln -s lex.syllabs2phones.disambig.fst $out/L_disambig.fst
 echo "Validating the output lang dir"
 utils/validate_lang.pl $out || exit 1
 
-sed -i'' 's/#1$//g' $lout/lexicon.txt
-sed -i'' 's/#1$//g' $lout/lexiconp.txt
+perl -i -pe 's/#1$//g' $lout/lexicon.txt $lout/lexiconp.txt
 
 echo "Done OK."
 exit 0
diff --git a/egs/babel/s5d/local/syllab/generate_syllable_lang.sh b/egs/babel/s5d/local/syllab/generate_syllable_lang.sh
index db7b0902425..a7bd667027c 100755
--- a/egs/babel/s5d/local/syllab/generate_syllable_lang.sh
+++ b/egs/babel/s5d/local/syllab/generate_syllable_lang.sh
@@ -122,8 +122,7 @@ ln -s lex.syllabs2phones.disambig.fst $out/L_disambig.fst
 echo "Validating the output lang dir"
 utils/validate_lang.pl $out || exit 1
 
-sed -i'' 's/#1$//g' $lout/lexicon.txt
-sed -i'' 's/#1$//g' $lout/lexiconp.txt
+perl -i -pe 's/#1$//g' $lout/lexicon.txt $lout/lexiconp.txt
 
 echo "Done OK."
 exit 0
diff --git a/egs/bentham/v1/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/bentham/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
index 6bac5a22398..ec530ef1ce4 100755
--- a/egs/bentham/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
+++ b/egs/bentham/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
@@ -139,7 +139,7 @@ if [ $stage -le 4 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   cnn_opts="l2-regularize=0.03 dropout-proportion=0.0"
   tdnn_opts="l2-regularize=0.03"
   output_opts="l2-regularize=0.04"
diff --git a/egs/bentham/v1/local/create_splits.sh b/egs/bentham/v1/local/create_splits.sh
index 93e8bf1b12e..e8ea2279a49 100755
--- a/egs/bentham/v1/local/create_splits.sh
+++ b/egs/bentham/v1/local/create_splits.sh
@@ -27,10 +27,8 @@ function split {
         echo $name $lines_dir"/"$name".png" >> $split_dir/images.scp
         echo $name $spkid >> $split_dir/utt2spk 
 	done < "$line_file"
-    
-    sed -i '/^\s*$/d' $split_dir/images.scp
-    sed -i '/^\s*$/d' $split_dir/text
-    sed -i '/^\s*$/d' $split_dir/utt2spk
+   
+    perl -i -ne 'print if /\S/' $split_dir/images.scp $split_dir/text $split_dir/utt2spk
     utils/utt2spk_to_spk2utt.pl $split_dir/utt2spk > $split_dir/spk2utt
 }
 
diff --git a/egs/bn_music_speech/v1/local/make_musan.py b/egs/bn_music_speech/v1/local/make_musan.py
deleted file mode 100755
index 942973cfc65..00000000000
--- a/egs/bn_music_speech/v1/local/make_musan.py
+++ /dev/null
@@ -1,119 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2015   David Snyder
-# Apache 2.0.
-#
-# This file is meant to be invoked by make_musan.sh.
-
-import os, sys
-
-def process_music_annotations(path):
-  utt2spk = {}
-  utt2vocals = {}
-  lines = open(path, 'r').readlines()
-  for line in lines:
-    utt, genres, vocals, musician = line.rstrip().split()[:4]
-    # For this application, the musican ID isn't important
-    utt2spk[utt] = utt
-    utt2vocals[utt] = vocals == "Y"
-  return utt2spk, utt2vocals
-
-def prepare_music(root_dir, use_vocals):
-  utt2vocals = {}
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  music_dir = os.path.join(root_dir, "music")
-  for root, dirs, files in os.walk(music_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-      elif str(file) == "ANNOTATIONS":
-        utt2spk_part, utt2vocals_part = process_music_annotations(file_path)
-        utt2spk.update(utt2spk_part)
-        utt2vocals.update(utt2vocals_part)
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2vocals:
-    if utt in utt2wav:
-      if use_vocals or not utt2vocals[utt]:
-        utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-        utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
-      num_good_files += 1
-    else:
-      print("Missing file {}".format(utt))
-      num_bad_files += 1
-  print(("In music directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
-  return utt2spk_str, utt2wav_str
-
-def prepare_speech(root_dir):
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  speech_dir = os.path.join(root_dir, "speech")
-  for root, dirs, files in os.walk(speech_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-        utt2spk[utt] = utt
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2spk:
-    if utt in utt2wav:
-      utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-      utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
-      num_good_files += 1
-    else:
-      print("Missing file {}".format(utt))
-      num_bad_files += 1
-  print(("In speech directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
-  return utt2spk_str, utt2wav_str
-
-def prepare_noise(root_dir):
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  noise_dir = os.path.join(root_dir, "noise")
-  for root, dirs, files in os.walk(noise_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-        utt2spk[utt] = utt
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2spk:
-    if utt in utt2wav:
-      utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-      utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
-      num_good_files += 1
-    else:
-      print("Missing file {}".format(utt))
-      num_bad_files += 1
-  print(("In noise directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
-  return utt2spk_str, utt2wav_str
-
-def main():
-  in_dir = sys.argv[1]
-  out_dir = sys.argv[2]
-  use_vocals = sys.argv[3] == "Y"
-  utt2spk_music, utt2wav_music = prepare_music(in_dir, use_vocals)
-  utt2spk_speech, utt2wav_speech = prepare_speech(in_dir)
-  utt2spk_noise, utt2wav_noise = prepare_noise(in_dir)
-  utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise
-  utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise
-  wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w')
-  wav_fi.write(utt2wav)
-  utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w')
-  utt2spk_fi.write(utt2spk)
-
-
-if __name__=="__main__":
-  main()
diff --git a/egs/bn_music_speech/v1/local/make_musan.sh b/egs/bn_music_speech/v1/local/make_musan.sh
deleted file mode 100755
index 694940ad70f..00000000000
--- a/egs/bn_music_speech/v1/local/make_musan.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-# Copyright 2015   David Snyder
-# Apache 2.0.
-#
-# This script, called by ../run.sh, creates the MUSAN
-# data directory. The required dataset is freely available at
-#   http://www.openslr.org/17/
-
-set -e
-in_dir=$1
-data_dir=$2
-use_vocals='Y'
-
-mkdir -p local/musan.tmp
-
-echo "Preparing ${data_dir}/musan..."
-mkdir -p ${data_dir}/musan
-local/make_musan.py ${in_dir} ${data_dir}/musan ${use_vocals}
-
-utils/fix_data_dir.sh ${data_dir}/musan
-
-grep "music" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_music
-grep "speech" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_speech
-grep "noise" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_noise
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_music \
-  ${data_dir}/musan ${data_dir}/musan_music
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_speech \
-  ${data_dir}/musan ${data_dir}/musan_speech
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_noise \
-  ${data_dir}/musan ${data_dir}/musan_noise
-
-utils/fix_data_dir.sh ${data_dir}/musan_music
-utils/fix_data_dir.sh ${data_dir}/musan_speech
-utils/fix_data_dir.sh ${data_dir}/musan_noise
-
-rm -rf local/musan.tmp
-
diff --git a/egs/bn_music_speech/v1/run.sh b/egs/bn_music_speech/v1/run.sh
index 6cc0531e9d7..08d5c022a9d 100755
--- a/egs/bn_music_speech/v1/run.sh
+++ b/egs/bn_music_speech/v1/run.sh
@@ -20,7 +20,7 @@ vaddir=`pwd`/mfcc
 local/make_bn.sh /export/corpora5/LDC/LDC97S44 \
                  /export/corpora/LDC/LDC97T22 data
 
-local/make_musan.sh /export/corpora/JHU/musan data
+steps/data/make_musan.sh --sampling-rate 16000 /export/corpora/JHU/musan data
 
 steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 30 --cmd "$train_cmd" \
     data/musan_speech exp/make_mfcc $mfccdir
diff --git a/egs/callhome_diarization/v1/diarization/VB_diarization.py b/egs/callhome_diarization/v1/diarization/VB_diarization.py
new file mode 100644
index 00000000000..31af078efd2
--- /dev/null
+++ b/egs/callhome_diarization/v1/diarization/VB_diarization.py
@@ -0,0 +1,359 @@
+# Copyright 2013-2017 Lukas Burget (burget@fit.vutbr.cz)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# Revision History
+#   L. Burget   16/07/13 01:00AM - original version
+#   L. Burget   20/06/17 12:07AM - np.asarray replaced by .toarray()
+#                                - minor bug fix in initializing q
+#                                - minor bug fix in ELBO calculation
+#                                - few more optimizations
+
+import numpy as np
+from scipy.sparse import coo_matrix
+import scipy.linalg as spl
+import numexpr as ne # the dependency on this modul can be avoided by replacing
+                       # logsumexp_ne and exp_ne with logsumexp and np.exp
+
+#[q sp Li] =
+def VB_diarization(X, m, iE, w, V, sp=None, q=None,
+                   maxSpeakers = 10, maxIters = 10,
+                   epsilon = 1e-4, loopProb = 0.99, statScale = 1.0,
+                   alphaQInit = 1.0, downsample = None, VtiEV = None, ref=None,
+                   plot=False, sparsityThr=0.001, llScale=1.0, minDur=1):
+
+  """
+  This a generalized version of speaker diarization described in:
+
+  Kenny, P. Bayesian Analysis of Speaker Diarization with Eigenvoice Priors,
+  Montreal, CRIM, May 2008.
+
+  Kenny, P., Reynolds, D., and Castaldo, F. Diarization of Telephone
+  Conversations using Factor Analysis IEEE Journal of Selected Topics in Signal
+  Processing, December 2010.
+
+  The generalization introduced in this implementation lies in using an HMM
+  instead of the simple mixture model when modeling generation of segments
+  (or even frames) from speakers. HMM limits the probability of switching
+  between speakers when changing frames, which makes it possible to use
+  the model on frame-by-frame bases without any need to iterate between
+  1) clustering speech segments and 2) re-segmentation (i.e. as it was done in
+  the paper above).
+
+  Inputs:
+  X  - T x D array, where columns are D dimensional feature vectors for T frames
+  m  - C x D array of GMM component means
+  iE - C x D array of GMM component inverse covariance matrix diagonals
+  w  - C dimensional column vector of GMM component weights
+  V  - R x C x D array of eigenvoices
+  maxSpeakers - maximum number of speakers expected in the utterance
+  maxIters    - maximum number of algorithm iterations
+  epsilon     - stop iterating, if obj. fun. improvement is less than epsilon
+  loopProb    - probability of not switching speakers between frames
+  statScale   - scale sufficient statiscits collected using UBM
+  llScale     - scale UBM likelihood (i.e. llScale < 1.0 make atribution of
+                frames to UBM componets more uncertain)
+  sparsityThr - set occupations smaller that this threshold to 0.0 (saves memory
+                as the posteriors are represented by sparse matrix)
+  alphaQInit  - Dirichlet concentraion parameter for initializing q
+  downsample  - perform diarization on input downsampled by this factor
+  VtiEV       - C x (R**2+R)/2 matrix normally calculated by VB_diarization when
+                VtiEV is None. However, it can be pre-calculated using function
+                precalculate_VtiEV(V) and used across calls of VB_diarization.
+  minDur      - minimum number of frames between speaker turns imposed by linear
+                chains of HMM states corresponding to each speaker. All the states
+                in a chain share the same output distribution
+  ref         - T dim. integer vector with reference speaker ID (0:maxSpeakers)
+                per frame
+  plot        - if set to True, plot per-frame speaker posteriors.
+
+   Outputs:
+   q  - S x T matrix of posteriors attribution each frame to one of S possible
+        speakers, where S is given by opts.maxSpeakers
+   sp - S dimensional column vector of ML learned speaker priors. Ideally, these
+        should allow to estimate # of speaker in the utterance as the
+        probabilities of the redundant speaker should converge to zero.
+   Li - values of auxiliary function (and DER and frame cross-entropy between q
+        and reference if 'ref' is provided) over iterations.
+  """
+
+  # The references to equations corresponds to the technical report:
+  # Kenny, P. Bayesian Analysis of Speaker Diarization with Eigenvoice Priors,
+  # Montreal, CRIM, May 2008.
+
+  D=X.shape[1]  # feature dimensionality
+  C=len(w)      # number of mixture components
+  R=V.shape[0]  # subspace rank
+  nframes=X.shape[0]
+
+  if VtiEV is None:
+    VtiEV = precalculate_VtiEV(V, iE)
+
+  V = V.reshape(V.shape[0],-1)
+
+  if sp is None:
+    sp = np.ones(maxSpeakers)/maxSpeakers
+  else:
+    maxSpeakers = len(sp)
+
+  if q is None:
+    # initialize q from flat Dirichlet prior with concentrsaion parameter alphaQInit
+    q = np.random.gamma(alphaQInit, size=(nframes, maxSpeakers))
+    q = q / q.sum(1, keepdims=True)
+
+  # calculate UBM mixture frame posteriors (i.e. per-frame zero order statistics)
+  ll = (X**2).dot(-0.5*iE.T) + X.dot(iE.T*m.T)-0.5*((iE * m**2 - np.log(iE)).sum(1) - 2*np.log(w) + D*np.log(2*np.pi))
+  ll *= llScale
+  G = logsumexp_ne(ll, axis=1)
+  NN =  exp_ne(ll - G[:,np.newaxis]) * statScale
+  NN[NN<sparsityThr] = 0.0
+
+  #Kx = np.sum(NN * (np.log(w) - np.log(NN)), 1)
+  NN = coo_matrix(NN) # represent zero-order stats using sparse matrix
+  print 'Sparsity: ', len(NN.row), float(len(NN.row))/np.prod(NN.shape)
+  LL = np.sum(G) # total log-likelihod as calculated using UBM
+
+  mixture_sum = coo_matrix((np.ones(C*D), (np.repeat(range(C),D), range(C*D))), shape=(C, C*D))
+
+  #G = np.sum((NN.multiply(ll - np.log(w))).toarray(), 1) + Kx  # eq. (15) # Aleready calculated above
+
+  # Calculate per-frame first order statistics projected into the R-dim. subspace
+  # V^T \Sigma^{-1} F_m
+  F_s = coo_matrix((((X[NN.row]-m[NN.col])*NN.data[:,np.newaxis]).flat,
+                   (NN.row.repeat(D), NN.col.repeat(D)*D+np.tile(range(D), len(NN.col)))), shape=(nframes, D*C))
+  VtiEF = F_s.tocsr().dot((iE.flat * V).T) ; del F_s
+  ## The code above is only efficient implementation of the following comented code
+  #VtiEF = 0;
+  #for ii in range(C):
+  #  VtiEF = VtiEF + V[ii*D:(ii+1)*D,:].T.dot(NN[ii,:] * np.sqrt(iE[:,[ii]]) *  (X - m[:,[ii]]))
+
+  if downsample is not None:
+    # Downsample NN, VtiEF, G and q by summing the statistic over 'downsample' frames
+    # This speeds-up diarization for the price of lowering its frame resolution
+    downsampler = coo_matrix((np.ones(nframes, dtype=np.int64), ((np.ceil(np.arange(nframes)/downsample)).astype(int), np.arange(nframes))), shape=(int(np.ceil(1.0 * nframes / downsample)), nframes))
+    NN    = downsampler.dot(NN)
+    VtiEF = downsampler.dot(VtiEF)
+    G     = downsampler.dot(G)
+    q     = downsampler.dot(q) / downsample
+  else:
+    downsampler=np.array(1)
+
+  Li = [[LL]] # for the 0-th iteration,
+  if ref is not None:
+    Li[-1] += [DER(downsampler.T.dot(q), ref), DER(downsampler.T.dot(q), ref, xentropy=True)]
+
+  lls = np.zeros_like(q)
+  tr = np.eye(minDur*maxSpeakers, k=1)
+  ip = np.zeros(minDur*maxSpeakers)
+  for ii in range(maxIters):
+    L = 0 # objective function (37) (i.e. VB lower-bound on the evidence)
+    Ns =   NN.T.dot(q).T                             # bracket in eq. (34) for all 's'
+    VtNsiEV_flat = Ns.astype(VtiEV.dtype).dot(VtiEV) # eq. (34) except for 'I' for all 's'
+    VtiEFs = q.T.dot(VtiEF)                          # eq. (35) except for \Lambda_s^{-1} for all 's'
+    for sid in range(maxSpeakers):
+        invL = np.linalg.inv(np.eye(R) + tril_to_sym(VtNsiEV_flat[sid])) # eq. (34) inverse
+        a = invL.dot(VtiEFs[sid])                                        # eq. (35)
+        # eq. (29) except for the prior term \ln \pi_s. Our prior is given by HMM
+        # trasition probability matrix. Instead of eq. (30), we need to use
+        # forward-backwar algorithm to calculate per-frame speaker posteriors,
+        # where 'lls' plays role of HMM output log-probabilities
+        lls[:,sid] = G + VtiEF.dot(a) - 0.5 * NN.dot(mixture_sum.dot(((invL+np.outer(a,a)).astype(V.dtype).dot(V) * (iE.flat * V)).sum(0)))
+        L += 0.5 * (logdet(invL) - np.sum(np.diag(invL) + a**2, 0) + R)
+
+    # Construct transition probability matrix with linear chain of 'minDur'
+    # states for each of 'maxSpeaker' speaker. The last state in each chain has
+    # self-loop probability 'loopProb' and the transition probabilities to the
+    # initial chain states given by vector '(1-loopProb) * sp'. From all other,
+    #states, one must move to the next state in the chain with probability one.
+    tr[minDur-1::minDur,0::minDur]=(1-loopProb)*sp
+    tr[(np.arange(1,maxSpeakers+1)*minDur-1,)*2] += loopProb
+    ip[::minDur]=sp
+    # per-frame HMM state posteriors. Note that we can have linear chain of minDur states
+    # for each speaker.
+    q, tll, lf, lb = forward_backward(lls.repeat(minDur,axis=1), tr, ip) #, np.arange(1,maxSpeakers+1)*minDur-1)
+
+    # Right after updating q(Z), tll is E{log p(X|,Y,Z)} - KL{q(Z)||p(Z)}.
+    # L now contains -KL{q(Y)||p(Y)}. Therefore, L+ttl is correct value for ELBO.
+    L += tll
+    Li.append([L])
+
+    # ML estimate of speaker prior probabilities (analogue to eq. (38))
+    sp = q[0,::minDur] + np.exp(logsumexp(lf[:-1,minDur-1::minDur],axis=1)[:,np.newaxis]
+                       + lb[1:,::minDur] + lls[1:] + np.log((1-loopProb)*sp)-tll).sum(0)
+    sp = sp / sp.sum()
+
+    # per-frame speaker posteriors (analogue to eq. (30)), obtained by summing
+    # HMM state posteriors corresponding to each speaker
+    q = q.reshape(len(q),maxSpeakers,minDur).sum(axis=2)
+
+
+    # if reference is provided, report DER, cross-entropy and plot the figures
+    if ref is not None:
+      Li[-1] += [DER(downsampler.T.dot(q), ref), DER(downsampler.T.dot(q), ref, xentropy=True)]
+
+      if plot:
+        #import matplotlib.pyplot
+        #if ii == 0: matplotlib.pyplot.clf()
+        #matplotlib.pyplot.subplot(maxIters, 1, ii+1)
+        #matplotlib.pyplot.plot(downsampler.T.dot(q), lw=2)
+        #matplotlib.pyplot.imshow(np.atleast_2d(ref), interpolation='none', aspect='auto',
+        #                         cmap=matplotlib.pyplot.cm.Pastel1, extent=(0, len(ref), -0.05, 1.05))
+        import matplotlib.pyplot as plt
+        if ii == 0: plt.clf()
+        plt.subplot(maxIters, 1, ii+1)
+        plt.plot(downsampler.T.dot(q), lw=2)
+        #matplotlib.pyplot.imshow(np.atleast_2d(ref), interpolation='none', aspect='auto',
+        #                         cmap=matplotlib.pyplot.cm.Pastel1, extent=(0, len(ref), -0.05, 1.05))
+        plt.savefig("result.pdf")
+        
+      print ii, Li[-2]
+
+
+    if ii > 0 and L - Li[-2][0] < epsilon:
+      if L - Li[-1][0] < 0: print('WARNING: Value of auxiliary function has decreased!')
+      break
+
+  if downsample is not None:
+    #upsample resulting q to match number of frames in the input utterance
+    q = downsampler.T.dot(q)
+
+  return q, sp, Li
+
+
+def precalculate_VtiEV(V, iE):
+    tril_ind = np.tril_indices(V.shape[0])
+    VtiEV = np.empty((V.shape[1],len(tril_ind[0])), V.dtype)
+    for c in range(V.shape[1]):
+        VtiEV[c,:] = np.dot(V[:,c,:]*iE[np.newaxis,c,:], V[:,c,:].T)[tril_ind]
+    return VtiEV
+
+
+# Initialize q (per-frame speaker posteriors) from a reference
+# (vector of per-frame zero based integer speaker IDs)
+def frame_labels2posterior_mx(labels, maxSpeakers):
+    #initialize from reference
+    #pmx = np.zeros((len(labels), labels.max()+1))
+    pmx = np.zeros((len(labels), maxSpeakers))
+    pmx[np.arange(len(labels)), labels] = 1
+    return pmx
+
+# Calculates Diarization Error Rate (DER) or per-frame cross-entropy between
+# reference (vector of per-frame zero based integer speaker IDs) and q (per-frame
+# speaker posteriors). If expected=False, q is converted into hard labels before
+# calculating DER. If expected=TRUE, posteriors in q are used to calculated
+# "expected" DER.
+def DER(q, ref, expected=True, xentropy=False):
+    from itertools import permutations
+
+    if not expected:
+        # replce probabiities in q by zeros and ones
+        hard_labels = q.argmax(1)
+        q = np.zeros_like(q)
+        q[range(len(q)), hard_labels] = 1
+
+    err_mx = np.empty((ref.max()+1, q.shape[1]))
+    for s in range(err_mx.shape[0]):
+        tmpq = q[ref == s,:]
+        err_mx[s] = (-np.log(tmpq) if xentropy else tmpq).sum(0)
+
+    if err_mx.shape[0] < err_mx.shape[1]:
+        err_mx = err_mx.T
+
+    # try all alignments (permutations) of reference and detected speaker
+    #could be written in more efficient way using dynamic programing
+    acc = [err_mx[perm[:err_mx.shape[1]], range(err_mx.shape[1])].sum()
+              for perm in permutations(range(err_mx.shape[0]))]
+    if xentropy:
+       return min(acc)/float(len(ref))
+    else:
+       return (len(ref) - max(acc))/float(len(ref))
+
+
+###############################################################################
+# Module private functions
+###############################################################################
+def logsumexp(x, axis=0):
+    xmax = x.max(axis)
+    x = xmax + np.log(np.sum(np.exp(x - np.expand_dims(xmax, axis)), axis))
+    infs = np.isinf(xmax)
+    if np.ndim(x) > 0:
+      x[infs] = xmax[infs]
+    elif infs:
+      x = xmax
+    return x
+
+
+# The folowing two functions are only versions optimized for speed using numexpr
+# module and can be replaced by logsumexp and np.exp functions to avoid
+# the dependency on the module.
+def logsumexp_ne(x, axis=0):
+    xmax = np.array(x).max(axis=axis)
+    xmax_e = np.expand_dims(xmax, axis)
+    x = ne.evaluate("sum(exp(x - xmax_e), axis=%d)" % axis)
+    x = ne.evaluate("xmax + log(x)")
+    infs = np.isinf(xmax)
+    if np.ndim(x) > 0:
+      x[infs] = xmax[infs]
+    elif infs:
+      x = xmax
+    return x
+
+
+def exp_ne(x, out=None):
+    return ne.evaluate("exp(x)", out=None)
+
+
+# Convert vector with lower-triangular coefficients into symetric matrix
+def tril_to_sym(tril):
+    R = np.sqrt(len(tril)*2).astype(int)
+    tril_ind = np.tril_indices(R)
+    S = np.empty((R,R))
+    S[tril_ind]       = tril
+    S[tril_ind[::-1]] = tril
+    return S
+
+
+def logdet(A):
+    return 2*np.sum(np.log(np.diag(spl.cholesky(A))))
+
+
+def forward_backward(lls, tr, ip):
+    """
+    Inputs:
+        lls - matrix of per-frame log HMM state output probabilities
+        tr  - transition probability matrix
+        ip  - vector of initial state probabilities (i.e. statrting in the state)
+    Outputs:
+        sp  - matrix of per-frame state occupation posteriors
+        tll - total (forward) log-likelihood
+        lfw - log forward probabilities
+        lfw - log backward probabilities
+    """
+    ltr = np.log(tr)
+    lfw = np.empty_like(lls)
+    lbw = np.empty_like(lls)
+    lfw[:] = -np.inf
+    lbw[:] = -np.inf
+    lfw[0] = lls[0] + np.log(ip)
+    lbw[-1] = 0.0
+
+    for ii in  xrange(1,len(lls)):
+        lfw[ii] =  lls[ii] + logsumexp(lfw[ii-1] + ltr.T, axis=1)
+
+    for ii in reversed(xrange(len(lls)-1)):
+        lbw[ii] = logsumexp(ltr + lls[ii+1] + lbw[ii+1], axis=1)
+
+    tll = logsumexp(lfw[-1])
+    sp = np.exp(lfw + lbw - tll)
+    return sp, tll, lfw, lbw
diff --git a/egs/callhome_diarization/v1/diarization/VB_resegmentation.py b/egs/callhome_diarization/v1/diarization/VB_resegmentation.py
new file mode 100755
index 00000000000..aa951693615
--- /dev/null
+++ b/egs/callhome_diarization/v1/diarization/VB_resegmentation.py
@@ -0,0 +1,281 @@
+#!/usr/bin/env python
+
+import numpy as np
+import VB_diarization
+import pickle
+import kaldi_io
+import sys
+import argparse
+import commands
+
+def get_utt_list(utt2spk_filename):
+    utt_list = []
+    with open(utt2spk_filename, 'r') as fh:
+        content = fh.readlines()
+    for line in content:
+        line = line.strip('\n')
+        line_split = line.split()
+        utt_list.append(line_split[0])
+    print("{} UTTERANCES IN TOTAL".format(len(utt_list)))
+    return utt_list
+
+def utt_num_frames_mapping(utt2num_frames_filename):
+    utt2num_frames = {}
+    with open(utt2num_frames_filename, 'r') as fh:
+        content = fh.readlines()
+    for line in content:
+        line = line.strip('\n')
+        line_split = line.split()
+        utt2num_frames[line_split[0]] = int(line_split[1])
+    return utt2num_frames
+
+def create_ref_file(uttname, utt2num_frames, full_rttm_filename, temp_dir, rttm_filename):
+    utt_rttm_file = open("{}/{}".format(temp_dir, rttm_filename), 'w')
+
+    num_frames = utt2num_frames[uttname]
+
+    # We use 0 to denote silence frames and 1 to denote overlapping frames.
+    ref = np.zeros(num_frames)
+    speaker_dict = {}
+    num_spk = 0
+
+    with open(full_rttm_filename, 'r') as fh:
+        content = fh.readlines()
+    for line in content:
+        line = line.strip('\n')
+        line_split = line.split()
+        uttname_line = line_split[1]
+        if uttname != uttname_line:
+            continue
+        else:
+            utt_rttm_file.write(line + "\n")
+        start_time = int(float(line_split[3]) * 100)
+        duration_time = int(float(line_split[4]) * 100)
+        end_time = start_time + duration_time
+        spkname = line_split[7]
+        if spkname not in speaker_dict.keys():
+            spk_idx = num_spk + 2
+            speaker_dict[spkname] = spk_idx
+            num_spk += 1
+        
+        for i in range(start_time, end_time):
+            if i < 0:
+                raise ValueError(line)
+            elif i >= num_frames:
+                print("{} EXCEED NUM_FRAMES".format(line))
+                break
+            else:
+                if ref[i] == 0:
+                    ref[i] = speaker_dict[spkname] 
+                else:
+                    ref[i] = 1 # The overlapping speech is marked as 1.
+    ref = ref.astype(int)
+
+    print("{} SPEAKERS IN {}".format(num_spk, uttname))
+    print("{} TOTAL, {} SILENCE({:.0f}%), {} OVERLAPPING({:.0f}%)".format(len(ref), np.sum(ref == 0), 100.0 * np.sum(ref == 0) / len(ref), np.sum(ref == 1), 100.0 * np.sum(ref == 1) / len(ref)))
+
+    duration_list = []
+    for i in range(num_spk):
+        duration_list.append(1.0 * np.sum(ref == (i + 2)) / len(ref))
+    duration_list.sort()
+    duration_list = map(lambda x: '{0:.2f}'.format(x), duration_list)
+    print("DISTRIBUTION OF SPEAKER {}".format(" ".join(duration_list)))
+    print("")
+    sys.stdout.flush()
+    utt_rttm_file.close()
+    return ref
+
+def create_rttm_output(uttname, predicted_label, output_dir, channel):
+    num_frames = len(predicted_label)
+
+    start_idx = 0
+    idx_list = []
+
+    last_label = predicted_label[0]
+    for i in range(num_frames):
+        if predicted_label[i] == last_label: # The speaker label remains the same.
+            continue
+        else: # The speaker label is different.
+            if last_label != 0: # Ignore the silence.
+                idx_list.append([start_idx, i, last_label])
+            start_idx = i
+            last_label = predicted_label[i]
+    if last_label != 0:
+        idx_list.append([start_idx, num_frames, last_label])
+
+    with open("{}/{}_predict.rttm".format(output_dir, uttname), 'w') as fh:
+        for i in range(len(idx_list)):
+            start_frame = (idx_list[i])[0]
+            end_frame = (idx_list[i])[1]
+            label = (idx_list[i])[2]
+            duration = end_frame - start_frame
+            fh.write("SPEAKER {} {} {:.2f} {:.2f} <NA> <NA> {} <NA> <NA>\n".format(uttname, channel, start_frame / 100.0, duration / 100.0, label))
+    return 0
+
+def match_DER(string):
+    string_split = string.split('\n')
+    for line in string_split:
+        if "OVERALL SPEAKER DIARIZATION ERROR" in line:
+            return line
+    return 0
+
+def main():
+    parser = argparse.ArgumentParser(description='VB Resegmentation')
+    parser.add_argument('data_dir', type=str, help='Subset data directory')
+    parser.add_argument('init_rttm_filename', type=str, 
+                        help='The rttm file to initialize the VB system, usually the AHC cluster result')
+    parser.add_argument('output_dir', type=str, help='Output directory')
+    parser.add_argument('dubm_model', type=str, help='Path of the diagonal UBM model')
+    parser.add_argument('ie_model', type=str, help='Path of the ivector extractor model')
+    parser.add_argument('--max-speakers', type=int, default=10,
+                        help='Maximum number of speakers expected in the utterance (default: 10)')
+    parser.add_argument('--max-iters', type=int, default=10,
+                        help='Maximum number of algorithm iterations (default: 10)')
+    parser.add_argument('--downsample', type=int, default=25,
+                        help='Perform diarization on input downsampled by this factor (default: 25)')
+    parser.add_argument('--alphaQInit', type=float, default=100.0,
+                        help='Dirichlet concentraion parameter for initializing q')
+    parser.add_argument('--sparsityThr', type=float, default=0.001,
+                        help='Set occupations smaller that this threshold to 0.0 (saves memory as \
+                        the posteriors are represented by sparse matrix)')
+    parser.add_argument('--epsilon', type=float, default=1e-6,
+                        help='Stop iterating, if obj. fun. improvement is less than epsilon')
+    parser.add_argument('--minDur', type=int, default=1,
+                        help='Minimum number of frames between speaker turns imposed by linear \
+                        chains of HMM states corresponding to each speaker. All the states \
+                        in a chain share the same output distribution')
+    parser.add_argument('--loopProb', type=float, default=0.9,
+                        help='Probability of not switching speakers between frames')
+    parser.add_argument('--statScale', type=float, default=0.2,
+                        help='Scale sufficient statiscits collected using UBM')
+    parser.add_argument('--llScale', type=float, default=1.0,
+                        help='Scale UBM likelihood (i.e. llScale < 1.0 make atribution of \
+                        frames to UBM componets more uncertain)')
+    parser.add_argument('--channel', type=int, default=0,
+                        help='Channel information in the rttm file')
+    parser.add_argument('--initialize', type=int, default=1,
+                        help='Whether to initalize the speaker posterior')
+
+    args = parser.parse_args()
+    print(args)
+    data_dir = args.data_dir
+    init_rttm_filename = args.init_rttm_filename
+
+    # The data directory should contain wav.scp, spk2utt, utt2spk and feats.scp
+    utt2spk_filename = "{}/utt2spk".format(data_dir) 
+    utt2num_frames_filename = "{}/utt2num_frames".format(data_dir) 
+    feats_scp_filename = "{}/feats.scp".format(data_dir)
+    temp_dir = "{}/tmp".format(args.output_dir)
+    rttm_dir = "{}/rttm".format(args.output_dir)
+
+    utt_list = get_utt_list(utt2spk_filename)
+    utt2num_frames = utt_num_frames_mapping(utt2num_frames_filename) 
+    print("------------------------------------------------------------------------")
+    print("")
+    sys.stdout.flush()
+    
+    # Load the diagonal UBM and i-vector extractor
+    with open(args.dubm_model, 'rb') as fh:
+        dubm_para = pickle.load(fh)
+    with open(args.ie_model, 'rb') as fh:
+        ie_para = pickle.load(fh)
+    
+    DUBM_WEIGHTS = None
+    DUBM_MEANS_INVVARS = None
+    DUBM_INV_VARS = None
+    IE_M = None
+
+    for key in dubm_para.keys():
+        if key == "<WEIGHTS>":
+            DUBM_WEIGHTS = dubm_para[key]
+        elif key == "<MEANS_INVVARS>":
+            DUBM_MEANS_INVVARS = dubm_para[key]
+        elif key == "<INV_VARS>":
+            DUBM_INV_VARS = dubm_para[key]
+        else:
+            continue
+        
+    for key in ie_para.keys():
+        if key == "M":
+            IE_M = np.transpose(ie_para[key], (2, 0, 1)) 
+    m = DUBM_MEANS_INVVARS / DUBM_INV_VARS
+    iE = DUBM_INV_VARS
+    w = DUBM_WEIGHTS
+    V = IE_M
+
+    # Load the MFCC features
+    feats_dict = {}
+    for key,mat in kaldi_io.read_mat_scp(feats_scp_filename):
+        feats_dict[key] = mat
+
+    for utt in utt_list:
+        # Get the alignments from the clustering result.
+        # In init_ref, 0 denotes the silence silence frames
+        # 1 denotes the overlapping speech frames, the speaker
+        # label starts from 2.
+        init_ref = create_ref_file(utt, utt2num_frames, init_rttm_filename, temp_dir, "{}.rttm".format(utt))
+        # Ground truth of the diarization.
+
+        X = feats_dict[utt]
+        X = X.astype(np.float64)
+
+        # Keep only the voiced frames (0 denotes the silence 
+        # frames, 1 denotes the overlapping speech frames). Since
+        # our method predicts single speaker label for each frame
+        # the init_ref doesn't contain 1.
+        mask = (init_ref >= 2)
+        X_voiced = X[mask]
+        init_ref_voiced = init_ref[mask] - 2
+
+        if X_voiced.shape[0] == 0:
+            print("Warning: {} has no voiced frames in the initialization file".format(utt))
+            continue
+
+        # Initialize the posterior of each speaker based on the clustering result.
+        if args.initialize:
+            q = VB_diarization.frame_labels2posterior_mx(init_ref_voiced, args.max_speakers)
+        else:
+            q = None
+            print("RANDOM INITIALIZATION\n")
+        
+        # VB resegmentation
+
+        # q  - S x T matrix of posteriors attribution each frame to one of S possible
+        #      speakers, where S is given by opts.maxSpeakers
+        # sp - S dimensional column vector of ML learned speaker priors. Ideally, these
+        #      should allow to estimate # of speaker in the utterance as the
+        #      probabilities of the redundant speaker should converge to zero.
+        # Li - values of auxiliary function (and DER and frame cross-entropy between q
+        #      and reference if 'ref' is provided) over iterations.
+        q_out, sp_out, L_out = VB_diarization.VB_diarization(X_voiced, m, iE, w, V, sp=None, q=q, maxSpeakers=args.max_speakers, maxIters=args.max_iters, VtiEV=None,
+                                  downsample=args.downsample, alphaQInit=args.alphaQInit, sparsityThr=args.sparsityThr, epsilon=args.epsilon, minDur=args.minDur,
+                                  loopProb=args.loopProb, statScale=args.statScale, llScale=args.llScale, ref=None, plot=False)
+
+        predicted_label_voiced = np.argmax(q_out, 1) + 2
+        predicted_label = (np.zeros(len(mask))).astype(int)
+        predicted_label[mask] = predicted_label_voiced
+
+        duration_list = []
+        for i in range(args.max_speakers):
+            num_frames = np.sum(predicted_label == (i + 2))
+            if num_frames == 0:
+                continue
+            else:
+                duration_list.append(1.0 * num_frames / len(predicted_label))
+        duration_list.sort()
+        duration_list = map(lambda x: '{0:.2f}'.format(x), duration_list)
+        print("PREDICTED {} SPEAKERS".format(len(duration_list)))
+        print("DISTRIBUTION {}".format(" ".join(duration_list)))
+        print("sp_out", sp_out)
+        print("L_out", L_out)
+
+        # Create the output rttm file and compute the DER after re-segmentation
+        create_rttm_output(utt, predicted_label, rttm_dir, args.channel)
+        print("")
+        print("------------------------------------------------------------------------")
+        print("")
+        sys.stdout.flush()
+    return 0
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/callhome_diarization/v1/diarization/VB_resegmentation.sh b/egs/callhome_diarization/v1/diarization/VB_resegmentation.sh
new file mode 100755
index 00000000000..a677f178ee5
--- /dev/null
+++ b/egs/callhome_diarization/v1/diarization/VB_resegmentation.sh
@@ -0,0 +1,103 @@
+#!/bin/bash
+
+# Begin configuration section.
+nj=20
+cmd=run.pl
+stage=0
+true_rttm_filename=None
+max_speakers=10
+max_iters=10
+downsample=25
+alphaQInit=100.0
+sparsityThr=0.001
+epsilon=1e-6
+minDur=1
+loopProb=0.9
+statScale=0.2
+llScale=1.0
+channel=0
+initialize=1
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ -f $KALDI_ROOT/tools/VB_diarization/VB_diarization.py ]; then
+    echo "VB_diarization is installed so will use the script"
+else
+    echo "VB_diarization is not installed, Please install
+          it using extras/install_diarization_VBHMM.sh in tools/"
+    exit 1;
+fi
+
+
+if [ $# != 5 ]; then
+  echo "Usage: local/VB_resegmentation.sh <data_dir> <init_rttm_filename> <output_dir> <dubm_model> <ie_model>"
+  echo "Variational Bayes Re-segmenatation"
+  echo "Options: "
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # How to run jobs."
+  echo "  --nj <num-jobs|20>                               # Number of parallel jobs to run."
+  echo "  --true-rttm-filename <string|None>               # The true rttm label file"
+  echo "  --max-speakers <n|10>                            # Maximum number of speakers" 
+  echo "                                                   # expected in the utterance" 
+  echo "					           # (default: 10)"
+  echo "  --max-iters <n|10>                               # Maximum number of algorithm"
+  echo "                                                   # iterations (default: 10)" 
+  echo "  --downsample <n|25>                              # Perform diarization on input"
+  echo "                                                   # downsampled by this factor"
+  echo "                                                   # (default: 25)"
+  echo "  --alphaQInit <float|100.0>                       # Dirichlet concentraion"
+  echo "                                                   # parameter for initializing q"
+  echo "  --sparsityThr <float|0.001>                      # Set occupations smaller that"
+  echo "                                                   # this threshold to 0.0 (saves"
+  echo "                                                   # memory as the posteriors are"
+  echo "                                                   # represented by sparse matrix)"
+  echo "  --epsilon <float|1e-6>                           # Stop iterating, if obj. fun." 
+  echo "                                                   # improvement is less than" 
+  echo "				                   # epsilon"
+  echo "  --minDur <n|1>                                   # Minimum number of frames"
+  echo "                                                   # between speaker turns imposed"
+  echo "                                                   # by linear chains of HMM" 
+  echo "                                                   # state corresponding to each" 
+  echo "                                                   # speaker. All the states in"
+  echo "                                                   # a chain share the same output"
+  echo "                                                   # distribution"
+  echo "  --loopProb <float|0.9>                           # Probability of not switching"
+  echo "                                                   # speakers between frames"
+  echo "  --statScale <float|0.2>                          # Scale sufficient statistics" 
+  echo "                                                   # collected using UBM"
+  echo "  --llScale <float|1.0>                            # Scale UBM likelihood (i.e."
+  echo "                                                   # llScale < 1.0 make" 
+  echo "                                                   # attribution of frames to UBM"
+  echo "                                                   # componets more uncertain)" 
+  echo "  --channel <n|0>                                  # Channel information in the rttm file"
+  echo "  --initialize <n|1>                               # Whether to initalize the"
+  echo "                                                   # speaker posterior (if not)"
+  echo "                                                   # the speaker posterior will be"
+  echo "                                                   # randomly initilized"
+
+  exit 1;
+fi
+
+data_dir=$1
+init_rttm_filename=$2
+output_dir=$3
+dubm_model=$4
+ie_model=$5
+
+mkdir -p $output_dir/rttm
+
+sdata=$data_dir/split$nj;
+utils/split_data.sh $data_dir $nj || exit 1;
+
+if [ $stage -le 0 ]; then
+    $cmd JOB=1:$nj $output_dir/log/VB_resegmentation.JOB.log \
+      diarization/VB_resegmentation.py --true-rttm-filename $true_rttm_filename --max-speakers $max_speakers \
+        --max-iters $max_iters --downsample $downsample --alphaQInit $alphaQInit \
+	--sparsityThr $sparsityThr --epsilon $epsilon --minDur $minDur \
+	--loopProb $loopProb --statScale $statScale --llScale $llScale \
+	--channel $channel --initialize $initialize \
+        $sdata/JOB $init_rttm_filename $output_dir $dubm_model $ie_model || exit 1;
+fi
diff --git a/egs/callhome_diarization/v1/diarization/cluster.sh b/egs/callhome_diarization/v1/diarization/cluster.sh
index fa5ead5b6b9..5e5c6e9dbe5 100755
--- a/egs/callhome_diarization/v1/diarization/cluster.sh
+++ b/egs/callhome_diarization/v1/diarization/cluster.sh
@@ -14,6 +14,8 @@ stage=0
 nj=10
 cleanup=true
 threshold=0.5
+max_spk_fraction=1.0
+first_pass_max_utterances=32767
 rttm_channel=0
 read_costs=false
 reco2num_spk=
@@ -36,6 +38,15 @@ if [ $# != 2 ]; then
   echo "  --threshold <threshold|0>                        # Cluster stopping criterion. Clusters with scores greater"
   echo "                                                   # than this value will be merged until all clusters"
   echo "                                                   # exceed this value."
+  echo "  --max-spk-fraction <max-spk-fraction|1.0>        # Clusters with total fraction of utterances greater than"
+  echo "                                                   # this value will not be merged. This is active only when"
+  echo "                                                   # reco2num-spk is supplied and"
+  echo "                                                   # 1.0 / num-spk <= max-spk-fraction <= 1.0."
+  echo "  --first-pass-max-utterances <max-utts|32767>     # If the number of utterances is larger than first-pass-max-utterances,"
+  echo "                                                   # then clustering is done in two passes. In the first pass, input points"
+  echo "                                                   # are divided into contiguous subsets of size first-pass-max-utterances"
+  echo "                                                   # and each subset is clustered separately. In the second pass, the first"
+  echo "                                                   # pass clusters are merged into the final set of clusters."
   echo "  --rttm-channel <rttm-channel|0>                  # The value passed into the RTTM channel field. Only affects"
   echo "                                                   # the format of the RTTM file."
   echo "  --read-costs <read-costs|false>                  # If true, interpret input scores as costs, i.e. similarity"
@@ -78,8 +89,10 @@ if [ $stage -le 0 ]; then
   echo "$0: clustering scores"
   $cmd JOB=1:$nj $dir/log/agglomerative_cluster.JOB.log \
     agglomerative-cluster --threshold=$threshold --read-costs=$read_costs \
-      --reco2num-spk-rspecifier=$reco2num_spk scp:"$feats" \
-      ark,t:$sdata/JOB/spk2utt ark,t:$dir/labels.JOB || exit 1;
+      --reco2num-spk-rspecifier=$reco2num_spk \
+      --max-spk-fraction=$max_spk_fraction \
+      --first-pass-max-utterances=$first_pass_max_utterances \
+      scp:"$feats" ark,t:$sdata/JOB/spk2utt ark,t:$dir/labels.JOB || exit 1;
 fi
 
 if [ $stage -le 1 ]; then
diff --git a/egs/callhome_diarization/v1/diarization/dump_model.py b/egs/callhome_diarization/v1/diarization/dump_model.py
new file mode 100755
index 00000000000..47a85b114d3
--- /dev/null
+++ b/egs/callhome_diarization/v1/diarization/dump_model.py
@@ -0,0 +1,127 @@
+#!/usr/bin/env python
+
+import numpy as np
+import pickle
+import sys
+
+def load_dubm(dubm_text):
+    para_dict = {}
+    with open(dubm_text, 'r') as fh:
+        content = fh.readlines()
+    state = 0
+    data_array = []
+
+    for line in content:
+        line = line.strip('\n')
+        line_split = line.split()
+        if state == 0:
+            if len(line_split) == 1:
+                continue
+            elif len(line_split) == 2 and line_split[1] == "[":
+                para_name = line_split[0]
+                state = 1
+                data_array = []
+            elif len(line_split) >= 3 and line_split[1] == "[" and line_split[-1] == "]": # One line vector
+                para_name = line_split[0]
+                data_list = []
+                for i in range(2, len(line_split) - 1):
+                    data_list.append(float(line_split[i]))
+                data_list = np.array(data_list)
+                para_dict[para_name] = data_list
+            else:
+                raise ValueError("Condition not defined.")
+        elif state == 1:
+            if line_split[-1] == "]":
+                data_list = []
+                for i in range(len(line_split) - 1):
+                    data_list.append(float(line_split[i]))
+                data_list = np.array(data_list)
+                data_array.append(data_list)
+                data_array = np.array(data_array)
+                para_dict[para_name] = data_array
+                state = 0
+            else:
+                data_list = []
+                for i in range(len(line_split)):
+                    data_list.append(float(line_split[i]))
+                data_list = np.array(data_list)
+                data_array.append(data_list)
+        else:
+            raise ValueError("Condition not defined.")
+    return para_dict 
+
+def load_ivector_extractor(ie_text):
+    para_dict = {}
+    with open(ie_text, 'r') as fh:
+        content = fh.readlines()
+    state = 0
+    data_3dmatrix = []
+    data_matrix = []
+    data_array = []
+
+    for line in content:
+        line = line.strip('\n')
+        if line == "<SigmaInv> [":
+            break
+        if state == 0:
+            if line != "<M> 1024  [":
+                continue
+            else:
+                state = 1
+        elif state == 1:
+            line_split = line.split()
+            if line_split[0] == "[":
+                continue
+            elif line_split[-1] == "]":
+                data_array = []
+                for i in range(len(line_split)-1):
+                    data_array.append(float(line_split[i]))
+                data_matrix.append(data_array)
+                data_3dmatrix.append(data_matrix)
+                data_matrix = []
+            else:
+                data_array = []
+                for i in range(len(line_split)):
+                    data_array.append(float(line_split[i]))
+                data_matrix.append(data_array)
+        else:
+            raise ValueError("Condition not defined.")
+    para_dict['M'] = np.array(data_3dmatrix)
+    return para_dict 
+
+def save_dict(para_dict, output_filename):
+    with open(output_filename, 'wb') as fh:
+        pickle.dump(para_dict, fh)
+    return 0
+
+def judge_case(txt_model):
+    with open(txt_model, 'r') as fh:
+        first_line = fh.readline()
+    model_type = first_line.split()[0]
+    if model_type == "<DiagGMM>":
+        return 1
+    elif model_type == "<IvectorExtractor>":
+        return 2
+    else:
+        return 0
+
+def main():
+    # The txt version of diagonal UBM and i-vector extractor. See gmm-global-copy 
+    # and ivector-extractor-copy for details. (ivector-extractor-copy is not
+    # supported in the official kaldi, so you have to use my kaldi)
+    txt_model = sys.argv[1]
+    output_dir = sys.argv[2]
+    model_type = judge_case(txt_model)
+
+    if model_type == 1: # DiagGMM
+        dubm_para = load_dubm(txt_model)
+        save_dict(dubm_para, "{}/diag_ubm.pkl".format(output_dir))
+    elif model_type == 2: # IvectorExtractor
+        ie_para = load_ivector_extractor(txt_model)
+        save_dict(ie_para, "{}/ie.pkl".format(output_dir))
+    else:
+        raise ValueError("Condition not defined.")
+    return 0
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/callhome_diarization/v1/diarization/kaldi_io.py b/egs/callhome_diarization/v1/diarization/kaldi_io.py
new file mode 100755
index 00000000000..dae5599b8f1
--- /dev/null
+++ b/egs/callhome_diarization/v1/diarization/kaldi_io.py
@@ -0,0 +1,627 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Copyright 2014-2016  Brno University of Technology (author: Karel Vesely)
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+import numpy as np
+import sys, os, re, gzip, struct
+
+#################################################
+# Adding kaldi tools to shell path,
+
+# Select kaldi,
+if not 'KALDI_ROOT' in os.environ:
+  # Default! To change run python with 'export KALDI_ROOT=/some_dir python'
+  os.environ['KALDI_ROOT']='/mnt/matylda5/iveselyk/Tools/kaldi-trunk'
+
+# Add kaldi tools to path,
+os.environ['PATH'] = os.popen('echo $KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/nnet3bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/').readline().strip() + ':' + os.environ['PATH']
+
+
+#################################################
+# Define all custom exceptions,
+class UnsupportedDataType(Exception): pass
+class UnknownVectorHeader(Exception): pass
+class UnknownMatrixHeader(Exception): pass
+
+class BadSampleSize(Exception): pass
+class BadInputFormat(Exception): pass
+
+class SubprocessFailed(Exception): pass
+
+#################################################
+# Data-type independent helper functions,
+
+def open_or_fd(file, mode='rb'):
+  """ fd = open_or_fd(file)
+   Open file, gzipped file, pipe, or forward the file-descriptor.
+   Eventually seeks in the 'file' argument contains ':offset' suffix.
+  """
+  offset = None
+  try:
+    # strip 'ark:' prefix from r{x,w}filename (optional),
+    if re.search('^(ark|scp)(,scp|,b|,t|,n?f|,n?p|,b?o|,n?s|,n?cs)*:', file):
+      (prefix,file) = file.split(':',1)
+    # separate offset from filename (optional),
+    if re.search(':[0-9]+$', file):
+      (file,offset) = file.rsplit(':',1)
+    # input pipe?
+    if file[-1] == '|':
+      fd = popen(file[:-1], 'rb') # custom,
+    # output pipe?
+    elif file[0] == '|':
+      fd = popen(file[1:], 'wb') # custom,
+    # is it gzipped?
+    elif file.split('.')[-1] == 'gz':
+      fd = gzip.open(file, mode)
+    # a normal file...
+    else:
+      fd = open(file, mode)
+  except TypeError:
+    # 'file' is opened file descriptor,
+    fd = file
+  # Eventually seek to offset,
+  if offset != None: fd.seek(int(offset))
+  return fd
+
+# based on '/usr/local/lib/python3.4/os.py'
+def popen(cmd, mode="rb"):
+  if not isinstance(cmd, str):
+    raise TypeError("invalid cmd type (%s, expected string)" % type(cmd))
+
+  import subprocess, io, threading
+
+  # cleanup function for subprocesses,
+  def cleanup(proc, cmd):
+    ret = proc.wait()
+    if ret > 0:
+      raise SubprocessFailed('cmd %s returned %d !' % (cmd,ret))
+    return
+
+  # text-mode,
+  if mode == "r":
+    proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
+    threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread,
+    return io.TextIOWrapper(proc.stdout)
+  elif mode == "w":
+    proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE)
+    threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread,
+    return io.TextIOWrapper(proc.stdin)
+  # binary,
+  elif mode == "rb":
+    proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
+    threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread,
+    return proc.stdout
+  elif mode == "wb":
+    proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE)
+    threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread,
+    return proc.stdin
+  # sanity,
+  else:
+    raise ValueError("invalid mode %s" % mode)
+
+
+def read_key(fd):
+  """ [key] = read_key(fd)
+   Read the utterance-key from the opened ark/stream descriptor 'fd'.
+  """
+  key = ''
+  while 1:
+    char = fd.read(1).decode("latin1")
+    if char == '' : break
+    if char == ' ' : break
+    key += char
+  key = key.strip()
+  if key == '': return None # end of file,
+  assert(re.match('^\S+$',key) != None) # check format (no whitespace!)
+  return key
+
+
+#################################################
+# Integer vectors (alignments, ...),
+
+def read_ali_ark(file_or_fd):
+  """ Alias to 'read_vec_int_ark()' """
+  return read_vec_int_ark(file_or_fd)
+
+def read_vec_int_ark(file_or_fd):
+  """ generator(key,vec) = read_vec_int_ark(file_or_fd)
+   Create generator of (key,vector<int>) tuples, which reads from the ark file/stream.
+   file_or_fd : ark, gzipped ark, pipe or opened file descriptor.
+
+   Read ark to a 'dictionary':
+   d = { u:d for u,d in kaldi_io.read_vec_int_ark(file) }
+  """
+  fd = open_or_fd(file_or_fd)
+  try:
+    key = read_key(fd)
+    while key:
+      ali = read_vec_int(fd)
+      yield key, ali
+      key = read_key(fd)
+  finally:
+    if fd is not file_or_fd: fd.close()
+
+def read_vec_int(file_or_fd):
+  """ [int-vec] = read_vec_int(file_or_fd)
+   Read kaldi integer vector, ascii or binary input,
+  """
+  fd = open_or_fd(file_or_fd)
+  binary = fd.read(2).decode()
+  if binary == '\0B': # binary flag
+    assert(fd.read(1).decode() == '\4'); # int-size
+    vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim
+    # Elements from int32 vector are sored in tuples: (sizeof(int32), value),
+    vec = np.frombuffer(fd.read(vec_size*5), dtype=[('size','int8'),('value','int32')], count=vec_size)
+    assert(vec[0]['size'] == 4) # int32 size,
+    ans = vec[:]['value'] # values are in 2nd column,
+  else: # ascii,
+    arr = (binary + fd.readline().decode()).strip().split()
+    try:
+      arr.remove('['); arr.remove(']') # optionally
+    except ValueError:
+      pass
+    ans = np.array(arr, dtype=int)
+  if fd is not file_or_fd : fd.close() # cleanup
+  return ans
+
+# Writing,
+def write_vec_int(file_or_fd, v, key=''):
+  """ write_vec_int(f, v, key='')
+   Write a binary kaldi integer vector to filename or stream.
+   Arguments:
+   file_or_fd : filename or opened file descriptor for writing,
+   v : the vector to be stored,
+   key (optional) : used for writing ark-file, the utterance-id gets written before the vector.
+
+   Example of writing single vector:
+   kaldi_io.write_vec_int(filename, vec)
+
+   Example of writing arkfile:
+   with open(ark_file,'w') as f:
+     for key,vec in dict.iteritems():
+       kaldi_io.write_vec_flt(f, vec, key=key)
+  """
+  fd = open_or_fd(file_or_fd, mode='wb')
+  if sys.version_info[0] == 3: assert(fd.mode == 'wb')
+  try:
+    if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id),
+    fd.write('\0B'.encode()) # we write binary!
+    # dim,
+    fd.write('\4'.encode()) # int32 type,
+    fd.write(struct.pack(np.dtype('int32').char, v.shape[0]))
+    # data,
+    for i in range(len(v)):
+      fd.write('\4'.encode()) # int32 type,
+      fd.write(struct.pack(np.dtype('int32').char, v[i])) # binary,
+  finally:
+    if fd is not file_or_fd : fd.close()
+
+
+#################################################
+# Float vectors (confidences, ivectors, ...),
+
+# Reading,
+def read_vec_flt_scp(file_or_fd):
+  """ generator(key,mat) = read_vec_flt_scp(file_or_fd)
+   Returns generator of (key,vector) tuples, read according to kaldi scp.
+   file_or_fd : scp, gzipped scp, pipe or opened file descriptor.
+
+   Iterate the scp:
+   for key,vec in kaldi_io.read_vec_flt_scp(file):
+     ...
+
+   Read scp to a 'dictionary':
+   d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) }
+  """
+  fd = open_or_fd(file_or_fd)
+  try:
+    for line in fd:
+      (key,rxfile) = line.decode().split(' ')
+      vec = read_vec_flt(rxfile)
+      yield key, vec
+  finally:
+    if fd is not file_or_fd : fd.close()
+
+def read_vec_flt_ark(file_or_fd):
+  """ generator(key,vec) = read_vec_flt_ark(file_or_fd)
+   Create generator of (key,vector<float>) tuples, reading from an ark file/stream.
+   file_or_fd : ark, gzipped ark, pipe or opened file descriptor.
+
+   Read ark to a 'dictionary':
+   d = { u:d for u,d in kaldi_io.read_vec_flt_ark(file) }
+  """
+  fd = open_or_fd(file_or_fd)
+  try:
+    key = read_key(fd)
+    while key:
+      ali = read_vec_flt(fd)
+      yield key, ali
+      key = read_key(fd)
+  finally:
+    if fd is not file_or_fd: fd.close()
+
+def read_vec_flt(file_or_fd):
+  """ [flt-vec] = read_vec_flt(file_or_fd)
+   Read kaldi float vector, ascii or binary input,
+  """
+  fd = open_or_fd(file_or_fd)
+  binary = fd.read(2).decode()
+  if binary == '\0B': # binary flag
+    # Data type,
+    header = fd.read(3).decode()
+    if header == 'FV ': sample_size = 4 # floats
+    elif header == 'DV ': sample_size = 8 # doubles
+    else: raise UnknownVectorHeader("The header contained '%s'" % header)
+    assert(sample_size > 0)
+    # Dimension,
+    assert(fd.read(1).decode() == '\4'); # int-size
+    vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim
+    # Read whole vector,
+    buf = fd.read(vec_size * sample_size)
+    if sample_size == 4 : ans = np.frombuffer(buf, dtype='float32')
+    elif sample_size == 8 : ans = np.frombuffer(buf, dtype='float64')
+    else : raise BadSampleSize
+    return ans
+  else: # ascii,
+    arr = (binary + fd.readline().decode()).strip().split()
+    try:
+      arr.remove('['); arr.remove(']') # optionally
+    except ValueError:
+      pass
+    ans = np.array(arr, dtype=float)
+  if fd is not file_or_fd : fd.close() # cleanup
+  return ans
+
+# Writing,
+def write_vec_flt(file_or_fd, v, key=''):
+  """ write_vec_flt(f, v, key='')
+   Write a binary kaldi vector to filename or stream. Supports 32bit and 64bit floats.
+   Arguments:
+   file_or_fd : filename or opened file descriptor for writing,
+   v : the vector to be stored,
+   key (optional) : used for writing ark-file, the utterance-id gets written before the vector.
+
+   Example of writing single vector:
+   kaldi_io.write_vec_flt(filename, vec)
+
+   Example of writing arkfile:
+   with open(ark_file,'w') as f:
+     for key,vec in dict.iteritems():
+       kaldi_io.write_vec_flt(f, vec, key=key)
+  """
+  fd = open_or_fd(file_or_fd, mode='wb')
+  if sys.version_info[0] == 3: assert(fd.mode == 'wb')
+  try:
+    if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id),
+    fd.write('\0B'.encode()) # we write binary!
+    # Data-type,
+    if v.dtype == 'float32': fd.write('FV '.encode())
+    elif v.dtype == 'float64': fd.write('DV '.encode())
+    else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % v.dtype)
+    # Dim,
+    fd.write('\04'.encode())
+    fd.write(struct.pack(np.dtype('uint32').char, v.shape[0])) # dim
+    # Data,
+    fd.write(v.tobytes())
+  finally:
+    if fd is not file_or_fd : fd.close()
+
+
+#################################################
+# Float matrices (features, transformations, ...),
+
+# Reading,
+def read_mat_scp(file_or_fd):
+  """ generator(key,mat) = read_mat_scp(file_or_fd)
+   Returns generator of (key,matrix) tuples, read according to kaldi scp.
+   file_or_fd : scp, gzipped scp, pipe or opened file descriptor.
+
+   Iterate the scp:
+   for key,mat in kaldi_io.read_mat_scp(file):
+     ...
+
+   Read scp to a 'dictionary':
+   d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) }
+  """
+  fd = open_or_fd(file_or_fd)
+  try:
+    for line in fd:
+      (key,rxfile) = line.decode().split(' ')
+      mat = read_mat(rxfile)
+      yield key, mat
+  finally:
+    if fd is not file_or_fd : fd.close()
+
+def read_mat_ark(file_or_fd):
+  """ generator(key,mat) = read_mat_ark(file_or_fd)
+   Returns generator of (key,matrix) tuples, read from ark file/stream.
+   file_or_fd : scp, gzipped scp, pipe or opened file descriptor.
+
+   Iterate the ark:
+   for key,mat in kaldi_io.read_mat_ark(file):
+     ...
+
+   Read ark to a 'dictionary':
+   d = { key:mat for key,mat in kaldi_io.read_mat_ark(file) }
+  """
+  fd = open_or_fd(file_or_fd)
+  try:
+    key = read_key(fd)
+    while key:
+      mat = read_mat(fd)
+      yield key, mat
+      key = read_key(fd)
+  finally:
+    if fd is not file_or_fd : fd.close()
+
+def read_mat(file_or_fd):
+  """ [mat] = read_mat(file_or_fd)
+   Reads single kaldi matrix, supports ascii and binary.
+   file_or_fd : file, gzipped file, pipe or opened file descriptor.
+  """
+  fd = open_or_fd(file_or_fd)
+  try:
+    binary = fd.read(2).decode()
+    if binary == '\0B' :
+      mat = _read_mat_binary(fd)
+    else:
+      assert(binary == ' [')
+      mat = _read_mat_ascii(fd)
+  finally:
+    if fd is not file_or_fd: fd.close()
+  return mat
+
+def _read_mat_binary(fd):
+  # Data type
+  header = fd.read(3).decode()
+  # 'CM', 'CM2', 'CM3' are possible values,
+  if header.startswith('CM'): return _read_compressed_mat(fd, header)
+  elif header == 'FM ': sample_size = 4 # floats
+  elif header == 'DM ': sample_size = 8 # doubles
+  else: raise UnknownMatrixHeader("The header contained '%s'" % header)
+  assert(sample_size > 0)
+  # Dimensions
+  s1, rows, s2, cols = np.frombuffer(fd.read(10), dtype='int8,int32,int8,int32', count=1)[0]
+  # Read whole matrix
+  buf = fd.read(rows * cols * sample_size)
+  if sample_size == 4 : vec = np.frombuffer(buf, dtype='float32')
+  elif sample_size == 8 : vec = np.frombuffer(buf, dtype='float64')
+  else : raise BadSampleSize
+  mat = np.reshape(vec,(rows,cols))
+  return mat
+
+def _read_mat_ascii(fd):
+  rows = []
+  while 1:
+    line = fd.readline().decode()
+    if (len(line) == 0) : raise BadInputFormat # eof, should not happen!
+    if len(line.strip()) == 0 : continue # skip empty line
+    arr = line.strip().split()
+    if arr[-1] != ']':
+      rows.append(np.array(arr,dtype='float32')) # not last line
+    else:
+      rows.append(np.array(arr[:-1],dtype='float32')) # last line
+      mat = np.vstack(rows)
+      return mat
+
+
+def _read_compressed_mat(fd, format):
+  """ Read a compressed matrix,
+      see: https://github.com/kaldi-asr/kaldi/blob/master/src/matrix/compressed-matrix.h
+      methods: CompressedMatrix::Read(...), CompressedMatrix::CopyToMat(...),
+  """
+  assert(format == 'CM ') # The formats CM2, CM3 are not supported...
+
+  # Format of header 'struct',
+  global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written,
+  per_col_header = np.dtype([('percentile_0','uint16'),('percentile_25','uint16'),('percentile_75','uint16'),('percentile_100','uint16')])
+
+  # Read global header,
+  globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0]
+
+  # The data is structed as [Colheader, ... , Colheader, Data, Data , .... ]
+  #                         {           cols           }{     size         }
+  col_headers = np.frombuffer(fd.read(cols*8), dtype=per_col_header, count=cols)
+  col_headers = np.array([np.array([x for x in y]) * globrange * 1.52590218966964e-05 + globmin for y in col_headers], dtype=np.float32)
+  data = np.reshape(np.frombuffer(fd.read(cols*rows), dtype='uint8', count=cols*rows), newshape=(cols,rows)) # stored as col-major,
+
+  mat = np.zeros((cols,rows), dtype='float32')
+  p0 = col_headers[:, 0].reshape(-1, 1)
+  p25 = col_headers[:, 1].reshape(-1, 1)
+  p75 = col_headers[:, 2].reshape(-1, 1)
+  p100 = col_headers[:, 3].reshape(-1, 1)
+  mask_0_64 = (data <= 64)
+  mask_193_255 = (data > 192)
+  mask_65_192 = (~(mask_0_64 | mask_193_255))
+
+  mat += (p0  + (p25 - p0) / 64. * data) * mask_0_64.astype(np.float32)
+  mat += (p25 + (p75 - p25) / 128. * (data - 64)) * mask_65_192.astype(np.float32)
+  mat += (p75 + (p100 - p75) / 63. * (data - 192)) * mask_193_255.astype(np.float32)
+
+  return mat.T # transpose! col-major -> row-major,
+
+
+# Writing,
+def write_mat(file_or_fd, m, key=''):
+  """ write_mat(f, m, key='')
+  Write a binary kaldi matrix to filename or stream. Supports 32bit and 64bit floats.
+  Arguments:
+   file_or_fd : filename of opened file descriptor for writing,
+   m : the matrix to be stored,
+   key (optional) : used for writing ark-file, the utterance-id gets written before the matrix.
+
+   Example of writing single matrix:
+   kaldi_io.write_mat(filename, mat)
+
+   Example of writing arkfile:
+   with open(ark_file,'w') as f:
+     for key,mat in dict.iteritems():
+       kaldi_io.write_mat(f, mat, key=key)
+  """
+  fd = open_or_fd(file_or_fd, mode='wb')
+  if sys.version_info[0] == 3: assert(fd.mode == 'wb')
+  try:
+    if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id),
+    fd.write('\0B'.encode()) # we write binary!
+    # Data-type,
+    if m.dtype == 'float32': fd.write('FM '.encode())
+    elif m.dtype == 'float64': fd.write('DM '.encode())
+    else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % m.dtype)
+    # Dims,
+    fd.write('\04'.encode())
+    fd.write(struct.pack(np.dtype('uint32').char, m.shape[0])) # rows
+    fd.write('\04'.encode())
+    fd.write(struct.pack(np.dtype('uint32').char, m.shape[1])) # cols
+    # Data,
+    fd.write(m.tobytes())
+  finally:
+    if fd is not file_or_fd : fd.close()
+
+
+#################################################
+# 'Posterior' kaldi type (posteriors, confusion network, nnet1 training targets, ...)
+# Corresponds to: vector<vector<tuple<int,float> > >
+# - outer vector: time axis
+# - inner vector: records at the time
+# - tuple: int = index, float = value
+#
+
+def read_cnet_ark(file_or_fd):
+  """ Alias of function 'read_post_ark()', 'cnet' = confusion network """
+  return read_post_ark(file_or_fd)
+
+def read_post_ark(file_or_fd):
+  """ generator(key,vec<vec<int,float>>) = read_post_ark(file)
+   Returns generator of (key,posterior) tuples, read from ark file.
+   file_or_fd : ark, gzipped ark, pipe or opened file descriptor.
+
+   Iterate the ark:
+   for key,post in kaldi_io.read_post_ark(file):
+     ...
+
+   Read ark to a 'dictionary':
+   d = { key:post for key,post in kaldi_io.read_post_ark(file) }
+  """
+  fd = open_or_fd(file_or_fd)
+  try:
+    key = read_key(fd)
+    while key:
+      post = read_post(fd)
+      yield key, post
+      key = read_key(fd)
+  finally:
+    if fd is not file_or_fd: fd.close()
+
+def read_post(file_or_fd):
+  """ [post] = read_post(file_or_fd)
+   Reads single kaldi 'Posterior' in binary format.
+
+   The 'Posterior' is C++ type 'vector<vector<tuple<int,float> > >',
+   the outer-vector is usually time axis, inner-vector are the records
+   at given time,  and the tuple is composed of an 'index' (integer)
+   and a 'float-value'. The 'float-value' can represent a probability
+   or any other numeric value.
+
+   Returns vector of vectors of tuples.
+  """
+  fd = open_or_fd(file_or_fd)
+  ans=[]
+  binary = fd.read(2).decode(); assert(binary == '\0B'); # binary flag
+  assert(fd.read(1).decode() == '\4'); # int-size
+  outer_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins)
+
+  # Loop over 'outer-vector',
+  for i in range(outer_vec_size):
+    assert(fd.read(1).decode() == '\4'); # int-size
+    inner_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of records for frame (or bin)
+    data = np.frombuffer(fd.read(inner_vec_size*10), dtype=[('size_idx','int8'),('idx','int32'),('size_post','int8'),('post','float32')], count=inner_vec_size)
+    assert(data[0]['size_idx'] == 4)
+    assert(data[0]['size_post'] == 4)
+    ans.append(data[['idx','post']].tolist())
+
+  if fd is not file_or_fd: fd.close()
+  return ans
+
+
+#################################################
+# Kaldi Confusion Network bin begin/end times,
+# (kaldi stores CNs time info separately from the Posterior).
+#
+
+def read_cntime_ark(file_or_fd):
+  """ generator(key,vec<tuple<float,float>>) = read_cntime_ark(file_or_fd)
+   Returns generator of (key,cntime) tuples, read from ark file.
+   file_or_fd : file, gzipped file, pipe or opened file descriptor.
+
+   Iterate the ark:
+   for key,time in kaldi_io.read_cntime_ark(file):
+     ...
+
+   Read ark to a 'dictionary':
+   d = { key:time for key,time in kaldi_io.read_post_ark(file) }
+  """
+  fd = open_or_fd(file_or_fd)
+  try:
+    key = read_key(fd)
+    while key:
+      cntime = read_cntime(fd)
+      yield key, cntime
+      key = read_key(fd)
+  finally:
+    if fd is not file_or_fd : fd.close()
+
+def read_cntime(file_or_fd):
+  """ [cntime] = read_cntime(file_or_fd)
+   Reads single kaldi 'Confusion Network time info', in binary format:
+   C++ type: vector<tuple<float,float> >.
+   (begin/end times of bins at the confusion network).
+
+   Binary layout is '<num-bins> <beg1> <end1> <beg2> <end2> ...'
+
+   file_or_fd : file, gzipped file, pipe or opened file descriptor.
+
+   Returns vector of tuples.
+  """
+  fd = open_or_fd(file_or_fd)
+  binary = fd.read(2).decode(); assert(binary == '\0B'); # assuming it's binary
+
+  assert(fd.read(1).decode() == '\4'); # int-size
+  vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins)
+
+  data = np.frombuffer(fd.read(vec_size*10), dtype=[('size_beg','int8'),('t_beg','float32'),('size_end','int8'),('t_end','float32')], count=vec_size)
+  assert(data[0]['size_beg'] == 4)
+  assert(data[0]['size_end'] == 4)
+  ans = data[['t_beg','t_end']].tolist() # Return vector of tuples (t_beg,t_end),
+
+  if fd is not file_or_fd : fd.close()
+  return ans
+
+
+#################################################
+# Segments related,
+#
+
+# Segments as 'Bool vectors' can be handy,
+# - for 'superposing' the segmentations,
+# - for frame-selection in Speaker-ID experiments,
+def read_segments_as_bool_vec(segments_file):
+  """ [ bool_vec ] = read_segments_as_bool_vec(segments_file)
+   using kaldi 'segments' file for 1 wav, format : '<utt> <rec> <t-beg> <t-end>'
+   - t-beg, t-end is in seconds,
+   - assumed 100 frames/second,
+  """
+  segs = np.loadtxt(segments_file, dtype='object,object,f,f', ndmin=1)
+  # Sanity checks,
+  assert(len(segs) > 0) # empty segmentation is an error,
+  assert(len(np.unique([rec[1] for rec in segs ])) == 1) # segments with only 1 wav-file,
+  # Convert time to frame-indexes,
+  start = np.rint([100 * rec[2] for rec in segs]).astype(int)
+  end = np.rint([100 * rec[3] for rec in segs]).astype(int)
+  # Taken from 'read_lab_to_bool_vec', htk.py,
+  frms = np.repeat(np.r_[np.tile([False,True], len(end)), False],
+                   np.r_[np.c_[start - np.r_[0, end[:-1]], end-start].flat, 0])
+  assert np.sum(end-start) == np.sum(frms)
+  return frms
+
diff --git a/egs/callhome_diarization/v1/diarization/nnet3/xvector/extract_xvectors.sh b/egs/callhome_diarization/v1/diarization/nnet3/xvector/extract_xvectors.sh
index d7591a6a3a8..8d579138c73 100755
--- a/egs/callhome_diarization/v1/diarization/nnet3/xvector/extract_xvectors.sh
+++ b/egs/callhome_diarization/v1/diarization/nnet3/xvector/extract_xvectors.sh
@@ -102,7 +102,7 @@ if [ $stage -le 0 ]; then
   fi
   utils/data/get_uniform_subsegments.py \
       --max-segment-duration=$window \
-      --overlap-duration=$(echo "$window-$period" | bc) \
+      --overlap-duration=$(perl -e "print ($window-$period);") \
       --max-remaining-duration=$min_segment \
       --constant-duration=True \
       $segments > $dir/subsegments
diff --git a/egs/callhome_diarization/v1/diarization/train_ivector_extractor_diag.sh b/egs/callhome_diarization/v1/diarization/train_ivector_extractor_diag.sh
new file mode 100755
index 00000000000..6751fb7dd22
--- /dev/null
+++ b/egs/callhome_diarization/v1/diarization/train_ivector_extractor_diag.sh
@@ -0,0 +1,166 @@
+#!/bin/bash
+
+# Copyright   2013  Daniel Povey
+#             2014  David Snyder
+# Apache 2.0.
+
+# This script trains the i-vector extractor.  Note: there are 3 separate levels
+# of parallelization: num_threads, num_processes, and num_jobs.  This may seem a
+# bit excessive.  It has to do with minimizing memory usage and disk I/O,
+# subject to various constraints.  The "num_threads" is how many threads a
+# program uses; the "num_processes" is the number of separate processes a single
+# job spawns, and then sums the accumulators in memory.  Our recommendation:
+#  - Set num_threads to the minimum of (4, or how many virtual cores your machine has).
+#    (because of needing to lock various global quantities, the program can't
+#    use many more than 4 threads with good CPU utilization).
+#  - Set num_processes to the number of virtual cores on each machine you have, divided by
+#    num_threads.  E.g. 4, if you have 16 virtual cores.   If you're on a shared queue
+#    that's busy with other people's jobs, it may be wise to set it to rather less
+#    than this maximum though, or your jobs won't get scheduled.  And if memory is
+#    tight you need to be careful; in our normal setup, each process uses about 5G.
+#  - Set num_jobs to as many of the jobs (each using $num_threads * $num_processes CPUs)
+#    your queue will let you run at one time, but don't go much more than 10 or 20, or
+#    summing the accumulators will possibly get slow.  If you have a lot of data, you
+#    may want more jobs, though.
+
+# Begin configuration section.
+nj=10   # this is the number of separate queue jobs we run, but each one
+        # contains num_processes sub-jobs.. the real number of threads we
+        # run is nj * num_processes * num_threads, and the number of
+        # separate pieces of data is nj * num_processes.
+num_threads=4
+num_processes=4 # each job runs this many processes, each with --num-threads threads
+cmd="queue.pl"
+stage=-4
+num_gselect=20 # Gaussian-selection using diagonal model: number of Gaussians to select
+ivector_dim=400 # dimension of the extracted i-vector
+use_weights=false # set to true to turn on the regression of log-weights on the ivector.
+num_iters=10
+min_post=0.025 # Minimum posterior to use (posteriors below this are pruned out)
+num_samples_for_weights=3 # smaller than the default for speed (relates to a sampling method)
+cleanup=true
+apply_cmn=true # If true, apply sliding window cepstral mean normalization
+posterior_scale=1.0 # This scale helps to control for successve features being highly
+                    # correlated.  E.g. try 0.1 or 0.3
+sum_accs_opt=
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+
+if [ $# != 3 ]; then
+  echo "Usage: $0 <gmm-model> <data> <extractor-dir>"
+  echo " e.g.: $0 exp/ubm_2048_male/final.dubm data/train_male exp/extractor_male"
+  echo "main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config containing options"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --num-iters <#iters|10>                          # Number of iterations of E-M"
+  echo "  --nj <n|10>                                      # Number of jobs (also see num-processes and num-threads)"
+  echo "  --num-processes <n|4>                            # Number of processes for each queue job (relates"
+  echo "                                                   # to summing accs in memory)"
+  echo "  --num-threads <n|4>                              # Number of threads for each process (can't be usefully"
+  echo "                                                   # increased much above 4)"
+  echo "  --stage <stage|-4>                               # To control partial reruns"
+  echo "  --num-gselect <n|20>                             # Number of Gaussians to select using"
+  echo "                                                   # diagonal model."
+  echo "  --sum-accs-opt <option|''>                       # Option e.g. '-l hostname=a15' to localize"
+  echo "                                                   # sum-accs process to nfs server."
+  echo " --apply-cmn <true,false|true>                     # if true, apply sliding window cepstral mean"
+  echo "                                                   # normalization to features"
+  exit 1;
+fi
+
+gmm_model=$1
+data=$2
+dir=$3
+srcdir=$(dirname $gmm_model)
+
+for f in $gmm_model $data/feats.scp ; do
+  [ ! -f $f ] && echo "No such file $f" && exit 1;
+done
+
+# Set various variables.
+mkdir -p $dir/log
+nj_full=$[$nj*$num_processes]
+sdata=$data/split$nj_full;
+utils/split_data.sh $data $nj_full || exit 1;
+
+delta_opts=`cat $srcdir/delta_opts 2>/dev/null`
+if [ -f $srcdir/delta_opts ]; then
+  cp $srcdir/delta_opts $dir/ 2>/dev/null
+fi
+
+parallel_opts="--num-threads $[$num_threads*$num_processes]"
+## Set up features.
+if $apply_cmn; then
+  feats="ark,s,cs:add-deltas $delta_opts scp:$sdata/JOB/feats.scp ark:- | apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 ark:- ark:- | select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- |"
+else
+  feats="ark,s,cs:add-deltas $delta_opts scp:$sdata/JOB/feats.scp ark:- | select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- |"
+fi
+
+# Initialize the i-vector extractor using the FGMM input
+if [ $stage -le -2 ]; then
+  cp $gmm_model $dir/final.dubm || exit 1;
+  $cmd $dir/log/init.log \
+    ivector-extractor-init --ivector-dim=$ivector_dim --use-weights=$use_weights \
+     "gmm-global-to-fgmm $dir/final.dubm -|" $dir/0.ie || exit 1
+fi
+
+# Do Gaussian selection and posterior extracion
+
+if [ $stage -le -1 ]; then
+  echo $nj_full > $dir/num_jobs
+  echo "$0: doing Gaussian selection and posterior computation"
+  $cmd JOB=1:$nj_full $dir/log/gselect.JOB.log \
+    gmm-global-get-post --n=$num_gselect --min-post=$min_post $dir/final.dubm "$feats" ark:- \| \
+    scale-post ark:- $posterior_scale "ark:|gzip -c >$dir/post.JOB.gz" || exit 1;
+else
+  if ! [ $nj_full -eq $(cat $dir/num_jobs) ]; then
+    echo "Num-jobs mismatch $nj_full versus $(cat $dir/num_jobs)"
+    exit 1
+  fi
+fi
+
+x=0
+while [ $x -lt $num_iters ]; do
+  if [ $stage -le $x ]; then
+    rm $dir/.error 2>/dev/null
+
+    Args=() # bash array of training commands for 1:nj, that put accs to stdout.
+    for j in $(seq $nj_full); do
+      Args[$j]=`echo "ivector-extractor-acc-stats --num-threads=$num_threads --num-samples-for-weights=$num_samples_for_weights $dir/$x.ie '$feats' 'ark,s,cs:gunzip -c $dir/post.JOB.gz|' -|" | sed s/JOB/$j/g`
+    done
+
+    echo "Accumulating stats (pass $x)"
+    for g in $(seq $nj); do
+      start=$[$num_processes*($g-1)+1]
+      $cmd $parallel_opts $dir/log/acc.$x.$g.log \
+        ivector-extractor-sum-accs --parallel=true "${Args[@]:$start:$num_processes}" \
+          $dir/acc.$x.$g || touch $dir/.error &
+    done
+    wait
+    [ -f $dir/.error ] && echo "Error accumulating stats on iteration $x" && exit 1;
+    accs=""
+    for j in $(seq $nj); do
+      accs+="$dir/acc.$x.$j "
+    done
+    echo "Summing accs (pass $x)"
+    $cmd $sum_accs_opt $dir/log/sum_acc.$x.log \
+      ivector-extractor-sum-accs $accs $dir/acc.$x || exit 1;
+    echo "Updating model (pass $x)"
+    nt=$[$num_threads*$num_processes] # use the same number of threads that
+                                      # each accumulation process uses, since we
+                                      # can be sure the queue will support this many.
+    $cmd $parallel_opts $dir/log/update.$x.log \
+      ivector-extractor-est --num-threads=$nt $dir/$x.ie $dir/acc.$x $dir/$[$x+1].ie || exit 1;
+    rm $dir/acc.$x.*
+    $cleanup && rm $dir/acc.$x $dir/$x.ie
+  fi
+  x=$[$x+1]
+done
+$cleanup && rm -f $dir/post.*.gz
+rm -f $dir/final.ie
+ln -s $x.ie $dir/final.ie
diff --git a/egs/callhome_diarization/v1/local/make_callhome.sh b/egs/callhome_diarization/v1/local/make_callhome.sh
index caa8f679f22..0d25f9f428c 100755
--- a/egs/callhome_diarization/v1/local/make_callhome.sh
+++ b/egs/callhome_diarization/v1/local/make_callhome.sh
@@ -70,4 +70,10 @@ utils/filter_scp.pl $data_dir/callhome1/wav.scp $data_dir/callhome/reco2num_spk
 utils/filter_scp.pl $data_dir/callhome2/wav.scp $data_dir/callhome/reco2num_spk \
   > $data_dir/callhome2/reco2num_spk
 
+# Here we tweak somethings to make VB resegmentation work for original data/callhome
+rm $data_dir/callhome/segments
+awk '{print $1, $1}' $data_dir/callhome/wav.scp > $data_dir/callhome/utt2spk
+utils/utt2spk_to_spk2utt.pl $data_dir/callhome/utt2spk > $data_dir/callhome/spk2utt
+utils/data/get_utt2num_frames.sh $data_dir/callhome
+
 rm -rf $tmp_dir 2> /dev/null
diff --git a/egs/callhome_diarization/v1/local/make_musan.py b/egs/callhome_diarization/v1/local/make_musan.py
deleted file mode 100755
index 7c50adf7c83..00000000000
--- a/egs/callhome_diarization/v1/local/make_musan.py
+++ /dev/null
@@ -1,119 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2015   David Snyder
-# Apache 2.0.
-#
-# This file is meant to be invoked by make_musan.sh.
-
-import os, sys
-
-def process_music_annotations(path):
-  utt2spk = {}
-  utt2vocals = {}
-  lines = open(path, 'r').readlines()
-  for line in lines:
-    utt, genres, vocals, musician = line.rstrip().split()[:4]
-    # For this application, the musican ID isn't important
-    utt2spk[utt] = utt
-    utt2vocals[utt] = vocals == "Y"
-  return utt2spk, utt2vocals
-
-def prepare_music(root_dir, use_vocals):
-  utt2vocals = {}
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  music_dir = os.path.join(root_dir, "music")
-  for root, dirs, files in os.walk(music_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-      elif str(file) == "ANNOTATIONS":
-        utt2spk_part, utt2vocals_part = process_music_annotations(file_path)
-        utt2spk.update(utt2spk_part)
-        utt2vocals.update(utt2vocals_part)
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2vocals:
-    if utt in utt2wav:
-      if use_vocals or not utt2vocals[utt]:
-        utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-        utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n"
-      num_good_files += 1
-    else:
-      print("Missing file: {}".format(utt))
-      num_bad_files += 1
-  print("In music directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
-  return utt2spk_str, utt2wav_str
-
-def prepare_speech(root_dir):
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  speech_dir = os.path.join(root_dir, "speech")
-  for root, dirs, files in os.walk(speech_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-        utt2spk[utt] = utt
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2spk:
-    if utt in utt2wav:
-      utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-      utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n"
-      num_good_files += 1
-    else:
-      print("Missing file: {}".format(utt))
-      num_bad_files += 1
-  print("In speech directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
-  return utt2spk_str, utt2wav_str
-
-def prepare_noise(root_dir):
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  noise_dir = os.path.join(root_dir, "noise")
-  for root, dirs, files in os.walk(noise_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-        utt2spk[utt] = utt
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2spk:
-    if utt in utt2wav:
-      utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-      utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n"
-      num_good_files += 1
-    else:
-      print("Missing file: {}".format(utt))
-      num_bad_files += 1
-  print("In noise directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
-  return utt2spk_str, utt2wav_str
-
-def main():
-  in_dir = sys.argv[1]
-  out_dir = sys.argv[2]
-  use_vocals = sys.argv[3] == "Y"
-  utt2spk_music, utt2wav_music = prepare_music(in_dir, use_vocals)
-  utt2spk_speech, utt2wav_speech = prepare_speech(in_dir)
-  utt2spk_noise, utt2wav_noise = prepare_noise(in_dir)
-  utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise
-  utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise
-  wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w')
-  wav_fi.write(utt2wav)
-  utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w')
-  utt2spk_fi.write(utt2spk)
-
-
-if __name__=="__main__":
-  main()
diff --git a/egs/callhome_diarization/v1/local/make_musan.sh b/egs/callhome_diarization/v1/local/make_musan.sh
deleted file mode 100755
index 694940ad70f..00000000000
--- a/egs/callhome_diarization/v1/local/make_musan.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-# Copyright 2015   David Snyder
-# Apache 2.0.
-#
-# This script, called by ../run.sh, creates the MUSAN
-# data directory. The required dataset is freely available at
-#   http://www.openslr.org/17/
-
-set -e
-in_dir=$1
-data_dir=$2
-use_vocals='Y'
-
-mkdir -p local/musan.tmp
-
-echo "Preparing ${data_dir}/musan..."
-mkdir -p ${data_dir}/musan
-local/make_musan.py ${in_dir} ${data_dir}/musan ${use_vocals}
-
-utils/fix_data_dir.sh ${data_dir}/musan
-
-grep "music" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_music
-grep "speech" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_speech
-grep "noise" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_noise
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_music \
-  ${data_dir}/musan ${data_dir}/musan_music
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_speech \
-  ${data_dir}/musan ${data_dir}/musan_speech
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_noise \
-  ${data_dir}/musan ${data_dir}/musan_noise
-
-utils/fix_data_dir.sh ${data_dir}/musan_music
-utils/fix_data_dir.sh ${data_dir}/musan_speech
-utils/fix_data_dir.sh ${data_dir}/musan_noise
-
-rm -rf local/musan.tmp
-
diff --git a/egs/callhome_diarization/v1/local/make_mx6.sh b/egs/callhome_diarization/v1/local/make_mx6.sh
new file mode 100755
index 00000000000..4e0df1350a1
--- /dev/null
+++ b/egs/callhome_diarization/v1/local/make_mx6.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+# Copyright 2017   David Snyder
+# Apache 2.0.
+#
+# This script prepares both the microphone and telephone portions of the
+# Mixer 6 corpus.
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <mixer6-speech> <out-dir>"
+  echo "e.g.: $0 /export/corpora/LDC/LDC2013S03 data/"
+  exit 1;
+fi
+
+set -e
+in_dir=$1
+out_dir=$2
+
+# Mic 01 is the lapel mic for the interviewer, so we don't use it.  Mic 02 is
+# the lapel mic for the interviewee.  All other mics are placed throughout the
+# room.  In addition to mic 01, we omit mics 03 and 14 as they are often
+# silent.
+echo "$0: preparing mic speech (excluding 01, 03, and 14)"
+
+for mic in 02 04 05 06 07 08 09 10 11 12 13; do
+  local/make_mx6_mic.pl $in_dir $mic $out_dir
+done
+
+utils/combine_data.sh $out_dir/mx6_mic_04_to_13 $out_dir/mx6_mic_{04,05,06,07,08,09,10,11,12,13}
+
+# Mics 02-13 contain the same content, but recorded from different microphones.
+# To get some channel diversity, but not be overwhelmed with duplicated data
+# we take a 2k subset from mics 04-13 and combine it with all of mic 02.
+echo "$0: selecting a 2k subset of mics 04 through 13 and combining it with mic 02"
+utils/subset_data_dir.sh $out_dir/mx6_mic_04_to_13 2000 $out_dir/mx6_mic_04_to_13_2k
+utils/combine_data.sh $out_dir/mx6_mic $out_dir/mx6_mic_02 $out_dir/mx6_mic_04_to_13_2k
+
+echo "$0: preparing telephone portion"
+local/make_mx6_calls.pl $in_dir $out_dir
+
+echo "$0 combining mic and telephone speech in data/mx6"
+utils/combine_data.sh $out_dir/mx6 $out_dir/mx6_mic $out_dir/mx6_calls
+utils/fix_data_dir.sh $out_dir/mx6
diff --git a/egs/callhome_diarization/v1/local/make_mx6_calls.pl b/egs/callhome_diarization/v1/local/make_mx6_calls.pl
new file mode 100755
index 00000000000..0e38a350890
--- /dev/null
+++ b/egs/callhome_diarization/v1/local/make_mx6_calls.pl
@@ -0,0 +1,106 @@
+#!/usr/bin/perl
+use warnings; #sed replacement for -w perl parameter
+# Copyright 2017   David Snyder
+# Apache 2.0
+#
+# Prepares the telephone portion of Mixer 6 (LDC2013S03).
+
+if (@ARGV != 2) {
+  print STDERR "Usage: $0 <path-to-LDC2013S03> <path-to-output>\n";
+  print STDERR "e.g. $0 /export/corpora5/LDC/LDC2013S03 data/\n";
+  exit(1);
+}
+($db_base, $out_dir) = @ARGV;
+
+if (! -d "$db_base/mx6_speech/data/ulaw_sphere/") {
+  print STDERR "Directory $db_base/mx6_speech/data/ulaw_sphere/ doesn't exist\n";
+  exit(1);
+}
+
+$out_dir = "$out_dir/mx6_calls";
+
+$tmp_dir = "$out_dir/tmp";
+if (system("mkdir -p $tmp_dir") != 0) {
+  die "Error making directory $tmp_dir";
+}
+
+if (system("mkdir -p $out_dir") != 0) {
+  print STDERR "Error making directory $out_dir\n";
+  exit(1);
+}
+
+%call2sph = ();
+open(SUBJECTS, "<$db_base/mx6_speech/docs/mx6_subjs.csv") || die "cannot open $$db_base/mx6_speech/docs/mx6_subjs.csv";
+open(SPKR, ">$out_dir/utt2spk") || die "Could not open the output file $out_dir/utt2spk";
+open(GNDR, ">$out_dir/spk2gender") || die "Could not open the output file $out_dir/spk2gender";
+open(WAV, ">$out_dir/wav.scp") || die "Could not open the output file $out_dir/wav.scp";
+open(META, "<$db_base/mx6_speech/docs/mx6_calls.csv") || die "cannot open $db_base/mx6_speech/docs/mx6_calls.csv";
+
+if (system("find $db_base/mx6_speech/data/ulaw_sphere/ -name '*.sph' > $tmp_dir/sph.list") != 0) {
+  die "Error getting list of sph files";
+}
+
+open(SPHLIST, "<$tmp_dir/sph.list") or die "cannot open wav list";
+
+while(<SPHLIST>) {
+  chomp;
+  $sph = $_;
+  @toks = split("/",$sph);
+  $sph_id = (split("[./]",$toks[$#toks]))[0];
+  $call_id = (split("_", $sph_id))[2];
+  $call2sph[$call_id] = $sph;
+}
+
+while (<SUBJECTS>) {
+  chomp;
+  $line = $_;
+  @toks = split(",", $line);
+  $spk = $toks[0];
+  $gender = lc $toks[1];
+  if ($gender eq "f" or $gender eq "m") {
+    print GNDR "$spk $gender\n";
+  }
+}
+
+$num_good_files = 0;
+$num_bad_files = 0;
+while (<META>) {
+  chomp;
+  $line = $_;
+  @toks = split(",", $line);
+  $call_id = $toks[0];
+  ($call_date, $call_time) = split(/_/, $toks[1]);
+  $sid_A = $toks[4];
+  $sid_B = $toks[12];
+  if (-f $call2sph[$call_id]) {
+    $utt_A = "${sid_A}_MX6_${call_id}_A";
+    $utt_B = "${sid_B}_MX6_${call_id}_B";
+    print SPKR "${utt_A} $sid_A\n";
+    print SPKR "${utt_B} $sid_B\n";
+    print WAV "${utt_A} sph2pipe -f wav -p -c 1 $call2sph[$call_id] |\n";
+    print WAV "${utt_B} sph2pipe -f wav -p -c 2 $call2sph[$call_id] |\n";
+    $num_good_files++;
+  } else {
+    print STDERR "Sphere file for $call_id doesn't exist\n";
+    $num_bad_files++;
+  }
+}
+
+print STDERR "Processed $num_good_files utterances; $num_bad_files had missing sphere data.\n";
+
+close(SPHLIST) || die;
+close(SUBJECTS) || die;
+close(GNDR) || die;
+close(SPKR) || die;
+close(WAV) || die;
+close(META) || die;
+
+if (system(
+  "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
+  die "Error creating spk2utt file in directory $out_dir";
+}
+
+system("utils/fix_data_dir.sh $out_dir");
+if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
+  die "Error validating directory $out_dir";
+}
diff --git a/egs/callhome_diarization/v1/local/make_mx6_mic.pl b/egs/callhome_diarization/v1/local/make_mx6_mic.pl
new file mode 100755
index 00000000000..7e1b4046e73
--- /dev/null
+++ b/egs/callhome_diarization/v1/local/make_mx6_mic.pl
@@ -0,0 +1,92 @@
+#!/usr/bin/perl
+use warnings; #sed replacement for -w perl parameter
+# Copyright 2017   David Snyder
+# Apache 2.0
+# Prepares Mixer 6 (LDC2013S03) speech from a specified microphone and
+# downsamples it to 8k.
+
+if (@ARGV != 3) {
+  print STDERR "Usage: $0 <path-to-LDC2013S03> <channel> <path-to-output>\n";
+  print STDERR "e.g. $0 /export/corpora5/LDC/LDC2013S03 02 data/\n";
+  exit(1);
+}
+($db_base, $ch, $out_dir) = @ARGV;
+
+@bad_channels = ("01", "03", "14");
+if (/$ch/i ~~ @bad_channels) {
+  print STDERR "Bad channel $ch\n";
+  exit(1);
+}
+
+if (! -d "$db_base/mx6_speech/data/pcm_flac/CH$ch/") {
+  print STDERR "Directory $db_base/mx6_speech/data/pcm_flac/CH$ch/ doesn't exist\n";
+  exit(1);
+}
+
+$out_dir = "$out_dir/mx6_mic_$ch";
+if (system("mkdir -p $out_dir")) {
+  print STDERR "Error making directory $out_dir\n";
+  exit(1);
+}
+
+if (system("mkdir -p $out_dir") != 0) {
+  print STDERR "Error making directory $out_dir\n";
+  exit(1);
+}
+
+open(SUBJECTS, "<$db_base/mx6_speech/docs/mx6_subjs.csv") || die "cannot open $$db_base/mx6_speech/docs/mx6_subjs.csv";
+open(SPKR, ">$out_dir/utt2spk") || die "Could not open the output file $out_dir/utt2spk";
+open(GNDR, ">$out_dir/spk2gender") || die "Could not open the output file $out_dir/spk2gender";
+open(WAV, ">$out_dir/wav.scp") || die "Could not open the output file $out_dir/wav.scp";
+open(META, "<$db_base/mx6_speech/docs/mx6_ivcomponents.csv") || die "cannot open $db_base/mx6_speech/docs/mx6_ivcomponents.csv";
+
+while (<SUBJECTS>) {
+  chomp;
+  $line = $_;
+  @toks = split(",", $line);
+  $spk = $toks[0];
+  $gender = lc $toks[1];
+  if ($gender eq "f" or $gender eq "m") {
+    print GNDR "$spk $gender\n";
+  }
+}
+
+$num_good_files = 0;
+$num_bad_files = 0;
+while (<META>) {
+  chomp;
+  $line = $_;
+  @toks = split(",", $line);
+  $flac = "$db_base/mx6_speech/data/pcm_flac/CH$ch/$toks[0]_CH$ch.flac";
+  $t1 = $toks[7];
+  $t2 = $toks[8];
+  @toks2 = split(/_/, $toks[0]);
+  $spk = $toks2[3];
+  $utt = "${spk}_MX6_$toks2[0]_$toks2[1]_$ch";
+  if (-f $flac) {
+    print SPKR "${utt} $spk\n";
+    print WAV "${utt} sox -t flac $flac -r 8k -t wav - trim $t1 =$t2 |\n";
+    $num_good_files++;
+  } else {
+    print STDERR "File $flac doesn't exist\n";
+    $num_bad_files++;
+  }
+}
+
+print STDERR "Processed $num_good_files utterances; $num_bad_files had missing flac data.\n";
+
+close(SUBJECTS) || die;
+close(GNDR) || die;
+close(SPKR) || die;
+close(WAV) || die;
+close(META) || die;
+
+if (system(
+  "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
+  die "Error creating spk2utt file in directory $out_dir";
+}
+
+system("utils/fix_data_dir.sh $out_dir");
+if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
+  die "Error validating directory $out_dir";
+}
diff --git a/egs/callhome_diarization/v1/local/make_sre10.pl b/egs/callhome_diarization/v1/local/make_sre10.pl
new file mode 100755
index 00000000000..eba9f697760
--- /dev/null
+++ b/egs/callhome_diarization/v1/local/make_sre10.pl
@@ -0,0 +1,133 @@
+#!/usr/bin/perl
+use warnings; #sed replacement for -w perl parameter
+# Copyright 2017   David Snyder
+# Apache 2.0
+#
+# Prepares NIST SRE10 enroll and test data in a single directory.
+if (@ARGV != 2) {
+  print STDERR "Usage: $0 <path-to-SRE10-eval> <path-to-output>\n";
+  print STDERR "e.g. $0 /export/corpora5/SRE/SRE2010/eval/ data/\n";
+  exit(1);
+}
+($db_base, $out_dir) = @ARGV;
+
+if (! -d "$db_base/data/") {
+  print STDERR "Directory $db_base/data/ doesn't exist\n";
+  exit(1);
+}
+$out_dir = "$out_dir/sre10";
+$tmp_dir = "$out_dir/tmp";
+if (system("mkdir -p $tmp_dir") != 0) {
+  die "Error making directory $tmp_dir";
+}
+
+if (system("mkdir -p $out_dir") != 0) {
+  print STDERR "Error making directory $out_dir\n";
+  exit(1);
+}
+
+%seg2sph = ();
+open(TRIALS, "<$db_base/keys/coreext-coreext.trialkey.csv") || die "Could not open $db_base/keys/coreext-coreext.trialkey.csv";
+open(TRAIN, "<$db_base/train/coreext.trn") || die "Could not open $db_base/train/coreext.trn";
+open(MODELS, "<$db_base/keys/coreext.modelkey.csv") || die "Could not open $db_base/keys/coreext.modelkey.csv";
+open(SPKR, ">$out_dir/utt2spk") || die "Could not open the output file $out_dir/utt2spk";
+open(GNDR, ">$out_dir/spk2gender") || die "Could not open the output file $out_dir/spk2gender";
+open(WAV, ">$out_dir/wav.scp") || die "Could not open the output file $out_dir/wav.scp";
+
+if (system("find $db_base/data/ -name '*.sph' > $tmp_dir/sph.list") != 0) {
+  die "Error getting list of sph files";
+}
+open(SPHLIST, "<$tmp_dir/sph.list") or die "cannot open wav list";
+while(<SPHLIST>) {
+  chomp;
+  $sph = $_;
+  @toks = split("/",$sph);
+  $sph_id = (split("[./]",$toks[$#toks]))[0];
+  $seg2sph{$sph_id} = $sph;
+}
+
+%model2sid = ();
+while (<MODELS>) {
+  chomp;
+  $line = $_;
+  ($model, $sid) = split(",", $line);
+  if (not $sid eq "NOT_SCORED") {
+    $model2sid{$model} = $sid;
+  }
+}
+
+while (<TRAIN>) {
+  chomp;
+  $line = $_;
+  @toks = split(" ", $line);
+  $model = $toks[0];
+  $gender = $toks[1];
+  @toks2 = split("/", $toks[2]);
+  ($sph, $ch) = split("[:]", $toks2[$#toks2]);
+  $seg = (split("[./]", $sph))[0];
+  if (exists $seg2sph{$seg}) {
+    $sph = $seg2sph{$seg};
+    if (exists $model2sid{$model}) {
+      $sid = $model2sid{$model};
+      print GNDR "$sid $gender\n";
+      if ($ch eq "A") {
+        $utt = "${sid}_SRE10_${seg}_A";
+        print WAV "$utt"," sph2pipe -f wav -p -c 1 $sph |\n";
+        print SPKR "$utt $sid\n";
+      } elsif($ch eq "B") {
+        $utt = "${sid}_SRE10_${seg}_B";
+        print WAV "$utt"," sph2pipe -f wav -p -c 2 $sph |\n";
+        print SPKR "$utt $sid\n";
+      } else {
+        print STDERR "Malformed train file\n";
+        exit(1);
+      }
+    }
+  }
+}
+
+while (<TRIALS>) {
+  chomp;
+  $line = $_;
+  @toks = split(",", $line);
+  $model = $toks[0];
+  $seg = $toks[1];
+  $ch = $toks[2];
+  $target = $toks[3];
+  if (exists $seg2sph{$seg} and -f $seg2sph{$seg}) {
+    $sph = $seg2sph{$seg};
+    if ($target eq "target" and exists $model2sid{$model}) {
+      $sid = $model2sid{$model};
+      if ($ch eq "a") {
+        $utt = "${sid}_SRE10_${seg}_A";
+        print WAV "$utt"," sph2pipe -f wav -p -c 1 $sph |\n";
+        print SPKR "$utt $sid\n";
+      } elsif($ch eq "b") {
+        $utt = "${sid}_SRE10_${seg}_B";
+        print WAV "$utt"," sph2pipe -f wav -p -c 2 $sph |\n";
+        print SPKR "$utt $sid\n";
+      } else {
+        print STDERR "Malformed trials file\n";
+        exit(1);
+      }
+    }
+  }
+}
+
+close(TRIALS) || die;
+close(TRAIN) || die;
+close(MODELS) || die;
+close(GNDR) || die;
+close(SPKR) || die;
+close(WAV) || die;
+
+if (system(
+  "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
+  die "Error creating spk2utt file in directory $out_dir";
+}
+
+system("utils/fix_data_dir.sh $out_dir");
+if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
+  die "Error validating directory $out_dir";
+}
+
diff --git a/egs/callhome_diarization/v1/run.sh b/egs/callhome_diarization/v1/run.sh
index acc48bd24f9..f4652c0c0ef 100755
--- a/egs/callhome_diarization/v1/run.sh
+++ b/egs/callhome_diarization/v1/run.sh
@@ -188,7 +188,7 @@ if [ $stage -le 6 ]; then
 
       der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \
         exp/tuning/${dataset}_t${threshold})
-      if [ $(echo $der'<'$best_der | bc -l) -eq 1 ]; then
+      if [ $(perl -e "print ($der < $best_der ? 1 : 0);") -eq 1 ]; then
         best_der=$der
         best_threshold=$threshold
       fi
diff --git a/egs/callhome_diarization/v2/run.sh b/egs/callhome_diarization/v2/run.sh
index ae05dd9da1c..58a980a6859 100755
--- a/egs/callhome_diarization/v2/run.sh
+++ b/egs/callhome_diarization/v2/run.sh
@@ -8,22 +8,39 @@
 # The scripts are based on the recipe in ../v1/run.sh, but clusters x-vectors
 # instead of i-vectors.  It is similar to the x-vector-based diarization system
 # described in "Diarization is Hard: Some Experiences and Lessons Learned for
-# the JHU Team in the Inaugural DIHARD Challenge" by Sell et al.  The main
-# difference is that we haven't implemented the VB resegmentation yet.
+# the JHU Team in the Inaugural DIHARD Challenge" by Sell et al.  
+# 
+# We download and use the VB resegmentation code of Speech@fit team, Brno University of Technology,
+# for the VB resegmentation part which is optionally executed
+# on the output rttm of the x-vector based diarization to improve precision of speaker segments.
+#
+# extras/install_diarization_VBHMM.sh in tools/ installs all the required
+# dependencies for VB resegmentation scripts.
+
 
 . ./cmd.sh
 . ./path.sh
+
 set -e
 mfccdir=`pwd`/mfcc
 vaddir=`pwd`/mfcc
 data_root=/export/corpora5/LDC
+sre_root=/export/corpora5/SRE
 stage=0
 nnet_dir=exp/xvector_nnet_1a/
 
+#Variational Bayes resegmentation options
+VB_resegmentation=true
+num_gauss=1024
+ivec_dim=400
+
+. utils/parse_options.sh
+
 # Prepare datasets
 if [ $stage -le 0 ]; then
   # Prepare a collection of NIST SRE data. This will be used to train,
-  # x-vector DNN and PLDA model.
+    # x-vector DNN and PLDA model.
+    # This data doesn't include sre2010.
   local/make_sre.sh $data_root data
 
   # Prepare SWB for x-vector DNN training.
@@ -53,7 +70,7 @@ if [ $stage -le 1 ]; then
   # callhome1 and callhome2.  Each partition is treated like a held-out
   # dataset, and used to estimate various quantities needed to perform
   # diarization on the other part (and vice versa).
-  for name in train callhome1 callhome2; do
+  for name in train callhome callhome1 callhome2; do
     steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 \
       --cmd "$train_cmd" --write-utt2num-frames true \
       data/$name exp/make_mfcc $mfccdir
@@ -92,7 +109,29 @@ if [ $stage -le 1 ]; then
   # detection (SAD) system, but this is not necessary.  You can replace
   # this with segments computed from your favorite SAD.
   diarization/vad_to_segments.sh --nj 40 --cmd "$train_cmd" \
-    data/sre_cmn data/sre_cmn_segmented
+				 data/sre_cmn data/sre_cmn_segmented
+
+  if [ $VB_resegmentation ]; then
+      # Prepare telephone and microphone speech from Mixer6.
+      local/make_mx6.sh $data_root/LDC2013S03 data/
+
+      # Prepare SRE10 test and enroll. Includes microphone interview speech.
+      # NOTE: This corpus is now available through the LDC as LDC2017S06.
+      local/make_sre10.pl $sre_root/SRE2010/eval/ data/
+
+      # Preparing features for ivector extractor model training.
+      for name in mx6 sre10; do
+	  steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" \
+			     data/${name} exp/make_mfcc $mfccdir
+	  utils/fix_data_dir.sh data/${name}
+	  sid/compute_vad_decision.sh --nj 40 --cmd "$train_cmd" \
+				      data/${name} exp/make_vad $vaddir
+	  utils/fix_data_dir.sh data/${name}
+      done
+      utils/combine_data.sh data/swbd_sre data/train data/mx6 data/sre10
+      # data/train already includes sre and swbd data.
+      utils/subset_data_dir.sh data/swbd_sre 32000 data/swbd_sre_32k
+  fi
 fi
 
 # In this section, we augment the training data with reverberation,
@@ -130,7 +169,7 @@ if [ $stage -le 2 ]; then
 
   # Prepare the MUSAN corpus, which consists of music, speech, and noise
   # suitable for augmentation.
-  local/make_musan.sh /export/corpora/JHU/musan data
+  steps/data/make_musan.sh --sampling-rate 8000 /export/corpora/JHU/musan data
 
   # Get the duration of the MUSAN recordings.  This will be used by the
   # script augment_data_dir.py.
@@ -297,7 +336,7 @@ if [ $stage -le 10 ]; then
 
       der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \
         $nnet_dir/tuning/${dataset}_t${threshold})
-      if [ $(echo $der'<'$best_der | bc -l) -eq 1 ]; then
+      if [ $(perl -e "print ($der < $best_der ? 1 : 0);") -eq 1 ]; then
         best_der=$der
         best_threshold=$threshold
       fi
@@ -356,3 +395,82 @@ if [ $stage -le 11 ]; then
   # Compare to 8.69% in ../v1/run.sh
   echo "Using the oracle number of speakers, DER: $der%"
 fi
+
+if [ $VB_resegmentation ]; then
+# Variational Bayes method for smoothing the Speaker segments at frame-level
+  output_dir=exp/xvec_init_gauss_${num_gauss}_ivec_${ivec_dim}
+
+  if [ $stage -le 12 ]; then
+    # Apply cmn and adding deltas will harm the performance on the callhome dataset. So we just use the 20-dim raw MFCC feature.
+    sid/train_diag_ubm.sh --cmd "$train_cmd --mem 20G --max-jobs-run 6" \
+			  --nj 10 --num-threads 4  --subsample 1 --delta-order 0 --apply-cmn false \
+			  data/swbd_sre_32k $num_gauss \
+			  exp/diag_ubm_gauss_${num_gauss}_delta_0_cmn_0
+  fi
+
+  if [ $stage -le 13 ]; then
+    # Train the i-vector extractor. The UBM is assumed to be diagonal.
+    diarization/train_ivector_extractor_diag.sh --cmd "$train_cmd --mem 45G --max-jobs-run 20" \
+					  --ivector-dim ${ivec_dim} \
+					  --num-iters 5 \
+					  --apply-cmn false \
+					  --num-threads 1 --num-processes 1 --nj 10 \
+					  exp/diag_ubm_gauss_${num_gauss}_delta_0_cmn_0/final.dubm data/swbd_sre \
+					  exp/extractor_gauss_${num_gauss}_delta_0_cmn_0_ivec_${ivec_dim}
+  fi
+
+  if [ $stage -le 14 ]; then
+    # Convert the Kaldi UBM and T-matrix model to numpy array.
+    mkdir -p $output_dir
+    mkdir -p $output_dir/tmp
+    mkdir -p $output_dir/log
+    mkdir -p $output_dir/model
+
+    # Dump the diagonal UBM model into text format.
+    "$train_cmd" $output_dir/log/convert_diag_ubm.log \
+		 gmm-global-copy --binary=false \
+		 exp/diag_ubm_gauss_${num_gauss}_delta_0_cmn_0/final.dubm \
+		 $output_dir/tmp/dubm.tmp || exit 1;
+
+    # Dump the ivector extractor model into text format.
+    # This method is not currently supported by Kaldi,
+    # so please use my kaldi.
+    "$train_cmd" $output_dir/log/convert_ie.log \
+		 ivector-extractor-copy --binary=false \
+		 exp/extractor_gauss_${num_gauss}_delta_0_cmn_0_ivec_${ivec_dim}/final.ie \
+		 $output_dir/tmp/ie.tmp || exit 1;
+
+    diarization/dump_model.py $output_dir/tmp/dubm.tmp $output_dir/model
+    diarization/dump_model.py $output_dir/tmp/ie.tmp $output_dir/model
+  fi
+
+  if [ $stage -le 15 ]; then
+    mkdir -p $output_dir/results
+    init_rttm_file=callhome_rttm_output_xvec
+    label_rttm_file=data/callhome/fullref.rttm
+    cat $nnet_dir/xvectors_callhome1/plda_scores/rttm \
+	  $nnet_dir/xvectors_callhome2/plda_scores/rttm > $init_rttm_file
+
+    # Compute the DER before VB resegmentation
+    md-eval.pl -1 -c 0.25 -r $label_rttm_file -s $init_rttm_file 2> $output_dir/log/DER_init.log \
+	       > $output_dir/results/DER_init.txt
+    der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \
+	       $output_dir/results/DER_init.txt)
+
+    # VB resegmentation. In this script, I use the x-vector result to
+    # initialize the VB system. You can also use i-vector result or random
+    # initize the VB system.
+    diarization/VB_resegmentation.sh --nj 20 --cmd "$train_cmd --mem 20G" \
+			             --initialize 1 \
+			       data/callhome $init_rttm_file $output_dir $output_dir/model/diag_ubm.pkl $output_dir/model/ie.pkl || exit 1;
+
+    # Compute the DER after VB resegmentation
+    cat $output_dir/rttm/* > $output_dir/predict.rttm
+    md-eval.pl -1 -c 0.25 -r $label_rttm_file -s $output_dir/predict.rttm 2> $output_dir/log/DER.log \
+	       > $output_dir/results/DER.txt
+    der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \
+	       $output_dir/results/DER.txt)
+    # After VB resegmentation, DER: 6.15%
+    echo "After VB resegmentation, DER: $der%"
+  fi # VB resegmentation part ends here.
+fi
diff --git a/egs/callhome_egyptian/s5/local/callhome_prepare_dict.sh b/egs/callhome_egyptian/s5/local/callhome_prepare_dict.sh
index 62bca974e53..d9faa97f266 100755
--- a/egs/callhome_egyptian/s5/local/callhome_prepare_dict.sh
+++ b/egs/callhome_egyptian/s5/local/callhome_prepare_dict.sh
@@ -54,9 +54,8 @@ cat $dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > \
 $dir/extra_questions.txt || exit 1;
 
 # Add prons for laughter, noise, oov
-for w in `grep -v sil $dir/silence_phones.txt`; do
-sed -i "/\[$w\]/d" $tmpdir/lexicon.3
-done
+w=$(grep -v sil $dir/silence_phones.txt | tr '\n' '|')
+perl -i -ne "print unless /\[(${w%?})\]/"  $tmpdir/lexicon.3
 
 for w in `grep -v sil $dir/silence_phones.txt`; do
 echo "[$w] $w"
diff --git a/egs/callhome_egyptian/s5/local/ctm.sh b/egs/callhome_egyptian/s5/local/ctm.sh
index 14056b7a44b..64a7cf0d4f6 100755
--- a/egs/callhome_egyptian/s5/local/ctm.sh
+++ b/egs/callhome_egyptian/s5/local/ctm.sh
@@ -18,9 +18,9 @@ fi
 steps/get_ctm.sh $data_dir $lang_dir $decode_dir
 
 # Make sure that channel markers match
-#sed -i "s:\s.*_fsp-([AB]): \1:g" data/dev/stm
-#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} sed -i -r 's:fsp\s1\s:fsp A :g' {}
-#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} sed -i -r 's:fsp\s2\s:fsp B :g' {}
+#perl -i -pe "s:\s.*_fsp-([AB]): \1:g" data/dev/stm
+#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} perl -i -pe 's:fsp\s1\s:fsp A :g' {}
+#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} perl -i -pe 's:fsp\s2\s:fsp B :g' {}
 
 # Get the environment variables
 . /export/babel/data/software/env.sh
diff --git a/egs/chime4/s5_1ch/local/chain/tuning/run_tdnn_1a.sh b/egs/chime4/s5_1ch/local/chain/tuning/run_tdnn_1a.sh
index d5ad3629cee..3f8b7c60090 100755
--- a/egs/chime4/s5_1ch/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/chime4/s5_1ch/local/chain/tuning/run_tdnn_1a.sh
@@ -217,7 +217,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.01"
   output_opts="l2-regularize=0.005"
 
diff --git a/egs/chime4/s5_1ch/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/chime4/s5_1ch/local/chain/tuning/run_tdnn_lstm_1a.sh
index f5c8973ab67..8b4e93cd05b 100755
--- a/egs/chime4/s5_1ch/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/chime4/s5_1ch/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -180,7 +180,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/chime4/s5_1ch/local/chime4_calc_wers_looped.sh b/egs/chime4/s5_1ch/local/chime4_calc_wers_looped.sh
index 9fe4a20f43a..84bb2cb8dbd 100755
--- a/egs/chime4/s5_1ch/local/chime4_calc_wers_looped.sh
+++ b/egs/chime4/s5_1ch/local/chime4_calc_wers_looped.sh
@@ -82,4 +82,4 @@ for e_d in $tasks; do
       | utils/int2sym.pl -f 2- $graph_dir/words.txt \
       | sed s:\<UNK\>::g
   done
-done
\ No newline at end of file
+done
diff --git a/egs/chime4/s5_1ch/local/run_lmrescore_tdnn_lstm.sh b/egs/chime4/s5_1ch/local/run_lmrescore_tdnn_lstm.sh
index 7173dcea78b..0bea4dd7102 100755
--- a/egs/chime4/s5_1ch/local/run_lmrescore_tdnn_lstm.sh
+++ b/egs/chime4/s5_1ch/local/run_lmrescore_tdnn_lstm.sh
@@ -165,4 +165,4 @@ if [ $stage -le 4 ]; then
   local/chime4_calc_wers_looped.sh $dir ${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest} $dir/graph_tgpr_5k \
       > $dir/best_wer_looped_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}.result
   head -n 15 $dir/best_wer_looped_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}.result
-fi
\ No newline at end of file
+fi
diff --git a/egs/chime5/s5/cmd.sh b/egs/chime5/s5/cmd.sh
index a697a22cda3..9702501f1a7 100644
--- a/egs/chime5/s5/cmd.sh
+++ b/egs/chime5/s5/cmd.sh
@@ -10,6 +10,6 @@
 # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
 # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
-export train_cmd="queue.pl --mem 2G"
+export train_cmd="retry.pl queue.pl --mem 2G"
 export decode_cmd="queue.pl --mem 4G"
 
diff --git a/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh
index 45a7fd84bd6..d60e6a4aa04 100755
--- a/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -24,21 +24,16 @@ decode_iter=
 # training options
 # training chunk-options
 chunk_width=140,100,160
-# we don't need extra left/right context for TDNN systems.
-chunk_left_context=0
-chunk_right_context=0
 common_egs_dir=
 xent_regularize=0.1
 
 # training options
 srand=0
 remove_egs=true
-reporting_email=
 
 #decode options
 test_online_decoding=false  # if true, it will run the last decoding stage.
 
-
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
@@ -133,7 +128,7 @@ if [ $stage -le 13 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.05"
   output_opts="l2-regularize=0.01 bottleneck-dim=320"
 
@@ -176,7 +171,6 @@ EOF
   steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
 fi
 
-
 if [ $stage -le 14 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
@@ -204,15 +198,10 @@ if [ $stage -le 14 ]; then
     --trainer.num-chunk-per-minibatch=256,128,64 \
     --trainer.optimization.momentum=0.0 \
     --egs.chunk-width=$chunk_width \
-    --egs.chunk-left-context=$chunk_left_context \
-    --egs.chunk-right-context=$chunk_right_context \
-    --egs.chunk-left-context-initial=0 \
-    --egs.chunk-right-context-final=0 \
     --egs.dir="$common_egs_dir" \
     --egs.opts="--frames-overlap-per-eg 0" \
     --cleanup.remove-egs=$remove_egs \
     --use-gpu=true \
-    --reporting.email="$reporting_email" \
     --feat-dir=$train_data_dir \
     --tree-dir=$tree_dir \
     --lat-dir=$lat_dir \
@@ -235,10 +224,6 @@ if [ $stage -le 16 ]; then
     (
       steps/nnet3/decode.sh \
           --acwt 1.0 --post-decode-acwt 10.0 \
-          --extra-left-context $chunk_left_context \
-          --extra-right-context $chunk_right_context \
-          --extra-left-context-initial 0 \
-          --extra-right-context-final 0 \
           --frames-per-chunk $frames_per_chunk \
           --nj 8 --cmd "$decode_cmd"  --num-threads 4 \
           --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
diff --git a/egs/chime5/s5/local/nnet3/run_ivector_common.sh b/egs/chime5/s5/local/nnet3/run_ivector_common.sh
index e28e5ce996d..2b672063be7 100755
--- a/egs/chime5/s5/local/nnet3/run_ivector_common.sh
+++ b/egs/chime5/s5/local/nnet3/run_ivector_common.sh
@@ -23,7 +23,7 @@ nnet3_affix=_train_worn_u100k
 gmm_dir=exp/${gmm}
 ali_dir=exp/${gmm}_ali_${train_set}_sp
 
-for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
+for f in data/${train_set}/utt2spk ${gmm_dir}/final.mdl; do
   if [ ! -f $f ]; then
     echo "$0: expected file $f to exist"
     exit 1
diff --git a/egs/chime5/s5/local/run_wpe.py b/egs/chime5/s5/local/run_wpe.py
new file mode 100644
index 00000000000..cc9cd41927a
--- /dev/null
+++ b/egs/chime5/s5/local/run_wpe.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+# Copyright 2018 Johns Hopkins University (Author: Aswin Shanmugam Subramanian)
+# Apache 2.0
+# Works with both python2 and python3
+
+import numpy as np
+import soundfile as sf
+import time
+import os, errno
+from tqdm import tqdm
+import argparse
+
+from nara_wpe.wpe import wpe
+from nara_wpe.utils import stft, istft 
+from nara_wpe import project_root
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--files', '-f', nargs='+')
+args = parser.parse_args()
+
+input_files = args.files[:len(args.files)//2]
+output_files = args.files[len(args.files)//2:]
+out_dir = os.path.dirname(output_files[0])
+try: 
+    os.makedirs(out_dir)
+except OSError as e:
+    if e.errno != errno.EEXIST:
+        raise
+
+stft_options = dict(
+    size=512,
+    shift=128,
+    window_length=None,
+    fading=True,
+    pad=True,
+    symmetric_window=False
+)
+
+sampling_rate = 16000
+delay = 3
+iterations = 5
+taps = 10
+
+signal_list = [
+    sf.read(f)[0]
+    for f in input_files
+]
+y = np.stack(signal_list, axis=0)
+Y = stft(y, **stft_options).transpose(2, 0, 1)
+Z = wpe(Y, iterations=iterations, statistics_mode='full').transpose(1, 2, 0)
+z = istft(Z, size=stft_options['size'], shift=stft_options['shift'])
+
+for d in range(len(signal_list)):
+    sf.write(output_files[d], z[d,:], sampling_rate)
diff --git a/egs/chime5/s5/local/run_wpe.sh b/egs/chime5/s5/local/run_wpe.sh
new file mode 100755
index 00000000000..8ecbbd6182a
--- /dev/null
+++ b/egs/chime5/s5/local/run_wpe.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+# Copyright 2018 Johns Hopkins University (Author: Aswin Shanmugam Subramanian)
+# Apache 2.0
+
+. ./cmd.sh
+. ./path.sh
+
+# Config:
+nj=4
+cmd=run.pl
+
+. utils/parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+   echo "Wrong #arguments ($#, expected 3)"
+   echo "Usage: local/run_wpe.sh [options] <wav-in-dir> <wav-out-dir> <array-id>"
+   echo "main options (for others, see top of script file)"
+   echo "  --cmd <cmd>                              # Command to run in parallel with"
+   echo "  --nj 50                        # number of jobs for parallel processing"
+   exit 1;
+fi
+
+sdir=$1
+odir=$2
+array=$3
+task=`basename $sdir`
+expdir=exp/wpe/${task}_${array}
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+miniconda_dir=$HOME/miniconda3/
+if [ ! -d $miniconda_dir ]; then
+    echo "$miniconda_dir does not exist. Please run '../../../tools/extras/install_miniconda.sh' and '../../../tools/extras/install_wpe.sh';"
+fi
+
+# check if WPE is installed
+result=`$HOME/miniconda3/bin/python -c "\
+try:
+    import nara_wpe
+    print('1')
+except ImportError:
+    print('0')"`
+
+if [ "$result" == "1" ]; then
+    echo "WPE is installed"
+else
+    echo "WPE is not installed. Please run ../../../tools/extras/install_wpe.sh"
+    exit 1
+fi
+
+mkdir -p $odir
+mkdir -p $expdir/log
+
+# wavfiles.list can be used as the name of the output files
+output_wavfiles=$expdir/wavfiles.list
+find -L ${sdir} | grep -i ${array} > $expdir/channels_input
+cat $expdir/channels_input | awk -F '/' '{print $NF}' | sed "s@S@$odir\/S@g" > $expdir/channels_output
+paste -d" " $expdir/channels_input $expdir/channels_output > $output_wavfiles
+
+# split the list for parallel processing
+split_wavfiles=""
+for n in `seq $nj`; do
+  split_wavfiles="$split_wavfiles $output_wavfiles.$n"
+done
+utils/split_scp.pl $output_wavfiles $split_wavfiles || exit 1;
+
+echo -e "Dereverberation - $task - $array\n"
+# making a shell script for each job
+for n in `seq $nj`; do
+cat <<-EOF > $expdir/log/wpe.$n.sh
+while read line; do
+  $HOME/miniconda3/bin/python local/run_wpe.py \
+    --file \$line
+done < $output_wavfiles.$n
+EOF
+done
+
+chmod a+x $expdir/log/wpe.*.sh
+$cmd JOB=1:$nj $expdir/log/wpe.JOB.log \
+  $expdir/log/wpe.JOB.sh
+
+echo "`basename $0` Done."
diff --git a/egs/chime5/s5b/RESULTS b/egs/chime5/s5b/RESULTS
new file mode 100644
index 00000000000..0dcea1f0031
--- /dev/null
+++ b/egs/chime5/s5b/RESULTS
@@ -0,0 +1,33 @@
+
+# tri2
+%WER 76.40 [ 44985 / 58881, 3496 ins, 17652 del, 23837 sub ] exp/tri2/decode_dev_worn/wer_13_1.0
+%WER 93.56 [ 55091 / 58881, 2132 ins, 35555 del, 17404 sub ] exp/tri2/decode_dev_beamformit_ref/wer_17_1.0
+
+# tri3
+%WER 72.81 [ 42869 / 58881, 3629 ins, 15998 del, 23242 sub ] exp/tri3/decode_dev_worn/wer_15_1.0
+%WER 91.73 [ 54013 / 58881, 3519 ins, 27098 del, 23396 sub ] exp/tri3/decode_dev_beamformit_ref/wer_17_1.0
+
+# nnet3 tdnn+chain
+%WER 47.91 [ 28212 / 58881, 2843 ins, 8957 del, 16412 sub ] exp/chain_train_worn_u100k_cleaned/tdnn1a_sp/decode_dev_worn/wer_9_0.0
+%WER 81.28 [ 47859 / 58881, 4210 ins, 27511 del, 16138 sub ] exp/chain_train_worn_u100k_cleaned/tdnn1a_sp/decode_dev_beamformit_ref/wer_9_0.5
+
+# result with the challenge submission format (July 9, 2018)
+# before the fix of speaker ID across arrays
+session S02 room DINING: #words 8288, #errors 6593, wer 79.54 %
+session S02 room KITCHEN: #words 12696, #errors 11096, wer 87.39 %
+session S02 room LIVING: #words 15460, #errors 12219, wer 79.03 %
+session S09 room DINING: #words 5766, #errors 4651, wer 80.66 %
+session S09 room KITCHEN: #words 8911, #errors 7277, wer 81.66 %
+session S09 room LIVING: #words 7760, #errors 6023, wer 77.61 %
+overall: #words 58881, #errors 47859, wer 81.28 %
+
+# result with the challenge submission format (July 9, 2018)
+# after the fix of speaker ID across arrays
+==== development set ====
+session S02 room DINING: #words 8288, #errors 6556, wer 79.10 %
+session S02 room KITCHEN: #words 12696, #errors 11096, wer 87.39 %
+session S02 room LIVING: #words 15460, #errors 12182, wer 78.79 %
+session S09 room DINING: #words 5766, #errors 4648, wer 80.61 %
+session S09 room KITCHEN: #words 8911, #errors 7277, wer 81.66 %
+session S09 room LIVING: #words 7760, #errors 6022, wer 77.60 %
+overall: #words 58881, #errors 47781, wer 81.14 %
diff --git a/egs/chime5/s5b/cmd.sh b/egs/chime5/s5b/cmd.sh
new file mode 100644
index 00000000000..9702501f1a7
--- /dev/null
+++ b/egs/chime5/s5b/cmd.sh
@@ -0,0 +1,15 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="retry.pl queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+
diff --git a/egs/chime5/s5b/conf/beamformit.cfg b/egs/chime5/s5b/conf/beamformit.cfg
new file mode 100755
index 00000000000..70fdd858651
--- /dev/null
+++ b/egs/chime5/s5b/conf/beamformit.cfg
@@ -0,0 +1,50 @@
+#BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/)
+
+# scrolling size to compute the delays
+scroll_size = 250
+
+# cross correlation computation window size
+window_size = 500
+
+#amount of maximum points for the xcorrelation taken into account
+nbest_amount = 4
+
+#flag wether to apply an automatic noise thresholding 
+do_noise_threshold = 1
+
+#Percentage of frames with lower xcorr taken as noisy
+noise_percent = 10
+
+######## acoustic modelling parameters
+
+#transition probabilities weight for multichannel decoding
+trans_weight_multi = 25
+trans_weight_nbest = 25
+
+###
+
+#flag wether to print the feaures after setting them, or not
+print_features = 1
+
+#flag wether to use the bad frames in the sum process
+do_avoid_bad_frames = 1
+
+#flag to use the best channel (SNR) as a reference
+#defined from command line
+do_compute_reference = 1
+
+#flag wether to use a uem file or not(process all the file)
+do_use_uem_file = 0
+
+#flag wether to use an adaptative weights scheme or fixed weights
+do_adapt_weights = 1
+
+#flag wether to output the sph files or just run the system to create the auxiliary files
+do_write_sph_files = 1
+
+####directories where to store/retrieve info####
+#channels_file = ./cfg-files/channels
+
+#show needs to be passed as argument normally, here a default one is given just in case
+#show_id = Ttmp
+
diff --git a/egs/chime5/s5b/conf/mfcc.conf b/egs/chime5/s5b/conf/mfcc.conf
new file mode 100644
index 00000000000..32988403b00
--- /dev/null
+++ b/egs/chime5/s5b/conf/mfcc.conf
@@ -0,0 +1,2 @@
+--use-energy=false
+--sample-frequency=16000
diff --git a/egs/chime5/s5b/conf/mfcc_hires.conf b/egs/chime5/s5b/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..fd64b62eb16
--- /dev/null
+++ b/egs/chime5/s5b/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=16000 
+--num-mel-bins=40
+--num-ceps=40
+--low-freq=40
+--high-freq=-400
diff --git a/egs/chime5/s5b/conf/online_cmvn.conf b/egs/chime5/s5b/conf/online_cmvn.conf
new file mode 100644
index 00000000000..7748a4a4dd3
--- /dev/null
+++ b/egs/chime5/s5b/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
diff --git a/egs/chime5/s5b/local/chain/run_tdnn.sh b/egs/chime5/s5b/local/chain/run_tdnn.sh
new file mode 120000
index 00000000000..34499362831
--- /dev/null
+++ b/egs/chime5/s5b/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/chime5/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh b/egs/chime5/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh
new file mode 100755
index 00000000000..95e9d934bd3
--- /dev/null
+++ b/egs/chime5/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh
@@ -0,0 +1,304 @@
+#!/bin/bash
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=96
+train_set=train_worn_u400k_cleaned
+test_sets="dev_beamformit_ref"
+gmm=tri3_cleaned
+nnet3_affix=_train_worn_u400k_cleaned
+lm_suffix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=_1a   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+common_egs_dir=
+
+hidden_dim=1024
+cell_dim=1024
+projection_dim=256
+
+# training options
+num_epochs=2  # 2 works better than 4
+chunk_width=140,100,160
+chunk_left_context=40
+chunk_right_context=0
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+xent_regularize=0.025
+label_delay=5
+
+# decode options
+extra_left_context=50
+extra_right_context=0
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+				  --test-sets "$test_sets" \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/cnn_tdnn_lstm${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj ${nj} --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+if [ $stage -le 13 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  lstm_opts="decay-time=40"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+  idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat
+      
+  conv-relu-batchnorm-layer name=cnn1 input=idct height-in=40 height-out=20 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=256 learning-rate-factor=0.333 max-change=0.25
+  conv-relu-batchnorm-layer name=cnn2 input=cnn1 height-in=20 height-out=20 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=128
+
+  relu-batchnorm-layer name=affine1 input=lda dim=512
+ 
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 input=cnn2 dim=1024
+  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,affine1) dim=1024
+  relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0 $lstm_opts
+  relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0 $lstm_opts
+  relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+  relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=1024
+  relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/chime5-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  mkdir -p $dir/egs
+  touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$train_cmd --mem 4G" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.srand=$srand \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=16 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.momentum=0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang${lm_suffix}/ \
+    $tree_dir $tree_dir/graph${lm_suffix} || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $chunk_left_context \
+          --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj 8 --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph${lm_suffix} data/${data}_hires ${dir}/decode${lm_suffix}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj 8 --cmd "$decode_cmd" \
+        $tree_dir/graph${lm_suffix} data/${data} ${dir}_online/decode${lm_suffix}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/chime5/s5b/local/chain/tuning/run_tdnn_1a.sh b/egs/chime5/s5b/local/chain/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..daad37e2cd7
--- /dev/null
+++ b/egs/chime5/s5b/local/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,270 @@
+#!/bin/bash
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=96
+train_set=train_worn_u100k
+test_sets="dev_worn dev_beamformit_ref"
+gmm=tri3
+nnet3_affix=_train_worn_u100k
+lm_suffix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1a   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+# training options
+# training chunk-options
+chunk_width=140,100,160
+common_egs_dir=
+xent_regularize=0.1
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+				  --test-sets "$test_sets" \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj ${nj} --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 13 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.05"
+  output_opts="l2-regularize=0.01 bottleneck-dim=320"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $opts dim=512
+  relu-batchnorm-layer name=tdnn2 $opts dim=512 input=Append(-1,0,1)
+  relu-batchnorm-layer name=tdnn3 $opts dim=512
+  relu-batchnorm-layer name=tdnn4 $opts dim=512 input=Append(-1,0,1)
+  relu-batchnorm-layer name=tdnn5 $opts dim=512
+  relu-batchnorm-layer name=tdnn6 $opts dim=512 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn7 $opts dim=512 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn8 $opts dim=512 input=Append(-6,-3,0)
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain $opts dim=512 target-rms=0.5
+  output-layer name=output include-log-softmax=false $output_opts dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn8 $opts dim=512 target-rms=0.5
+  output-layer name=output-xent $output_opts dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/chime5-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=10 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=4 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=256,128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang${lm_suffix}/ \
+    $tree_dir $tree_dir/graph${lm_suffix} || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj 8 --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph${lm_suffix} data/${data}_hires ${dir}/decode${lm_suffix}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj 8 --cmd "$decode_cmd" \
+        $tree_dir/graph${lm_suffix} data/${data} ${dir}_online/decode${lm_suffix}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/chime5/s5b/local/chain/tuning/run_tdnn_1b.sh b/egs/chime5/s5b/local/chain/tuning/run_tdnn_1b.sh
new file mode 100755
index 00000000000..e033715d884
--- /dev/null
+++ b/egs/chime5/s5b/local/chain/tuning/run_tdnn_1b.sh
@@ -0,0 +1,249 @@
+#!/bin/bash
+
+# This factorized TDNN (TDNN-F) script is adapted from SWBD recipe 7q.
+# It uses resnet-style skip connections.
+# For details, refer to the paper:
+# "Semi-Orthogonal Low-Rank Matrix Factorization for Deep Neural Networks", Daniel Povey, Gaofeng Cheng, Yiming Wang, Ke Li, Hainan Xu, Mahsa Yarmohamadi, Sanjeev Khudanpur, Interspeech 2018
+
+# %WER 70.27 [ 41375 / 58881, 3487 ins, 22831 del, 15057 sub ] exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/decode_dev_beamformit_dereverb_ref_2stage/wer_12_0.0
+# %WER 70.28 [ 41383 / 58881, 4486 ins, 19616 del, 17281 sub ] exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/decode_dev_beamformit_ref_2stage/wer_11_0.0
+# %WER 72.62 [ 42761 / 58881, 4545 ins, 21618 del, 16598 sub ] exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/decode_dev_beamformit_ref/wer_11_0.0
+# %WER 72.64 [ 42772 / 58881, 4556 ins, 21618 del, 16598 sub ] exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/decode_dev_beamformit_dereverb_ref/wer_11_0.0
+
+# steps/info/chain_dir_info.pl exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn_1b_sp
+# exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/: num-iters=317 nj=3..16 num-params=17.0M dim=40+100->2792 combine=-0.149->-0.149 (over 2) xent:train/valid[210,316,final]=(-2.50,-1.99,-2.00/-2.36,-1.95,-1.95) logprob:train/valid[210,316,final]=(-0.228,-0.136,-0.136/-0.223,-0.156,-0.155)
+
+set -e
+
+# configs for 'chain'
+stage=0
+nj=96
+train_set=train_worn_u400k
+test_sets="dev_worn dev_beamformit_ref"
+gmm=tri3
+nnet3_affix=_train_worn_u400k
+lm_suffix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1b   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+num_epochs=4
+common_egs_dir=
+# training options
+# training chunk-options
+chunk_width=140,100,160
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --test-sets "$test_sets" \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj ${nj} --cmd "$train_cmd" --generate-ali-from-lats true \
+    ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $lat_dir $tree_dir
+fi
+
+if [ $stage -le 13 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.002"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/chime5-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$train_cmd --mem 4G" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule "$dropout_schedule" \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.00025 \
+    --trainer.optimization.final-effective-lrate 0.000025 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang${lm_suffix}/ \
+    $tree_dir $tree_dir/graph${lm_suffix} || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj 8 --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph${lm_suffix} data/${data}_hires ${dir}/decode${lm_suffix}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+exit 0;
diff --git a/egs/chime5/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/chime5/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
new file mode 100755
index 00000000000..e3d8e6ac4dc
--- /dev/null
+++ b/egs/chime5/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -0,0 +1,297 @@
+#!/bin/bash
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=96
+train_set=train_worn_u400k_cleaned
+test_sets="dev_worn dev_beamformit_ref"
+gmm=tri3_cleaned
+nnet3_affix=_train_worn_u400k_cleaned
+lm_suffix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=_1a   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+common_egs_dir=
+
+hidden_dim=1024
+cell_dim=1024
+projection_dim=256
+
+# training options
+num_epochs=2  # 2 works better than 4
+chunk_width=140,100,160
+chunk_left_context=40
+chunk_right_context=0
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+xent_regularize=0.025
+label_delay=5
+
+# decode options
+extra_left_context=50
+extra_right_context=0
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+				  --test-sets "$test_sets" \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj ${nj} --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+if [ $stage -le 13 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  lstm_opts="decay-time=40"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=$hidden_dim
+  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim
+  relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim
+
+  fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts
+  relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim
+  relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim
+  fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts
+  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim
+  relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim
+  fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts
+  relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim
+  relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim
+  fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/chime5-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  mkdir -p $dir/egs
+  touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$train_cmd --mem 4G" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.srand=$srand \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=16 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.momentum=0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang${lm_suffix}/ \
+    $tree_dir $tree_dir/graph${lm_suffix} || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $chunk_left_context \
+          --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj 8 --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph${lm_suffix} data/${data}_hires ${dir}/decode${lm_suffix}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj 8 --cmd "$decode_cmd" \
+        $tree_dir/graph${lm_suffix} data/${data} ${dir}_online/decode${lm_suffix}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/chime5/s5b/local/check_tools.sh b/egs/chime5/s5b/local/check_tools.sh
new file mode 100755
index 00000000000..9c0f9290a75
--- /dev/null
+++ b/egs/chime5/s5b/local/check_tools.sh
@@ -0,0 +1,71 @@
+#!/bin/bash -u
+
+# Copyright 2015 (c) Johns Hopkins University (Jan Trmal <jtrmal@gmail.com>)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+[ -f ./path.sh ] && . ./path.sh
+
+command -v uconv &>/dev/null \
+  || { echo  >&2 "uconv not found on PATH. You will have to install ICU4C"; exit 1; }
+
+command -v ngram &>/dev/null \
+  || { echo  >&2 "srilm not found on PATH. Please use the script $KALDI_ROOT/tools/extras/install_srilm.sh to install it"; exit 1; }
+
+if [  -z ${LIBLBFGS} ]; then
+  echo >&2  "SRILM is not compiled with the support of MaxEnt models."
+  echo >&2  "You should use the script in \$KALDI_ROOT/tools/install_srilm.sh"
+  echo >&2  "which will take care of compiling the SRILM with MaxEnt support"
+  exit 1;
+fi
+
+sox=`command -v sox 2>/dev/null` \
+  || { echo  >&2 "sox not found on PATH. Please install it manually (you will need version 14.4.0 and higher)."; exit 1; }
+
+# If sox is found on path, check if the version is correct
+if [ ! -z "$sox" ]; then
+  sox_version=`$sox --version 2>&1| head -1 | sed -e 's?.*: ??' -e 's?.* ??'`
+  if [[ ! $sox_version =~ v14.4.* ]]; then
+    echo "Unsupported sox version $sox_version found on path. You will need version v14.4.0 and higher."
+    exit 1
+  fi
+fi
+
+command -v phonetisaurus-align &>/dev/null \
+  || { echo  >&2 "Phonetisaurus not found on PATH. Please use the script $KALDI_ROOT/tools/extras/install_phonetisaurus.sh to install it"; exit 1; }
+
+command -v BeamformIt &>/dev/null \
+  || { echo  >&2 "BeamformIt not found on PATH. Please use the script $KALDI_ROOT/tools/extras/install_beamformit.sh to install it"; exit 1; }
+
+miniconda_dir=$HOME/miniconda3/
+if [ ! -d $miniconda_dir ]; then
+    echo "$miniconda_dir does not exist. Please run '../../../tools/extras/install_miniconda.sh'"
+fi
+
+# check if WPE is installed
+result=`$miniconda_dir/bin/python -c "\
+try:
+    import nara_wpe
+    print('1')
+except ImportError:
+    print('0')"`
+
+if [ "$result" == "1" ]; then
+    echo "WPE is installed"
+else
+    echo "WPE is not installed. Please run ../../../tools/extras/install_wpe.sh"
+    exit 1
+fi
+
+exit  0
diff --git a/egs/chime5/s5b/local/copy_lat_dir_parallel.sh b/egs/chime5/s5b/local/copy_lat_dir_parallel.sh
new file mode 100755
index 00000000000..82839604c9e
--- /dev/null
+++ b/egs/chime5/s5b/local/copy_lat_dir_parallel.sh
@@ -0,0 +1,97 @@
+#!/bin/bash
+
+cmd=queue.pl
+nj=40
+stage=0
+speed_perturb=true
+
+. ./path.sh
+. utils/parse_options.sh
+
+if [ $# -ne 4 ]; then
+  echo "Usage: $0 <utt-map> <data-dir> <src-lat-dir> <out-lat-dir>"
+  exit 1
+fi
+
+utt_map=$1
+data=$2
+srcdir=$3
+dir=$4
+
+mkdir -p $dir
+
+cp $srcdir/{phones.txt,tree,final.mdl} $dir || exit 1
+cp $srcdir/{final.alimdl,final.occs,splice_opts,cmvn_opts,delta_opts,final.mat,full.mat} 2>/dev/null || true
+
+nj_src=$(cat $srcdir/num_jobs) || exit 1
+
+if [ $stage -le 1 ]; then
+  $cmd JOB=1:$nj_src $dir/log/copy_lats_orig.JOB.log \
+    lattice-copy "ark:gunzip -c $srcdir/lat.JOB.gz |" \
+    ark,scp:$dir/lat_orig.JOB.ark,$dir/lat_orig.JOB.scp || exit 1
+fi
+
+for n in $(seq $nj_src); do
+  cat $dir/lat_orig.$n.scp
+done > $dir/lat_orig.scp || exit 1
+
+if $speed_perturb; then
+  for s in 0.9 1.1; do
+    awk -v s=$s '{print "sp"s"-"$1" sp"s"-"$2}' $utt_map
+  done | cat - $utt_map | sort -k1,1 > $dir/utt_map
+  utt_map=$dir/utt_map
+fi
+
+if [ $stage -le 2 ]; then
+  utils/filter_scp.pl -f 2 $dir/lat_orig.scp < $utt_map | \
+    utils/apply_map.pl -f 2 $dir/lat_orig.scp > \
+    $dir/lat.scp || exit 1
+
+  if [ ! -s $dir/lat.scp ]; then
+    echo "$0: $dir/lat.scp is empty. Something went wrong!"
+    exit 1
+  fi
+fi
+
+utils/split_data.sh $data $nj
+
+if [ $stage -le 3 ]; then
+  $cmd JOB=1:$nj $dir/log/copy_lats.JOB.log \
+    lattice-copy "scp:utils/filter_scp.pl $data/split$nj/JOB/utt2spk $dir/lat.scp |" \
+    "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1
+fi
+
+echo $nj > $dir/num_jobs
+
+if [ -f $srcdir/ali.1.gz ]; then
+  if [ $stage -le 4 ]; then
+    $cmd JOB=1:$nj_src $dir/log/copy_ali_orig.JOB.log \
+      copy-int-vector "ark:gunzip -c $srcdir/ali.JOB.gz |" \
+      ark,scp:$dir/ali_orig.JOB.ark,$dir/ali_orig.JOB.scp || exit 1
+  fi
+
+  for n in $(seq $nj_src); do
+    cat $dir/ali_orig.$n.scp
+  done > $dir/ali_orig.scp || exit 1
+
+  if [ $stage -le 5 ]; then
+    utils/filter_scp.pl -f 2 $dir/ali_orig.scp < $utt_map | \
+      utils/apply_map.pl -f 2 $dir/ali_orig.scp > \
+      $dir/ali.scp || exit 1
+  
+    if [ ! -s $dir/ali.scp ]; then
+      echo "$0: $dir/ali.scp is empty. Something went wrong!"
+      exit 1
+    fi
+  fi
+
+  utils/split_data.sh $data $nj
+
+  if [ $stage -le 6 ]; then
+    $cmd JOB=1:$nj $dir/log/copy_ali.JOB.log \
+      copy-int-vector "scp:utils/filter_scp.pl $data/split$nj/JOB/utt2spk $dir/ali.scp |" \
+      "ark:|gzip -c > $dir/ali.JOB.gz" || exit 1
+  fi
+fi
+
+rm $dir/lat_orig.*.{ark,scp} $dir/ali_orig.*.{ark,scp} 2>/dev/null || true
diff --git a/egs/chime5/s5b/local/distant_audio_list b/egs/chime5/s5b/local/distant_audio_list
new file mode 100644
index 00000000000..fc7aff15cd0
--- /dev/null
+++ b/egs/chime5/s5b/local/distant_audio_list
@@ -0,0 +1,376 @@
+S03_U01.CH1
+S03_U01.CH2
+S03_U01.CH3
+S03_U01.CH4
+S03_U02.CH1
+S03_U02.CH2
+S03_U02.CH3
+S03_U02.CH4
+S03_U03.CH1
+S03_U03.CH2
+S03_U03.CH3
+S03_U03.CH4
+S03_U04.CH1
+S03_U04.CH2
+S03_U04.CH3
+S03_U04.CH4
+S03_U05.CH1
+S03_U05.CH2
+S03_U05.CH3
+S03_U05.CH4
+S03_U06.CH1
+S03_U06.CH2
+S03_U06.CH3
+S03_U06.CH4
+S04_U01.CH1
+S04_U01.CH2
+S04_U01.CH3
+S04_U01.CH4
+S04_U02.CH1
+S04_U02.CH2
+S04_U02.CH3
+S04_U02.CH4
+S04_U03.CH1
+S04_U03.CH2
+S04_U03.CH3
+S04_U03.CH4
+S04_U04.CH1
+S04_U04.CH2
+S04_U04.CH3
+S04_U04.CH4
+S04_U05.CH1
+S04_U05.CH2
+S04_U05.CH3
+S04_U05.CH4
+S04_U06.CH1
+S04_U06.CH2
+S04_U06.CH3
+S04_U06.CH4
+S05_U01.CH1
+S05_U01.CH2
+S05_U01.CH3
+S05_U01.CH4
+S05_U02.CH1
+S05_U02.CH2
+S05_U02.CH3
+S05_U02.CH4
+S05_U04.CH1
+S05_U04.CH2
+S05_U04.CH3
+S05_U04.CH4
+S05_U05.CH1
+S05_U05.CH2
+S05_U05.CH3
+S05_U05.CH4
+S05_U06.CH1
+S05_U06.CH2
+S05_U06.CH3
+S05_U06.CH4
+S06_U01.CH1
+S06_U01.CH2
+S06_U01.CH3
+S06_U01.CH4
+S06_U02.CH1
+S06_U02.CH2
+S06_U02.CH3
+S06_U02.CH4
+S06_U03.CH1
+S06_U03.CH2
+S06_U03.CH3
+S06_U03.CH4
+S06_U04.CH1
+S06_U04.CH2
+S06_U04.CH3
+S06_U04.CH4
+S06_U05.CH1
+S06_U05.CH2
+S06_U05.CH3
+S06_U05.CH4
+S06_U06.CH1
+S06_U06.CH2
+S06_U06.CH3
+S06_U06.CH4
+S07_U01.CH1
+S07_U01.CH2
+S07_U01.CH3
+S07_U01.CH4
+S07_U02.CH1
+S07_U02.CH2
+S07_U02.CH3
+S07_U02.CH4
+S07_U03.CH1
+S07_U03.CH2
+S07_U03.CH3
+S07_U03.CH4
+S07_U04.CH1
+S07_U04.CH2
+S07_U04.CH3
+S07_U04.CH4
+S07_U05.CH1
+S07_U05.CH2
+S07_U05.CH3
+S07_U05.CH4
+S07_U06.CH1
+S07_U06.CH2
+S07_U06.CH3
+S07_U06.CH4
+S08_U01.CH1
+S08_U01.CH2
+S08_U01.CH3
+S08_U01.CH4
+S08_U02.CH1
+S08_U02.CH2
+S08_U02.CH3
+S08_U02.CH4
+S08_U03.CH1
+S08_U03.CH2
+S08_U03.CH3
+S08_U03.CH4
+S08_U04.CH1
+S08_U04.CH2
+S08_U04.CH3
+S08_U04.CH4
+S08_U05.CH1
+S08_U05.CH2
+S08_U05.CH3
+S08_U05.CH4
+S08_U06.CH1
+S08_U06.CH2
+S08_U06.CH3
+S08_U06.CH4
+S12_U01.CH1
+S12_U01.CH2
+S12_U01.CH3
+S12_U01.CH4
+S12_U02.CH1
+S12_U02.CH2
+S12_U02.CH3
+S12_U02.CH4
+S12_U03.CH1
+S12_U03.CH2
+S12_U03.CH3
+S12_U03.CH4
+S12_U04.CH1
+S12_U04.CH2
+S12_U04.CH3
+S12_U04.CH4
+S12_U05.CH1
+S12_U05.CH2
+S12_U05.CH3
+S12_U05.CH4
+S12_U06.CH1
+S12_U06.CH2
+S12_U06.CH3
+S12_U06.CH4
+S13_U01.CH1
+S13_U01.CH2
+S13_U01.CH3
+S13_U01.CH4
+S13_U02.CH1
+S13_U02.CH2
+S13_U02.CH3
+S13_U02.CH4
+S13_U03.CH1
+S13_U03.CH2
+S13_U03.CH3
+S13_U03.CH4
+S13_U04.CH1
+S13_U04.CH2
+S13_U04.CH3
+S13_U04.CH4
+S13_U05.CH1
+S13_U05.CH2
+S13_U05.CH3
+S13_U05.CH4
+S13_U06.CH1
+S13_U06.CH2
+S13_U06.CH3
+S13_U06.CH4
+S16_U01.CH1
+S16_U01.CH2
+S16_U01.CH3
+S16_U01.CH4
+S16_U02.CH1
+S16_U02.CH2
+S16_U02.CH3
+S16_U02.CH4
+S16_U03.CH1
+S16_U03.CH2
+S16_U03.CH3
+S16_U03.CH4
+S16_U04.CH1
+S16_U04.CH2
+S16_U04.CH3
+S16_U04.CH4
+S16_U05.CH1
+S16_U05.CH2
+S16_U05.CH3
+S16_U05.CH4
+S16_U06.CH1
+S16_U06.CH2
+S16_U06.CH3
+S16_U06.CH4
+S17_U01.CH1
+S17_U01.CH2
+S17_U01.CH3
+S17_U01.CH4
+S17_U02.CH1
+S17_U02.CH2
+S17_U02.CH3
+S17_U02.CH4
+S17_U03.CH1
+S17_U03.CH2
+S17_U03.CH3
+S17_U03.CH4
+S17_U04.CH1
+S17_U04.CH2
+S17_U04.CH3
+S17_U04.CH4
+S17_U05.CH1
+S17_U05.CH2
+S17_U05.CH3
+S17_U05.CH4
+S17_U06.CH1
+S17_U06.CH2
+S17_U06.CH3
+S17_U06.CH4
+S18_U01.CH1
+S18_U01.CH2
+S18_U01.CH3
+S18_U01.CH4
+S18_U02.CH1
+S18_U02.CH2
+S18_U02.CH3
+S18_U02.CH4
+S18_U03.CH1
+S18_U03.CH2
+S18_U03.CH3
+S18_U03.CH4
+S18_U04.CH1
+S18_U04.CH2
+S18_U04.CH3
+S18_U04.CH4
+S18_U05.CH1
+S18_U05.CH2
+S18_U05.CH3
+S18_U05.CH4
+S18_U06.CH1
+S18_U06.CH2
+S18_U06.CH3
+S18_U06.CH4
+S19_U01.CH1
+S19_U01.CH2
+S19_U01.CH3
+S19_U01.CH4
+S19_U02.CH1
+S19_U02.CH2
+S19_U02.CH3
+S19_U02.CH4
+S19_U03.CH1
+S19_U03.CH2
+S19_U03.CH3
+S19_U03.CH4
+S19_U04.CH1
+S19_U04.CH2
+S19_U04.CH3
+S19_U04.CH4
+S19_U05.CH1
+S19_U05.CH2
+S19_U05.CH3
+S19_U05.CH4
+S19_U06.CH1
+S19_U06.CH2
+S19_U06.CH3
+S19_U06.CH4
+S20_U01.CH1
+S20_U01.CH2
+S20_U01.CH3
+S20_U01.CH4
+S20_U02.CH1
+S20_U02.CH2
+S20_U02.CH3
+S20_U02.CH4
+S20_U03.CH1
+S20_U03.CH2
+S20_U03.CH3
+S20_U03.CH4
+S20_U04.CH1
+S20_U04.CH2
+S20_U04.CH3
+S20_U04.CH4
+S20_U05.CH1
+S20_U05.CH2
+S20_U05.CH3
+S20_U05.CH4
+S20_U06.CH1
+S20_U06.CH2
+S20_U06.CH3
+S20_U06.CH4
+S22_U01.CH1
+S22_U01.CH2
+S22_U01.CH3
+S22_U01.CH4
+S22_U02.CH1
+S22_U02.CH2
+S22_U02.CH3
+S22_U02.CH4
+S22_U04.CH1
+S22_U04.CH2
+S22_U04.CH3
+S22_U04.CH4
+S22_U05.CH1
+S22_U05.CH2
+S22_U05.CH3
+S22_U05.CH4
+S22_U06.CH1
+S22_U06.CH2
+S22_U06.CH3
+S22_U06.CH4
+S23_U01.CH1
+S23_U01.CH2
+S23_U01.CH3
+S23_U01.CH4
+S23_U02.CH1
+S23_U02.CH2
+S23_U02.CH3
+S23_U02.CH4
+S23_U03.CH1
+S23_U03.CH2
+S23_U03.CH3
+S23_U03.CH4
+S23_U04.CH1
+S23_U04.CH2
+S23_U04.CH3
+S23_U04.CH4
+S23_U05.CH1
+S23_U05.CH2
+S23_U05.CH3
+S23_U05.CH4
+S23_U06.CH1
+S23_U06.CH2
+S23_U06.CH3
+S23_U06.CH4
+S24_U01.CH1
+S24_U01.CH2
+S24_U01.CH3
+S24_U01.CH4
+S24_U02.CH1
+S24_U02.CH2
+S24_U02.CH3
+S24_U02.CH4
+S24_U03.CH1
+S24_U03.CH2
+S24_U03.CH3
+S24_U03.CH4
+S24_U04.CH1
+S24_U04.CH2
+S24_U04.CH3
+S24_U04.CH4
+S24_U05.CH1
+S24_U05.CH2
+S24_U05.CH3
+S24_U05.CH4
+S24_U06.CH1
+S24_U06.CH2
+S24_U06.CH3
+S24_U06.CH4
diff --git a/egs/chime5/s5b/local/extract_noises.py b/egs/chime5/s5b/local/extract_noises.py
new file mode 100755
index 00000000000..f7b7f752d9e
--- /dev/null
+++ b/egs/chime5/s5b/local/extract_noises.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import logging
+import os
+import sys
+import scipy.io.wavfile as siw
+import math
+import numpy as np
+
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        """Extract noises from the corpus based on the non-speech regions.
+        e.g. {} /export/corpora4/CHiME5/audio/train/ \\
+                /export/corpora4/CHiME5/transcriptions/train/ \\
+                /export/b05/zhiqiw/noise/""".format(sys.argv[0]))
+
+    parser.add_argument("--segment-length", default=20)
+    parser.add_argument("audio_dir", help="""Location of the CHiME5 Audio files. e.g. /export/corpora4/CHiME5/audio/train/""")
+    parser.add_argument("trans_dir", help="""Location of the CHiME5 Transcriptions. e.g. /export/corpora4/CHiME5/transcriptions/train/""")
+    parser.add_argument("audio_list", help="""List of ids of the CHiME5 recordings from which noise is extracted. e.g. local/distant_audio_list""")
+    parser.add_argument("out_dir", help="Output directory to write noise files. e.g. /export/b05/zhiqiw/noise/")
+
+    args = parser.parse_args()
+    return args
+
+
+def Trans_time(time, fs):
+    units = time.split(':')
+    time_second = float(units[0]) * 3600 + float(units[1]) * 60 + float(units[2])
+    return int(time_second*fs)
+
+
+def Get_time(conf, tag, mic, fs):
+    for i in conf:
+        st = Trans_time(i['start_time'][mic], fs)
+        ed = Trans_time(i['end_time'][mic], fs)
+        tag[st:ed] = 0
+    return tag
+
+
+def write_noise(out_dir, seg, audio, sig, tag, fs, cnt):
+    sig_noise = sig[np.nonzero(tag)]
+    for i in range(math.floor(len(sig_noise)/(seg*fs))):
+        siw.write(out_dir +'/noise'+str(cnt)+'.wav', fs, sig_noise[i*seg*fs:(i+1)*seg*fs])
+        cnt += 1
+    return cnt
+
+
+def main():
+    args = get_args()
+
+    if not os.path.exists(args.out_dir):
+        os.makedirs(args.out_dir)
+
+    wav_list = open(args.audio_list).readlines()
+
+    cnt = 1
+    for i, audio in enumerate(wav_list):
+        parts = audio.strip().split('.')
+        if len(parts) == 2:
+            # Assuming distant mic with name like S03_U01.CH1
+            session, mic = parts[0].split('_')
+            channel = parts[1]
+            base_name = session + "_" + mic + "." + channel
+        else:
+            # Assuming close talk mic with name like S03_P09
+            session, mic = audio.strip().split('_')
+            base_name = session + "_" + mic
+        fs, sig = siw.read(args.audio_dir + "/" + base_name + '.wav')
+        tag = np.ones(len(sig))
+        if i == 0 or session != session_p:
+            with open(args.trans_dir + "/" + session + '.json') as f:
+                conf = json.load(f)
+        tag = Get_time(conf, tag, mic, fs)
+        cnt = write_noise(args.out_dir, args.segment_length, audio, sig, tag, fs, cnt)
+        session_p = session
+
+
+if __name__ == '__main__':
+    main()
diff --git a/egs/chime5/s5b/local/extract_vad_weights.sh b/egs/chime5/s5b/local/extract_vad_weights.sh
new file mode 100755
index 00000000000..250b021bd8f
--- /dev/null
+++ b/egs/chime5/s5b/local/extract_vad_weights.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+# Copyright 2016 Johns Hopkins University (Author: Daniel Povey, Vijayaditya Peddinti)
+#           2019 Vimal Manohar
+# Apache 2.0.
+
+# This script converts lattices available from a first pass decode into a per-frame weights file
+# The ctms generated from the lattices are filtered. Silence frames are assigned a low weight (e.g.0.00001)
+# and voiced frames have a weight of 1.
+
+set -e
+
+stage=1
+cmd=run.pl
+silence_weight=0.00001
+#end configuration section.
+
+. ./cmd.sh
+
+[ -f ./path.sh ] && . ./path.sh
+. utils/parse_options.sh || exit 1;
+if [ $# -ne 4 ]; then
+  echo "Usage: $0 [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <input-decode-dir> <output-wts-file-gzipped>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  exit 1;
+fi
+
+data_dir=$1
+lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
+decode_dir=$3
+output_wts_file_gz=$4
+
+if [ $stage -le 1 ]; then
+  echo "$0: generating CTM from input lattices"
+  steps/get_ctm_conf.sh --cmd "$cmd" \
+    --use-segments false \
+    $data_dir \
+    $lang \
+    $decode_dir
+fi
+
+if [ $stage -le 2 ]; then
+  name=`basename $data_dir`
+  # we just take the ctm from LMWT 10, it doesn't seem to affect the results a lot
+  ctm=$decode_dir/score_10/$name.ctm
+  echo "$0: generating weights file from ctm $ctm"
+
+  pad_frames=0  # this did not seem to be helpful but leaving it as an option.
+  feat-to-len scp:$data_dir/feats.scp ark,t:- >$decode_dir/utt.lengths
+  if [ ! -f $ctm ]; then  echo "$0: expected ctm to exist: $ctm"; exit 1; fi
+
+  cat $ctm | awk '$6 == 1.0 && $4 < 1.0' | \
+  grep -v -w mm | grep -v -w mhm | grep -v -F '[noise]' | \
+  grep -v -F '[laughter]' | grep -v -F '<unk>' | \
+  perl -e ' $lengths=shift @ARGV;  $pad_frames=shift @ARGV; $silence_weight=shift @ARGV;
+   $pad_frames >= 0 || die "bad pad-frames value $pad_frames";
+   open(L, "<$lengths") || die "opening lengths file";
+   @all_utts = ();
+   $utt2ref = { };
+   while (<L>) {
+     ($utt, $len) = split(" ", $_);
+     push @all_utts, $utt;
+     $array_ref = [ ];
+     for ($n = 0; $n < $len; $n++) { ${$array_ref}[$n] = $silence_weight; }
+     $utt2ref{$utt} = $array_ref;
+   }
+   while (<STDIN>) {
+     @A = split(" ", $_);
+     @A == 6 || die "bad ctm line $_";
+     $utt = $A[0]; $beg = $A[2]; $len = $A[3];
+     $beg_int = int($beg * 100) - $pad_frames;
+     $len_int = int($len * 100) + 2*$pad_frames;
+     $array_ref = $utt2ref{$utt};
+     !defined $array_ref  && die "No length info for utterance $utt";
+     for ($t = $beg_int; $t < $beg_int + $len_int; $t++) {
+       if ($t >= 0 && $t < @$array_ref) {
+         ${$array_ref}[$t] = 1;
+        }
+      }
+    }
+    foreach $utt (@all_utts) {  $array_ref = $utt2ref{$utt};
+      print $utt, " [ ", join(" ", @$array_ref), " ]\n";
+      } ' $decode_dir/utt.lengths $pad_frames $silence_weight | \
+        gzip -c > $output_wts_file_gz
+fi
diff --git a/egs/chime5/s5b/local/json2text.py b/egs/chime5/s5b/local/json2text.py
new file mode 100755
index 00000000000..4df0160efb6
--- /dev/null
+++ b/egs/chime5/s5b/local/json2text.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python3
+
+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+import json
+import argparse
+import logging
+import sys
+
+
+def hms_to_seconds(hms):
+    hour = hms.split(':')[0]
+    minute = hms.split(':')[1]
+    second = hms.split(':')[2].split('.')[0]
+
+    # .xx (10 ms order)
+    ms10 = hms.split(':')[2].split('.')[1]
+
+    # total seconds
+    seconds = int(hour) * 3600 + int(minute) * 60 + int(second)
+
+    return '{:07d}'.format(int(str(seconds) + ms10))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('json', type=str, help='JSON transcription file')
+    parser.add_argument('--mictype', type=str,
+                        choices=['ref', 'worn', 'u01', 'u02', 'u03', 'u04', 'u05', 'u06'],
+                        help='Type of microphones')
+    args = parser.parse_args()
+
+    # logging info
+    log_format = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s:%(message)s"
+    logging.basicConfig(level=logging.INFO, format=log_format)
+
+    logging.debug("reading %s", args.json)
+    with open(args.json, 'rt', encoding="utf-8") as f:
+        j = json.load(f)
+
+    for x in j:
+        if '[redacted]' not in x['words']:
+            session_id = x['session_id']
+            speaker_id = x['speaker']
+            if args.mictype == 'ref':
+                mictype = x['ref']
+            elif args.mictype == 'worn':
+                mictype = 'original'
+            else:
+                mictype = args.mictype.upper() # convert from u01 to U01
+
+            # add location tag for scoring (only for dev and eval sets)
+            if 'location' in x.keys():
+                location = x['location'].upper()
+            else:
+                location = 'NOLOCATION'
+
+            start_time = x['start_time'][mictype]
+            end_time = x['end_time'][mictype]
+        
+            # remove meta chars and convert to lower
+            words = x['words'].replace('"', '')\
+                              .replace('.', '')\
+                              .replace('?', '')\
+                              .replace(',', '')\
+                              .replace(':', '')\
+                              .replace(';', '')\
+                              .replace('!', '').lower()
+
+            # remove multiple spaces
+            words = " ".join(words.split())
+
+            # convert to seconds, e.g., 1:10:05.55 -> 3600 + 600 + 5.55 = 4205.55
+            start_time = hms_to_seconds(start_time)
+            end_time = hms_to_seconds(end_time)
+
+            uttid = speaker_id + '_' + session_id
+            if not args.mictype == 'worn':
+                uttid += '_' + mictype
+            uttid += '_' + location + '-' + start_time + '-' + end_time
+
+            if end_time > start_time:
+                sys.stdout.buffer.write((uttid + ' ' + words + '\n').encode("utf-8"))
diff --git a/egs/chime5/s5b/local/make_noise_list.py b/egs/chime5/s5b/local/make_noise_list.py
new file mode 100755
index 00000000000..5aaf7fa4062
--- /dev/null
+++ b/egs/chime5/s5b/local/make_noise_list.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python3
+
+import glob
+import os
+import sys
+
+
+if len(sys.argv) != 2:
+    print ("Usage: {} <noises-dir>".format(sys.argv[0]))
+    raise SystemExit(1)
+
+
+for line in glob.glob("{}/*.wav".format(sys.argv[1])):
+    fname = os.path.basename(line.strip())
+
+    print ("--noise-id {} --noise-type point-source "
+           "--bg-fg-type foreground {}".format(fname, line.strip()))
diff --git a/egs/chime5/s5/local/chain/compare_wer.sh b/egs/chime5/s5b/local/nnet3/compare_wer.sh
similarity index 84%
rename from egs/chime5/s5/local/chain/compare_wer.sh
rename to egs/chime5/s5b/local/nnet3/compare_wer.sh
index cd6be14ed88..095e85cc338 100755
--- a/egs/chime5/s5/local/chain/compare_wer.sh
+++ b/egs/chime5/s5b/local/nnet3/compare_wer.sh
@@ -101,31 +101,32 @@ if $used_epochs; then
   exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
 fi
 
-
 echo -n "# Final train prob     "
 for x in $*; do
-  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  prob=$(grep Overall $x/log/compute_prob_train.{final,combined}.log 2>/dev/null | grep log-like | awk '{printf("%.4f", $8)}')
   printf "% 10s" $prob
 done
 echo
 
 echo -n "# Final valid prob     "
 for x in $*; do
-  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  prob=$(grep Overall $x/log/compute_prob_valid.{final,combined}.log 2>/dev/null | grep log-like | awk '{printf("%.4f", $8)}')
   printf "% 10s" $prob
 done
 echo
 
-echo -n "# Final train prob (xent)"
+echo -n "# Final train acc      "
 for x in $*; do
-  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  prob=$(grep Overall $x/log/compute_prob_train.{final,combined}.log 2>/dev/null | grep accuracy | awk '{printf("%.4f", $8)}')
   printf "% 10s" $prob
 done
 echo
 
-echo -n "# Final valid prob (xent)"
+echo -n "# Final valid acc      "
 for x in $*; do
-  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  prob=$(grep Overall $x/log/compute_prob_valid.{final,combined}.log 2>/dev/null | grep accuracy | awk '{printf("%.4f", $8)}')
   printf "% 10s" $prob
 done
 echo
+
+echo
diff --git a/egs/chime5/s5b/local/nnet3/decode.sh b/egs/chime5/s5b/local/nnet3/decode.sh
new file mode 100755
index 00000000000..7af09f36a13
--- /dev/null
+++ b/egs/chime5/s5b/local/nnet3/decode.sh
@@ -0,0 +1,162 @@
+#!/bin/bash
+
+# Copyright 2016 Johns Hopkins University (Author: Daniel Povey, Vijayaditya Peddinti)
+#           2019 Vimal Manohar 
+# Apache 2.0.
+
+# This script does 2-stage decoding where the first stage is used to get 
+# reliable frames for i-vector extraction.
+
+set -e
+
+# general opts
+iter=
+stage=0
+nj=30
+affix=  # affix for decode directory
+
+# ivector opts
+max_count=75  # parameter for extract_ivectors.sh
+sub_speaker_frames=6000
+ivector_scale=0.75
+get_weights_from_ctm=true
+weights_file=   # use weights from this archive (must be compressed using gunzip)
+silence_weight=0.00001   # apply this weight to silence frames during i-vector extraction
+ivector_dir=exp/nnet3
+
+# decode opts
+pass2_decode_opts="--min-active 1000"
+lattice_beam=8
+extra_left_context=0 # change for (B)LSTM
+extra_right_context=0 # change for BLSTM
+frames_per_chunk=50 # change for (B)LSTM
+acwt=0.1 # important to change this when using chain models
+post_decode_acwt=1.0 # important to change this when using chain models
+extra_left_context_initial=0
+extra_right_context_final=0
+
+score_opts="--min-lmwt 6 --max-lmwt 13"
+
+. ./cmd.sh
+[ -f ./path.sh ] && . ./path.sh
+. utils/parse_options.sh || exit 1;
+
+if [ $# -ne 4 ]; then
+  echo "Usage: $0 [options] <data-dir> <lang-dir> <graph-dir> <model-dir>"
+  echo " Options:"
+  echo "    --stage (0|1|2)   # start scoring script from part-way through."
+  echo "e.g.:"
+  echo "$0 data/dev data/lang exp/tri5a/graph_pp exp/nnet3/tdnn"
+  exit 1;
+fi
+
+data=$1 # data directory 
+lang=$2 # data/lang
+graph=$3 #exp/tri5a/graph_pp
+dir=$4 # exp/nnet3/tdnn
+
+model_affix=`basename $dir`
+ivector_affix=${affix:+_$affix}_chain_${model_affix}${iter:+_iter$iter}
+affix=${affix:+_${affix}}${iter:+_iter${iter}}
+
+if [ $stage -le 1 ]; then
+  if [ ! -s ${data}_hires/feats.scp ]; then
+    utils/copy_data_dir.sh $data ${data}_hires
+    steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj $nj --cmd "$train_cmd" ${data}_hires
+    steps/compute_cmvn_stats.sh ${data}_hires
+    utils/fix_data_dir.sh ${data}_hires
+  fi
+fi
+
+data_set=$(basename $data)
+if [ $stage -le 2 ]; then
+  echo "Extracting i-vectors, stage 1"
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+    --max-count $max_count \
+    ${data}_hires $ivector_dir/extractor \
+    $ivector_dir/ivectors_${data_set}${ivector_affix}_stage1;
+  # float comparisons are hard in bash
+  if [ `bc <<< "$ivector_scale != 1"` -eq 1 ]; then
+    ivector_scale_affix=_scale$ivector_scale
+  else
+    ivector_scale_affix=
+  fi
+
+  if [ ! -z "$ivector_scale_affix" ]; then
+    echo "$0: Scaling iVectors, stage 1"
+    srcdir=$ivector_dir/ivectors_${data_set}${ivector_affix}_stage1
+    outdir=$ivector_dir/ivectors_${data_set}${ivector_affix}${ivector_scale_affix}_stage1
+    mkdir -p $outdir
+    $train_cmd $outdir/log/scale_ivectors.log \
+      copy-matrix --scale=$ivector_scale scp:$srcdir/ivector_online.scp ark:- \| \
+      copy-feats --compress=true ark:-  ark,scp:$outdir/ivector_online.ark,$outdir/ivector_online.scp;
+    cp $srcdir/ivector_period $outdir/ivector_period
+  fi
+fi
+
+decode_dir=$dir/decode_${data_set}${affix}
+# generate the lattices
+if [ $stage -le 3 ]; then
+  echo "Generating lattices, stage 1"
+  steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" \
+    --acwt $acwt --post-decode-acwt $post_decode_acwt \
+    --extra-left-context $extra_left_context  \
+    --extra-right-context $extra_right_context  \
+    --extra-left-context-initial $extra_left_context_initial \
+    --extra-right-context-final $extra_right_context_final \
+    --frames-per-chunk "$frames_per_chunk" \
+    --online-ivector-dir $ivector_dir/ivectors_${data_set}${ivector_affix}${ivector_scale_affix}_stage1 \
+    --skip-scoring true ${iter:+--iter $iter} \
+    $graph ${data}_hires ${decode_dir}_stage1;
+fi
+
+if [ $stage -le 4 ]; then
+  if $get_weights_from_ctm; then
+    if [ ! -z $weights_file ]; then
+      echo "$0: Using provided vad weights file $weights_file"
+      ivector_extractor_weights=$weights_file
+    else
+      echo "$0 : Generating vad weights file"
+      ivector_extractor_weights=${decode_dir}_stage1/weights${affix}.gz
+      local/extract_vad_weights.sh --silence-weight $silence_weight \
+        --cmd "$decode_cmd" ${iter:+--iter $iter} \
+        ${data}_hires $lang \
+        ${decode_dir}_stage1 $ivector_extractor_weights
+    fi
+  else
+    # get weights from best path decoding
+    ivector_extractor_weights=${decode_dir}_stage1
+  fi
+fi
+
+if [ $stage -le 5 ]; then
+  echo "Extracting i-vectors, stage 2 with weights from $ivector_extractor_weights"
+  # this does offline decoding, except we estimate the iVectors per
+  # speaker, excluding silence (based on alignments from a DNN decoding), with a
+  # different script.  This is just to demonstrate that script.
+  # the --sub-speaker-frames is optional; if provided, it will divide each speaker
+  # up into "sub-speakers" of at least that many frames... can be useful if
+  # acoustic conditions drift over time within the speaker's data.
+  steps/online/nnet2/extract_ivectors.sh --cmd "$train_cmd" --nj $nj \
+    --silence-weight $silence_weight \
+    --sub-speaker-frames $sub_speaker_frames --max-count $max_count \
+    ${data}_hires $lang $ivector_dir/extractor \
+    $ivector_extractor_weights $ivector_dir/ivectors_${data_set}${ivector_affix};
+fi
+
+if [ $stage -le 6 ]; then
+  echo "Generating lattices, stage 2 with --acwt $acwt"
+  rm -f ${decode_dir}/.error
+  steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" $pass2_decode_opts \
+      --acwt $acwt --post-decode-acwt $post_decode_acwt \
+      --extra-left-context $extra_left_context  \
+      --extra-right-context $extra_right_context  \
+      --extra-left-context-initial $extra_left_context_initial \
+      --extra-right-context-final $extra_right_context_final \
+      --frames-per-chunk "$frames_per_chunk" \
+      --skip-scoring false ${iter:+--iter $iter} --lattice-beam $lattice_beam \
+      --online-ivector-dir $ivector_dir/ivectors_${data_set}${ivector_affix} \
+     $graph ${data}_hires ${decode_dir} || touch ${decode_dir}/.error
+  [ -f ${decode_dir}/.error ] && echo "$0: Error decoding" && exit 1;
+fi
+exit 0
diff --git a/egs/chime5/s5b/local/nnet3/run_ivector_common.sh b/egs/chime5/s5b/local/nnet3/run_ivector_common.sh
new file mode 100755
index 00000000000..3910e1812a3
--- /dev/null
+++ b/egs/chime5/s5b/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,151 @@
+#!/bin/bash
+
+set -euo pipefail
+
+# This script is called from local/nnet3/run_tdnn.sh and
+# local/chain/run_tdnn.sh (and may eventually be called by more
+# scripts).  It contains the common feature preparation and
+# iVector-related parts of the script.  See those scripts for examples
+# of usage.
+
+stage=0
+train_set=train_worn_u100k
+test_sets="dev_worn dev_beamformit_ref"
+gmm=tri3
+nj=96
+
+nnet3_affix=_train_worn_u100k
+
+. ./cmd.sh
+. ./path.sh
+. utils/parse_options.sh
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+
+for f in ${gmm_dir}/final.mdl; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+if [ $stage -le 1 ]; then
+  # Although the nnet will be trained by high resolution data, we still have to
+  # perturb the normal data to get the alignment _sp stands for speed-perturbed
+  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+  echo "$0: making MFCC features for low-resolution speed-perturbed data"
+  steps/make_mfcc.sh --cmd "$train_cmd" --nj 20 data/${train_set}_sp || exit 1;
+  steps/compute_cmvn_stats.sh data/${train_set}_sp || exit 1;
+  utils/fix_data_dir.sh data/${train_set}_sp
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: aligning with the perturbed low-resolution data"
+  steps/align_fmllr.sh --nj ${nj} --cmd "$train_cmd" \
+    data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1
+fi
+
+if [ $stage -le 3 ]; then
+  # Create high-resolution MFCC features (with 40 cepstra instead of 13).
+  # this shows how you can split across multiple file-systems.
+  echo "$0: creating high-resolution MFCC features"
+  mfccdir=data/${train_set}_sp_hires/data
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    utils/create_split_dir.pl /export/b1{5,6,8,9}/$USER/kaldi-data/mfcc/chime5-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+  fi
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+  done
+
+  # do volume-perturbation on the training data prior to extracting hires
+  # features; this helps make trained nnets more invariant to test data volume.
+  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires || exit 1;
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    steps/make_mfcc.sh --nj 20 --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1;
+    utils/fix_data_dir.sh data/${datadir}_hires || exit 1;
+  done
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: computing a subset of data to train the diagonal UBM."
+  # We'll use about a quarter of the data.
+  mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
+  temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
+
+  num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
+  num_utts=$[$num_utts_total/4]
+  utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
+     $num_utts ${temp_data_root}/${train_set}_sp_hires_subset
+
+  echo "$0: computing a PCA transform from the hires data."
+  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
+      --splice-opts "--left-context=3 --right-context=3" \
+      --max-utts 10000 --subsample 2 \
+       ${temp_data_root}/${train_set}_sp_hires_subset \
+       exp/nnet3${nnet3_affix}/pca_transform
+
+  echo "$0: training the diagonal UBM."
+  # Use 512 Gaussians in the UBM.
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
+    --num-frames 700000 \
+    --num-threads 8 \
+    ${temp_data_root}/${train_set}_sp_hires_subset 512 \
+    exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
+fi
+
+if [ $stage -le 5 ]; then
+  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
+  # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
+  # 100.
+  echo "$0: training the iVector extractor"
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 20 \
+     data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm \
+     exp/nnet3${nnet3_affix}/extractor || exit 1;
+fi
+
+
+if [ $stage -le 6 ]; then
+  # We extract iVectors on the speed-perturbed training data after combining
+  # short segments, which will be what we train the system on.  With
+  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
+  # each of these pairs as one speaker; this gives more diversity in iVectors..
+  # Note that these are extracted 'online'.
+
+  # note, we don't encode the 'max2' in the name of the ivectordir even though
+  # that's the data we extract the ivectors from, as it's still going to be
+  # valid for the non-'max2' data, the utterance list is the same.
+
+  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/ivectors/chime5-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
+  fi
+
+
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  temp_data_root=${ivectordir}
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${train_set}_sp_hires ${temp_data_root}/${train_set}_sp_hires_max2
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj ${nj} \
+    ${temp_data_root}/${train_set}_sp_hires_max2 \
+    exp/nnet3${nnet3_affix}/extractor $ivectordir
+fi
+
+if [ $stage -le 7 ]; then
+  # Also extract iVectors for the test data, but in this case we don't need the speed
+  # perturbation (sp).
+  for data in $test_sets; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 20 \
+      data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
+      exp/nnet3${nnet3_affix}/ivectors_${data}_hires
+  done
+fi
+
+exit 0
diff --git a/egs/chime5/s5b/local/prepare_data.sh b/egs/chime5/s5b/local/prepare_data.sh
new file mode 100755
index 00000000000..98087322c38
--- /dev/null
+++ b/egs/chime5/s5b/local/prepare_data.sh
@@ -0,0 +1,136 @@
+#!/bin/bash
+#
+# Copyright  2017  Johns Hopkins University (Author: Shinji Watanabe, Yenda Trmal)
+# Apache 2.0
+
+# Begin configuration section.
+mictype=worn # worn, ref or others
+cleanup=true
+# End configuration section
+. ./utils/parse_options.sh  # accept options.. you can run this run.sh with the
+
+. ./path.sh
+
+echo >&2 "$0" "$@"
+if [ $# -ne 3 ] ; then
+  echo >&2 "$0" "$@"
+  echo >&2 "$0: Error: wrong number of arguments"
+  echo -e >&2 "Usage:\n  $0 [opts] <audio-dir> <json-transcript-dir> <output-dir>"
+  echo -e >&2 "eg:\n  $0 /corpora/chime5/audio/train /corpora/chime5/transcriptions/train data/train"
+  exit 1
+fi
+
+set -e -o pipefail
+
+adir=$1
+jdir=$2
+dir=$3
+
+json_count=$(find -L $jdir -name "*.json" | wc -l)
+wav_count=$(find -L $adir -name "*.wav" | wc -l)
+
+if [ "$json_count" -eq 0 ]; then
+  echo >&2 "We expect that the directory $jdir will contain json files."
+  echo >&2 "That implies you have supplied a wrong path to the data."
+  exit 1
+fi
+if [ "$wav_count" -eq 0 ]; then
+  echo >&2 "We expect that the directory $adir will contain wav files."
+  echo >&2 "That implies you have supplied a wrong path to the data."
+  exit 1
+fi
+
+echo "$0: Converting transcription to text"
+
+mkdir -p $dir
+for file in $jdir/*json; do
+  ./local/json2text.py --mictype $mictype $file
+done | \
+  sed -e "s/\[inaudible[- 0-9]*\]/[inaudible]/g" |\
+  sed -e 's/ - / /g' |\
+  sed -e 's/mm-/mm/g' > $dir/text.orig
+
+echo "$0: Creating datadir $dir for type=\"$mictype\""
+
+if [ $mictype == "worn" ]; then
+  # convert the filenames to wav.scp format, use the basename of the file
+  # as a the wav.scp key, add .L and .R for left and right channel
+  # i.e. each file will have two entries (left and right channel)
+  find -L $adir -name  "S[0-9]*_P[0-9]*.wav" | \
+    perl -ne '{
+      chomp;
+      $path = $_;
+      next unless $path;
+      @F = split "/", $path;
+      ($f = $F[@F-1]) =~ s/.wav//;
+      @F = split "_", $f;
+      print "${F[1]}_${F[0]}.L sox $path -t wav - remix 1 |\n";
+      print "${F[1]}_${F[0]}.R sox $path -t wav - remix 2 |\n";
+    }' | sort > $dir/wav.scp
+
+  # generate the transcripts for both left and right channel
+  # from the original transcript in the form
+  # P09_S03-0006072-0006147 gimme the baker
+  # create left and right channel transcript
+  # P09_S03.L-0006072-0006147 gimme the baker
+  # P09_S03.R-0006072-0006147 gimme the baker
+  sed -n 's/  *$//; h; s/-/\.L-/p; g; s/-/\.R-/p' $dir/text.orig | sort > $dir/text
+elif [ $mictype == "ref" ]; then
+  # fixed reference array
+
+  # first get a text, which will be used to extract reference arrays
+  perl -ne 's/-/.ENH-/;print;' $dir/text.orig | sort > $dir/text
+
+  find -L $adir | grep "\.wav" | sort > $dir/wav.flist
+  # following command provide the argument for grep to extract only reference arrays
+  grep `cut -f 1 -d"-" $dir/text | awk -F"_" '{print $2 "_" $3}' | sed -e "s/\.ENH//" | sort | uniq | sed -e "s/^/ -e /" | tr "\n" " "` $dir/wav.flist > $dir/wav.flist2
+  paste -d" " \
+	<(awk -F "/" '{print $NF}' $dir/wav.flist2 | sed -e "s/\.wav/.ENH/") \
+	$dir/wav.flist2 | sort > $dir/wav.scp
+else
+  # array mic case
+  # convert the filenames to wav.scp format, use the basename of the file
+  # as a the wav.scp key
+  find -L $adir -name "*.wav" -ipath "*${mictype}*" |\
+    perl -ne '$p=$_;chomp $_;@F=split "/";$F[$#F]=~s/\.wav//;print "$F[$#F] $p";' |\
+    sort -u > $dir/wav.scp
+
+  # convert the transcripts from
+  # P09_S03-0006072-0006147 gimme the baker
+  # to the per-channel transcripts
+  # P09_S03_U01_NOLOCATION.CH1-0006072-0006147 gimme the baker
+  # P09_S03_U01_NOLOCATION.CH2-0006072-0006147 gimme the baker
+  # P09_S03_U01_NOLOCATION.CH3-0006072-0006147 gimme the baker
+  # P09_S03_U01_NOLOCATION.CH4-0006072-0006147 gimme the baker
+  perl -ne '$l=$_;
+    for($i=1; $i<=4; $i++) {
+      ($x=$l)=~ s/-/.CH\Q$i\E-/;
+      print $x;}' $dir/text.orig | sort > $dir/text
+
+fi
+$cleanup && rm -f $dir/text.* $dir/wav.scp.* $dir/wav.flist
+
+# Prepare 'segments', 'utt2spk', 'spk2utt'
+if [ $mictype == "worn" ]; then
+  cut -d" " -f 1 $dir/text | \
+    awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\
+    sed -e "s/_[A-Z]*\././2" \
+    > $dir/segments
+elif [ $mictype == "ref" ]; then
+  cut -d" " -f 1 $dir/text | \
+    awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\
+    sed -e "s/_[A-Z]*\././2" |\
+    sed -e "s/ P.._/ /" > $dir/segments
+else
+  cut -d" " -f 1 $dir/text | \
+    awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\
+    sed -e "s/_[A-Z]*\././2" |\
+    sed -e 's/ P.._/ /' > $dir/segments
+fi
+cut -f 1 -d ' ' $dir/segments | \
+  perl -ne 'chomp;$utt=$_;s/_.*//;print "$utt $_\n";' > $dir/utt2spk
+
+utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
+
+# Check that data dirs are okay!
+utils/validate_data_dir.sh --no-feats $dir || exit 1
diff --git a/egs/chime5/s5b/local/prepare_dict.sh b/egs/chime5/s5b/local/prepare_dict.sh
new file mode 100755
index 00000000000..09083d0e795
--- /dev/null
+++ b/egs/chime5/s5b/local/prepare_dict.sh
@@ -0,0 +1,124 @@
+#!/bin/bash
+# Copyright (c) 2018, Johns Hopkins University (Jan "Yenda" Trmal<jtrmal@gmail.com>)
+# License: Apache 2.0
+
+# Begin configuration section.
+# End configuration section
+. ./utils/parse_options.sh
+
+. ./path.sh
+
+set -e -o pipefail
+set -o nounset                              # Treat unset variables as an error
+
+
+# The parts of the output of this that will be needed are
+# [in data/local/dict/ ]
+# lexicon.txt
+# extra_questions.txt
+# nonsilence_phones.txt
+# optional_silence.txt
+# silence_phones.txt
+
+
+# check existing directories
+[ $# != 0 ] && echo "Usage: $0" && exit 1;
+
+dir=data/local/dict
+
+mkdir -p $dir
+echo "$0: Getting CMU dictionary"
+if [ ! -f $dir/cmudict.done ]; then
+  [ -d $dir/cmudict ] && rm -rf $dir/cmudict
+  svn co https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict $dir/cmudict
+  touch $dir/cmudict.done
+fi
+
+# silence phones, one per line.
+for w in sil spn inaudible laughs noise; do
+  echo $w;
+done > $dir/silence_phones.txt
+echo sil > $dir/optional_silence.txt
+
+# For this setup we're discarding stress.
+cat $dir/cmudict/cmudict-0.7b.symbols | \
+  perl -ne 's:[0-9]::g; s:\r::; print lc($_)' | \
+  sort -u > $dir/nonsilence_phones.txt
+
+# An extra question will be added by including the silence phones in one class.
+paste -d ' ' -s $dir/silence_phones.txt > $dir/extra_questions.txt
+
+grep -v ';;;' $dir/cmudict/cmudict-0.7b |\
+  uconv -f latin1 -t utf-8 -x Any-Lower |\
+  perl -ne 's:(\S+)\(\d+\) :$1 :; s:  : :; print;' |\
+  perl -ne '@F = split " ",$_,2; $F[1] =~ s/[0-9]//g; print "$F[0] $F[1]";' \
+  > $dir/lexicon1_raw_nosil.txt || exit 1;
+
+# Add prons for laughter, noise, oov
+for w in `grep -v sil $dir/silence_phones.txt`; do
+  echo "[$w] $w"
+done | cat - $dir/lexicon1_raw_nosil.txt > $dir/lexicon2_raw.txt || exit 1;
+
+# we keep all words from the cmudict in the lexicon
+# might reduce OOV rate on dev and eval
+cat $dir/lexicon2_raw.txt  \
+   <( echo "mm m"
+      echo "<unk> spn"
+      echo "cuz k aa z"
+      echo "cuz k ah z"
+      echo "cuz k ao z"
+      echo "mmm m"; \
+      echo "hmm hh m"; \
+    ) | sort -u | sed 's/[\t ]/\t/' > $dir/iv_lexicon.txt
+
+
+cat data/train*/text  | \
+  awk '{for (n=2;n<=NF;n++){ count[$n]++; } } END { for(n in count) { print count[n], n; }}' | \
+  sort -nr > $dir/word_counts
+
+cat $dir/word_counts | awk '{print $2}' > $dir/word_list
+
+awk '{print $1}' $dir/iv_lexicon.txt | \
+  perl -e '($word_counts)=@ARGV;
+   open(W, "<$word_counts")||die "opening word-counts $word_counts";
+   while(<STDIN>) { chop; $seen{$_}=1; }
+   while(<W>) {
+     ($c,$w) = split;
+     if (!defined $seen{$w}) { print; }
+   } ' $dir/word_counts > $dir/oov_counts.txt
+
+echo "*Highest-count OOVs (including fragments) are:"
+head -n 10 $dir/oov_counts.txt
+echo "*Highest-count OOVs (excluding fragments) are:"
+grep -v -E '^-|-$' $dir/oov_counts.txt | head -n 10 || true
+
+echo "*Training a G2P and generating missing pronunciations"
+mkdir -p $dir/g2p/
+phonetisaurus-align --input=$dir/iv_lexicon.txt --ofile=$dir/g2p/aligned_lexicon.corpus
+ngram-count -order 4 -kn-modify-counts-at-end -ukndiscount\
+  -gt1min 0 -gt2min 0 -gt3min 0 -gt4min 0 \
+  -text $dir/g2p/aligned_lexicon.corpus -lm $dir/g2p/aligned_lexicon.arpa
+phonetisaurus-arpa2wfst --lm=$dir/g2p/aligned_lexicon.arpa --ofile=$dir/g2p/g2p.fst
+awk '{print $2}' $dir/oov_counts.txt > $dir/oov_words.txt
+phonetisaurus-apply --nbest 2 --model $dir/g2p/g2p.fst --thresh 5 --accumulate \
+  --word_list $dir/oov_words.txt > $dir/oov_lexicon.txt
+
+## The next section is again just for debug purposes
+## to show words for which the G2P failed
+cat $dir/oov_lexicon.txt $dir/iv_lexicon.txt | sort -u > $dir/lexicon.txt
+rm -f $dir/lexiconp.txt 2>/dev/null; # can confuse later script if this exists.
+awk '{print $1}' $dir/lexicon.txt | \
+  perl -e '($word_counts)=@ARGV;
+   open(W, "<$word_counts")||die "opening word-counts $word_counts";
+   while(<STDIN>) { chop; $seen{$_}=1; }
+   while(<W>) {
+     ($c,$w) = split;
+     if (!defined $seen{$w}) { print; }
+   } ' $dir/word_counts > $dir/oov_counts.g2p.txt
+
+echo "*Highest-count OOVs (including fragments) after G2P are:"
+head -n 10 $dir/oov_counts.g2p.txt
+
+utils/validate_dict_dir.pl $dir
+exit 0;
+
diff --git a/egs/chime5/s5b/local/reverberate_lat_dir.sh b/egs/chime5/s5b/local/reverberate_lat_dir.sh
new file mode 100755
index 00000000000..f601a37c0e1
--- /dev/null
+++ b/egs/chime5/s5b/local/reverberate_lat_dir.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+
+# Copyright 2018  Vimal Manohar
+# Apache 2.0
+
+num_data_reps=1
+cmd=run.pl
+nj=20
+include_clean=false
+
+. utils/parse_options.sh
+. ./path.sh
+
+if [ $# -ne 4 ]; then
+  echo "Usage: $0 <train-data-dir> <noisy-latdir> <clean-latdir> <output-latdir>"
+  exit 1
+fi
+
+train_data_dir=$1
+noisy_latdir=$2
+clean_latdir=$3
+dir=$4
+
+clean_nj=$(cat $clean_latdir/num_jobs)
+
+$cmd JOB=1:$clean_nj $dir/copy_clean_lattices.JOB.log \
+  lattice-copy "ark:gunzip -c $clean_latdir/lat.JOB.gz |" \
+  ark,scp:$dir/lats_clean.JOB.ark,$dir/lats_clean.JOB.scp || exit 1
+  
+for n in $(seq $clean_nj); do
+  cat $dir/lats_clean.$n.scp 
+done > $dir/lats_clean.scp
+
+for i in $(seq $num_data_reps); do
+  cat $dir/lats_clean.scp | awk -vi=$i '{print "rev"i"_"$0}'
+done > $dir/lats_rvb.scp
+
+noisy_nj=$(cat $noisy_latdir/num_jobs)
+$cmd JOB=1:$noisy_nj $dir/copy_noisy_lattices.JOB>log \
+  lattice-copy "ark:gunzip -c $noisy_latdir/lat.JOB.gz |" \
+  ark,scp:$dir/lats_noisy.JOB.ark,$dir/lats_noisy.JOB.scp || exit 1
+
+optional_clean=
+if $include_clean; then
+  optional_clean=$dir/lats_clean.scp
+fi
+
+for n in $(seq $noisy_nj); do
+  cat $dir/lats_noisy.$n.scp
+done | cat - $dir/lats_rvb.scp ${optional_clean} | sort -k1,1 > $dir/lats.scp
+
+utils/split_data.sh $train_data_dir $nj
+$cmd JOB=1:$nj $dir/copy_lattices.JOB.log \
+  lattice-copy "scp:utils/filter_scp.pl $train_data_dir/split$nj/JOB/utt2spk $dir/lats.scp |" \
+  "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1
+
+echo $nj > $dir/num_jobs
+
+if [ -f $clean_latdir/ali.1.gz ]; then
+  $cmd JOB=1:$clean_nj $dir/copy_clean_alignments.JOB.log \
+    copy-int-vector "ark:gunzip -c $clean_latdir/ali.JOB.gz |" \
+    ark,scp:$dir/ali_clean.JOB.ark,$dir/ali_clean.JOB.scp
+    
+  for n in $(seq $clean_nj); do
+    cat $dir/ali_clean.$n.scp 
+  done > $dir/ali_clean.scp
+
+  for i in $(seq $num_data_reps); do
+    cat $dir/ali_clean.scp | awk -vi=$i '{print "rev"i"_"$0}'
+  done > $dir/ali_rvb.scp
+  
+  optional_clean=
+  if $include_clean; then
+    optional_clean=$dir/ali_clean.scp
+  fi
+
+  $cmd JOB=1:$noisy_nj $dir/copy_noisy_alignments.JOB.log \
+    copy-int-vector "ark:gunzip -c $noisy_latdir/ali.JOB.gz |" \
+    ark,scp:$dir/ali_noisy.JOB.ark,$dir/ali_noisy.JOB.scp
+
+  for n in $(seq $noisy_nj); do
+    cat $dir/ali_noisy.$n.scp
+  done | cat - $dir/ali_rvb.scp $optional_clean | sort -k1,1 > $dir/ali.scp
+
+  utils/split_data.sh $train_data_dir $nj || exit 1
+  $cmd JOB=1:$nj $dir/copy_rvb_alignments.JOB.log \
+    copy-int-vector "scp:utils/filter_scp.pl $train_data_dir/split$nj/JOB/utt2spk $dir/ali.scp |" \
+    "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1
+fi
+
+cp $clean_latdir/{final.*,tree,*.mat,*opts,*.txt} $dir || true
+
+rm $dir/lats_{clean,noisy}.*.{ark,scp} $dir/ali_{clean,noisy}.*.{ark,scp} || true # save space
diff --git a/egs/chime5/s5b/local/run_beamformit.sh b/egs/chime5/s5b/local/run_beamformit.sh
new file mode 100755
index 00000000000..aa3badd90d8
--- /dev/null
+++ b/egs/chime5/s5b/local/run_beamformit.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+
+# Copyright 2015, Mitsubishi Electric Research Laboratories, MERL (Author: Shinji Watanabe)
+
+. ./cmd.sh
+. ./path.sh
+
+# Config:
+cmd=run.pl
+bmf="1 2 3 4"
+
+. utils/parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+   echo "Wrong #arguments ($#, expected 3)"
+   echo "Usage: local/run_beamformit.sh [options] <wav-in-dir> <wav-out-dir> <array-id>"
+   echo "main options (for others, see top of script file)"
+   echo "  --cmd <cmd>                              # Command to run in parallel with"
+   echo "  --bmf \"1 2 3 4\"                        # microphones used for beamforming"
+   exit 1;
+fi
+
+sdir=$1
+odir=$2
+array=$3
+expdir=exp/enhan/`echo $odir | awk -F '/' '{print $NF}'`_`echo $bmf | tr ' ' '_'`
+
+if ! command  -v BeamformIt &>/dev/null ; then
+  echo "Missing BeamformIt, run 'cd $KALDI_ROOT/tools/; ./extras/install_beamformit.sh; cd -;'" && exit 1
+fi
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+mkdir -p $odir
+mkdir -p $expdir/log
+
+echo "Will use the following channels: $bmf"
+# number of channels
+numch=`echo $bmf | tr ' ' '\n' | wc -l`
+echo "the number of channels: $numch"
+
+# wavfiles.list can be used as the name of the output files
+output_wavfiles=$expdir/wavfiles.list
+find -L ${sdir} | grep -i ${array} | awk -F "/" '{print $NF}' | sed -e "s/\.CH.\.wav//" | sort | uniq > $expdir/wavfiles.list
+
+# this is an input file list of the microphones
+# format: 1st_wav 2nd_wav ... nth_wav
+input_arrays=$expdir/channels_$numch
+for x in `cat $output_wavfiles`; do
+  echo -n "$x"
+  for ch in $bmf; do
+    echo -n " $x.CH$ch.wav"
+  done
+  echo ""
+done > $input_arrays
+
+# split the list for parallel processing
+# number of jobs are set by the number of WAV files
+nj=`wc -l $expdir/wavfiles.list | awk '{print $1}'`
+split_wavfiles=""
+for n in `seq $nj`; do
+  split_wavfiles="$split_wavfiles $output_wavfiles.$n"
+done
+utils/split_scp.pl $output_wavfiles $split_wavfiles || exit 1;
+
+echo -e "Beamforming\n"
+# making a shell script for each job
+for n in `seq $nj`; do
+cat << EOF > $expdir/log/beamform.$n.sh
+while read line; do
+  $BEAMFORMIT/BeamformIt -s \$line -c $input_arrays \
+    --config_file `pwd`/conf/beamformit.cfg \
+    --source_dir $sdir \
+    --result_dir $odir
+done < $output_wavfiles.$n
+EOF
+done
+
+chmod a+x $expdir/log/beamform.*.sh
+$cmd JOB=1:$nj $expdir/log/beamform.JOB.log \
+  $expdir/log/beamform.JOB.sh
+
+echo "`basename $0` Done."
diff --git a/egs/chime5/s5b/local/run_recog.sh b/egs/chime5/s5b/local/run_recog.sh
new file mode 100755
index 00000000000..5c74c9ff242
--- /dev/null
+++ b/egs/chime5/s5b/local/run_recog.sh
@@ -0,0 +1,164 @@
+#!/bin/bash
+#
+# Based mostly on the TED-LIUM and Switchboard recipe
+#
+# Copyright  2017  Johns Hopkins University (Author: Shinji Watanabe and Yenda Trmal)
+# Apache 2.0
+#
+# This is a subset of run.sh to only perform recognition experiments with evaluation data
+
+# Begin configuration section.
+decode_nj=20
+stage=0
+enhancement=beamformit # for a new enhancement method,
+                       # change this variable and stage 4
+# End configuration section
+. ./utils/parse_options.sh
+
+. ./cmd.sh
+. ./path.sh
+
+
+set -e # exit on error
+
+# chime5 main directory path
+# please change the path accordingly
+chime5_corpus=/export/corpora4/CHiME5
+json_dir=${chime5_corpus}/transcriptions
+audio_dir=${chime5_corpus}/audio
+
+# training and test data
+train_set=train_worn_u100k
+test_sets="eval_${enhancement}_ref"
+
+# This script also needs the phonetisaurus g2p, srilm, beamformit
+./local/check_tools.sh || exit 1
+
+if [ $stage -le 4 ]; then
+  # Beamforming using reference arrays
+  # enhanced WAV directory
+  enhandir=enhan
+  for dset in eval; do
+    for mictype in u01 u02 u03 u04 u05 u06; do
+      local/run_beamformit.sh --cmd "$train_cmd" \
+			      ${audio_dir}/${dset} \
+			      ${enhandir}/${dset}_${enhancement}_${mictype} \
+			      ${mictype}
+    done
+  done
+  
+  for dset in eval; do
+    local/prepare_data.sh --mictype ref "$PWD/${enhandir}/${dset}_${enhancement}_u0*" \
+			  ${json_dir}/${dset} data/${dset}_${enhancement}_ref
+  done
+fi
+
+if [ $stage -le 6 ]; then
+  # fix speaker ID issue (thanks to Dr. Naoyuki Kanda)
+  # add array ID to the speaker ID to avoid the use of other array information to meet regulations
+  # Before this fix
+  # $ head -n 2 data/eval_beamformit_ref_nosplit/utt2spk
+  # P01_S01_U02_KITCHEN.ENH-0000192-0001278 P01
+  # P01_S01_U02_KITCHEN.ENH-0001421-0001481 P01
+  # After this fix
+  # $ head -n 2 data/eval_beamformit_ref_nosplit_fix/utt2spk
+  # P01_S01_U02_KITCHEN.ENH-0000192-0001278 P01_U02
+  # P01_S01_U02_KITCHEN.ENH-0001421-0001481 P01_U02
+  for dset in ${test_sets}; do
+    utils/copy_data_dir.sh data/${dset} data/${dset}_nosplit
+    mkdir -p data/${dset}_nosplit_fix
+    cp data/${dset}_nosplit/{segments,text,wav.scp} data/${dset}_nosplit_fix/
+    awk -F "_" '{print $0 "_" $3}' data/${dset}_nosplit/utt2spk > data/${dset}_nosplit_fix/utt2spk
+    utils/utt2spk_to_spk2utt.pl data/${dset}_nosplit_fix/utt2spk > data/${dset}_nosplit_fix/spk2utt
+  done
+
+  # Split speakers up into 3-minute chunks.  This doesn't hurt adaptation, and
+  # lets us use more jobs for decoding etc.
+  for dset in ${test_sets}; do
+    utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}_nosplit_fix data/${dset}
+  done
+fi
+
+if [ $stage -le 7 ]; then
+  # Now make MFCC features.
+  # mfccdir should be some place with a largish disk where you
+  # want to store MFCC features.
+  mfccdir=mfcc
+  for x in ${test_sets}; do
+    steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" \
+		       data/$x exp/make_mfcc/$x $mfccdir
+    steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
+    utils/fix_data_dir.sh data/$x
+  done
+fi
+
+if [ $stage -le 17 ]; then
+  nnet3_affix=_${train_set}_cleaned
+  for datadir in ${test_sets}; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+  done
+  for datadir in ${test_sets}; do
+    steps/make_mfcc.sh --nj 20 --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1;
+    utils/fix_data_dir.sh data/${datadir}_hires || exit 1;
+  done
+  for data in $test_sets; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 20 \
+      data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
+      exp/nnet3${nnet3_affix}/ivectors_${data}_hires
+  done
+fi
+
+if [ $stage -le 18 ]; then
+  # First the options that are passed through to run_ivector_common.sh
+  # (some of which are also used in this script directly).
+  lm_suffix=
+
+  # The rest are configs specific to this script.  Most of the parameters
+  # are just hardcoded at this level, in the commands below.
+  affix=1a   # affix for the TDNN directory name
+  tree_affix=
+  tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+  dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+
+  # training options
+  # training chunk-options
+  chunk_width=140,100,160
+  # we don't need extra left/right context for TDNN systems.
+  chunk_left_context=0
+  chunk_right_context=0
+  
+  utils/mkgraph.sh \
+      --self-loop-scale 1.0 data/lang${lm_suffix}/ \
+      $tree_dir $tree_dir/graph${lm_suffix} || exit 1;
+
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $chunk_left_context \
+          --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj 8 --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph${lm_suffix} data/${data}_hires ${dir}/decode${lm_suffix}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+if [ $stage -le 20 ]; then
+  # final scoring to get the official challenge result
+  # please specify both dev and eval set directories so that the search parameters
+  # (insertion penalty and language model weight) will be tuned using the dev set
+  local/score_for_submit.sh \
+      --dev exp/chain_${train_set}_cleaned/tdnn1a_sp/decode_dev_${enhancement}_ref \
+      --eval exp/chain_${train_set}_cleaned/tdnn1a_sp/decode_eval_${enhancement}_ref
+fi
diff --git a/egs/chime5/s5b/local/run_wpe.py b/egs/chime5/s5b/local/run_wpe.py
new file mode 100755
index 00000000000..2f3818f9c42
--- /dev/null
+++ b/egs/chime5/s5b/local/run_wpe.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+# Copyright 2018 Johns Hopkins University (Author: Aswin Shanmugam Subramanian)
+# Apache 2.0
+# Works with both python2 and python3
+# This script assumes that WPE (nara_wpe) is installed locally using miniconda.
+# ../../../tools/extras/install_miniconda.sh and ../../../tools/extras/install_wpe.sh
+# needs to be run and this script needs to be launched run with that version of
+# python.
+# See local/run_wpe.sh for example.
+
+import numpy as np
+import soundfile as sf
+import time
+import os, errno
+from tqdm import tqdm
+import argparse
+
+from nara_wpe.wpe import wpe
+from nara_wpe.utils import stft, istft
+from nara_wpe import project_root
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--files', '-f', nargs='+')
+args = parser.parse_args()
+
+input_files = args.files[:len(args.files)//2]
+output_files = args.files[len(args.files)//2:]
+out_dir = os.path.dirname(output_files[0])
+try:
+    os.makedirs(out_dir)
+except OSError as e:
+    if e.errno != errno.EEXIST:
+        raise
+
+stft_options = dict(
+    size=512,
+    shift=128,
+    window_length=None,
+    fading=True,
+    pad=True,
+    symmetric_window=False
+)
+
+sampling_rate = 16000
+delay = 3
+iterations = 5
+taps = 10
+
+signal_list = [
+    sf.read(f)[0]
+    for f in input_files
+]
+y = np.stack(signal_list, axis=0)
+Y = stft(y, **stft_options).transpose(2, 0, 1)
+Z = wpe(Y, iterations=iterations, statistics_mode='full').transpose(1, 2, 0)
+z = istft(Z, size=stft_options['size'], shift=stft_options['shift'])
+
+for d in range(len(signal_list)):
+    sf.write(output_files[d], z[d,:], sampling_rate)
diff --git a/egs/chime5/s5b/local/run_wpe.sh b/egs/chime5/s5b/local/run_wpe.sh
new file mode 100755
index 00000000000..1c4b1c80291
--- /dev/null
+++ b/egs/chime5/s5b/local/run_wpe.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+# Copyright 2018 Johns Hopkins University (Author: Aswin Shanmugam Subramanian)
+# Apache 2.0
+
+. ./cmd.sh
+. ./path.sh
+
+# Config:
+nj=4
+cmd=run.pl
+
+. utils/parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+   echo "Wrong #arguments ($#, expected 3)"
+   echo "Usage: local/run_wpe.sh [options] <wav-in-dir> <wav-out-dir> <array-id>"
+   echo "main options (for others, see top of script file)"
+   echo "  --cmd <cmd>                              # Command to run in parallel with"
+   echo "  --nj 50                        # number of jobs for parallel processing"
+   exit 1;
+fi
+
+sdir=$1
+odir=$2
+array=$3
+task=`basename $sdir`
+expdir=exp/wpe/${task}_${array}
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+miniconda_dir=$HOME/miniconda3/
+if [ ! -d $miniconda_dir ]; then
+    echo "$miniconda_dir does not exist. Please run '../../../tools/extras/install_miniconda.sh' and '../../../tools/extras/install_wpe.sh';"
+fi
+
+# check if WPE is installed
+result=`$miniconda_dir/bin/python -c "\
+try:
+    import nara_wpe
+    print('1')
+except ImportError:
+    print('0')"`
+
+if [ "$result" == "1" ]; then
+    echo "WPE is installed"
+else
+    echo "WPE is not installed. Please run ../../../tools/extras/install_wpe.sh"
+    exit 1
+fi
+
+mkdir -p $odir
+mkdir -p $expdir/log
+
+# wavfiles.list can be used as the name of the output files
+output_wavfiles=$expdir/wavfiles.list
+find -L ${sdir} | grep -i ${array} > $expdir/channels_input
+cat $expdir/channels_input | awk -F '/' '{print $NF}' | sed "s@S@$odir\/S@g" > $expdir/channels_output
+paste -d" " $expdir/channels_input $expdir/channels_output > $output_wavfiles
+
+# split the list for parallel processing
+split_wavfiles=""
+for n in `seq $nj`; do
+  split_wavfiles="$split_wavfiles $output_wavfiles.$n"
+done
+utils/split_scp.pl $output_wavfiles $split_wavfiles || exit 1;
+
+echo -e "Dereverberation - $task - $array\n"
+# making a shell script for each job
+for n in `seq $nj`; do
+cat <<-EOF > $expdir/log/wpe.$n.sh
+while read line; do
+  $miniconda_dir/bin/python local/run_wpe.py \
+    --file \$line
+done < $output_wavfiles.$n
+EOF
+done
+
+chmod a+x $expdir/log/wpe.*.sh
+$cmd JOB=1:$nj $expdir/log/wpe.JOB.log \
+  $expdir/log/wpe.JOB.sh
+
+echo "`basename $0` Done."
diff --git a/egs/chime5/s5b/local/score.sh b/egs/chime5/s5b/local/score.sh
new file mode 120000
index 00000000000..6a200b42ed3
--- /dev/null
+++ b/egs/chime5/s5b/local/score.sh
@@ -0,0 +1 @@
+../steps/scoring/score_kaldi_wer.sh
\ No newline at end of file
diff --git a/egs/chime5/s5b/local/score_for_submit.sh b/egs/chime5/s5b/local/score_for_submit.sh
new file mode 100755
index 00000000000..23121d68b93
--- /dev/null
+++ b/egs/chime5/s5b/local/score_for_submit.sh
@@ -0,0 +1,119 @@
+#!/bin/bash
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey, Yenda Trmal)
+# Apache 2.0
+#
+# This script provides official CHiME-5 challenge submission scores per room and session.
+# It first calculates the best search parameter configurations by using the dev set
+# and also create the transcriptions for dev and eval sets to be submitted.
+# The default setup does not calculate scores of the evaluation set since
+# the evaluation transcription is not distributed (July 9 2018)
+
+cmd=run.pl
+dev=exp/chain_train_worn_u100k_cleaned/tdnn1a_sp/decode_dev_beamformit_ref
+eval=exp/chain_train_worn_u100k_cleaned/tdnn1a_sp/decode_eval_beamformit_ref
+do_eval=false
+
+echo "$0 $@"  # Print the command line for logging
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 0 ]; then
+    echo "Usage: $0 [--cmd (run.pl|queue.pl...)]"
+    echo "This script provides official CHiME-5 challenge submission scores"
+    echo " Options:"
+    echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+    echo "    --dev <dev-decode-dir>          # dev set decoding directory"
+    echo "    --eval <eval-decode-dir>        # eval set decoding directory"
+    exit 1;
+fi
+
+# get language model weight and word insertion penalty from the dev set
+best_lmwt=`cat $dev/scoring_kaldi/wer_details/lmwt`
+best_wip=`cat $dev/scoring_kaldi/wer_details/wip`
+
+echo "best LM weight: $best_lmwt"
+echo "insertion penalty weight: $best_wip"
+
+echo "==== development set ===="
+# development set
+# get the scoring result per utterance
+score_result=$dev/scoring_kaldi/wer_details/per_utt
+for session in S02 S09; do
+    for room in DINING KITCHEN LIVING; do
+	# get nerror
+	nerr=`grep "\#csid" $score_result | grep $room | grep $session | awk '{sum+=$4+$5+$6} END {print sum}'`
+	# get nwords from references (NF-2 means to exclude utterance id and " ref ")
+	nwrd=`grep "\#csid" $score_result | grep $room | grep $session | awk '{sum+=$3+$4+$6} END {print sum}'`
+	# compute wer with scale=2
+	wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc`
+	
+	# report the results
+	echo -n "session $session "
+	echo -n "room $room: "
+	echo -n "#words $nwrd, "
+	echo -n "#errors $nerr, "
+	echo "wer $wer %"
+    done
+done
+echo -n "overall: "
+# get nerror
+nerr=`grep "\#csid" $score_result | awk '{sum+=$4+$5+$6} END {print sum}'`
+# get nwords from references (NF-2 means to exclude utterance id and " ref ")
+nwrd=`grep "\#csid" $score_result | awk '{sum+=$3+$4+$6} END {print sum}'`
+# compute wer with scale=2
+wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc`
+echo -n "#words $nwrd, "
+echo -n "#errors $nerr, "
+echo "wer $wer %"
+
+echo "==== evaluation set ===="
+# evaluation set
+# get the scoring result per utterance. Copied from local/score.sh
+mkdir -p $eval/scoring_kaldi/wer_details_devbest
+$cmd $eval/scoring_kaldi/log/stats1.log \
+     cat $eval/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \| \
+     align-text --special-symbol="'***'" ark:$eval/scoring_kaldi/test_filt.txt ark:- ark,t:- \|  \
+     utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \> $eval/scoring_kaldi/wer_details_devbest/per_utt
+score_result=$eval/scoring_kaldi/wer_details_devbest/per_utt
+for session in S01 S21; do
+    for room in DINING KITCHEN LIVING; do
+	if $do_eval; then
+	    # get nerror
+	    nerr=`grep "\#csid" $score_result | grep $room | grep $session | awk '{sum+=$4+$5+$6} END {print sum}'`
+	    # get nwords from references (NF-2 means to exclude utterance id and " ref ")
+	    nwrd=`grep "\#csid" $score_result | grep $room | grep $session | awk '{sum+=$3+$4+$6} END {print sum}'`
+	    # compute wer with scale=2
+	    wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc`
+	
+	    # report the results
+	    echo -n "session $session "
+	    echo -n "room $room: "
+	    echo -n "#words $nwrd, "
+	    echo -n "#errors $nerr, "
+	    echo "wer $wer %"
+	fi
+    done
+done
+if $do_eval; then
+    # get nerror
+    nerr=`grep "\#csid" $score_result | awk '{sum+=$4+$5+$6} END {print sum}'`
+    # get nwords from references (NF-2 means to exclude utterance id and " ref ")
+    nwrd=`grep "\#csid" $score_result | awk '{sum+=$3+$4+$6} END {print sum}'`
+    # compute wer with scale=2
+    wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc`
+    echo -n "overall: "
+    echo -n "#words $nwrd, "
+    echo -n "#errors $nerr, "
+    echo "wer $wer %"
+else
+    echo "skip evaluation scoring"
+    echo ""
+    echo "==== when you submit your result to the CHiME-5 challenge ===="
+    echo "Please rename your recognition results of "
+    echo "$dev/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt"
+    echo "$eval/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt"
+    echo "with {dev,eval}_<last name>_<affiliation>.txt, e.g., dev_watanabe_jhu.txt and eval_watanabe_jhu.txt, "
+    echo "and submit both of them as your final challenge result"
+    echo "=================================================================="    
+fi
+
diff --git a/egs/chime5/s5b/local/train_lms_srilm.sh b/egs/chime5/s5b/local/train_lms_srilm.sh
new file mode 100755
index 00000000000..5a1d56d24b3
--- /dev/null
+++ b/egs/chime5/s5b/local/train_lms_srilm.sh
@@ -0,0 +1,261 @@
+#!/bin/bash
+# Copyright (c) 2017  Johns Hopkins University (Author: Yenda Trmal, Shinji Watanabe)
+# Apache 2.0
+
+export LC_ALL=C
+
+# Begin configuration section.
+words_file=
+train_text=
+dev_text=
+oov_symbol="<UNK>"
+# End configuration section
+
+echo "$0 $@"
+
+[ -f path.sh ]  && . ./path.sh
+. ./utils/parse_options.sh || exit 1
+
+echo "-------------------------------------"
+echo "Building an SRILM language model     "
+echo "-------------------------------------"
+
+if [ $# -ne 2 ] ; then
+  echo "Incorrect number of parameters. "
+  echo "Script has to be called like this:"
+  echo "  $0 [switches] <datadir> <tgtdir>"
+  echo "For example: "
+  echo "  $0 data data/srilm"
+  echo "The allowed switches are: "
+  echo "    words_file=<word_file|>        word list file -- data/lang/words.txt by default"
+  echo "    train_text=<train_text|>       data/train/text is used in case when not specified"
+  echo "    dev_text=<dev_text|>           last 10 % of the train text is used by default"
+  echo "    oov_symbol=<unk_sumbol|<UNK>>  symbol to use for oov modeling -- <UNK> by default"
+  exit 1
+fi
+
+datadir=$1
+tgtdir=$2
+
+##End of configuration
+loc=`which ngram-count`;
+if [ -z $loc ]; then
+  echo >&2 "You appear to not have SRILM tools installed, either on your path,"
+  echo >&2 "Use the script \$KALDI_ROOT/tools/install_srilm.sh to install it."
+  exit 1
+fi
+
+# Prepare the destination directory
+mkdir -p $tgtdir
+
+for f in $words_file $train_text $dev_text; do
+  [ ! -s $f ] && echo "No such file $f" && exit 1;
+done
+
+[ -z $words_file ] && words_file=$datadir/lang/words.txt
+if [ ! -z "$train_text" ] && [ -z "$dev_text" ] ; then
+  nr=`cat  $train_text | wc -l`
+  nr_dev=$(($nr / 10 ))
+  nr_train=$(( $nr - $nr_dev ))
+  orig_train_text=$train_text
+  head -n $nr_train $train_text > $tgtdir/train_text
+  tail -n $nr_dev $train_text > $tgtdir/dev_text
+
+  train_text=$tgtdir/train_text
+  dev_text=$tgtdir/dev_text
+  echo "Using words file: $words_file"
+  echo "Using train text: 9/10 of $orig_train_text"
+  echo "Using dev text  : 1/10 of $orig_train_text"
+elif [ ! -z "$train_text" ] && [ ! -z "$dev_text" ] ; then
+  echo "Using words file: $words_file"
+  echo "Using train text: $train_text"
+  echo "Using dev text  : $dev_text"
+  train_text=$train_text
+  dev_text=$dev_text
+else
+  train_text=$datadir/train/text
+  dev_text=$datadir/dev2h/text
+  echo "Using words file: $words_file"
+  echo "Using train text: $train_text"
+  echo "Using dev text  : $dev_text"
+
+fi
+
+[ ! -f $words_file ] && echo >&2 "File $words_file must exist!" && exit 1
+[ ! -f $train_text ] && echo >&2 "File $train_text must exist!" && exit 1
+[ ! -f $dev_text ] && echo >&2 "File $dev_text must exist!" && exit 1
+
+
+# Extract the word list from the training dictionary; exclude special symbols
+sort $words_file | awk '{print $1}' | grep -v '\#0' | grep -v '<eps>' | grep -v -F "$oov_symbol" > $tgtdir/vocab
+if (($?)); then
+  echo "Failed to create vocab from $words_file"
+  exit 1
+else
+  # wc vocab # doesn't work due to some encoding issues
+  echo vocab contains `cat $tgtdir/vocab | perl -ne 'BEGIN{$l=$w=0;}{split; $w+=$#_; $w++; $l++;}END{print "$l lines, $w words\n";}'`
+fi
+
+# Kaldi transcript files contain Utterance_ID as the first word; remove it
+# We also have to avoid skewing the LM by incorporating  the same sentences
+# from different channels
+sed -e "s/\.CH.//" -e "s/_.\-./_/" -e "s/NOLOCATION\(\.[LR]\)*-//" -e "s/U[0-9][0-9]_//" $train_text | sort -u | \
+  perl -ane 'print join(" ", @F[1..$#F]) . "\n" if @F > 1' > $tgtdir/train.txt
+if (($?)); then
+    echo "Failed to create $tgtdir/train.txt from $train_text"
+    exit 1
+else
+    echo "Removed first word (uid) from every line of $train_text"
+    # wc text.train train.txt # doesn't work due to some encoding issues
+    echo $train_text contains `cat $train_text | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $w--; $s++;}END{print "$w words, $s sentences\n";}'`
+    echo train.txt contains `cat $tgtdir/train.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $s++;}END{print "$w words, $s sentences\n";}'`
+fi
+
+# Kaldi transcript files contain Utterance_ID as the first word; remove it
+sed -e "s/\.CH.//" -e "s/_.\-./_/" $dev_text | sort -u | \
+  perl -ane 'print join(" ", @F[1..$#F]) . "\n" if @F > 1' > $tgtdir/dev.txt
+if (($?)); then
+    echo "Failed to create $tgtdir/dev.txt from $dev_text"
+    exit 1
+else
+    echo "Removed first word (uid) from every line of $dev_text"
+    # wc text.train train.txt # doesn't work due to some encoding issues
+    echo $dev_text contains `cat $dev_text | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $w--; $s++;}END{print "$w words, $s sentences\n";}'`
+    echo $tgtdir/dev.txt contains `cat $tgtdir/dev.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F;  $s++;}END{print "$w words, $s sentences\n";}'`
+fi
+
+
+echo "-------------------"
+echo "Good-Turing 3grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/3gram.gt011.gz -gt1min 0 -gt2min 1 -gt3min 1 -order 3 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.gt012.gz -gt1min 0 -gt2min 1 -gt3min 2 -order 3 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.gt022.gz -gt1min 0 -gt2min 2 -gt3min 2 -order 3 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.gt023.gz -gt1min 0 -gt2min 2 -gt3min 3 -order 3 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Kneser-Ney 3grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/3gram.kn011.gz -kndiscount1 -gt1min 0 \
+  -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -order 3 -interpolate \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn012.gz -kndiscount1 -gt1min 0 \
+  -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -order 3 -interpolate \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn022.gz -kndiscount1 -gt1min 0 \
+  -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -order 3 -interpolate \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn023.gz -kndiscount1 -gt1min 0 \
+  -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 3 -order 3 -interpolate \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn111.gz -kndiscount1 -gt1min 1 \
+  -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -order 3 -interpolate \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn112.gz -kndiscount1 -gt1min 1 \
+  -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -order 3 -interpolate \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn122.gz -kndiscount1 -gt1min 1 \
+  -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -order 3 -interpolate \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn123.gz -kndiscount1 -gt1min 1 \
+  -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 3 -order 3 -interpolate \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+
+echo "-------------------"
+echo "Good-Turing 4grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/4gram.gt0111.gz \
+  -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 1 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0112.gz \
+  -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 2 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0122.gz \
+  -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 2 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0123.gz \
+  -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 3 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0113.gz \
+  -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 3 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0222.gz \
+  -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 2 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0223.gz \
+  -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 3 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Kneser-Ney 4grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/4gram.kn0111.gz \
+  -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 1 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0112.gz \
+  -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 2 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0113.gz \
+  -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 3 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0122.gz \
+  -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0123.gz \
+  -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0222.gz \
+  -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0223.gz \
+  -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+if [ ! -z ${LIBLBFGS} ]; then
+  #please note that if the switch -map-unk "$oov_symbol" is used with -maxent-convert-to-arpa, ngram-count will segfault
+  #instead of that, we simply output the model in the maxent format and convert it using the "ngram"
+  echo "-------------------"
+  echo "Maxent 3grams"
+  echo "-------------------"
+  sed 's/'${oov_symbol}'/<unk>/g' $tgtdir/train.txt | \
+    ngram-count -lm - -order 3 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\
+    ngram -lm - -order 3 -unk -map-unk "$oov_symbol" -prune-lowprobs -write-lm - |\
+    sed 's/<unk>/'${oov_symbol}'/g' | gzip -c > $tgtdir/3gram.me.gz || exit 1
+
+  echo "-------------------"
+  echo "Maxent 4grams"
+  echo "-------------------"
+  sed 's/'${oov_symbol}'/<unk>/g' $tgtdir/train.txt | \
+    ngram-count -lm - -order 4 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\
+    ngram -lm - -order 4 -unk -map-unk "$oov_symbol" -prune-lowprobs -write-lm - |\
+    sed 's/<unk>/'${oov_symbol}'/g' | gzip -c > $tgtdir/4gram.me.gz || exit 1
+else
+  echo >&2  "SRILM is not compiled with the support of MaxEnt models."
+  echo >&2  "You should use the script in \$KALDI_ROOT/tools/install_srilm.sh"
+  echo >&2  "which will take care of compiling the SRILM with MaxEnt support"
+  exit 1;
+fi
+
+
+echo "--------------------"
+echo "Computing perplexity"
+echo "--------------------"
+(
+  for f in $tgtdir/3gram* ; do ( echo $f; ngram -order 3 -lm $f -unk -map-unk "$oov_symbol" -prune-lowprobs -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done
+  for f in $tgtdir/4gram* ; do ( echo $f; ngram -order 4 -lm $f -unk -map-unk "$oov_symbol" -prune-lowprobs -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done
+)  | sort  -r -n -k 15,15g | column -t | tee $tgtdir/perplexities.txt
+
+echo "The perlexity scores report is stored in $tgtdir/perplexities.txt "
+echo ""
+
+for best_ngram in {3,4}gram ; do
+  outlm=best_${best_ngram}.gz
+  lmfilename=$(grep "${best_ngram}" $tgtdir/perplexities.txt | head -n 1 | cut -f 1 -d ' ')
+  echo "$outlm -> $lmfilename"
+  (cd $tgtdir; rm -f $outlm; ln -sf $(basename $lmfilename) $outlm )
+done
diff --git a/egs/chime5/s5b/local/wer_output_filter b/egs/chime5/s5b/local/wer_output_filter
new file mode 100755
index 00000000000..6f4b6400716
--- /dev/null
+++ b/egs/chime5/s5b/local/wer_output_filter
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Copyright (c) 2017  Johns Hopkins University (Author: Yenda Trmal <jtrmal@gmail.com>)
+# Apache 2.0
+
+
+## Filter for scoring of the STT results. Convert everything to lowercase
+## and add some ad-hoc fixes for the hesitations
+
+perl -e '
+   while(<STDIN>) {
+     @A  = split(" ", $_);
+     $id = shift @A; print "$id ";
+     foreach $a (@A) {
+       print lc($a) . " " unless $a =~ /\[.*\]/;
+     }
+     print "\n";
+    }' | \
+sed -e '
+    s/\<mhm\>/hmm/g;
+    s/\<mm\>/hmm/g;
+    s/\<mmm\>/hmm/g;
+'
+
+#| uconv -f  utf-8  -t utf-8 -x Latin-ASCII
+
diff --git a/egs/chime5/s5b/local/worn_audio_list b/egs/chime5/s5b/local/worn_audio_list
new file mode 100644
index 00000000000..fc7a44ad77d
--- /dev/null
+++ b/egs/chime5/s5b/local/worn_audio_list
@@ -0,0 +1,64 @@
+/export/corpora4/CHiME5/audio/train/S03_P09.wav
+/export/corpora4/CHiME5/audio/train/S03_P10.wav
+/export/corpora4/CHiME5/audio/train/S03_P11.wav
+/export/corpora4/CHiME5/audio/train/S03_P12.wav
+/export/corpora4/CHiME5/audio/train/S04_P09.wav
+/export/corpora4/CHiME5/audio/train/S04_P10.wav
+/export/corpora4/CHiME5/audio/train/S04_P11.wav
+/export/corpora4/CHiME5/audio/train/S04_P12.wav
+/export/corpora4/CHiME5/audio/train/S05_P13.wav
+/export/corpora4/CHiME5/audio/train/S05_P14.wav
+/export/corpora4/CHiME5/audio/train/S05_P15.wav
+/export/corpora4/CHiME5/audio/train/S05_P16.wav
+/export/corpora4/CHiME5/audio/train/S06_P13.wav
+/export/corpora4/CHiME5/audio/train/S06_P14.wav
+/export/corpora4/CHiME5/audio/train/S06_P15.wav
+/export/corpora4/CHiME5/audio/train/S06_P16.wav
+/export/corpora4/CHiME5/audio/train/S07_P17.wav
+/export/corpora4/CHiME5/audio/train/S07_P18.wav
+/export/corpora4/CHiME5/audio/train/S07_P19.wav
+/export/corpora4/CHiME5/audio/train/S07_P20.wav
+/export/corpora4/CHiME5/audio/train/S08_P21.wav
+/export/corpora4/CHiME5/audio/train/S08_P22.wav
+/export/corpora4/CHiME5/audio/train/S08_P23.wav
+/export/corpora4/CHiME5/audio/train/S08_P24.wav
+/export/corpora4/CHiME5/audio/train/S12_P33.wav
+/export/corpora4/CHiME5/audio/train/S12_P34.wav
+/export/corpora4/CHiME5/audio/train/S12_P35.wav
+/export/corpora4/CHiME5/audio/train/S12_P36.wav
+/export/corpora4/CHiME5/audio/train/S13_P33.wav
+/export/corpora4/CHiME5/audio/train/S13_P34.wav
+/export/corpora4/CHiME5/audio/train/S13_P35.wav
+/export/corpora4/CHiME5/audio/train/S13_P36.wav
+/export/corpora4/CHiME5/audio/train/S16_P21.wav
+/export/corpora4/CHiME5/audio/train/S16_P22.wav
+/export/corpora4/CHiME5/audio/train/S16_P23.wav
+/export/corpora4/CHiME5/audio/train/S16_P24.wav
+/export/corpora4/CHiME5/audio/train/S17_P17.wav
+/export/corpora4/CHiME5/audio/train/S17_P18.wav
+/export/corpora4/CHiME5/audio/train/S17_P19.wav
+/export/corpora4/CHiME5/audio/train/S17_P20.wav
+/export/corpora4/CHiME5/audio/train/S18_P41.wav
+/export/corpora4/CHiME5/audio/train/S18_P42.wav
+/export/corpora4/CHiME5/audio/train/S18_P43.wav
+/export/corpora4/CHiME5/audio/train/S18_P44.wav
+/export/corpora4/CHiME5/audio/train/S19_P49.wav
+/export/corpora4/CHiME5/audio/train/S19_P50.wav
+/export/corpora4/CHiME5/audio/train/S19_P51.wav
+/export/corpora4/CHiME5/audio/train/S19_P52.wav
+/export/corpora4/CHiME5/audio/train/S20_P49.wav
+/export/corpora4/CHiME5/audio/train/S20_P50.wav
+/export/corpora4/CHiME5/audio/train/S20_P51.wav
+/export/corpora4/CHiME5/audio/train/S20_P52.wav
+/export/corpora4/CHiME5/audio/train/S22_P41.wav
+/export/corpora4/CHiME5/audio/train/S22_P42.wav
+/export/corpora4/CHiME5/audio/train/S22_P43.wav
+/export/corpora4/CHiME5/audio/train/S22_P44.wav
+/export/corpora4/CHiME5/audio/train/S23_P53.wav
+/export/corpora4/CHiME5/audio/train/S23_P54.wav
+/export/corpora4/CHiME5/audio/train/S23_P55.wav
+/export/corpora4/CHiME5/audio/train/S23_P56.wav
+/export/corpora4/CHiME5/audio/train/S24_P53.wav
+/export/corpora4/CHiME5/audio/train/S24_P54.wav
+/export/corpora4/CHiME5/audio/train/S24_P55.wav
+/export/corpora4/CHiME5/audio/train/S24_P56.wav
diff --git a/egs/chime5/s5b/path.sh b/egs/chime5/s5b/path.sh
new file mode 100644
index 00000000000..fb1c0489386
--- /dev/null
+++ b/egs/chime5/s5b/path.sh
@@ -0,0 +1,7 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
diff --git a/egs/chime5/s5b/run.sh b/egs/chime5/s5b/run.sh
new file mode 100755
index 00000000000..37bc5c2c94e
--- /dev/null
+++ b/egs/chime5/s5b/run.sh
@@ -0,0 +1,297 @@
+#!/bin/bash
+#
+# Based mostly on the TED-LIUM and Switchboard recipe
+#
+# Copyright  2017  Johns Hopkins University (Author: Shinji Watanabe and Yenda Trmal)
+# Apache 2.0
+#
+
+# Begin configuration section.
+nj=96
+decode_nj=20
+stage=0
+nnet_stage=-10
+num_data_reps=4
+snrs="20:10:15:5:0"
+foreground_snrs="20:10:15:5:0"
+background_snrs="20:10:15:5:0"
+enhancement=beamformit # for a new enhancement method,
+                       # change this variable and stage 4
+# End configuration section
+. ./utils/parse_options.sh
+
+. ./cmd.sh
+. ./path.sh
+
+
+set -e # exit on error
+
+# chime5 main directory path
+# please change the path accordingly
+chime5_corpus=/export/corpora4/CHiME5
+json_dir=${chime5_corpus}/transcriptions
+audio_dir=${chime5_corpus}/audio
+
+# training and test data
+train_set=train_worn_simu_u400k
+test_sets="dev_${enhancement}_dereverb_ref" #"dev_worn dev_addition_dereverb_ref"
+#test_sets="dev_${enhancement}_ref" #"dev_worn dev_addition_dereverb_ref"
+
+# This script also needs the phonetisaurus g2p, srilm, beamformit
+./local/check_tools.sh || exit 1
+
+if [ $stage -le 1 ]; then
+  # skip u03 as they are missing
+  for mictype in worn u01 u02 u04 u05 u06; do
+    local/prepare_data.sh --mictype ${mictype} \
+			  ${audio_dir}/train ${json_dir}/train data/train_${mictype}
+  done
+  for dataset in dev; do
+    for mictype in worn; do
+      local/prepare_data.sh --mictype ${mictype} \
+			    ${audio_dir}/${dataset} ${json_dir}/${dataset} \
+			    data/${dataset}_${mictype}
+    done
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  local/prepare_dict.sh
+
+  utils/prepare_lang.sh \
+    data/local/dict "<unk>" data/local/lang data/lang
+
+  local/train_lms_srilm.sh \
+    --train-text data/train_worn/text --dev-text data/dev_worn/text \
+    --oov-symbol "<unk>" --words-file data/lang/words.txt \
+    data/ data/srilm
+fi
+
+LM=data/srilm/best_3gram.gz
+if [ $stage -le 3 ]; then
+  # Compiles G for chime5 trigram LM
+  utils/format_lm.sh \
+		data/lang $LM data/local/dict/lexicon.txt data/lang
+
+fi
+
+if [ $stage -le 4 ]; then
+  # Beamforming using reference arrays
+  # enhanced WAV directory
+  enhandir=enhan
+  dereverb_dir=${PWD}/wav/wpe/
+  for dset in dev eval; do
+    for mictype in u01 u02 u03 u04 u06; do
+      local/run_wpe.sh --nj 4 --cmd "$train_cmd --mem 120G" \
+			      ${audio_dir}/${dset} \
+			      ${dereverb_dir}/${dset} \
+			      ${mictype}
+    done
+  done
+
+  for dset in dev eval; do
+    for mictype in u01 u02 u03 u04 u06; do
+      local/run_beamformit.sh --cmd "$train_cmd" \
+			      ${dereverb_dir}/${dset} \
+			      ${enhandir}/${dset}_${enhancement}_${mictype} \
+			      ${mictype}
+    done
+  done
+
+  for dset in dev eval; do
+    local/prepare_data.sh --mictype ref "$PWD/${enhandir}/${dset}_${enhancement}_u0*" \
+			  ${json_dir}/${dset} data/${dset}_${enhancement}_dereverb_ref
+  done
+fi
+
+if [ $stage -le 5 ]; then
+  # remove possibly bad sessions (P11_S03, P52_S19, P53_S24, P54_S24)
+  # see http://spandh.dcs.shef.ac.uk/chime_challenge/data.html for more details
+  utils/copy_data_dir.sh data/train_worn data/train_worn_org # back up
+  grep -v -e "^P11_S03" -e "^P52_S19" -e "^P53_S24" -e "^P54_S24" data/train_worn_org/text > data/train_worn/text
+  utils/fix_data_dir.sh data/train_worn
+fi
+
+if [ $stage -le 6 ]; then
+  local/extract_noises.py $chime5_corpus/audio/train $chime5_corpus/transcriptions/train \
+    local/distant_audio_list distant_noises
+  local/make_noise_list.py distant_noises > distant_noise_list
+
+  noise_list=distant_noise_list
+  
+  if [ ! -d RIRS_NOISES/ ]; then
+    # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises
+    wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip
+    unzip rirs_noises.zip
+  fi
+
+  # This is the config for the system using simulated RIRs and point-source noises
+  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list")
+  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list")
+  rvb_opts+=(--noise-set-parameters $noise_list)
+
+  steps/data/reverberate_data_dir.py \
+    "${rvb_opts[@]}" \
+    --prefix "rev" \
+    --foreground-snrs $foreground_snrs \
+    --background-snrs $background_snrs \
+    --speech-rvb-probability 1 \
+    --pointsource-noise-addition-probability 1 \
+    --isotropic-noise-addition-probability 1 \
+    --num-replications $num_data_reps \
+    --max-noises-per-minute 1 \
+    --source-sampling-rate 16000 \
+    data/train_worn data/train_worn_rvb
+fi
+
+if [ $stage -le 7 ]; then
+  # combine mix array and worn mics
+  # randomly extract first 100k utterances from all mics
+  # if you want to include more training data, you can increase the number of array mic utterances
+  utils/combine_data.sh data/train_uall data/train_u01 data/train_u02 data/train_u04 data/train_u05 data/train_u06
+  utils/subset_data_dir.sh data/train_uall 400000 data/train_u400k
+  utils/combine_data.sh data/${train_set} data/train_worn data/train_worn_rvb data/train_u400k
+
+  # only use left channel for worn mic recognition
+  # you can use both left and right channels for training
+  for dset in train dev; do
+    utils/copy_data_dir.sh data/${dset}_worn data/${dset}_worn_stereo
+    grep "\.L-" data/${dset}_worn_stereo/text > data/${dset}_worn/text
+    utils/fix_data_dir.sh data/${dset}_worn
+  done
+fi
+
+if [ $stage -le 8 ]; then
+  # fix speaker ID issue (thanks to Dr. Naoyuki Kanda)
+  # add array ID to the speaker ID to avoid the use of other array information to meet regulations
+  # Before this fix
+  # $ head -n 2 data/eval_beamformit_ref_nosplit/utt2spk
+  # P01_S01_U02_KITCHEN.ENH-0000192-0001278 P01
+  # P01_S01_U02_KITCHEN.ENH-0001421-0001481 P01
+  # After this fix
+  # $ head -n 2 data/eval_beamformit_ref_nosplit_fix/utt2spk
+  # P01_S01_U02_KITCHEN.ENH-0000192-0001278 P01_U02
+  # P01_S01_U02_KITCHEN.ENH-0001421-0001481 P01_U02
+  for dset in dev_${enhancement}_dereverb_ref eval_${enhancement}_dereverb_ref; do
+    utils/copy_data_dir.sh data/${dset} data/${dset}_nosplit
+    mkdir -p data/${dset}_nosplit_fix
+    cp data/${dset}_nosplit/{segments,text,wav.scp} data/${dset}_nosplit_fix/
+    awk -F "_" '{print $0 "_" $3}' data/${dset}_nosplit/utt2spk > data/${dset}_nosplit_fix/utt2spk
+    utils/utt2spk_to_spk2utt.pl data/${dset}_nosplit_fix/utt2spk > data/${dset}_nosplit_fix/spk2utt
+  done
+
+  # Split speakers up into 3-minute chunks.  This doesn't hurt adaptation, and
+  # lets us use more jobs for decoding etc.
+  for dset in ${train_set} dev_worn; do
+    utils/copy_data_dir.sh data/${dset} data/${dset}_nosplit
+    utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}_nosplit data/${dset}
+  done
+  for dset in dev_${enhancement}_dereverb_ref eval_${enhancement}_dereverb_ref; do
+    utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}_nosplit_fix data/${dset}
+  done
+fi
+
+if [ $stage -le 8 ]; then
+  # Now make MFCC features.
+  # mfccdir should be some place with a largish disk where you
+  # want to store MFCC features.
+  mfccdir=mfcc
+  for x in ${train_set} ${test_sets}; do
+    steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" \
+		       data/$x exp/make_mfcc/$x $mfccdir
+    steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
+    utils/fix_data_dir.sh data/$x
+  done
+fi
+
+if [ $stage -le 9 ]; then
+  # make a subset for monophone training
+  utils/subset_data_dir.sh --shortest data/${train_set} 100000 data/${train_set}_100kshort
+  utils/subset_data_dir.sh data/${train_set}_100kshort 30000 data/${train_set}_30kshort
+fi
+
+if [ $stage -le 10 ]; then
+  # Starting basic training on MFCC features
+  steps/train_mono.sh --nj $nj --cmd "$train_cmd" \
+		      data/${train_set}_30kshort data/lang exp/mono
+fi
+
+if [ $stage -le 11 ]; then
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+		    data/${train_set} data/lang exp/mono exp/mono_ali
+
+  steps/train_deltas.sh --cmd "$train_cmd" \
+			2500 30000 data/${train_set} data/lang exp/mono_ali exp/tri1
+fi
+
+if [ $stage -le 12 ]; then
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+		    data/${train_set} data/lang exp/tri1 exp/tri1_ali
+
+  steps/train_lda_mllt.sh --cmd "$train_cmd" \
+			  4000 50000 data/${train_set} data/lang exp/tri1_ali exp/tri2
+fi
+
+if [ $stage -le 13 ]; then
+  utils/mkgraph.sh data/lang exp/tri2 exp/tri2/graph
+  for dset in ${test_sets}; do
+    steps/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
+		    exp/tri2/graph data/${dset} exp/tri2/decode_${dset} &
+  done
+  wait
+fi
+
+if [ $stage -le 14 ]; then
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+		    data/${train_set} data/lang exp/tri2 exp/tri2_ali
+
+  steps/train_sat.sh --cmd "$train_cmd" \
+		     5000 100000 data/${train_set} data/lang exp/tri2_ali exp/tri3
+fi
+
+if [ $stage -le 15 ]; then
+  utils/mkgraph.sh data/lang exp/tri3 exp/tri3/graph
+  for dset in ${test_sets}; do
+    steps/decode_fmllr.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
+			  exp/tri3/graph data/${dset} exp/tri3/decode_${dset} &
+  done
+  wait
+fi
+
+if [ $stage -le 16 ]; then
+  # The following script cleans the data and produces cleaned data
+  steps/cleanup/clean_and_segment_data.sh --nj ${nj} --cmd "$train_cmd" \
+    --segmentation-opts "--min-segment-length 0.3 --min-new-segment-length 0.6" \
+    data/${train_set} data/lang exp/tri3 exp/tri3_cleaned data/${train_set}_cleaned
+fi
+
+if [ $stage -le 17 ]; then
+  # chain TDNN
+  local/chain/tuning/run_tdnn_1b.sh --nj ${nj} \
+    --stage $nnet_stage \
+    --train-set ${train_set}_cleaned \
+    --test-sets "$test_sets" \
+    --gmm tri3_cleaned --nnet3-affix _${train_set}_cleaned_rvb
+fi
+
+if [ $stage -le 18 ]; then
+  # 2-stage decoding
+  for test_set in $test_sets; do
+    local/nnet3/decode.sh --affix 2stage --pass2-decode-opts "--min-active 1000" \
+      --acwt 1.0 --post-decode-acwt 10.0 \
+      --frames-per-chunk 150 --nj $decode_nj \
+      --ivector-dir exp/nnet3_${train_set}_cleaned_rvb \
+      data/${test_set} data/lang_chain \
+      exp/chain_${train_set}_cleaned_rvb/tree_sp/graph \
+      exp/chain_${train_set}_cleaned_rvb/tdnn1b_sp 
+  done
+fi
+
+if [ $stage -le 19 ]; then
+  # final scoring to get the official challenge result
+  # please specify both dev and eval set directories so that the search parameters
+  # (insertion penalty and language model weight) will be tuned using the dev set
+  local/score_for_submit.sh \
+      --dev exp/chain_${train_set}_cleaned_rvb/tdnn1b_sp/decode_dev_${enhancement}_dereverb_ref \
+      --eval exp/chain_${train_set}_cleaned_rvb/tdnn1b_sp/decode_eval_${enhancement}_dereverb_ref
+fi
diff --git a/egs/chime5/s5b/steps b/egs/chime5/s5b/steps
new file mode 120000
index 00000000000..1b186770dd1
--- /dev/null
+++ b/egs/chime5/s5b/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps/
\ No newline at end of file
diff --git a/egs/chime5/s5b/utils b/egs/chime5/s5b/utils
new file mode 120000
index 00000000000..a3279dc8679
--- /dev/null
+++ b/egs/chime5/s5b/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils/
\ No newline at end of file
diff --git a/egs/commonvoice/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/commonvoice/s5/local/chain/tuning/run_tdnn_1a.sh
index 635e3de1076..d4acd0fed4b 100755
--- a/egs/commonvoice/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/commonvoice/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -141,7 +141,7 @@ if [ $stage -le 13 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/csj/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/csj/s5/local/chain/tuning/run_tdnn_1a.sh
index a463db77066..75ceb80e3e0 100755
--- a/egs/csj/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/csj/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -133,7 +133,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/csj/s5/local/nnet/run_dnn_tandem_uc.sh b/egs/csj/s5/local/nnet/run_dnn_tandem_uc.sh
index 4677ff473cb..297aed1f486 100755
--- a/egs/csj/s5/local/nnet/run_dnn_tandem_uc.sh
+++ b/egs/csj/s5/local/nnet/run_dnn_tandem_uc.sh
@@ -280,4 +280,4 @@ exit 0
 %WER 14.88 [ 2557 / 17189, 556 ins, 359 del, 1642 sub ] exp/tandem2uc-tri4/decode_eval3_csj/wer_20_0.5
 %WER 17.03 [ 2927 / 17189, 592 ins, 417 del, 1918 sub ] exp/tandem2uc-tri4/decode_eval3_csj.si/wer_20_1.0
 %WER 13.44 [ 2311 / 17189, 430 ins, 340 del, 1541 sub ] exp/tandem2uc-tri4_mmi_b0.1/decode_eval3_csj/wer_20_1.0
-EOF
\ No newline at end of file
+EOF
diff --git a/egs/dihard_2018/v1/local/make_voxceleb1_v2.pl b/egs/dihard_2018/v1/local/make_voxceleb1_v2.pl
new file mode 100755
index 00000000000..0bc13bea251
--- /dev/null
+++ b/egs/dihard_2018/v1/local/make_voxceleb1_v2.pl
@@ -0,0 +1,123 @@
+#!/usr/bin/perl
+#
+# Copyright 2018  Ewald Enzinger
+#           2018  David Snyder
+#           2019  Soonshin Seo
+#
+# Usage: make_voxceleb1_v2.pl /export/voxceleb1 dev data/dev
+#
+# The VoxCeleb1 corpus underwent several updates that changed the directory and speaker ID format.
+# The script 'make_voxceleb1.pl' works for the oldest version of the corpus. 
+# This script should be used if you've downloaded the corpus recently.
+
+if (@ARGV != 3) {
+  print STDERR "Usage: $0 <path-to-voxceleb1> <dataset> <path-to-data-dir>\n";
+  print STDERR "e.g. $0 /export/voxceleb1 dev data/dev\n";
+  exit(1);
+}
+
+($data_base, $dataset, $out_dir) = @ARGV;
+
+if ("$dataset" ne "dev" && "$dataset" ne "test") {
+  die "dataset parameter must be 'dev' or 'test'!";
+}
+
+if (system("mkdir -p $out_dir") != 0) {
+  die "Error making directory $out_dir";
+}
+print "$data_base/$dataset/wav\n";
+opendir my $dh, "$data_base/$dataset/wav" or die "Cannot open directory: $!";
+my @spkr_dirs = grep {-d "$data_base/$dataset/wav/$_" && ! /^\.{1,2}$/} readdir($dh);
+closedir $dh;
+
+if ($dataset eq "dev"){
+  open(SPKR_TRAIN, ">", "$out_dir/utt2spk") or die "could not open the output file $out_dir/utt2spk";
+  open(WAV_TRAIN, ">", "$out_dir/wav.scp") or die "could not open the output file $out_dir/wav.scp";
+
+  foreach (@spkr_dirs) {
+    my $spkr_id = $_;
+    opendir my $dh, "$data_base/$dataset/wav/$spkr_id/" or die "Cannot open directory: $!";
+    my @rec_dirs = grep {-d "$data_base/$dataset/wav/$spkr_id/$_" && ! /^\.{1,2}$/} readdir($dh);
+    closedir $dh;
+    foreach (@rec_dirs) {
+	  my $rec_id = $_;
+	  opendir my $dh, "$data_base/$dataset/wav/$spkr_id/$rec_id/" or die "Cannot open directory: $!";
+	  my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh);
+	  closedir $dh;
+  	  foreach (@files) {
+        my $name = $_;
+        my $wav = "$data_base/$dataset/wav/$spkr_id/$rec_id/$name.wav";
+        my $utt_id = "$spkr_id-$rec_id-$name";
+        print WAV_TRAIN "$utt_id", " $wav", "\n";
+        print SPKR_TRAIN "$utt_id", " $spkr_id", "\n";
+      }
+    }
+  }
+  close(SPKR_TRAIN) or die;
+  close(WAV_TRAIN) or die;
+}
+
+if ($dataset eq "test"){
+  if (! -e "$data_base/voxceleb1_test_v2.txt") {
+    system("wget -O $data_base/voxceleb1_test_v2.txt http://www.openslr.org/resources/49/voxceleb1_test_v2.txt");
+  }
+
+  open(TRIAL_IN, "<", "$data_base/voxceleb1_test_v2.txt") or die "could not open the verification trials file $data_base/voxceleb1_test_v2.txt";
+  open(TRIAL_OUT, ">", "$out_dir/trials") or die "Could not open the output file $out_test_dir/trials";
+  open(SPKR_TEST, ">", "$out_dir/utt2spk") or die "could not open the output file $out_dir/utt2spk";
+  open(WAV_TEST, ">", "$out_dir/wav.scp") or die "could not open the output file $out_dir/wav.scp";
+
+  my $test_spkrs = ();
+  while (<TRIAL_IN>) {
+    chomp;
+    my ($tar_or_non, $path1, $path2) = split;
+    # Create entry for left-hand side of trial
+    my ($spkr_id, $rec_id, $name) = split('/', $path1);
+    my $utt_id1 = "$spkr_id-$rec_id-$name";
+    $test_spkrs{$spkr_id} = ();
+
+    # Create entry for right-hand side of trial
+    my ($spkr_id, $rec_id, $name) = split('/', $path2);
+    my $utt_id2 = "$spkr_id-$rec_id-$name";
+    $test_spkrs{$spkr_id} = ();
+
+    my $target = "nontarget";
+    if ($tar_or_non eq "1") {
+      $target = "target";
+    }
+    print TRIAL_OUT "$utt_id1 $utt_id2 $target\n";
+  }
+
+  foreach (@spkr_dirs) {
+    my $spkr_id = $_;
+    opendir my $dh, "$data_base/$dataset/wav/$spkr_id/" or die "Cannot open directory: $!";
+    my @rec_dirs = grep {-d "$data_base/$dataset/wav/$spkr_id/$_" && ! /^\.{1,2}$/} readdir($dh);
+    closedir $dh;
+    foreach (@rec_dirs) {
+	  my $rec_id = $_;
+	  opendir my $dh, "$data_base/$dataset/wav/$spkr_id/$rec_id/" or die "Cannot open directory: $!";
+	  my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh);
+	  closedir $dh;
+  	  foreach (@files) {
+        my $name = $_;
+        my $wav = "$data_base/$dataset/wav/$spkr_id/$rec_id/$name.wav";
+        my $utt_id = "$spkr_id-$rec_id-$name";
+        print WAV_TEST "$utt_id", " $wav", "\n";
+        print SPKR_TEST "$utt_id", " $spkr_id", "\n";
+      }
+    }
+  }
+  close(SPKR_TEST) or die;
+  close(WAV_TEST) or die;
+  close(TRIAL_OUT) or die;
+  close(TRIAL_IN) or die;
+}
+
+if (system(
+  "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
+  die "Error creating spk2utt file in directory $out_dir";
+}
+system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir");
+if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
+  die "Error validating directory $out_dir";
+}
diff --git a/egs/dihard_2018/v1/run.sh b/egs/dihard_2018/v1/run.sh
index 429a1231975..eb23ac500cd 100755
--- a/egs/dihard_2018/v1/run.sh
+++ b/egs/dihard_2018/v1/run.sh
@@ -28,9 +28,14 @@ stage=0
 if [ $stage -le 0 ]; then
   local/make_voxceleb2.pl $voxceleb2_root dev data/voxceleb2_train
   local/make_voxceleb2.pl $voxceleb2_root test data/voxceleb2_test
-  # This script creates data/voxceleb1_test and data/voxceleb1_train.
-  # Our evaluation set is the test portion of VoxCeleb1.
-  local/make_voxceleb1.pl $voxceleb1_root data
+
+  # Now prepare the VoxCeleb1 train and test data.  If you downloaded the corpus soon
+  # after it was first released, you may need to use an older version of the script, which
+  # can be invoked as follows:
+  # local/make_voxceleb1.pl $voxceleb1_root data
+  local/make_voxceleb1_v2.pl $voxceleb1_root dev data/voxceleb1_train
+  local/make_voxceleb1_v2.pl $voxceleb1_root test data/voxceleb1_test
+
   # We'll train on all of VoxCeleb2, plus the training portion of VoxCeleb1.
   # This should give 7,351 speakers and 1,277,503 utterances.
   utils/combine_data.sh data/train data/voxceleb2_train data/voxceleb2_test data/voxceleb1_train
@@ -186,7 +191,7 @@ if [ $stage -le 7 ]; then
 
     der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \
       $ivec_dir/tuning/dihard_2018_dev_t${threshold})
-    if [ $(echo $der'<'$best_der | bc -l) -eq 1 ]; then
+    if [ $(perl -e "print ($der < $best_der ? 1 : 0);") -eq 1 ]; then
       best_der=$der
       best_threshold=$threshold
     fi
diff --git a/egs/dihard_2018/v2/local/make_musan.py b/egs/dihard_2018/v2/local/make_musan.py
deleted file mode 100755
index c4b5c9359b4..00000000000
--- a/egs/dihard_2018/v2/local/make_musan.py
+++ /dev/null
@@ -1,123 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2015   David Snyder
-#           2018   Ewald Enzinger
-# Apache 2.0.
-#
-# Modified version of egs/sre16/v1/local/make_musan.py (commit e3fb7c4a0da4167f8c94b80f4d3cc5ab4d0e22e8).
-# This version uses the raw MUSAN audio files (16 kHz) and does not use sox to resample at 8 kHz.
-#
-# This file is meant to be invoked by make_musan.sh.
-
-import os, sys
-
-def process_music_annotations(path):
-  utt2spk = {}
-  utt2vocals = {}
-  lines = open(path, 'r').readlines()
-  for line in lines:
-    utt, genres, vocals, musician = line.rstrip().split()[:4]
-    # For this application, the musican ID isn't important
-    utt2spk[utt] = utt
-    utt2vocals[utt] = vocals == "Y"
-  return utt2spk, utt2vocals
-
-def prepare_music(root_dir, use_vocals):
-  utt2vocals = {}
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  music_dir = os.path.join(root_dir, "music")
-  for root, dirs, files in os.walk(music_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-      elif str(file) == "ANNOTATIONS":
-        utt2spk_part, utt2vocals_part = process_music_annotations(file_path)
-        utt2spk.update(utt2spk_part)
-        utt2vocals.update(utt2vocals_part)
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2vocals:
-    if utt in utt2wav:
-      if use_vocals or not utt2vocals[utt]:
-        utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-        utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
-      num_good_files += 1
-    else:
-      print("Missing file {}".format(utt))
-      num_bad_files += 1
-  print(("In music directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
-  return utt2spk_str, utt2wav_str
-
-def prepare_speech(root_dir):
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  speech_dir = os.path.join(root_dir, "speech")
-  for root, dirs, files in os.walk(speech_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-        utt2spk[utt] = utt
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2spk:
-    if utt in utt2wav:
-      utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-      utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
-      num_good_files += 1
-    else:
-      print("Missing file {}".format(utt))
-      num_bad_files += 1
-  print(("In speech directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
-  return utt2spk_str, utt2wav_str
-
-def prepare_noise(root_dir):
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  noise_dir = os.path.join(root_dir, "noise")
-  for root, dirs, files in os.walk(noise_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-        utt2spk[utt] = utt
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2spk:
-    if utt in utt2wav:
-      utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-      utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
-      num_good_files += 1
-    else:
-      print("Missing file {}".format(utt))
-      num_bad_files += 1
-  print(("In noise directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
-  return utt2spk_str, utt2wav_str
-
-def main():
-  in_dir = sys.argv[1]
-  out_dir = sys.argv[2]
-  use_vocals = sys.argv[3] == "Y"
-  utt2spk_music, utt2wav_music = prepare_music(in_dir, use_vocals)
-  utt2spk_speech, utt2wav_speech = prepare_speech(in_dir)
-  utt2spk_noise, utt2wav_noise = prepare_noise(in_dir)
-  utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise
-  utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise
-  wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w')
-  wav_fi.write(utt2wav)
-  utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w')
-  utt2spk_fi.write(utt2spk)
-
-
-if __name__=="__main__":
-  main()
diff --git a/egs/dihard_2018/v2/local/make_musan.sh b/egs/dihard_2018/v2/local/make_musan.sh
deleted file mode 100755
index 1565ef0d85c..00000000000
--- a/egs/dihard_2018/v2/local/make_musan.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash
-# Copyright 2015   David Snyder
-# Apache 2.0.
-#
-# Copy of egs/sre16/v1/local/make_musan.sh (commit e3fb7c4a0da4167f8c94b80f4d3cc5ab4d0e22e8).
-#
-# This script, called by ../run.sh, creates the MUSAN
-# data directory. The required dataset is freely available at
-#   http://www.openslr.org/17/
-
-set -e
-in_dir=$1
-data_dir=$2
-use_vocals='Y'
-
-mkdir -p local/musan.tmp
-
-echo "Preparing ${data_dir}/musan..."
-mkdir -p ${data_dir}/musan
-local/make_musan.py ${in_dir} ${data_dir}/musan ${use_vocals}
-
-utils/fix_data_dir.sh ${data_dir}/musan
-
-grep "music" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_music
-grep "speech" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_speech
-grep "noise" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_noise
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_music \
-  ${data_dir}/musan ${data_dir}/musan_music
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_speech \
-  ${data_dir}/musan ${data_dir}/musan_speech
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_noise \
-  ${data_dir}/musan ${data_dir}/musan_noise
-
-utils/fix_data_dir.sh ${data_dir}/musan_music
-utils/fix_data_dir.sh ${data_dir}/musan_speech
-utils/fix_data_dir.sh ${data_dir}/musan_noise
-
-rm -rf local/musan.tmp
-
diff --git a/egs/dihard_2018/v2/local/make_voxceleb1_v2.pl b/egs/dihard_2018/v2/local/make_voxceleb1_v2.pl
new file mode 120000
index 00000000000..2e7a22eaadc
--- /dev/null
+++ b/egs/dihard_2018/v2/local/make_voxceleb1_v2.pl
@@ -0,0 +1 @@
+../../v1/local/make_voxceleb1_v2.pl
\ No newline at end of file
diff --git a/egs/dihard_2018/v2/run.sh b/egs/dihard_2018/v2/run.sh
index 1c018dfcc55..6cd6630a838 100755
--- a/egs/dihard_2018/v2/run.sh
+++ b/egs/dihard_2018/v2/run.sh
@@ -27,9 +27,14 @@ stage=0
 if [ $stage -le 0 ]; then
   local/make_voxceleb2.pl $voxceleb2_root dev data/voxceleb2_train
   local/make_voxceleb2.pl $voxceleb2_root test data/voxceleb2_test
-  # This script creates data/voxceleb1_test and data/voxceleb1_train.
-  # Our evaluation set is the test portion of VoxCeleb1.
-  local/make_voxceleb1.pl $voxceleb1_root data
+
+  # Now prepare the VoxCeleb1 train and test data.  If you downloaded the corpus soon
+  # after it was first released, you may need to use an older version of the script, which
+  # can be invoked as follows:
+  # local/make_voxceleb1.pl $voxceleb1_root data
+  local/make_voxceleb1_v2.pl $voxceleb1_root dev data/voxceleb1_train
+  local/make_voxceleb1_v2.pl $voxceleb1_root test data/voxceleb1_test
+
   # We'll train on all of VoxCeleb2, plus the training portion of VoxCeleb1.
   # This should give 7,351 speakers and 1,277,503 utterances.
   utils/combine_data.sh data/train data/voxceleb2_train data/voxceleb2_test data/voxceleb1_train
@@ -70,6 +75,8 @@ if [ $stage -le 1 ]; then
     utils/fix_data_dir.sh data/${name}_cmn
   done
 
+  echo "0.01" > data/dihard_2018_dev_cmn/frame_shift
+  echo "0.01" > data/dihard_2018_eval_cmn/frame_shift
   echo "0.01" > data/train_cmn/frame_shift
   # Create segments to extract x-vectors from for PLDA training data.
   # The segments are created using an energy-based speech activity
@@ -113,7 +120,7 @@ if [ $stage -le 2 ]; then
 
   # Prepare the MUSAN corpus, which consists of music, speech, and noise
   # suitable for augmentation.
-  local/make_musan.sh $musan_root data
+  steps/data/make_musan.sh --sampling-rate 16000 $musan_root data
 
   # Get the duration of the MUSAN recordings.  This will be used by the
   # script augment_data_dir.py.
@@ -246,7 +253,7 @@ if [ $stage -le 12 ]; then
 
   # The threshold is in terms of the log likelihood ratio provided by the
   # PLDA scores.  In a perfectly calibrated system, the threshold is 0.
-  # In the following loop, we evaluate DER performance on DIHARD 2018 development 
+  # In the following loop, we evaluate DER performance on DIHARD 2018 development
   # set using some reasonable thresholds for a well-calibrated system.
   for threshold in -0.5 -0.4 -0.3 -0.2 -0.1 -0.05 0 0.05 0.1 0.2 0.3 0.4 0.5; do
     diarization/cluster.sh --cmd "$train_cmd --mem 4G" --nj 20 \
@@ -260,7 +267,7 @@ if [ $stage -le 12 ]; then
 
     der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \
       $nnet_dir/tuning/dihard_2018_dev_t${threshold})
-    if [ $(echo $der'<'$best_der | bc -l) -eq 1 ]; then
+    if [ $(perl -e "print ($der < $best_der ? 1 : 0);") -eq 1 ]; then
       best_der=$der
       best_threshold=$threshold
     fi
@@ -271,23 +278,23 @@ if [ $stage -le 12 ]; then
     --threshold $(cat $nnet_dir/tuning/dihard_2018_dev_best) --rttm-channel 1 \
     $nnet_dir/xvectors_dihard_2018_dev/plda_scores $nnet_dir/xvectors_dihard_2018_dev/plda_scores
 
-  # Cluster DIHARD 2018 evaluation set using the best threshold found for the DIHARD 
-  # 2018 development set. The DIHARD 2018 development set is used as the validation 
-  # set to tune the parameters. 
+  # Cluster DIHARD 2018 evaluation set using the best threshold found for the DIHARD
+  # 2018 development set. The DIHARD 2018 development set is used as the validation
+  # set to tune the parameters.
   diarization/cluster.sh --cmd "$train_cmd --mem 4G" --nj 20 \
     --threshold $(cat $nnet_dir/tuning/dihard_2018_dev_best) --rttm-channel 1 \
     $nnet_dir/xvectors_dihard_2018_eval/plda_scores $nnet_dir/xvectors_dihard_2018_eval/plda_scores
 
   mkdir -p $nnet_dir/results
-  # Compute the DER on the DIHARD 2018 evaluation set. We use the official metrics of   
-  # the DIHARD challenge. The DER is calculated with no unscored collars and including  
+  # Compute the DER on the DIHARD 2018 evaluation set. We use the official metrics of
+  # the DIHARD challenge. The DER is calculated with no unscored collars and including
   # overlapping speech.
   md-eval.pl -r data/dihard_2018_eval/rttm \
     -s $nnet_dir/xvectors_dihard_2018_eval/plda_scores/rttm 2> $nnet_dir/results/threshold.log \
     > $nnet_dir/results/DER_threshold.txt
   der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \
     $nnet_dir/results/DER_threshold.txt)
-  # Using supervised calibration, DER: 26.47%
+  # Using supervised calibration, DER: 26.30%
   echo "Using supervised calibration, DER: $der%"
 fi
 
@@ -304,6 +311,6 @@ if [ $stage -le 13 ]; then
     > $nnet_dir/results/DER_num_spk.txt
   der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \
     $nnet_dir/results/DER_num_spk.txt)
-  # Using the oracle number of speakers, DER: 23.90%
+  # Using the oracle number of speakers, DER: 23.42%
   echo "Using the oracle number of speakers, DER: $der%"
 fi
diff --git a/egs/farsdat/s5/local/nnet/run_dnn.sh b/egs/farsdat/s5/local/nnet/run_dnn.sh
index fbb3db72e3e..a02894a7322 100755
--- a/egs/farsdat/s5/local/nnet/run_dnn.sh
+++ b/egs/farsdat/s5/local/nnet/run_dnn.sh
@@ -53,7 +53,7 @@ if [ $stage -le 1 ]; then
   # Pre-train DBN, i.e. a stack of RBMs (small database, smaller DNN)
   dir=exp/dnn4_pretrain-dbn
   (tail --pid=$$ -F $dir/log/pretrain_dbn.log 2>/dev/null)& # forward log
-  $cuda_cmd $dir/log/pretrain_dbn.log \
+  "$train_cmd" --gpu 1 $dir/log/pretrain_dbn.log \
     steps/nnet/pretrain_dbn.sh --hid-dim 1024 --rbm-iter 20 $data_fmllr/train $dir || exit 1;
 fi
 
@@ -65,7 +65,7 @@ if [ $stage -le 2 ]; then
   dbn=exp/dnn4_pretrain-dbn/6.dbn
   (tail --pid=$$ -F $dir/log/train_nnet.log 2>/dev/null)& # forward log
   # Train
-  $cuda_cmd $dir/log/train_nnet.log \
+  "$train_cmd" --gpu 1 $dir/log/train_nnet.log \
     steps/nnet/train.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 \
     $data_fmllr/train_tr90 $data_fmllr/train_cv10 data/lang $ali $ali $dir || exit 1;
   # Decode (reuse HCLG graph)
@@ -93,7 +93,7 @@ fi
 
 if [ $stage -le 4 ]; then
   # Re-train the DNN by 6 iterations of sMBR 
-  steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 6 --acwt $acwt --do-smbr true \
+  steps/nnet/train_mpe.sh --cmd ""$train_cmd" --gpu 1" --num-iters 6 --acwt $acwt --do-smbr true \
     $data_fmllr/train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1
   # Decode
   for ITER in 1 6; do
diff --git a/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh b/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh
index c487f1bd222..7f407552c2e 100755
--- a/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh
+++ b/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh
@@ -156,7 +156,7 @@ if [ $stage -le 19 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true"
   tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
   linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
diff --git a/egs/fisher_callhome_spanish/s5/local/ctm.sh b/egs/fisher_callhome_spanish/s5/local/ctm.sh
index 7d09f574580..62860a10b7b 100755
--- a/egs/fisher_callhome_spanish/s5/local/ctm.sh
+++ b/egs/fisher_callhome_spanish/s5/local/ctm.sh
@@ -19,9 +19,9 @@ fi
 steps/get_ctm.sh $data_dir $lang_dir $decode_dir
 
 # Make sure that channel markers match
-#sed -i "s:\s.*_fsp-([AB]): \1:g" data/dev/stm
-#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} sed -i -r 's:fsp\s1\s:fsp A :g' {}
-#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} sed -i -r 's:fsp\s2\s:fsp B :g' {}
+#perl -i -pe "s:\s.*_fsp-([AB]): \1:g" data/dev/stm
+#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} perl -i -pe 's:fsp\s1\s:fsp A :g' {}
+#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} perl -i -pe 's:fsp\s2\s:fsp B :g' {}
 
 # Get the environment variables
 . /export/babel/data/software/env.sh
diff --git a/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh b/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh
index 7b2de2db392..779298305c4 100755
--- a/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh
+++ b/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh
@@ -105,9 +105,8 @@ if [ $stage -le 4 ]; then
   cp "$tmpdir/lexicon.1" "$tmpdir/lexicon.2"
 
   # Add prons for laughter, noise, oov
-  for w in `grep -v sil $dir/silence_phones.txt`; do
-    sed -i "/\[$w\]/d" $tmpdir/lexicon.2
-  done
+  w=$(grep -v sil $dir/silence_phones.txt | tr '\n' '|')
+  perl -i -ne "print unless /\[(${w%?})\]/"  $tmpdir/lexicon.2
 
   for w in `grep -v sil $dir/silence_phones.txt`; do
     echo "[$w] $w"
diff --git a/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py b/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py
index 864b76b671b..c7aa6affb11 100755
--- a/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py
+++ b/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py
@@ -1,11 +1,11 @@
 #!/usr/bin/env python
-# Copyright 2014  Gaurav Kumar.   Apache 2.0
 # -*- coding: utf-8 -*-
 #
+# Copyright 2014  Gaurav Kumar.   Apache 2.0
+#    2018  Nagendra Kumar Goel, Saikiran Valluri, GoVivace inc., Avaaya
 # Merges unique words from Spanish Fisher, Gigaword and the LDC spanish lexicon
-
 from __future__ import print_function
-import sys
+import sys, re
 import json
 import codecs
 import operator
@@ -17,6 +17,7 @@
 uw_gigaword = tmpdir + "/es_wordlist.json"
 uw_LDC = ldc_lexicon + "/callhome_spanish_lexicon_970908/preferences"
 
+filtered_letters = re.compile(u'[¡¥ª°º¿àçèëìîôö0123456789]')
 merged_lexicon = []
 # All three lexicons are in different formats
 # First add the data from lexicon_fisher (A) into the dictionary
@@ -55,7 +56,8 @@
 ltuples = sorted(merged_lexicon)
 
 for item in ltuples:
-    lf.write(item + "\n")
+    if not item==u'ñ' and not re.search(filtered_letters, item):
+        lf.write(item + "\n")
 
 lf.close()
 
diff --git a/egs/fisher_english/s5/local/chain/run_tdnn.sh b/egs/fisher_english/s5/local/chain/run_tdnn.sh
index 14174e617c4..1fd0f1fdf3a 100755
--- a/egs/fisher_english/s5/local/chain/run_tdnn.sh
+++ b/egs/fisher_english/s5/local/chain/run_tdnn.sh
@@ -112,7 +112,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh
index e95de232304..b76efc4f1de 100644
--- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh
+++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh
@@ -231,7 +231,7 @@ if [ $stage -le 11 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $sup_tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_1a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_1a.sh
index e76df666e8a..b1c133942ef 100755
--- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_1a.sh
+++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_1a.sh
@@ -142,7 +142,7 @@ if [ $stage -le 13 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh
index 2d5b2f8480e..53aac8c08ea 100755
--- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh
+++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh
@@ -250,7 +250,7 @@ if [ $stage -le 11 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $sup_tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh b/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh
index cbf0ef6cb6c..c12f604f26b 100755
--- a/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh
@@ -133,7 +133,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_7c.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_7c.sh
index 12b3187a5fa..efcd1eced4a 100644
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_7c.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_7c.sh
@@ -129,7 +129,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_7d.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_7d.sh
index 7d640c3262a..e4a555abfdd 100644
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_7d.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_7d.sh
@@ -134,7 +134,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.002"
   linear_opts="orthonormal-constraint=1.0"
   output_opts="l2-regularize=0.0005 bottleneck-dim=256"
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh
index 07e88b59ddc..5650cedca28 100755
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh
@@ -142,7 +142,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   lstm_opts="decay-time=20"
 
   mkdir -p $dir/configs
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a_svd.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a_svd.sh
new file mode 100644
index 00000000000..5beb2e74a9a
--- /dev/null
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a_svd.sh
@@ -0,0 +1,448 @@
+#!/bin/bash
+#
+# Copyright 2018  Nagendra Kumar Goel,
+#            Saikiran Valluri, Govivace.Inc -  Apache 2.0
+
+# The script is organized as below.
+# First we train the baseline LSTMP-TDNN config chain model for few epochs on the (Fisher+swbd)-english data,
+# Then, we perform SVD based refactoring of all the Affine components in this baseline final.mdl,
+# in order to reduce the overall model parameters size,
+# as determined by the bottleneck dim value or Energy and Shrinkage threshold values.
+# Then, we finetune the weight parameters of the refactored model using entire Fisher + switchboard data for single epoch.
+
+# Command used for comparing  WERs of decoding on different testsets using pre-SVD and SVD models:
+#  ./local/chain/compare_wer_general.sh --looped tdnn_lstm_1a_sp tdnn_lstm_1a_svd_sp
+#
+# Please run this entire script till the end before running the above WER compare command...
+
+
+# System                tdnn_lstm_1a_sp
+# WER on eval2000(tg)        12.3
+#           [looped:]        12.2
+# WER on eval2000(fg)        12.1
+#           [looped:]        12.1
+# WER on eval2000(fg)
+#  [SVD retrained + looped]  12.1
+# WER on rt03(tg)            11.6
+#           [looped:]        11.6
+# WER on rt03(tg)
+#  [SVD retrained]           12
+# WER on rt03(fg)            11.3
+#           [looped:]        11.3
+# Final train prob         -0.074
+# Final valid prob         -0.084
+# Final train prob (xent)        -0.882
+# Final valid prob (xent)       -0.9393
+
+# WER stats for eval2000 using tdnn_lstm_1a_sp
+#           | #Snt #Wrd  | Corr Sub Del Ins Err  S.Err |
+# %WER 16.0 | 2628 21594 | 86.3 9.0 4.7 2.3 16.0 54.4 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_tg/score_7_0.0/eval2000_hires.ctm.callhm.filt.sys
+# %WER 12.3 | 4459 42989 | 89.4 7.1 3.5 1.7 12.3 49.8 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_tg/score_8_0.0/eval2000_hires.ctm.filt.sys
+# %WER 8.4 | 1831 21395 | 92.7 5.1 2.2 1.1 8.4 42.3 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+# %WER 15.9 | 2628 21594 | 86.4 8.9 4.7 2.3 15.9 54.3 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_fg/score_7_0.0/eval2000_hires.ctm.callhm.filt.sys
+# %WER 12.1 | 4459 42989 | 89.6 6.9 3.5 1.7 12.1 49.2 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_fg/score_8_0.0/eval2000_hires.ctm.filt.sys
+# %WER 8.2 | 1831 21395 | 93.1 5.1 1.8 1.3 8.2 41.7 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_fg/score_8_0.0/eval2000_hires.ctm.swbd.filt.sys
+
+# WER stats for rt03 using tdnn_lstm_1a_sp
+# %WER 9.6 | 3970 36721 | 91.5 5.5 3.0 1.1 9.6 41.2 | exp/chain/tdnn_lstm_1a_sp/decode_rt03_fsh_sw1_tg/score_7_0.0/rt03_hires.ctm.fsh.filt.sys
+# %WER 11.6 | 8420 76157 | 89.7 6.8 3.4 1.4 11.6 43.0 | exp/chain/tdnn_lstm_1a_sp/decode_rt03_fsh_sw1_tg/score_7_0.0/rt03_hires.ctm.filt.sys
+# %WER 13.3 | 4450 39436 | 88.0 7.4 4.6 1.3 13.3 44.5 | exp/chain/tdnn_lstm_1a_sp/decode_rt03_fsh_sw1_tg/score_9_0.0/rt03_hires.ctm.swbd.filt.sys
+# %WER 9.4 | 3970 36721 | 91.8 5.3 2.9 1.1 9.4 40.3 | exp/chain/tdnn_lstm_1a_sp/decode_rt03_fsh_sw1_fg/score_7_0.0/rt03_hires.ctm.fsh.filt.sys
+# %WER 11.3 | 8420 76157 | 89.9 6.4 3.7 1.2 11.3 42.4 | exp/chain/tdnn_lstm_1a_sp/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.filt.sys
+# %WER 13.1 | 4450 39436 | 88.3 7.5 4.2 1.4 13.1 44.0 | exp/chain/tdnn_lstm_1a_sp/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.swbd.filt.sys
+
+# WER stats for rt03 using tdnn_lstm_1a_svd_sp
+# %WER 9.7 | 3970 36721 | 91.3 5.9 2.8 1.0 9.7 40.0 | exp/chain/tdnn_lstm_1a_svd_sp/decode_rt03_fsh_sw1_tg/score_8_0.0/rt03_hires.ctm.fsh.filt.sys
+# %WER 12  | 8420 76157 | 89.3 7.3 3.4 1.3 12.0 42.0 | exp/chain/tdnn_lstm_1a_svd_sp/decode_rt03_fsh_sw1_tg/score_8_0.0/rt03_hires.ctm.filt.sys
+# %WER 14.1 | 4450 39436 | 87.4 8.2 4.3 1.5 14.1 44.6 | exp/chain/tdnn_lstm_1a_svd_sp/decode_rt03_fsh_sw1_tg/score_9_0.0/rt03_hires.ctm.swbd.filt.sys      
+
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-20
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_lstm_1a # Note: _sp will get added to this if $speed_perturb == true.
+svd_dir=${dir}_svd # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+decode_dir_affix=
+
+# training options
+leftmost_questions_truncate=-1
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=0
+xent_regularize=0.025
+self_repair_scale=0.00001
+label_delay=5
+# decode options
+extra_left_context=50
+extra_right_context=0
+frames_per_chunk=
+
+remove_egs=false
+common_egs_dir=
+
+affix=
+
+# config for svd
+apply_svd=true
+energy_threshold=0.81
+shrinkage_threshold=0.64
+primary_lr_factor=0.25
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+svd_dir=${svd_dir}$suffix
+build_tree_train_set=train_nodup
+train_set=train_nodup_sp
+build_tree_ali_dir=exp/tri5a_ali
+treedir=exp/chain/tri6_tree
+lang=data/lang_chain
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat $build_tree_ali_dir/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri5a exp/tri5a_lats_nodup$suffix
+  rm exp/tri5a_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 11000 data/$build_tree_train_set $lang $build_tree_ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  lstm_opts="decay-time=20"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2, ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts 
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+  lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1200000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri5a_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+src_mdl=${dir}/final.mdl
+if $apply_svd && [ $stage -le 14 ]; then
+  # model compression using SVD
+
+  # threshold configs for tdnn layers
+  mkdir -p $svd_dir/configs
+  edits_config=$svd_dir/configs/final.config
+  common_egs_dir=$dir/egs
+  cat <<EOF > ${edits_config}
+  set-learning-rate-factor learning-rate-factor=$primary_lr_factor
+  apply-svd name=* energy-threshold=$energy_threshold shrinkage-threshold=$shrinkage_threshold
+EOF
+
+  # Copy files / directories from source directory
+  cp ${dir}/{cmvn_opts,tree,frame_subsampling_factor,0.trans_mdl,normalization.fst,den.fst} $svd_dir/.
+
+  # Generate initial model from trained model
+  $train_cmd $svd_dir/log/generate_input_mdl.log \
+    nnet3-am-copy --edits-config=$edits_config $src_mdl $svd_dir/input.raw
+
+  # Retrain the model for 1 epoch
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --trainer.input-model $svd_dir/input.raw \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1200000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 1 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri5a_lats_nodup$suffix \
+    --dir ${svd_dir}  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
+fi
+
+decode_suff=fsh_sw1_tg
+graph_dir=$dir/graph_fsh_sw1_tg
+
+if [ $stage -le 16 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in rt03 eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+
+test_online_decoding=true
+lang=data/lang_fsh_sw1_tg
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in rt03 eval2000; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+              steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+                      data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
+                      ${dir}_online/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in online decoding"
+    exit 1
+  fi
+fi
+
+if $apply_svd; then
+  # Decoding the svd retrained model.
+  dir=$svd_dir
+fi
+
+if [ $stage -le 18 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in rt03 eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+
+test_online_decoding=true
+lang=data/lang_fsh_sw1_tg
+if $test_online_decoding && [ $stage -le 19 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in rt03 eval2000; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+	      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+		      data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
+		      ${dir}_online/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in online decoding"
+    exit 1
+  fi
+fi
+
+exit 0;
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh
index c9d50d1f7bd..f3cc869e6de 100755
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh
@@ -151,7 +151,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   lstm_opts="decay-time=20 dropout-proportion=0.0"
 
   mkdir -p $dir/configs
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh
index 1cce08abeee..059a81e15fc 100755
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh
@@ -148,7 +148,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   gru_opts="dropout-per-frame=true dropout-proportion=0.0 "
 
   mkdir -p $dir/configs
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh
index 2334c6a1bc1..d86b699d6f6 100755
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh
@@ -149,7 +149,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   gru_opts="dropout-per-frame=true dropout-proportion=0.0 "
 
   mkdir -p $dir/configs
diff --git a/egs/formosa/README.txt b/egs/formosa/README.txt
new file mode 100644
index 00000000000..3b9d78dad92
--- /dev/null
+++ b/egs/formosa/README.txt
@@ -0,0 +1,22 @@
+### Welcome to the demo recipe of the Formosa Speech in the Wild (FSW) Project ###
+
+The language habits of Taiwanese people are different from other Mandarin speakers (both accents and cultures) [1]. Especially Tainwaese use tranditional Chinese characters, i.e., 繁體中文). To address this issue, a Taiwanese speech corpus collection project "Formosa Speech in the Wild (FSW)" was initiated in 2017 to improve the development of Taiwanese-specific speech recognition techniques.
+
+FSW corpus will be a large-scale database of real-Life/multi-gene Taiwanese Spontaneous speech collected and transcribed from various sources (radio, TV, open courses, etc.). To demostrate that this database is a reasonable data resource for Taiwanese spontaneous speech recognition research, a baseline recipe is provied here for everybody, especially students, to develop their own systems easily and quickly.
+
+This recipe is based on the "NER-Trs-Vol1" corpus (about 150 hours broadcast radio speech selected from FSW). For more details, please visit: 
+* Formosa Speech in the Wild (FSW) project (https://sites.google.com/speech.ntut.edu.tw/fsw)
+
+If you want to apply the NER-Trs-Vol1 corpus, please contact Yuan-Fu Liao (廖元甫) via "yfliao@mail.ntut.edu.tw". This corpus is only for non-commercial research/education use and will be distributed via our GitLab server in https://speech.nchc.org.tw.
+
+Any bug, errors, comments or suggestions are very welcomed.
+
+Yuan-Fu Liao (廖元甫)
+Associate Professor
+Department of electronic Engineering,
+National Taipei University of Technology
+http://www.ntut.edu.tw/~yfliao
+yfliao@mail.ntut.edu.tw
+
+............
+[1] The languages of Taiwan consist of several varieties of languages under families of the Austronesian languages and the Sino-Tibetan languages. Taiwanese Mandarin, Hokkien, Hakka and Formosan languages are used by 83.5%, 81.9%, 6.6% and 1.4% of the population respectively (2010). Given the prevalent use of Taiwanese Hokkien, the Mandarin spoken in Taiwan has been to a great extent influenced by it.
diff --git a/egs/formosa/s5/RESULTS b/egs/formosa/s5/RESULTS
new file mode 100644
index 00000000000..b047e5cefe4
--- /dev/null
+++ b/egs/formosa/s5/RESULTS
@@ -0,0 +1,43 @@
+#
+# Reference results
+#
+# Experimental settings:
+#
+# training set:	show CS, BG, DA, QG, SR, SY and WK,	in total 18977 utt., 1,088,948 words
+# test set:	show JZ, GJ, KX and YX,			in total  2112 utt.,   135,972 words
+# eval set:     show JX, TD and WJ,                     in total  2222 utt.,   104,648 words
+#
+# lexicon: 274,036 words
+# phones (IPA):  196 (tonal)
+#
+
+# WER: test
+
+%WER 61.32 [ 83373 / 135972, 5458 ins, 19156 del, 58759 sub ] exp/mono/decode_test/wer_11_0.0
+%WER 41.00 [ 55742 / 135972, 6725 ins, 12763 del, 36254 sub ] exp/tri1/decode_test/wer_15_0.0
+%WER 40.41 [ 54948 / 135972, 7366 ins, 11505 del, 36077 sub ] exp/tri2/decode_test/wer_14_0.0
+%WER 38.67 [ 52574 / 135972, 6855 ins, 11250 del, 34469 sub ] exp/tri3a/decode_test/wer_15_0.0
+%WER 35.70 [ 48546 / 135972, 7197 ins,  9717 del, 31632 sub ] exp/tri4a/decode_test/wer_17_0.0
+%WER 32.11 [ 43661 / 135972, 6112 ins, 10185 del, 27364 sub ] exp/tri5a/decode_test/wer_17_0.5
+%WER 31.36 [ 42639 / 135972, 6846 ins, 8860 del, 26933 sub ] exp/tri5a_cleaned/decode_test/wer_17_0.5
+%WER 24.43 [ 33218 / 135972, 5524 ins,  7583 del, 20111 sub ] exp/nnet3/tdnn_sp/decode_test/wer_12_0.0
+%WER 23.95 [ 32568 / 135972, 4457 ins, 10271 del, 17840 sub ] exp/chain/tdnn_1a_sp/decode_test/wer_10_0.0
+%WER 23.54 [ 32006 / 135972, 4717 ins, 8644 del, 18645 sub ] exp/chain/tdnn_1b_sp/decode_test/wer_10_0.0
+%WER 20.64 [ 28067 / 135972, 4434 ins, 7946 del, 15687 sub ] exp/chain/tdnn_1c_sp/decode_test/wer_11_0.0
+%WER 20.98 [ 28527 / 135972, 4706 ins, 7816 del, 16005 sub ] exp/chain/tdnn_1d_sp/decode_test/wer_10_0.0
+
+# CER: test
+
+%WER 54.09 [ 116688 / 215718, 4747 ins, 24510 del, 87431 sub ] exp/mono/decode_test/cer_10_0.0
+%WER 32.61 [  70336 / 215718, 5866 ins, 16282 del, 48188 sub ] exp/tri1/decode_test/cer_13_0.0
+%WER 32.10 [  69238 / 215718, 6186 ins, 15772 del, 47280 sub ] exp/tri2/decode_test/cer_13_0.0
+%WER 30.40 [  65583 / 215718, 6729 ins, 13115 del, 45739 sub ] exp/tri3a/decode_test/cer_12_0.0
+%WER 27.53 [  59389 / 215718, 6311 ins, 13008 del, 40070 sub ] exp/tri4a/decode_test/cer_15_0.0
+%WER 24.21 [  52232 / 215718, 6425 ins, 11543 del, 34264 sub ] exp/tri5a/decode_test/cer_15_0.0
+%WER 23.41 [ 50492 / 215718, 6645 ins, 10997 del, 32850 sub ] exp/tri5a_cleaned/decode_test/cer_17_0.0
+%WER 17.07 [  36829 / 215718, 4734 ins,  9938 del, 22157 sub ] exp/nnet3/tdnn_sp/decode_test/cer_12_0.0
+%WER 16.83 [ 36305 / 215718, 4772 ins, 10810 del, 20723 sub ] exp/chain/tdnn_1a_sp/decode_test/cer_9_0.0
+%WER 16.44 [ 35459 / 215718, 4216 ins, 11278 del, 19965 sub ] exp/chain/tdnn_1b_sp/decode_test/cer_10_0.0
+%WER 13.72 [ 29605 / 215718, 4678 ins, 8066 del, 16861 sub ] exp/chain/tdnn_1c_sp/decode_test/cer_10_0.0
+%WER 14.08 [ 30364 / 215718, 5182 ins, 7588 del, 17594 sub ] exp/chain/tdnn_1d_sp/decode_test/cer_9_0.0
+
diff --git a/egs/formosa/s5/cmd.sh b/egs/formosa/s5/cmd.sh
new file mode 100755
index 00000000000..66ae9090820
--- /dev/null
+++ b/egs/formosa/s5/cmd.sh
@@ -0,0 +1,27 @@
+# "queue.pl" uses qsub.  The options to it are
+# options to qsub.  If you have GridEngine installed,
+# change this to a queue you have access to.
+# Otherwise, use "run.pl", which will run jobs locally
+# (make sure your --num-jobs options are no more than
+# the number of cpus on your machine.
+
+# Run locally:
+#export train_cmd=run.pl
+#export decode_cmd=run.pl
+
+# JHU cluster (or most clusters using GridEngine, with a suitable
+# conf/queue.conf).
+export train_cmd="queue.pl"
+export decode_cmd="queue.pl --mem 4G"
+
+host=$(hostname -f)
+if [ ${host#*.} == "fit.vutbr.cz" ]; then
+  queue_conf=$HOME/queue_conf/default.conf # see example /homes/kazi/iveselyk/queue_conf/default.conf,
+  export train_cmd="queue.pl --config $queue_conf --mem 2G --matylda 0.2"
+  export decode_cmd="queue.pl --config $queue_conf --mem 3G --matylda 0.1"
+  export cuda_cmd="queue.pl --config $queue_conf --gpu 1 --mem 10G --tmp 40G"
+elif [ ${host#*.} == "cm.cluster" ]; then
+  # MARCC bluecrab cluster:
+  export train_cmd="slurm.pl --time 4:00:00 "
+  export decode_cmd="slurm.pl --mem 4G --time 4:00:00 "
+fi
diff --git a/egs/formosa/s5/conf/decode.config b/egs/formosa/s5/conf/decode.config
new file mode 100644
index 00000000000..d91f86183af
--- /dev/null
+++ b/egs/formosa/s5/conf/decode.config
@@ -0,0 +1,5 @@
+beam=11.0 # beam for decoding.  Was 13.0 in the scripts.
+first_beam=8.0 # beam for 1st-pass decoding in SAT.
+
+
+
diff --git a/egs/formosa/s5/conf/mfcc.conf b/egs/formosa/s5/conf/mfcc.conf
new file mode 100644
index 00000000000..a1aa3d6c158
--- /dev/null
+++ b/egs/formosa/s5/conf/mfcc.conf
@@ -0,0 +1,2 @@
+--use-energy=false   # only non-default option.
+--sample-frequency=16000
diff --git a/egs/formosa/s5/conf/mfcc_hires.conf b/egs/formosa/s5/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..ca067e77b37
--- /dev/null
+++ b/egs/formosa/s5/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=16000 #  Switchboard is sampled at 8kHz
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=40    # low cutoff frequency for mel bins
+--high-freq=-200 # high cutoff frequently, relative to Nyquist of 8000 (=3800)
diff --git a/egs/formosa/s5/conf/online_cmvn.conf b/egs/formosa/s5/conf/online_cmvn.conf
new file mode 100644
index 00000000000..591367e7ae9
--- /dev/null
+++ b/egs/formosa/s5/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used when invoking online2-wav-nnet3-latgen-faster.
diff --git a/egs/formosa/s5/conf/pitch.conf b/egs/formosa/s5/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs/formosa/s5/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs/formosa/s5/local/chain/run_tdnn.sh b/egs/formosa/s5/local/chain/run_tdnn.sh
new file mode 120000
index 00000000000..e1adaa9346d
--- /dev/null
+++ b/egs/formosa/s5/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1d.sh
\ No newline at end of file
diff --git a/egs/formosa/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/formosa/s5/local/chain/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..66c5ad3335f
--- /dev/null
+++ b/egs/formosa/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,181 @@
+#!/bin/bash
+
+# This script is based on run_tdnn_7h.sh in swbd chain recipe.
+
+set -e
+
+# configs for 'chain'
+affix=1a
+stage=0
+train_stage=-10
+get_egs_stage=-10
+dir=exp/chain/tdnn  # Note: _sp will get added to this
+decode_iter=
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=2
+num_jobs_final=12
+minibatch_size=128
+frames_per_eg=150,110,90
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+dir=${dir}${affix:+_$affix}_sp
+train_set=train_sp
+ali_dir=exp/tri5a_sp_ali
+treedir=exp/chain/tri6_7d_tree_sp
+lang=data/lang_chain
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage || exit 1;
+
+if [ $stage -le 7 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat $ali_dir/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri5a exp/tri5a_sp_lats
+  rm exp/tri5a_sp_lats/fsts.*.gz # save space
+fi
+
+if [ $stage -le 8 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 9 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 5000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=43 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=625
+  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=625
+  relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=625
+  relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=625
+  relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=625
+  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=625
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=625 target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=625 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 11 ]; then
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri5a_sp_lats \
+    --use-gpu wait \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+fi
+
+graph_dir=$dir/graph
+if [ $stage -le 13 ]; then
+  for test_set in test eval; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj 10 --cmd "$decode_cmd" \
+      --online-ivector-dir exp/nnet3/ivectors_$test_set \
+      $graph_dir data/${test_set}_hires $dir/decode_${test_set} || exit 1;
+  done
+  wait;
+fi
+
+exit 0;
diff --git a/egs/formosa/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/formosa/s5/local/chain/tuning/run_tdnn_1b.sh
new file mode 100755
index 00000000000..1981bb0530d
--- /dev/null
+++ b/egs/formosa/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -0,0 +1,188 @@
+#!/bin/bash
+
+# This script shows improvement arising from data cleaning.
+
+# CER:
+# %WER 16.83 [ 36305 / 215718, 4772 ins, 10810 del, 20723 sub ] exp/chain/tdnn_1a_sp/decode_test/cer_9_0.0
+# %WER 16.44 [ 35459 / 215718, 4216 ins, 11278 del, 19965 sub ] exp/chain/tdnn_1b_sp/decode_test/cer_10_0.0
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_1b_sp
+# exp/chain/tdnn_1b_sp: num-iters=133 nj=2..12 num-params=12.5M dim=43+100->4528 combine=-0.073->-0.073 (over 2) xent:train/valid[87,132,final]=(-1.05,-0.964,-0.963/-1.10,-1.06,-1.05) logprob:train/valid[87,132,final]=(-0.079,-0.065,-0.065/-0.094,-0.092,-0.092)
+
+set -e
+
+# configs for 'chain'
+affix=1b
+nnet3_affix=_1b
+stage=0
+train_stage=-10
+get_egs_stage=-10
+dir=exp/chain/tdnn  # Note: _sp will get added to this
+decode_iter=
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=2
+num_jobs_final=12
+minibatch_size=128
+frames_per_eg=150,110,90
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+dir=${dir}${affix:+_$affix}_sp
+train_set=train_cleaned_sp
+ali_dir=exp/tri5a_cleaned_sp_ali
+treedir=exp/chain/tri6a_cleaned_tree_sp
+lang=data/lang_chain
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage --train-set train_cleaned --gmm tri5a_cleaned --nnet3-affix $nnet3_affix
+
+if [ $stage -le 7 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat $ali_dir/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri5a exp/tri5a_sp_lats
+  rm exp/tri5a_sp_lats/fsts.*.gz # save space
+fi
+
+if [ $stage -le 8 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 9 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 5000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=43 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=625
+  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=625
+  relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=625
+  relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=625
+  relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=625
+  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=625
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=625 target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=625 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 11 ]; then
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri5a_sp_lats \
+    --use-gpu wait \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+fi
+
+graph_dir=$dir/graph
+if [ $stage -le 13 ]; then
+  for test_set in test eval; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj 10 --cmd "$decode_cmd" \
+      --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_$test_set \
+      $graph_dir data/${test_set}_hires $dir/decode_${test_set} || exit 1;
+  done
+  wait;
+fi
+exit 0;
diff --git a/egs/formosa/s5/local/chain/tuning/run_tdnn_1c.sh b/egs/formosa/s5/local/chain/tuning/run_tdnn_1c.sh
new file mode 100755
index 00000000000..6fa10344cfc
--- /dev/null
+++ b/egs/formosa/s5/local/chain/tuning/run_tdnn_1c.sh
@@ -0,0 +1,191 @@
+#!/bin/bash
+
+# CER:
+# %WER 16.44 [ 35459 / 215718, 4216 ins, 11278 del, 19965 sub ] exp/chain/tdnn_1b_sp/decode_test/cer_10_0.0
+# %WER 13.72 [ 29605 / 215718, 4678 ins, 8066 del, 16861 sub ] exp/chain/tdnn_1c_sp/decode_test/cer_10_0.0
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_1c_sp
+# exp/chain/tdnn_1c_sp: num-iters=147 nj=3..16 num-params=17.9M dim=43+100->4528 combine=-0.041->-0.041 (over 2) xent:train/valid[97,146,final]=(-0.845,-0.625,-0.618/-0.901,-0.710,-0.703) logprob:train/valid[97,146,final]=(-0.064,-0.040,-0.039/-0.072,-0.058,-0.057)
+
+set -e
+
+# configs for 'chain'
+affix=1c
+nnet3_affix=_1b
+stage=0
+train_stage=-10
+get_egs_stage=-10
+dir=exp/chain/tdnn  # Note: _sp will get added to this
+decode_iter=
+
+# training options
+num_epochs=6
+initial_effective_lrate=0.00025
+final_effective_lrate=0.000025
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=64
+frames_per_eg=150,110,90
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+dir=${dir}${affix:+_$affix}_sp
+train_set=train_cleaned_sp
+ali_dir=exp/tri5a_cleaned_sp_ali
+treedir=exp/chain/tri6a_cleaned_tree_sp
+lang=data/lang_chain
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage --train-set train_cleaned --gmm tri5a_cleaned --nnet3-affix $nnet3_affix
+
+if [ $stage -le 7 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat $ali_dir/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri5a exp/tri5a_sp_lats
+  rm exp/tri5a_sp_lats/fsts.*.gz # save space
+fi
+
+if [ $stage -le 8 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 9 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 5000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.002"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=43 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  linear-component name=prefinal-l dim=256 $linear_opts
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 11 ]; then
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3$nnet3_affix/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0 --constrained false" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri5a_sp_lats \
+    --use-gpu wait \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+fi
+
+graph_dir=$dir/graph
+if [ $stage -le 13 ]; then
+  for test_set in test eval; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj 10 --cmd "$decode_cmd" \
+      --online-ivector-dir exp/nnet3${nnet3_affix:+_$nnet3_affix}/ivectors_$test_set \
+      $graph_dir data/${test_set}_hires $dir/decode_${test_set} || exit 1;
+  done
+  wait;
+fi
+
+exit 0;
diff --git a/egs/formosa/s5/local/chain/tuning/run_tdnn_1d.sh b/egs/formosa/s5/local/chain/tuning/run_tdnn_1d.sh
new file mode 100755
index 00000000000..1f4b7e12850
--- /dev/null
+++ b/egs/formosa/s5/local/chain/tuning/run_tdnn_1d.sh
@@ -0,0 +1,190 @@
+#!/bin/bash
+
+# CER:
+# 1a: %WER 16.83 [ 36305 / 215718, 4772 ins, 10810 del, 20723 sub ] exp/chain/tdnn_1a_sp/decode_test/cer_9_0.0
+# 1d: %WER 14.08 [ 30364 / 215718, 5182 ins, 7588 del, 17594 sub ] exp/chain/tdnn_1d_sp/decode_test/cer_9_0.0
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_1d_sp
+# exp/chain/tdnn_1d_sp: num-iters=157 nj=3..16 num-params=18.6M dim=43+100->5792 combine=-0.050->-0.050 (over 1) xent:train/valid[103,156,final]=(-0.977,-0.735,-0.725/-0.953,-0.772,-0.768) logprob:train/valid[103,156,final]=(-0.077,-0.052,-0.052/-0.079,-0.065,-0.066)
+
+set -e
+
+# configs for 'chain'
+affix=1d
+stage=0
+train_stage=-10
+get_egs_stage=-10
+dir=exp/chain/tdnn  # Note: _sp will get added to this
+decode_iter=
+
+# training options
+num_epochs=6
+initial_effective_lrate=0.00025
+final_effective_lrate=0.000025
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=64
+frames_per_eg=150,110,90
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+dir=${dir}${affix:+_$affix}_sp
+train_set=train_sp
+ali_dir=exp/tri5a_sp_ali
+treedir=exp/chain/tri6a_tree_sp
+lang=data/lang_chain
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage --train-set train --gmm tri5a ${nnet3_affix:+ --nnet3-affix $nnet3_affix}
+
+if [ $stage -le 7 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat $ali_dir/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri5a exp/tri5a_sp_lats
+  rm exp/tri5a_sp_lats/fsts.*.gz # save space
+fi
+
+if [ $stage -le 8 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 9 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.002"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=43 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  linear-component name=prefinal-l dim=256 $linear_opts
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 11 ]; then
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3$nnet3_affix/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0 --constrained false" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri5a_sp_lats \
+    --use-gpu wait \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+fi
+
+graph_dir=$dir/graph
+if [ $stage -le 13 ]; then
+  for test_set in test eval; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj 10 --cmd "$decode_cmd" \
+      --online-ivector-dir exp/nnet3${nnet3_affix:+_$nnet3_affix}/ivectors_$test_set \
+      $graph_dir data/${test_set}_hires $dir/decode_${test_set} || exit 1;
+  done
+  wait;
+fi
+
+exit 0;
diff --git a/egs/formosa/s5/local/nnet3/run_ivector_common.sh b/egs/formosa/s5/local/nnet3/run_ivector_common.sh
new file mode 100755
index 00000000000..723589ddd2e
--- /dev/null
+++ b/egs/formosa/s5/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,145 @@
+#!/bin/bash
+
+set -euo pipefail
+
+# This script is modified based on mini_librispeech/s5/local/nnet3/run_ivector_common.sh
+
+# This script is called from local/nnet3/run_tdnn.sh and
+# local/chain/run_tdnn.sh (and may eventually be called by more
+# scripts).  It contains the common feature preparation and
+# iVector-related parts of the script.  See those scripts for examples
+# of usage.
+
+stage=0
+train_set=train
+test_sets="test eval"
+gmm=tri5a
+
+nnet3_affix=
+
+. ./cmd.sh
+. ./path.sh
+. utils/parse_options.sh
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_sp_ali
+
+for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+if [ $stage -le 1 ]; then
+  # Although the nnet will be trained by high resolution data, we still have to
+  # perturb the normal data to get the alignment _sp stands for speed-perturbed
+  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+  echo "$0: making MFCC features for low-resolution speed-perturbed data"
+  steps/make_mfcc_pitch.sh --cmd "$train_cmd" --nj 70 data/${train_set}_sp \
+    exp/make_mfcc/${train_set}_sp mfcc_perturbed || exit 1;
+  steps/compute_cmvn_stats.sh data/${train_set}_sp \
+    exp/make_mfcc/${train_set}_sp mfcc_perturbed || exit 1;
+  utils/fix_data_dir.sh data/${train_set}_sp
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: aligning with the perturbed low-resolution data"
+  steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
+    data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1
+fi
+
+if [ $stage -le 3 ]; then
+  # Create high-resolution MFCC features (with 40 cepstra instead of 13).
+  # this shows how you can split across multiple file-systems.
+  echo "$0: creating high-resolution MFCC features"
+  mfccdir=mfcc_perturbed_hires
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+  done
+
+  # do volume-perturbation on the training data prior to extracting hires
+  # features; this helps make trained nnets more invariant to test data volume.
+  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires || exit 1;
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    steps/make_mfcc_pitch.sh --nj 10 --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
+    utils/fix_data_dir.sh data/${datadir}_hires || exit 1;
+    # create MFCC data dir without pitch to extract iVector
+    utils/data/limit_feature_dim.sh 0:39 data/${datadir}_hires data/${datadir}_hires_nopitch || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires_nopitch exp/make_hires/$datadir $mfccdir || exit 1;
+  done
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: computing a subset of data to train the diagonal UBM."
+  # We'll use about a quarter of the data.
+  mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
+  temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
+
+  num_utts_total=$(wc -l <data/${train_set}_sp_hires_nopitch/utt2spk)
+  num_utts=$[$num_utts_total/4]
+  utils/data/subset_data_dir.sh data/${train_set}_sp_hires_nopitch \
+     $num_utts ${temp_data_root}/${train_set}_sp_hires_nopitch_subset
+
+  echo "$0: computing a PCA transform from the hires data."
+  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
+      --splice-opts "--left-context=3 --right-context=3" \
+      --max-utts 10000 --subsample 2 \
+       ${temp_data_root}/${train_set}_sp_hires_nopitch_subset \
+       exp/nnet3${nnet3_affix}/pca_transform
+
+  echo "$0: training the diagonal UBM."
+  # Use 512 Gaussians in the UBM.
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
+    --num-frames 700000 \
+    ${temp_data_root}/${train_set}_sp_hires_nopitch_subset 512 \
+    exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
+fi
+
+if [ $stage -le 5 ]; then
+  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
+  # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
+  # 100.
+  echo "$0: training the iVector extractor"
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+     data/${train_set}_sp_hires_nopitch exp/nnet3${nnet3_affix}/diag_ubm \
+     exp/nnet3${nnet3_affix}/extractor || exit 1;
+fi
+
+if [ $stage -le 6 ]; then
+  # We extract iVectors on the speed-perturbed training data after combining
+  # short segments, which will be what we train the system on.  With
+  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
+  # each of these pairs as one speaker; this gives more diversity in iVectors..
+  # Note that these are extracted 'online'.
+
+  # note, we don't encode the 'max2' in the name of the ivectordir even though
+  # that's the data we extract the ivectors from, as it's still going to be
+  # valid for the non-'max2' data, the utterance list is the same.
+
+  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp
+
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  temp_data_root=${ivectordir}
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${train_set}_sp_hires_nopitch ${temp_data_root}/${train_set}_sp_hires_nopitch_max2
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
+    ${temp_data_root}/${train_set}_sp_hires_nopitch_max2 \
+    exp/nnet3${nnet3_affix}/extractor $ivectordir
+
+  # Also extract iVectors for the test data, but in this case we don't need the speed
+  # perturbation (sp).
+  for data in $test_sets; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 8 \
+      data/${data}_hires_nopitch exp/nnet3${nnet3_affix}/extractor \
+      exp/nnet3${nnet3_affix}/ivectors_${data}
+  done
+fi
+
+exit 0
diff --git a/egs/formosa/s5/local/nnet3/run_tdnn.sh b/egs/formosa/s5/local/nnet3/run_tdnn.sh
new file mode 100755
index 00000000000..a41d990a9b2
--- /dev/null
+++ b/egs/formosa/s5/local/nnet3/run_tdnn.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+
+# This script is based on swbd/s5c/local/nnet3/run_tdnn.sh
+
+# this is the standard "tdnn" system, built in nnet3; it's what we use to
+# call multi-splice.
+
+# At this script level we don't support not running on GPU, as it would be painfully slow.
+# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
+# --num-threads 16 and --minibatch-size 128.
+set -e
+
+stage=0
+train_stage=-10
+affix=
+common_egs_dir=
+
+# training options
+initial_effective_lrate=0.0015
+final_effective_lrate=0.00015
+num_epochs=4
+num_jobs_initial=2
+num_jobs_final=8
+remove_egs=false
+
+# feature options
+use_ivectors=true
+
+# End configuration section.
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+dir=exp/nnet3/tdnn_sp${affix:+_$affix}
+gmm_dir=exp/tri5a
+train_set=train_sp
+ali_dir=${gmm_dir}_sp_ali
+graph_dir=$gmm_dir/graph
+
+local/nnet3/run_ivector_common.sh --stage $stage || exit 1;
+
+if [ $stage -le 7 ]; then
+  echo "$0: creating neural net configs";
+
+  num_targets=$(tree-info $ali_dir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=43 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=850
+  relu-batchnorm-layer name=tdnn2 dim=850 input=Append(-1,0,2)
+  relu-batchnorm-layer name=tdnn3 dim=850 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn4 dim=850 input=Append(-7,0,2)
+  relu-batchnorm-layer name=tdnn5 dim=850 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn6 dim=850
+  output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 8 ]; then
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --cleanup.preserve-model-interval 500 \
+    --use-gpu wait \
+    --feat-dir=data/${train_set}_hires \
+    --ali-dir $ali_dir \
+    --lang data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 9 ]; then
+  # this version of the decoding treats each utterance separately
+  # without carrying forward speaker information.
+
+  for decode_set in test eval; do
+    num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+    decode_dir=${dir}/decode_$decode_set
+    steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
+       --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+       $graph_dir data/${decode_set}_hires $decode_dir || exit 1;
+  done
+  wait;
+fi
+
+exit 0;
diff --git a/egs/formosa/s5/local/prepare_data.sh b/egs/formosa/s5/local/prepare_data.sh
new file mode 100755
index 00000000000..68f342e1549
--- /dev/null
+++ b/egs/formosa/s5/local/prepare_data.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+# Copyright 2015-2016  Sarah Flora Juan
+# Copyright 2016  Johns Hopkins University (Author: Yenda Trmal)
+# Copyright 2018  Yuan-Fu Liao, National Taipei University of Technology
+#                 AsusTek Computer Inc. (Author: Alex Hung)
+
+# Apache 2.0
+
+set -e -o pipefail
+
+train_dir=NER-Trs-Vol1/Train
+eval_dir=NER-Trs-Vol1-Eval
+eval_key_dir=NER-Trs-Vol1-Eval-Key
+
+. ./path.sh
+. parse_options.sh
+
+for x in $train_dir $eval_dir; do
+  if [ ! -d "$x" ] ; then
+    echo >&2 "The directory $x does not exist"
+  fi
+done
+
+if [ -z "$(command -v dos2unix 2>/dev/null)" ]; then
+    echo "dos2unix not found on PATH. Please install it manually."
+    exit 1;
+fi
+
+# have to remvoe previous files to avoid filtering speakers according to cmvn.scp and feats.scp
+rm -rf   data/all data/train data/test data/eval data/local/train
+mkdir -p data/all data/train data/test data/eval data/local/train
+
+
+# make utt2spk, wav.scp and text
+find $train_dir -name *.wav -exec sh -c 'x={}; y=$(basename -s .wav $x); printf "%s %s\n"     $y $y' \; | dos2unix > data/all/utt2spk
+find $train_dir -name *.wav -exec sh -c 'x={}; y=$(basename -s .wav $x); printf "%s %s\n"     $y $x' \; | dos2unix > data/all/wav.scp
+find $train_dir -name *.txt -exec sh -c 'x={}; y=$(basename -s .txt $x); printf "%s " $y; cat $x'    \; | dos2unix > data/all/text
+
+# fix_data_dir.sh fixes common mistakes (unsorted entries in wav.scp,
+# duplicate entries and so on). Also, it regenerates the spk2utt from
+# utt2spk
+utils/fix_data_dir.sh data/all
+
+echo "Preparing train and test data"
+# test set: JZ, GJ, KX, YX
+grep -E "(JZ|GJ|KX|YX)_" data/all/utt2spk | awk '{print $1}' > data/all/cv.spk
+utils/subset_data_dir_tr_cv.sh --cv-spk-list data/all/cv.spk data/all data/train data/test
+
+# for LM training
+echo "cp data/train/text data/local/train/text for language model training"
+cat data/train/text | awk '{$1=""}1;' | awk '{$1=$1}1;' > data/local/train/text
+
+# preparing EVAL set.
+find $eval_dir     -name *.wav -exec sh -c 'x={}; y=$(basename -s .wav $x); printf "%s %s\n"     $y $y' \; | dos2unix > data/eval/utt2spk
+find $eval_dir     -name *.wav -exec sh -c 'x={}; y=$(basename -s .wav $x); printf "%s %s\n"     $y $x' \; | dos2unix > data/eval/wav.scp
+find $eval_key_dir -name *.txt -exec sh -c 'x={}; y=$(basename -s .txt $x); printf "%s " $y; cat $x'    \; | dos2unix > data/eval/text
+utils/fix_data_dir.sh data/eval
+
+echo "Data preparation completed."
+exit 0;
diff --git a/egs/formosa/s5/local/prepare_dict.sh b/egs/formosa/s5/local/prepare_dict.sh
new file mode 100755
index 00000000000..4e580f5f6e8
--- /dev/null
+++ b/egs/formosa/s5/local/prepare_dict.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# Copyright 2015-2016  Sarah Flora Juan
+# Copyright 2016  Johns Hopkins University (Author: Yenda Trmal)
+# Copyright 2018  Yuan-Fu Liao, National Taipei University of Technology
+# Apache 2.0
+
+source_dir=NER-Trs-Vol1/Language
+dict_dir=data/local/dict
+rm -rf $dict_dir
+mkdir -p $dict_dir
+
+#
+#
+#
+rm -f $dict_dir/lexicon.txt
+touch $dict_dir/lexicon.txt
+cat $source_dir/lexicon.txt > $dict_dir/lexicon.txt
+echo "<SIL> SIL"	>> $dict_dir/lexicon.txt
+
+#
+# define silence phone
+#
+rm -f $dict_dir/silence_phones.txt
+touch $dict_dir/silence_phones.txt
+
+echo "SIL"	> $dict_dir/silence_phones.txt
+
+#
+# find nonsilence phones
+#
+rm -f $dict_dir/nonsilence_phones.txt
+touch $dict_dir/nonsilence_phones.txt
+
+cat $source_dir/lexicon.txt | grep -v -F -f $dict_dir/silence_phones.txt | \
+    perl -ane 'print join("\n", @F[1..$#F]) . "\n"; '  | \
+    sort -u > $dict_dir/nonsilence_phones.txt
+
+#
+# add optional silence phones
+#
+
+rm -f $dict_dir/optional_silence.txt
+touch $dict_dir/optional_silence.txt
+echo "SIL"	> $dict_dir/optional_silence.txt
+
+#
+# extra questions
+#
+rm -f $dict_dir/extra_questions.txt
+touch $dict_dir/extra_questions.txt
+cat $dict_dir/silence_phones.txt    | awk '{printf("%s ", $1);} END{printf "\n";}'  > $dict_dir/extra_questions.txt || exit 1;
+cat $dict_dir/nonsilence_phones.txt | awk '{printf("%s ", $1);} END{printf "\n";}' >> $dict_dir/extra_questions.txt || exit 1;
+
+echo "Dictionary preparation succeeded"
+exit 0;
diff --git a/egs/formosa/s5/local/prepare_lm.sh b/egs/formosa/s5/local/prepare_lm.sh
new file mode 100755
index 00000000000..59fe1529658
--- /dev/null
+++ b/egs/formosa/s5/local/prepare_lm.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+# Copyright 2015-2016  Sarah Flora Juan
+# Copyright 2016  Johns Hopkins University (Author: Yenda Trmal)
+# Apache 2.0
+
+set -e -o pipefail
+
+# To create G.fst from ARPA language model
+. ./path.sh || die "path.sh expected";
+
+local/train_lms_srilm.sh --train-text data/train/text data/ data/srilm
+
+#nl -nrz -w10  corpus/LM/iban-bp-2012.txt | utils/shuffle_list.pl > data/local/external_text
+local/train_lms_srilm.sh --train-text data/local/external_text data/ data/srilm_external
+
+# let's do ngram interpolation of the previous two LMs
+# the lm.gz is always symlink to the model with the best perplexity, so we use that
+
+mkdir -p data/srilm_interp
+for w in 0.9 0.8 0.7 0.6 0.5; do
+    ngram -lm data/srilm/lm.gz  -mix-lm data/srilm_external/lm.gz \
+          -lambda $w -write-lm data/srilm_interp/lm.${w}.gz
+    echo -n "data/srilm_interp/lm.${w}.gz "
+    ngram -lm data/srilm_interp/lm.${w}.gz -ppl data/srilm/dev.txt | paste -s -
+done | sort  -k15,15g  > data/srilm_interp/perplexities.txt
+
+# for basic decoding, let's use only a trigram LM
+[ -d data/lang_test/ ] && rm -rf data/lang_test
+cp -R data/lang data/lang_test
+lm=$(cat data/srilm/perplexities.txt | grep 3gram | head -n1 | awk '{print $1}')
+local/arpa2G.sh $lm data/lang_test data/lang_test
+
+# for decoding using bigger LM let's find which interpolated gave the most improvement
+[ -d data/lang_big ] && rm -rf data/lang_big
+cp -R data/lang data/lang_big
+lm=$(cat data/srilm_interp/perplexities.txt | head -n1 | awk '{print $1}')
+local/arpa2G.sh $lm data/lang_big data/lang_big
+
+# for really big lm, we should only decode using small LM
+# and resocre using the big lm
+utils/build_const_arpa_lm.sh $lm data/lang_big data/lang_big
+exit 0;
diff --git a/egs/formosa/s5/local/run_cleanup_segmentation.sh b/egs/formosa/s5/local/run_cleanup_segmentation.sh
new file mode 100755
index 00000000000..b72cd89b4d1
--- /dev/null
+++ b/egs/formosa/s5/local/run_cleanup_segmentation.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+# Copyright   2016  Vimal Manohar
+#             2016  Johns Hopkins University (author: Daniel Povey)
+#             2017  Nagendra Kumar Goel
+#             2019  AsusTek Computer Inc. (author: Alex Hung)
+# Apache 2.0
+
+# This script demonstrates how to re-segment training data selecting only the
+# "good" audio that matches the transcripts.
+# The basic idea is to decode with an existing in-domain acoustic model, and a
+# biased language model built from the reference, and then work out the
+# segmentation from a ctm like file.
+
+# For nnet3 and chain results after cleanup, see the scripts in
+# local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh
+
+# GMM Results for speaker-independent (SI) and speaker adaptive training (SAT) systems on dev and test sets
+# [will add these later].
+
+set -e
+set -o pipefail
+set -u
+
+stage=0
+cleanup_stage=0
+data=data/train
+cleanup_affix=cleaned
+srcdir=exp/tri5a
+langdir=data/lang_test
+nj=20
+decode_nj=20
+decode_num_threads=1
+
+. ./cmd.sh
+if [ -f ./path.sh ]; then . ./path.sh; fi
+. utils/parse_options.sh
+
+cleaned_data=${data}_${cleanup_affix}
+
+dir=${srcdir}_${cleanup_affix}_work
+cleaned_dir=${srcdir}_${cleanup_affix}
+
+if [ $stage -le 1 ]; then
+  # This does the actual data cleanup.
+  steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage \
+    --nj $nj --cmd "$train_cmd" \
+    $data $langdir $srcdir $dir $cleaned_data
+fi
+
+if [ $stage -le 2 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    $cleaned_data $langdir $srcdir ${srcdir}_ali_${cleanup_affix}
+fi
+
+if [ $stage -le 3 ]; then
+  steps/train_sat.sh --cmd "$train_cmd" \
+    3500 100000 $cleaned_data $langdir ${srcdir}_ali_${cleanup_affix} ${cleaned_dir}
+fi
+
+utils/data/get_utt2dur.sh data/train_cleaned
+ori_avg_dur=$(awk 'BEGIN{total=0}{total += $2}END{printf("%.2f", total/NR)}' ${data}/utt2dur)
+new_avg_dur=$(awk 'BEGIN{total=0}{total += $2}END{printf("%.2f", total/NR)}' ${cleaned_data}/utt2dur)
+echo "average duration was reduced from ${ori_avg_dur}s to ${new_avg_dur}s."
+# average duration was reduced from 21.68s to 10.97s.
+exit 0;
diff --git a/egs/formosa/s5/local/score.sh b/egs/formosa/s5/local/score.sh
new file mode 100755
index 00000000000..a9786169973
--- /dev/null
+++ b/egs/formosa/s5/local/score.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+set -e -o pipefail
+set -x
+steps/score_kaldi.sh "$@"
+steps/scoring/score_kaldi_cer.sh --stage 2 "$@"
+
+echo "$0: Done"
diff --git a/egs/formosa/s5/local/train_lms.sh b/egs/formosa/s5/local/train_lms.sh
new file mode 100755
index 00000000000..efc5b92c573
--- /dev/null
+++ b/egs/formosa/s5/local/train_lms.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+
+# To be run from one directory above this script.
+. ./path.sh
+
+text=data/local/train/text
+lexicon=data/local/dict/lexicon.txt
+
+for f in "$text" "$lexicon"; do
+  [ ! -f $x ] && echo "$0: No such file $f" && exit 1;
+done
+
+# This script takes no arguments.  It assumes you have already run
+# aishell_data_prep.sh.
+# It takes as input the files
+# data/local/train/text
+# data/local/dict/lexicon.txt
+dir=data/local/lm
+mkdir -p $dir
+
+kaldi_lm=`which train_lm.sh`
+if [ -z $kaldi_lm ]; then
+  echo "$0: train_lm.sh is not found. That might mean it's not installed"
+  echo "$0: or it is not added to PATH"
+  echo "$0: Use the script tools/extra/install_kaldi_lm.sh to install it"
+  exit 1
+fi
+
+cleantext=$dir/text.no_oov
+
+cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
+  {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<SIL> ");} } printf("\n");}' \
+  > $cleantext || exit 1;
+
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
+   sort -nr > $dir/word.counts || exit 1;
+
+# Get counts from acoustic training transcripts, and add  one-count
+# for each word in the lexicon (but not silence, we don't want it
+# in the LM-- we'll add it optionally later).
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
+  cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
+   sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
+
+# note: we probably won't really make use of <SIL> as there aren't any OOVs
+cat $dir/unigram.counts  | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<SIL>" > $dir/word_map \
+   || exit 1;
+
+# note: ignore 1st field of train.txt, it's the utterance-id.
+cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
+  { for(n=2;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz \
+   || exit 1;
+
+train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1;
+
+# LM is small enough that we don't need to prune it (only about 0.7M N-grams).
+# Perplexity over 128254.000000 words is 90.446690
+
+# note: output is
+# data/local/lm/3gram-mincount/lm_unpruned.gz
+
+exit 0;
diff --git a/egs/formosa/s5/local/wer_hyp_filter b/egs/formosa/s5/local/wer_hyp_filter
new file mode 100755
index 00000000000..519d92ee80d
--- /dev/null
+++ b/egs/formosa/s5/local/wer_hyp_filter
@@ -0,0 +1,19 @@
+#!/usr/bin/env perl
+
+@filters=('<SIL>');
+
+foreach $w (@filters) {
+  $bad{$w} = 1;
+}
+
+while(<STDIN>) {
+  @A  = split(" ", $_);
+  $id = shift @A;
+  print "$id ";
+  foreach $a (@A) {
+    if (!defined $bad{$a}) {
+      print "$a ";
+    }
+  }
+  print "\n";
+}
diff --git a/egs/formosa/s5/local/wer_output_filter b/egs/formosa/s5/local/wer_output_filter
new file mode 100755
index 00000000000..06a99a43e34
--- /dev/null
+++ b/egs/formosa/s5/local/wer_output_filter
@@ -0,0 +1,25 @@
+#!/usr/bin/env perl
+# Copyright 2012-2014  Johns Hopkins University (Author: Yenda Trmal)
+# Apache 2.0
+use utf8;
+
+use open qw(:encoding(utf8));
+binmode STDIN, ":utf8";
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+
+while (<>) {
+  @F = split " ";
+  print $F[0] . " "; 
+  foreach $s (@F[1..$#F]) {
+    if (($s =~ /\[.*\]/) || ($s =~ /\<.*\>/) || ($s =~ "<SIL>")) {
+      print "";
+    } else {
+      print "$s"
+    }
+    print " ";
+  }
+  print "\n";
+}
+
+
diff --git a/egs/formosa/s5/local/wer_ref_filter b/egs/formosa/s5/local/wer_ref_filter
new file mode 100755
index 00000000000..519d92ee80d
--- /dev/null
+++ b/egs/formosa/s5/local/wer_ref_filter
@@ -0,0 +1,19 @@
+#!/usr/bin/env perl
+
+@filters=('<SIL>');
+
+foreach $w (@filters) {
+  $bad{$w} = 1;
+}
+
+while(<STDIN>) {
+  @A  = split(" ", $_);
+  $id = shift @A;
+  print "$id ";
+  foreach $a (@A) {
+    if (!defined $bad{$a}) {
+      print "$a ";
+    }
+  }
+  print "\n";
+}
diff --git a/egs/formosa/s5/path.sh b/egs/formosa/s5/path.sh
new file mode 100755
index 00000000000..2d17b17a84a
--- /dev/null
+++ b/egs/formosa/s5/path.sh
@@ -0,0 +1,6 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/formosa/s5/run.sh b/egs/formosa/s5/run.sh
new file mode 100755
index 00000000000..a4d0f2dcd1d
--- /dev/null
+++ b/egs/formosa/s5/run.sh
@@ -0,0 +1,217 @@
+#!/bin/bash
+#
+# Copyright 2018, Yuan-Fu Liao, National Taipei University of Technology, yfliao@mail.ntut.edu.tw
+#
+# Before you run this recipe, please apply, download and put or make a link of the corpus under this folder (folder name: "NER-Trs-Vol1").
+# For more detail, please check:
+# 1. Formosa Speech in the Wild (FSW) project (https://sites.google.com/speech.ntut.edu.tw/fsw/home/corpus)
+# 2. Formosa Speech Recognition Challenge (FSW) 2018 (https://sites.google.com/speech.ntut.edu.tw/fsw/home/challenge)
+stage=-2
+num_jobs=20
+
+train_dir=NER-Trs-Vol1/Train
+eval_dir=NER-Trs-Vol1-Eval
+eval_key_dir=NER-Trs-Vol1-Eval-Key
+
+# shell options
+set -eo pipefail
+
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+# configure number of jobs running in parallel, you should adjust these numbers according to your machines
+# data preparation
+if [ $stage -le -2 ]; then
+  # Lexicon Preparation,
+  echo "$0: Lexicon Preparation"
+  local/prepare_dict.sh || exit 1;
+
+  # Data Preparation
+  echo "$0: Data Preparation"
+  local/prepare_data.sh --train-dir $train_dir --eval-dir $eval_dir --eval-key-dir $eval_key_dir || exit 1;
+
+  # Phone Sets, questions, L compilation
+  echo "$0: Phone Sets, questions, L compilation Preparation"
+  rm -rf data/lang
+  utils/prepare_lang.sh --position-dependent-phones false data/local/dict \
+      "<SIL>" data/local/lang data/lang || exit 1;
+
+  # LM training
+  echo "$0: LM training"
+  rm -rf data/local/lm/3gram-mincount
+  local/train_lms.sh || exit 1;
+
+  # G compilation, check LG composition
+  echo "$0: G compilation, check LG composition"
+  utils/format_lm.sh data/lang data/local/lm/3gram-mincount/lm_unpruned.gz \
+      data/local/dict/lexicon.txt data/lang_test || exit 1;
+
+fi
+
+# Now make MFCC plus pitch features.
+# mfccdir should be some place with a largish disk where you
+# want to store MFCC features.
+mfccdir=mfcc
+
+# mfcc
+if [ $stage -le -1 ]; then
+  echo "$0: making mfccs"
+  for x in train test eval; do
+    steps/make_mfcc_pitch.sh --cmd "$train_cmd" --nj $num_jobs data/$x exp/make_mfcc/$x $mfccdir || exit 1;
+    steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1;
+    utils/fix_data_dir.sh data/$x || exit 1;
+  done
+fi
+
+# mono
+if [ $stage -le 0 ]; then
+  echo "$0: train mono model"
+  # Make some small data subsets for early system-build stages.
+  echo "$0: make training subsets"
+  utils/subset_data_dir.sh --shortest data/train 3000 data/train_mono
+
+  # train mono
+  steps/train_mono.sh --boost-silence 1.25 --cmd "$train_cmd" --nj $num_jobs \
+    data/train_mono data/lang exp/mono || exit 1;
+
+  # Get alignments from monophone system.
+  steps/align_si.sh --boost-silence 1.25 --cmd "$train_cmd" --nj $num_jobs \
+    data/train data/lang exp/mono exp/mono_ali || exit 1;
+
+  # Monophone decoding
+  (
+  utils/mkgraph.sh data/lang_test exp/mono exp/mono/graph || exit 1;
+  steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj $num_jobs \
+    exp/mono/graph data/test exp/mono/decode_test
+  )&
+fi
+
+# tri1
+if [ $stage -le 1 ]; then
+  echo "$0: train tri1 model"
+  # train tri1 [first triphone pass]
+  steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \
+   2500 20000 data/train data/lang exp/mono_ali exp/tri1 || exit 1;
+
+  # align tri1
+  steps/align_si.sh --cmd "$train_cmd" --nj $num_jobs \
+    data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
+
+  # decode tri1
+  (
+  utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph || exit 1;
+  steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj $num_jobs \
+    exp/tri1/graph data/test exp/tri1/decode_test
+  )&
+fi
+
+# tri2
+if [ $stage -le 2 ]; then
+  echo "$0: train tri2 model"
+  # train tri2 [delta+delta-deltas]
+  steps/train_deltas.sh --cmd "$train_cmd" \
+   2500 20000 data/train data/lang exp/tri1_ali exp/tri2 || exit 1;
+
+  # align tri2b
+  steps/align_si.sh --cmd "$train_cmd" --nj $num_jobs \
+    data/train data/lang exp/tri2 exp/tri2_ali || exit 1;
+
+  # decode tri2
+  (
+  utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph
+  steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj $num_jobs \
+    exp/tri2/graph data/test exp/tri2/decode_test
+  )&
+fi
+
+# tri3a
+if [ $stage -le 3 ]; then
+  echo "$-: train tri3 model"
+  # Train tri3a, which is LDA+MLLT,
+  steps/train_lda_mllt.sh --cmd "$train_cmd" \
+   2500 20000 data/train data/lang exp/tri2_ali exp/tri3a || exit 1;
+
+  # decode tri3a
+  (
+  utils/mkgraph.sh data/lang_test exp/tri3a exp/tri3a/graph || exit 1;
+  steps/decode.sh --cmd "$decode_cmd" --nj $num_jobs --config conf/decode.config \
+    exp/tri3a/graph data/test exp/tri3a/decode_test
+  )&
+fi
+
+# tri4
+if [ $stage -le 4 ]; then
+  echo "$0: train tri4 model"
+  # From now, we start building a more serious system (with SAT), and we'll
+  # do the alignment with fMLLR.
+  steps/align_fmllr.sh --cmd "$train_cmd" --nj $num_jobs \
+    data/train data/lang exp/tri3a exp/tri3a_ali || exit 1;
+
+  steps/train_sat.sh --cmd "$train_cmd" \
+    2500 20000 data/train data/lang exp/tri3a_ali exp/tri4a || exit 1;
+
+  # align tri4a
+  steps/align_fmllr.sh  --cmd "$train_cmd" --nj $num_jobs \
+    data/train data/lang exp/tri4a exp/tri4a_ali
+
+  # decode tri4a
+  (
+  utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph
+  steps/decode_fmllr.sh --cmd "$decode_cmd" --nj $num_jobs --config conf/decode.config \
+    exp/tri4a/graph data/test exp/tri4a/decode_test
+  )&
+fi
+
+# tri5
+if [ $stage -le 5 ]; then
+  echo "$0: train tri5 model"
+  # Building a larger SAT system.
+  steps/train_sat.sh --cmd "$train_cmd" \
+    3500 100000 data/train data/lang exp/tri4a_ali exp/tri5a || exit 1;
+
+  # align tri5a
+  steps/align_fmllr.sh --cmd "$train_cmd" --nj $num_jobs \
+    data/train data/lang exp/tri5a exp/tri5a_ali || exit 1;
+
+  # decode tri5
+  (
+  utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph || exit 1;
+  steps/decode_fmllr.sh --cmd "$decode_cmd" --nj $num_jobs --config conf/decode.config \
+     exp/tri5a/graph data/test exp/tri5a/decode_test || exit 1;
+  )&
+fi
+
+# nnet3 tdnn models
+# commented out by default, since the chain model is usually faster and better
+#if [ $stage -le 6 ]; then
+  # echo "$0: train nnet3 model"
+  # local/nnet3/run_tdnn.sh
+#fi
+
+# chain model
+if [ $stage -le 7 ]; then
+  # The iVector-extraction and feature-dumping parts coulb be skipped by setting "--train_stage 7"
+  echo "$0: train chain model"
+  local/chain/run_tdnn.sh
+fi
+
+# getting results (see RESULTS file)
+if [ $stage -le 8 ]; then
+  echo "$0: extract the results"
+  for test_set in test eval; do
+  echo "WER: $test_set"
+  for x in exp/*/decode_${test_set}*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done 2>/dev/null
+  for x in exp/*/*/decode_${test_set}*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done 2>/dev/null
+  echo
+
+  echo "CER: $test_set"
+  for x in exp/*/decode_${test_set}*; do [ -d $x ] && grep WER $x/cer_* | utils/best_wer.sh; done 2>/dev/null
+  for x in exp/*/*/decode_${test_set}*; do [ -d $x ] && grep WER $x/cer_* | utils/best_wer.sh; done 2>/dev/null
+  echo
+  done
+fi
+
+# finish
+echo "$0: all done"
+
+exit 0;
diff --git a/egs/formosa/s5/steps b/egs/formosa/s5/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/formosa/s5/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/formosa/s5/utils b/egs/formosa/s5/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/formosa/s5/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file
diff --git a/egs/gale_arabic/s5/local/gale_format_data.sh b/egs/gale_arabic/s5/local/gale_format_data.sh
index 85a946a58d9..053323dc194 100755
--- a/egs/gale_arabic/s5/local/gale_format_data.sh
+++ b/egs/gale_arabic/s5/local/gale_format_data.sh
@@ -57,4 +57,4 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
 
 echo gale_format_data  succeeded.
 
-exit 0
\ No newline at end of file
+exit 0
diff --git a/egs/gale_arabic/s5/local/gale_prep_dict.sh b/egs/gale_arabic/s5/local/gale_prep_dict.sh
index 74ef789eda7..f6fd83378d0 100755
--- a/egs/gale_arabic/s5/local/gale_prep_dict.sh
+++ b/egs/gale_arabic/s5/local/gale_prep_dict.sh
@@ -25,9 +25,8 @@ echo SIL > $dir/optional_silence.txt
 cat $dir/lexicon.txt | cut -d ' ' -f2- | tr -s ' ' '\n' |\
 sort -u >  $dir/nonsilence_phones.txt || exit 1;
 
+perl -i -pe 'print "<UNK> SIL\n" if $.==1'  $dir/lexicon.txt
 
- sed -i '1i<UNK> SIL' $dir/lexicon.txt
- 
 echo Dictionary preparation succeeded
 
 exit 0
diff --git a/egs/gale_arabic/s5/local/gale_train_lms.sh b/egs/gale_arabic/s5/local/gale_train_lms.sh
index 1b5d4665a19..8f8e715390f 100755
--- a/egs/gale_arabic/s5/local/gale_train_lms.sh
+++ b/egs/gale_arabic/s5/local/gale_train_lms.sh
@@ -113,4 +113,4 @@ fi
 
 echo train lm succeeded
 
-exit 0
\ No newline at end of file
+exit 0
diff --git a/egs/gale_arabic/s5b/RESULTS b/egs/gale_arabic/s5b/RESULTS
index 2260a106654..a485240ff6b 100644
--- a/egs/gale_arabic/s5b/RESULTS
+++ b/egs/gale_arabic/s5b/RESULTS
@@ -2,13 +2,7 @@
 # This file is generated using local/split_wer.sh $galeData  //galeData is a local folder to keep intermediate gale data
 # look at the end of run.sh in the same folder 
 ##
-##### RESULTS generated by amali at 2017-01-01-08-05-59
-
 Report Results WER:
-%WER 9.50 [ 2124 / 22363, 160 ins, 275 del, 1689 sub ] exp/chain_cleaned/tdnn_lstm1a_sp_bi/decode/wer_report_9
-%WER 10.72 [ 2398 / 22363, 163 ins, 313 del, 1922 sub ] exp/chain_cleaned/tdnn1b_sp_bi/decode/wer_report_9
-%WER 12.04 [ 2693 / 22363, 226 ins, 271 del, 2196 sub ] exp/nnet3_cleaned/lstm_ld5_sp/decode/wer_report_9
-%WER 12.29 [ 2749 / 22363, 273 ins, 266 del, 2210 sub ] exp/nnet3_cleaned/tdnn_sp/decode/wer_report_10
 %WER 17.82 [ 3986 / 22363, 315 ins, 618 del, 3053 sub ] exp/sgmm_5a_mmi_b0.1/decode/wer_report_12
 %WER 18.15 [ 4059 / 22363, 335 ins, 589 del, 3135 sub ] exp/sgmm_5a_mmi_b0.1/decode4/wer_report_11
 %WER 18.42 [ 4119 / 22363, 346 ins, 590 del, 3183 sub ] exp/sgmm_5a_mmi_b0.1/decode3/wer_report_11
@@ -27,10 +21,6 @@ Report Results WER:
 %WER 25.66 [ 5738 / 22363, 478 ins, 838 del, 4422 sub ] exp/tri2a/decode/wer_report_14
 %WER 26.38 [ 5900 / 22363, 435 ins, 929 del, 4536 sub ] exp/tri1/decode/wer_report_15
 Conversational Results WER:
-%WER 21.59 [ 10213 / 47305, 944 ins, 3092 del, 6177 sub ] exp/chain_cleaned/tdnn_lstm1a_sp_bi/decode/wer_conversational_9
-%WER 24.77 [ 11716 / 47305, 1098 ins, 3579 del, 7039 sub ] exp/chain_cleaned/tdnn1b_sp_bi/decode/wer_conversational_9
-%WER 26.78 [ 12670 / 47305, 1741 ins, 2434 del, 8495 sub ] exp/nnet3_cleaned/lstm_ld5_sp/decode/wer_conversational_9
-%WER 27.55 [ 13032 / 47305, 1800 ins, 2666 del, 8566 sub ] exp/nnet3_cleaned/tdnn_sp/decode/wer_conversational_11
 %WER 34.10 [ 16133 / 47305, 1903 ins, 3245 del, 10985 sub ] exp/sgmm_5a_mmi_b0.1/decode/wer_conversational_11
 %WER 34.81 [ 16466 / 47305, 2077 ins, 3037 del, 11352 sub ] exp/sgmm_5a_mmi_b0.1/decode4/wer_conversational_10
 %WER 35.19 [ 16648 / 47305, 1933 ins, 3264 del, 11451 sub ] exp/sgmm_5a_mmi_b0.1/decode3/wer_conversational_11
@@ -49,10 +39,6 @@ Conversational Results WER:
 %WER 45.92 [ 21724 / 47305, 1995 ins, 5213 del, 14516 sub ] exp/tri2a/decode/wer_conversational_14
 %WER 46.86 [ 22166 / 47305, 2212 ins, 4819 del, 15135 sub ] exp/tri1/decode/wer_conversational_13
 Combined Results for Reports and Conversational WER:
-%WER 17.64 [ 12286 / 69668, 1310 ins, 2807 del, 8169 sub ] exp/chain_cleaned/tdnn_lstm1a_sp_bi/decode/wer_8
-%WER 20.26 [ 14114 / 69668, 1261 ins, 3892 del, 8961 sub ] exp/chain_cleaned/tdnn1b_sp_bi/decode/wer_9
-%WER 22.05 [ 15363 / 69668, 1967 ins, 2705 del, 10691 sub ] exp/nnet3_cleaned/lstm_ld5_sp/decode/wer_9
-%WER 22.66 [ 15786 / 69668, 2047 ins, 2955 del, 10784 sub ] exp/nnet3_cleaned/tdnn_sp/decode/wer_11
 %WER 28.89 [ 20127 / 69668, 2244 ins, 3829 del, 14054 sub ] exp/sgmm_5a_mmi_b0.1/decode/wer_11
 %WER 29.48 [ 20541 / 69668, 2243 ins, 3860 del, 14438 sub ] exp/sgmm_5a_mmi_b0.1/decode4/wer_11
 %WER 29.81 [ 20767 / 69668, 2279 ins, 3854 del, 14634 sub ] exp/sgmm_5a_mmi_b0.1/decode3/wer_11
@@ -65,8 +51,30 @@ Combined Results for Reports and Conversational WER:
 %WER 32.36 [ 22542 / 69668, 2156 ins, 4184 del, 16202 sub ] exp/tri2b_mmi/decode_it4/wer_11
 %WER 32.50 [ 22640 / 69668, 2393 ins, 3956 del, 16291 sub ] exp/tri2b_mmi/decode_it3/wer_11
 %WER 32.79 [ 22847 / 69668, 2407 ins, 4760 del, 15680 sub ] exp/tri2b_mpe/decode_it3/wer_13
+# WER with train_sat_basis
+%WER 33.35 [ 23233 / 69668, 2385 ins, 5274 del, 15574 sub ] exp/tri3b/decode/wer_16_0.5
+# WER with train_sat
 %WER 33.61 [ 23413 / 69668, 2817 ins, 4577 del, 16019 sub ] exp/tri3b/decode/wer_17
 %WER 35.73 [ 24894 / 69668, 2630 ins, 4944 del, 17320 sub ] exp/tri3b/decode.si/wer_15
 %WER 36.17 [ 25196 / 69668, 2429 ins, 5393 del, 17374 sub ] exp/tri2b/decode/wer_16
 %WER 39.42 [ 27462 / 69668, 2473 ins, 6051 del, 18938 sub ] exp/tri2a/decode/wer_14
 %WER 40.35 [ 28113 / 69668, 2713 ins, 5635 del, 19765 sub ] exp/tri1/decode/wer_13
+
+
+# Effect of GMM seed model (tri2b instead of tri3b).  Using tri3b give a slightly better result
+# as compared to using tri2b as seed.
+%WER 16.66 [ 11610 / 69668, 1233 ins, 2747 del, 7630 sub ] exp/chain/tdnn_1a_3b_sp/decode_test/wer_10_0.0
+%WER 16.71 [ 11642 / 69668, 1145 ins, 2908 del, 7589 sub ] exp/chain/tdnn_1a_2b_sp/decode_test/wer_9_0.0
+
+# Effect of Tree-size (3500, 4500, 7000, 11000)
+%WER 16.66 [ 11610 / 69668, 1233 ins, 2747 del, 7630 sub ] exp/chain/tdnn_1a_3500_sp/decode_test/wer_10_0.0
+%WER 16.59 [ 11557 / 69668, 1234 ins, 2646 del, 7677 sub ] exp/chain/tdnn_1a_4500_sp/decode_test/wer_10_0.0
+%WER 16.47 [ 11474 / 69668, 1421 ins, 2207 del, 7846 sub ] exp/chain/tdnn_1a_7000_sp/decode_test/wer_9_0.0
+%WER 16.62 [ 11580 / 69668, 1164 ins, 2789 del, 7627 sub ] exp/chain/tdnn_1a_11000_sp/decode_test/wer_10_0.0
+
+# Effect of l2-regularization on the output with tree-size=7000. l2 on the output (0.005,0.002)
+%WER 16.54 [ 11522 / 69668, 1123 ins, 2739 del, 7660 sub ] exp/chain/tdnn_1a_7000_005_sp/decode_test/wer_9_0.5
+%WER 16.47 [ 11474 / 69668, 1421 ins, 2207 del, 7846 sub ] exp/chain/tdnn_1a_7000_002_sp/decode_test/wer_9_0.0
+
+#current best 'chain' models  (see local/chain/tuning/run_tdnn_1a.sh)
+%WER 16.47 [ 11474 / 69668, 1421 ins, 2207 del, 7846 sub ] exp/chain/tdnn_1a_sp/decode_test/wer_9_0.0
diff --git a/egs/gale_arabic/s5b/cmd.sh b/egs/gale_arabic/s5b/cmd.sh
index 71dd849a93b..ea341c98d4a 100755
--- a/egs/gale_arabic/s5b/cmd.sh
+++ b/egs/gale_arabic/s5b/cmd.sh
@@ -10,6 +10,6 @@
 # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
 # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
-export train_cmd="queue.pl --mem 2G"
-export decode_cmd="queue.pl --mem 4G"
-export mkgraph_cmd="queue.pl --mem 8G"
+export train_cmd="retry.pl queue.pl --mem 2G"
+export decode_cmd="retry.pl queue.pl --mem 4G"
+export mkgraph_cmd="retry.pl queue.pl --mem 8G"
diff --git a/egs/gale_arabic/s5b/local/chain/compare_wer.sh b/egs/gale_arabic/s5b/local/chain/compare_wer.sh
new file mode 100755
index 00000000000..1a40523355a
--- /dev/null
+++ b/egs/gale_arabic/s5b/local/chain/compare_wer.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b}
+
+# ./local/chain/compare_wer.sh exp/chain/cnn1a
+# System                          cnn1a
+# WER                              0.61
+# CER                              0.15
+# Final train prob              -0.0377
+# Final valid prob              -0.0380
+# Final train prob (xent)       -0.0830
+# Final valid prob (xent)       -0.0838
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/cnn{1a,1b}"
+  exit 1
+fi
+
+echo "# $0 $*"
+used_epochs=false
+
+echo -n "# System                     "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+echo -n "# WER                        "
+for x in $*; do
+  wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# CER                        "
+for x in $*; do
+  cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+echo -n "# Final train prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
diff --git a/egs/gale_arabic/s5b/local/chain/run_chain_common.sh b/egs/gale_arabic/s5b/local/chain/run_chain_common.sh
new file mode 100755
index 00000000000..da37e148441
--- /dev/null
+++ b/egs/gale_arabic/s5b/local/chain/run_chain_common.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+
+# this script has common stages shared across librispeech chain recipes.
+# It generates a new topology in a new lang directory, gets the alignments as
+# lattices, and builds a tree for the new topology
+set -e
+
+stage=11
+
+# input directory names. These options are actually compulsory, and they have
+# been named for convenience
+gmm_dir=
+ali_dir=
+lores_train_data_dir=
+
+num_leaves=6000
+
+# output directory names. They are also compulsory.
+lang=
+lat_dir=
+tree_dir=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+[ -z $lang ] && echo "Set --lang, this specifies the new lang directory which will have the new topology" && exit 1;
+[ -z $lat_dir ] && echo "Set --lat-dir, this specifies the experiment directory to store lattice" && exit 1;
+[ -z $tree_dir ] && echo "Set --tree-dir, this specifies the directory to store new tree " && exit 1;
+
+for f in $gmm_dir/final.mdl $ali_dir/ali.1.gz $lores_train_data_dir/feats.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 11 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 12 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat ${ali_dir}/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \
+    $lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 13 ]; then
+  # Build a tree using our new topology. We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" $num_leaves ${lores_train_data_dir} $lang $ali_dir $tree_dir
+fi
+
+exit 0;
diff --git a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh
index 7afafb31ff6..bf2e45c9914 100755
--- a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh
@@ -1,31 +1,51 @@
 #!/bin/bash
 
-#started from tedlium recipe with few edits
+# ./local/chain/compare_wer.sh exp/chain/tdnn_1a_sp
+# System                      tdnn_1a_sp
+# WER                             16.47
+# CER                              6.68
+# Final train prob              -0.0652
+# Final valid prob              -0.0831
+# Final train prob (xent)       -0.8965
+# Final valid prob (xent)       -0.9964
 
+# steps/info/chain_dir_info.pl exp/chain/tdnn_1a_sp/
+# exp/chain/tdnn_1a_sp/: num-iters=441 nj=3..16 num-params=18.6M dim=40+100->5816 combine=-0.063->-0.062 (over 6) xent:train/valid[293,440,final]=(-1.22,-0.912,-0.896/-1.29,-1.01,-0.996) logprob:train/valid[293,440,final]=(-0.097,-0.066,-0.065/-0.108,-0.084,-0.083)
 
-set -e -o pipefail
 
-# First the options that are passed through to run_ivector_common.sh
-# (some of which are also used in this script directly).
+set -e -o pipefail
 stage=0
 nj=30
-decode_nj=30
-min_seg_len=1.55
-xent_regularize=0.1
 train_set=train
-gmm=tri2b # the gmm for the target data
+test_set=test
+gmm=tri3b        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
 num_threads_ubm=32
-nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
-
-# The rest are configs specific to this script.  Most of the parameters
-# are just hardcoded at this level, in the commands below.
-train_stage=-10 #default -10
-tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
-tdnn_affix=1b  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
-common_egs_dir=  # you can set this to use previously dumped egs.
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=_1a   #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# LSTM/chain options
+train_stage=-10
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+# training chunk-options
+chunk_width=150,110,100
+get_egs_stage=-10
+
+# training options
+srand=0
+remove_egs=true
+run_ivector_common=true
+run_chain_common=true
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
+
 . ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
@@ -39,169 +59,162 @@ where "nvcc" is installed.
 EOF
 fi
 
-local/nnet3/run_ivector_common.sh --stage $stage \
-                                  --nj $nj \
-                                  --min-seg-len $min_seg_len \
-                                  --train-set $train_set \
-                                  --gmm $gmm \
-                                  --num-threads-ubm $num_threads_ubm \
-                                  --nnet3-affix "$nnet3_affix"
-
-
-gmm_dir=exp/$gmm
-ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
-tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
-lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
-dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi
-train_data_dir=data/${train_set}_sp_hires_comb
-lores_train_data_dir=data/${train_set}_sp_comb
-train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
-
+if $run_ivector_common; then
+  local/nnet3/run_ivector_common.sh \
+    --stage $stage --nj $nj \
+    --train-set $train_set --gmm $gmm \
+    --num-threads-ubm $num_threads_ubm \
+    --nnet3-affix "$nnet3_affix"
+fi
 
-for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
-    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+# note: you don't necessarily have to change the treedir name
+# each time you do a new experiment-- only if you change the
+# configuration in a way that affects the tree.
+tree_dir=exp/chain${nnet3_affix}/tree_a_sp
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
   [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
 done
 
-if [ $stage -le 14 ]; then
-  echo "$0: creating lang directory with one state per phone."
-  # Create a version of the lang/ directory that has one state per phone in the
-  # topo file. [note, it really has two states.. the first one is only repeated
-  # once, the second one has zero or more repeats.]
-  if [ -d data/lang_chain ]; then
-    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
-      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
-    else
-      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
-      echo " ... not sure what to do.  Exiting."
-      exit 1;
-    fi
-  else
-    cp -r data/lang data/lang_chain
-    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
-    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
-    # Use our special topology... note that later on may have to tune this
-    # topology.
-    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
-  fi
+# Please take this as a reference on how to specify all the options of
+# local/chain/run_chain_common.sh
+if $run_chain_common; then
+  local/chain/run_chain_common.sh --stage $stage \
+                                  --gmm-dir $gmm_dir \
+                                  --ali-dir $ali_dir \
+                                  --lores-train-data-dir ${lores_train_data_dir} \
+                                  --lang $lang \
+                                  --lat-dir $lat_dir \
+                                  --num-leaves 7000 \
+                                  --tree-dir $tree_dir || exit 1;
 fi
 
 if [ $stage -le 15 ]; then
-  # Get the alignments as lattices (gives the chain training more freedom).
-  # use the same num-jobs as the alignments
-  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
-    data/lang $gmm_dir $lat_dir
-  rm $lat_dir/fsts.*.gz # save space
-fi
-
-if [ $stage -le 16 ]; then
-  # Build a tree using our new topology.  We know we have alignments for the
-  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
-  # those.
-  if [ -f $tree_dir/final.mdl ]; then
-    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
-    exit 1;
-  fi
-  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
-      --context-opts "--context-width=2 --central-position=1" \
-      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
-fi
-
-if [ $stage -le 17 ]; then
   mkdir -p $dir
-
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.002"
 
   mkdir -p $dir/configs
+
   cat <<EOF > $dir/configs/network.xconfig
   input dim=100 name=ivector
   input dim=40 name=input
-
   # please note that it is important to have input layer with the name=input
   # as the layer immediately preceding the fixed-affine-layer to enable
   # the use of short notation for the descriptor
   fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
-
   # the first splicing is moved before the lda layer, so no splicing here
-  relu-renorm-layer name=tdnn1 dim=450
-  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=450
-  relu-renorm-layer name=tdnn3 input=Append(-1,0,1,2) dim=450
-  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=450
-  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=450
-  relu-renorm-layer name=tdnn6 input=Append(-6,-3,0) dim=450
-
-  ## adding the layers for chain branch
-  relu-renorm-layer name=prefinal-chain input=tdnn6 dim=450 target-rms=0.5
-  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
-
-  # adding the layers for xent branch
-  # This block prints the configs for a separate output that will be
-  # trained with a cross-entropy objective in the 'chain' models... this
-  # has the effect of regularizing the hidden parts of the model.  we use
-  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
-  # 0.5 / args.xent_regularize is suitable as it means the xent
-  # final-layer learns at a rate independent of the regularization
-  # constant; and the 0.5 was tuned so as to make the relative progress
-  # similar in the xent and regular final layers.
-  relu-renorm-layer name=prefinal-xent input=tdnn6 dim=450 target-rms=0.5
-  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
-
+  relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  linear-component name=prefinal-l dim=256 $linear_opts
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
 EOF
   steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
-
 fi
 
-if [ $stage -le 18 ]; then
+
+if [ $stage -le 16 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/gale_arabic-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
   fi
 
- steps/nnet3/chain/train.py --stage $train_stage \
+  steps/nnet3/chain/train.py --stage $train_stage \
     --cmd "$decode_cmd" \
     --feat.online-ivector-dir $train_ivector_dir \
     --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
     --chain.xent-regularize $xent_regularize \
     --chain.leaky-hmm-coefficient 0.1 \
-    --chain.l2-regularize 0.00005 \
+    --chain.l2-regularize 0.0 \
     --chain.apply-deriv-weights false \
     --chain.lm-opts="--num-extra-lm-states=2000" \
-    --egs.dir "$common_egs_dir" \
-    --egs.opts "--frames-overlap-per-eg 0" \
-    --egs.chunk-width 150 \
-    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs 6 \
     --trainer.frames-per-iter 1500000 \
-    --trainer.num-epochs 4 \
-    --trainer.optimization.num-jobs-initial 2 \
-    --trainer.optimization.num-jobs-final 2 \
-    --trainer.optimization.initial-effective-lrate 0.001 \
-    --trainer.optimization.final-effective-lrate 0.0001 \
-    --trainer.max-param-change 2.0 \
-    --cleanup.remove-egs true \
-    --feat-dir $train_data_dir \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.00025 \
+    --trainer.optimization.final-effective-lrate 0.000025 \
+    --trainer.num-chunk-per-minibatch=64,32 \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0 --constrained false" \
+    --egs.stage $get_egs_stage \
+    --reporting.email="$reporting_email" \
+    --cleanup.remove-egs=$remove_egs \
+    --feat-dir=$train_data_dir \
     --tree-dir $tree_dir \
-    --lat-dir $lat_dir \
-    --dir $dir
-fi
-
+    --lat-dir=$lat_dir \
+    --dir $dir  || exit 1;
 
+fi
 
-if [ $stage -le 19 ]; then
-  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
-  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
-  # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+if [ $stage -le 17 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test \
+    $tree_dir $tree_dir/graph || exit 1;
 fi
 
-if [ $stage -le 20 ]; then
+if [ $stage -le 18 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
   rm $dir/.error 2>/dev/null || true
-  steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
-    --acwt 1.0 --post-decode-acwt 10.0 \
-    --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_test_hires \
-    --scoring-opts "--min-lmwt 5 " \
-    $dir/graph data/test_hires $dir/decode || exit 1;
+
+    steps/nnet3/decode.sh \
+      --acwt 1.0 --post-decode-acwt 10.0 \
+      --extra-left-context 0 --extra-right-context 0 \
+      --extra-left-context-initial 0 \
+      --extra-right-context-final 0 \
+      --frames-per-chunk $frames_per_chunk \
+      --nj $nj --cmd "$decode_cmd"  --num-threads 4 \
+      --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${test_set}_hires \
+      $tree_dir/graph data/${test_set}_hires ${dir}/decode_${test_set} || exit 1
 fi
-exit 0
diff --git a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
index 604f32a1de4..deebafc95e4 100755
--- a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -120,7 +120,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/gale_arabic/s5b/local/gale_data_prep_audio.sh b/egs/gale_arabic/s5b/local/gale_data_prep_audio.sh
deleted file mode 100755
index 0125272d06c..00000000000
--- a/egs/gale_arabic/s5b/local/gale_data_prep_audio.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash 
-
-# Copyright 2014 QCRI (author: Ahmed Ali)
-# Apache 2.0
-
-
-galeData=$(utils/make_absolute.sh "${@: -1}" );  # last argumnet; the local folder
-audio_dvds=${@:1:${#}-1} # all the audio dvds for GALE corpus; ; check audio=( in ../run.sh
-
-mkdir -p $galeData 
-
-# check that sox is installed 
-which sox  &>/dev/null
-if [[ $? != 0 ]]; then 
- echo "sox is not installed"; exit 1 
-fi
-
-for dvd in $audio_dvds; do
-  dvd_full_path=$(utils/make_absolute.sh $dvd)
-  if [[ ! -e $dvd_full_path ]]; then 
-    echo missing $dvd_full_path; exit 1;
-  fi
-  find $dvd_full_path \( -name "*.wav" -o -name "*.flac" \)  | while read file; do
-    id=$(basename $file | awk '{gsub(".wav","");gsub(".flac","");print}')
-    echo "$id sox $file -r 16000 -t wav - |"
-  done 
-done | sort -u > $galeData/wav.scp
-
-echo data prep audio succeded
-
-exit 0
-
diff --git a/egs/gale_arabic/s5b/local/gale_data_prep_split.sh b/egs/gale_arabic/s5b/local/gale_data_prep_split.sh
deleted file mode 100755
index b18a4e5b105..00000000000
--- a/egs/gale_arabic/s5b/local/gale_data_prep_split.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash 
-
-# Copyright 2014 QCRI (author: Ahmed Ali)
-# Apache 2.0
-
-if [ $# -ne 1 ]; then
-   echo "Arguments should be the <gale folder>"; exit 1
-fi
-
-
-#data will data/local
-
-galeData=$(utils/make_absolute.sh $1)
-mkdir -p data/local
-dir=$(utils/make_absolute.sh data/local)
-
-
-grep -f local/test_list $galeData/all | grep -v -f local/bad_segments > $galeData/all.test
-grep -v -f local/test_list $galeData/all | grep -v -f local/bad_segments > $galeData/all.train 
-
-for x in test train; do
- outdir=$dir/$x
- file=$galeData/all.$x 
- mkdir -p $outdir
- awk '{print $2 " " $2}' $file | sort -u > $outdir/utt2spk 
- cp -pr $outdir/utt2spk $outdir/spk2utt
- awk '{print $2 " " $1 " " $3 " " $4}' $file  | sort -u > $outdir/segments
- awk '{printf $2 " "; for (i=5; i<=NF; i++) {printf $i " "} printf "\n"}' $file | sort -u > $outdir/text
-done 
-
-
-grep -f local/test_list $galeData/wav.scp > $dir/test/wav.scp
-
-cat $galeData/wav.scp | awk -v seg=$dir/train/segments 'BEGIN{while((getline<seg) >0) {seen[$2]=1;}}
- {if (seen[$1]) { print $0}}' > $dir/train/wav.scp
- 
-echo data prep split succeeded
-
-exit 0
diff --git a/egs/gale_arabic/s5b/local/gale_data_prep_txt.sh b/egs/gale_arabic/s5b/local/gale_data_prep_txt.sh
deleted file mode 100755
index 04529d88ac0..00000000000
--- a/egs/gale_arabic/s5b/local/gale_data_prep_txt.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/bash
-
-# Copyright 2014 QCRI (author: Ahmed Ali)
-# Apache 2.0
-
-galeData=$(utils/make_absolute.sh "${@: -1}" );  # last argumnet; the local folder
-txt_dvds=${@:1:${#}-1} # all the txt cds correspoding to the audio corpus; check text=( in ../run.sh
-
-
-top_pwd=`pwd`
-txtdir=$galeData/txt
-mkdir -p $txtdir; cd $txtdir
-
-for cdx in $txt_dvds; do
-  echo "Preparing $cdx"
-  if [[ $cdx  == *.tgz ]] ; then
-     tar -xvf $cdx
-  elif [  -d "$cdx" ]; then
-    ln -s $cdx `basename $cdx`
-  else
-    echo "I don't really know what I shall do with $cdx " >&2
-  fi
-done
-
-find -L . -type f -name "*.tdf" | while read file; do
-sed '1,3d' $file  # delete the first 3 lines
-done >  all.tmp$$
-
-perl -e '
-    ($inFile,$idFile,$txtFile)= split /\s+/, $ARGV[0];
-    open(IN, "$inFile");
-    open(ID, ">$idFile");
-    open(TXT, ">$txtFile");
-    while (<IN>) {
-      @arr= split /\t/,$_;
-      $start=sprintf ("%0.3f",$arr[2]);$rStart=$start;$start=~s/\.//; $start=~s/^0+$/0/; $start=~s/^0+([^0])/$1/; # remove zeros at the beginning
-      $end=sprintf ("%0.3f",$arr[3]);$rEnd=$end;$end=~s/^0+([^0])/$1/;$end=~s/\.//;
-      if ( ($arr[11] !~ m/report/) && ($arr[11] !~ m/conversational/) ){$arr[11]="UNK";}
-      $id="$arr[11] $arr[0] $arr[0]_${start}_${end} $rStart $rEnd\n";
-      next if ($rStart == $rEnd);
-      $id =~ s/.sph//g;
-      print ID $id;
-      print TXT "$arr[7]\n";
- }' "all.tmp$$ allid.tmp$$ contentall.tmp$$"
-
-
-perl ${top_pwd}/local/normalize_transcript_BW.pl contentall.tmp$$ contentall.buck.tmp$$
-
-paste allid.tmp$$ contentall.buck.tmp$$ | sed 's: $::' | awk '{if (NF>5) {print $0}}'  > all_1.tmp$$
-
-awk '{$1="";print $0}' all_1.tmp$$ | sed 's:^ ::' > $galeData/all
-awk '{if ($1 == "report") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' >  $galeData/report
-awk '{if ($1 == "conversational") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' > $galeData/conversational
-
-cd ..;
-rm -fr $txtdir
-cd $top_pwd
-echo data prep text succeeded
-
-exit 0
diff --git a/egs/gale_arabic/s5b/local/gale_prep_grapheme_dict.sh b/egs/gale_arabic/s5b/local/gale_prep_grapheme_dict.sh
deleted file mode 100755
index 5f101f8245b..00000000000
--- a/egs/gale_arabic/s5b/local/gale_prep_grapheme_dict.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/bin/bash
-
-# Copyright 2017 QCRI (author: Ahmed Ali)
-# Apache 2.0
-
-
-# run this from ../
-dir=$(utils/make_absolute.sh data/local/dict)
-mkdir -p $dir
-
-
-# (1) Get all avaialble  dictionaries, since this is a grapheme model, so we mainly need the most frequent word lists
-wget http://alt.qcri.org//resources/speech/dictionary/ar-ar_grapheme_lexicon_2016-02-09.bz2  || exit 1;
-wget http://alt.qcri.org//resources/speech/dictionary/ar-ar_lexicon_2014-03-17.txt.bz2  || exit 1;
-bzcat ar-ar_grapheme_lexicon_2016-02-09.bz2  | sed '1,3d' | awk '{print $1}'  >  tmp$$
-bzcat ar-ar_lexicon_2014-03-17.txt.bz2 | sed '1,3d' | awk '{print $1}' >>  tmp$$
-# (2) Now we add all the words appeared in the training data
-cat data/local/train/text | cut -d ' ' -f 2- | tr -s " " "\n" | sort -u >> tmp$$
-grep -v [0-9] tmp$$ |  sed -e 's:[FNKaui\~o\`]::g' -e 's:{:}:g' | sort -u > tmp1.$$ # remove vowels and  rare alef wasla
-cat tmp1.$$ | sed 's:\(\):\1 :g' | sed -e 's:  : :g' -e 's:  : :g' -e 's:\s*: :g' -e  's:\*:V:g' > tmp2.$$
-paste -d ' ' tmp1.$$ tmp2.$$ > $dir/lexicon.txt 
-
-#(2) Dictionary preparation:
-
-# silence phones, one per line.
-echo SIL > $dir/silence_phones.txt
-echo SIL > $dir/optional_silence.txt
-
-# nonsilence phones; on each line is a list of phones that correspond
-# really to the same base phone.
-cat tmp2.$$ | tr -s ' ' '\n' | grep -v ^$  | sort -u >  $dir/nonsilence_phones.txt || exit 1;
-
-sed -i '1i<UNK> SIL' $dir/lexicon.txt # insert word <UNK> with phone sil at the begining of the dictionary
-
-rm -fr ar-ar_lexicon_2014-03-17.txt.bz2 ar-ar_grapheme_lexicon_2016-02-09.bz2 tmp$$ tmp1.$$ tmp2.$$ 
-echo Dictionary preparation succeeded
-
-# The script is still missing dates and numbers 
-
-exit 0 
-
diff --git a/egs/gale_arabic/s5b/local/gale_train_lms.sh b/egs/gale_arabic/s5b/local/gale_train_lms.sh
deleted file mode 100755
index 3988ec3818f..00000000000
--- a/egs/gale_arabic/s5b/local/gale_train_lms.sh
+++ /dev/null
@@ -1,81 +0,0 @@
-#!/bin/bash
-
-
-# To be run from one directory above this script.
-
-
-lexicon=data/local/dict/lexicon.txt 
-[ ! -f $lexicon ] && echo "$0: No such file $lexicon" && exit 1;
-
-
-# This script takes no arguments.  It assumes you have already run
-# previus steps successfully
-# It takes as input the files
-#data/local/train.*/text
-#data/local/dict/lexicon.txt
-
-
-export LC_ALL=C # You'll get errors about things being not sorted, if you
-# have a different locale.
-export PATH=$PATH:./../../../tools/kaldi_lm
-( # First make sure the kaldi_lm toolkit is installed.
- cd $KALDI_ROOT/tools || exit 1;
- if [ -d kaldi_lm ]; then
-   echo Not installing the kaldi_lm toolkit since it is already there.
- else
-   echo Downloading and installing the kaldi_lm tools
-   if [ ! -f kaldi_lm.tar.gz ]; then
-     wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz || exit 1;
-   fi
-   tar -xvzf kaldi_lm.tar.gz || exit 1;
-   cd kaldi_lm
-   make || exit 1;
-   echo Done making the kaldi_lm tools
- fi
-) || exit 1;
-
-
-dir=data/local/lm
- mkdir -p $dir
- text=data/local/train/text
- [ ! -f $text ] && echo "$0: No such file $text" && exit 1;
- 
- cleantext=$dir/text.no_oov
-
- cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } } 
-   {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<UNK> ",$n);} } printf("\n");}' \
-   > $cleantext || exit 1;
-
-
- cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
-    sort -nr > $dir/word.counts || exit 1;
-
-
-# Get counts from acoustic training transcripts, and add  one-count
-# for each word in the lexicon (but not silence, we don't want it
-# in the LM-- we'll add it optionally later).
- cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
-   cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
-    sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
-
-# note: we probably won't really make use of <UNK> as there aren't any OOVs
- cat $dir/unigram.counts  | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<UNK>" > $dir/word_map \
-    || exit 1;
-
-# note: ignore 1st field of train.txt, it's the utterance-id.
- cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
-   { for(n=2;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz \
-    || exit 1;
- 
- train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1;
-
-# LM is small enough that we don't need to prune it (only about 0.7M N-grams).
-# Perplexity over 128254.000000 words is 90.446690
-
-# note: output is
-# data/local/lm/3gram-mincount/lm_unpruned.gz 
-
-
-echo train lm succeeded
-
-exit 0 
diff --git a/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh b/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh
index f14c8441869..a03cc5b2fa3 100755
--- a/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh
+++ b/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh
@@ -2,31 +2,29 @@
 
 set -e -o pipefail
 
-# This script is called from local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh (and may eventually
-# be called by more scripts).  It contains the common feature preparation and iVector-related parts
-# of the script.  See those scripts for examples of usage.
+# This script is called from scripts like local/nnet3/run_tdnn.sh and
+# local/chain/run_tdnn.sh (and may eventually be called by more scripts).  It
+# contains the common feature preparation and iVector-related parts of the
+# script.  See those scripts for examples of usage.
 
 
 stage=0
 nj=100
-min_seg_len=1.55  # min length in seconds... we do this because chain training
-                  # will discard segments shorter than 1.5 seconds.   Must remain in sync
-                  # with the same option given to prepare_lores_feats_and_alignments.sh
 train_set=train   # you might set this to e.g. train.
-gmm=tri2b         # This specifies a GMM-dir from the features of the type you're training the system on;
+test_sets="test"
+gmm=tri3b # This specifies a GMM-dir from the features of the type you're training the system on;
                          # it should contain alignments for 'train_set'.
 
 num_threads_ubm=32
-nnet3_affix=_cleaned     # affix for exp/nnet3 directory to put iVector stuff in, so it
-                         # becomes exp/nnet3_cleaned or whatever.
+nnet3_affix= # affix for exp/nnet3 directory to put iVector stuff
 
 . ./cmd.sh
 . ./path.sh
-. ./utils/parse_options.sh
+. utils/parse_options.sh
 
 
 gmm_dir=exp/${gmm}
-ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+ali_dir=exp/${gmm}_ali_${train_set}_sp
 
 for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
   if [ ! -f $f ]; then
@@ -61,7 +59,7 @@ if [ $stage -le 2 ]; then
     utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/gale_arabic-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
   fi
 
-  for datadir in ${train_set}_sp test; do
+  for datadir in ${train_set}_sp ${test_sets}; do
     utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
   done
 
@@ -69,7 +67,7 @@ if [ $stage -le 2 ]; then
   # features; this helps make trained nnets more invariant to test data volume.
   utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires
 
-  for datadir in ${train_set}_sp test; do
+  for datadir in ${train_set}_sp ${test_sets}; do
     steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
       --cmd "$train_cmd" data/${datadir}_hires
     steps/compute_cmvn_stats.sh data/${datadir}_hires
@@ -78,75 +76,33 @@ if [ $stage -le 2 ]; then
 fi
 
 if [ $stage -le 3 ]; then
-  echo "$0: combining short segments of speed-perturbed high-resolution MFCC training data"
-  # we have to combine short segments or we won't be able to train chain models
-  # on those segments.
-  utils/data/combine_short_segments.sh \
-     data/${train_set}_sp_hires $min_seg_len data/${train_set}_sp_hires_comb
-
-  # just copy over the CMVN to avoid having to recompute it.
-  cp data/${train_set}_sp_hires/cmvn.scp data/${train_set}_sp_hires_comb/
-  utils/fix_data_dir.sh data/${train_set}_sp_hires_comb/
-fi
-
-if [ $stage -le 4 ]; then
-  echo "$0: selecting segments of hires training data that were also present in the"
-  echo " ... original training data."
-
-  # note, these data-dirs are temporary; we put them in a sub-directory
-  # of the place where we'll make the alignments.
-  temp_data_root=exp/nnet3${nnet3_affix}/tri5
-  mkdir -p $temp_data_root
-
-  utils/data/subset_data_dir.sh --utt-list data/${train_set}/feats.scp \
-          data/${train_set}_sp_hires $temp_data_root/${train_set}_hires
-
-  # note: essentially all the original segments should be in the hires data.
-  n1=$(wc -l <data/${train_set}/feats.scp)
-  n2=$(wc -l <$temp_data_root/${train_set}_hires/feats.scp)
-  if [ $n1 != $n2 ]; then
-    echo "$0: warning: number of feats $n1 != $n2, if these are very different it could be bad."
-  fi
-
-  echo "$0: training a system on the hires data for its LDA+MLLT transform, in order to produce the diagonal GMM."
-  if [ -e exp/nnet3${nnet3_affix}/tri5/final.mdl ]; then
-    # we don't want to overwrite old stuff, ask the user to delete it.
-    echo "$0: exp/nnet3${nnet3_affix}/tri5/final.mdl already exists: "
-    echo " ... please delete and then rerun, or use a later --stage option."
-    exit 1;
-  fi
-  steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 7 --mllt-iters "2 4 6" \
-     --splice-opts "--left-context=3 --right-context=3" \
-     3000 10000 $temp_data_root/${train_set}_hires data/lang \
-      $gmm_dir exp/nnet3${nnet3_affix}/tri5
-fi
-
-
-if [ $stage -le 5 ]; then
   echo "$0: computing a subset of data to train the diagonal UBM."
-
   mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
   temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
 
   # train a diagonal UBM using a subset of about a quarter of the data
-  # we don't use the _comb data for this as there is no need for compatibility with
-  # the alignments, and using the non-combined data is more efficient for I/O
-  # (no messing about with piped commands).
   num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
   num_utts=$[$num_utts_total/4]
   utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
       $num_utts ${temp_data_root}/${train_set}_sp_hires_subset
 
+  echo "$0: computing a PCA transform from the hires data."
+  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
+      --splice-opts "--left-context=3 --right-context=3" \
+      --max-utts 10000 --subsample 2 \
+       ${temp_data_root}/${train_set}_sp_hires_subset \
+       exp/nnet3${nnet3_affix}/pca_transform
+
   echo "$0: training the diagonal UBM."
   # Use 512 Gaussians in the UBM.
   steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
     --num-frames 700000 \
     --num-threads $num_threads_ubm \
     ${temp_data_root}/${train_set}_sp_hires_subset 512 \
-    exp/nnet3${nnet3_affix}/tri5 exp/nnet3${nnet3_affix}/diag_ubm
+    exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
 fi
 
-if [ $stage -le 6 ]; then
+if [ $stage -le 4 ]; then
   # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
   # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
   # 100.
@@ -155,54 +111,54 @@ if [ $stage -le 6 ]; then
     data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm exp/nnet3${nnet3_affix}/extractor || exit 1;
 fi
 
-if [ $stage -le 7 ]; then
+if [ $stage -le 5 ]; then
   # note, we don't encode the 'max2' in the name of the ivectordir even though
   # that's the data we extract the ivectors from, as it's still going to be
-  # valid for the non-'max2' data, the utterance list is the same.
-  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+  # valid for the non-'max2' data; the utterance list is the same.
+  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
     utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/ivectors/gale_arabic-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
   fi
-  # We extract iVectors on the speed-perturbed training data after combining
-  # short segments, which will be what we train the system on.  With
+  # We extract iVectors on the speed-perturbed training data .  With
   # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
   # each of these pairs as one speaker; this gives more diversity in iVectors..
-  # Note that these are extracted 'online'.
+  # Note that these are extracted 'online' (they vary within the utterance).
 
-  # having a larger number of speakers is helpful for generalization, and to
-  # handle per-utterance decoding well (iVector starts at zero).
+  # Having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (the iVector starts at zero at the beginning
+  # of each pseudo-speaker).
   temp_data_root=${ivectordir}
   utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
-    data/${train_set}_sp_hires_comb ${temp_data_root}/${train_set}_sp_hires_comb_max2
+    data/${train_set}_sp_hires ${temp_data_root}/${train_set}_sp_hires_max2
 
   steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
-    ${temp_data_root}/${train_set}_sp_hires_comb_max2 \
+    ${temp_data_root}/${train_set}_sp_hires_max2 \
     exp/nnet3${nnet3_affix}/extractor $ivectordir
 
   # Also extract iVectors for the test data, but in this case we don't need the speed
-  # perturbation (sp) or small-segment concatenation (comb).
-  for data in test; do
-    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj "$nj" \
+  # perturbation (sp).
+  for data in ${test_sets}; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
       data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
       exp/nnet3${nnet3_affix}/ivectors_${data}_hires
   done
 fi
 
-if [ -f data/${train_set}_sp/feats.scp ] && [ $stage -le 9 ]; then
-  echo "$0: $feats already exists.  Refusing to overwrite the features "
+if [ -f data/${train_set}_sp/feats.scp ] && [ $stage -le 7 ]; then
+  echo "$0: data/${train_set}_sp/feats.scp already exists.  Refusing to overwrite the features "
   echo " to avoid wasting time.  Please remove the file and continue if you really mean this."
   exit 1;
 fi
 
 
-if [ $stage -le 8 ]; then
+if [ $stage -le 6 ]; then
   echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
   utils/data/perturb_data_dir_speed_3way.sh \
     data/${train_set} data/${train_set}_sp
 fi
 
-if [ $stage -le 9 ]; then
-  echo "$0: making MFCC features for low-resolution speed-perturbed data"
+if [ $stage -le 7 ]; then
+  echo "$0: making MFCC features for low-resolution speed-perturbed data (needed for alignments)"
   steps/make_mfcc.sh --nj $nj \
     --cmd "$train_cmd" data/${train_set}_sp
   steps/compute_cmvn_stats.sh data/${train_set}_sp
@@ -211,26 +167,15 @@ if [ $stage -le 9 ]; then
   utils/fix_data_dir.sh data/${train_set}_sp
 fi
 
-if [ $stage -le 10 ]; then
-  echo "$0: combining short segments of low-resolution speed-perturbed  MFCC data"
-  src=data/${train_set}_sp
-  dest=data/${train_set}_sp_comb
-  utils/data/combine_short_segments.sh $src $min_seg_len $dest
-  # re-use the CMVN stats from the source directory, since it seems to be slow to
-  # re-compute them after concatenating short segments.
-  cp $src/cmvn.scp $dest/
-  utils/fix_data_dir.sh $dest
-fi
-
-if [ $stage -le 11 ]; then
+if [ $stage -le 8 ]; then
   if [ -f $ali_dir/ali.1.gz ]; then
     echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
     echo " ... or use a later --stage option."
     exit 1
   fi
-  echo "$0: aligning with the perturbed, short-segment-combined low-resolution data"
+  echo "$0: aligning with the perturbed low-resolution data"
   steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-         data/${train_set}_sp_comb data/lang $gmm_dir $ali_dir
+    data/${train_set}_sp data/lang $gmm_dir $ali_dir
 fi
 
 
diff --git a/egs/gale_arabic/s5b/local/prepare_data.sh b/egs/gale_arabic/s5b/local/prepare_data.sh
new file mode 100755
index 00000000000..aea9ba2dc8e
--- /dev/null
+++ b/egs/gale_arabic/s5b/local/prepare_data.sh
@@ -0,0 +1,104 @@
+#!/bin/bash 
+
+# Copyright 2014 QCRI (author: Ahmed Ali)
+# Apache 2.0
+
+dir1=/export/corpora/LDC/LDC2013S02/
+dir2=/export/corpora/LDC/LDC2013S07/
+dir3=/export/corpora/LDC/LDC2014S07/
+text1=/export/corpora/LDC/LDC2013T17/
+text2=/export/corpora/LDC/LDC2013T04/
+text3=/export/corpora/LDC/LDC2014T17/
+gale_data=GALE
+
+mkdir -p $gale_data 
+# check that sox is installed 
+which sox  &>/dev/null
+if [[ $? != 0 ]]; then 
+ echo "$0: sox is not installed"; exit 1
+fi
+
+for dvd in $dir1 $dir2 $dir3; do
+  dvd_full_path=$(utils/make_absolute.sh $dvd)
+  if [[ ! -e $dvd_full_path ]]; then 
+    echo "$0: missing $dvd_full_path"; exit 1;
+  fi
+  find $dvd_full_path \( -name "*.wav" -o -name "*.flac" \)  | while read file; do
+    id=$(basename $file | awk '{gsub(".wav","");gsub(".flac","");print}')
+    echo "$id sox $file -r 16000 -t wav - |"
+  done 
+done | sort -u > $gale_data/wav.scp
+echo "$0:data prep audio succeded"
+
+gale_data=$(utils/make_absolute.sh "GALE" );
+top_pwd=`pwd`
+txtdir=$gale_data/txt
+mkdir -p $txtdir; cd $txtdir
+
+for cdx in $text1 $text2 $text3; do
+  echo "$0:Preparing $cdx"
+  if [[ $cdx  == *.tgz ]] ; then
+     tar -xvf $cdx
+  elif [  -d "$cdx" ]; then
+    ln -s $cdx `basename $cdx`
+  else
+    echo "$0:I don't really know what I shall do with $cdx " >&2
+  fi
+done
+
+find -L . -type f -name "*.tdf" | while read file; do
+sed '1,3d' $file  # delete the first 3 lines
+done >  all.tmp$$
+
+perl -e '
+    ($inFile,$idFile,$txtFile)= split /\s+/, $ARGV[0];
+    open(IN, "$inFile");
+    open(ID, ">$idFile");
+    open(TXT, ">$txtFile");
+    while (<IN>) {
+      @arr= split /\t/,$_;
+      $start=sprintf ("%0.3f",$arr[2]);$rStart=$start;$start=~s/\.//; $start=~s/^0+$/0/; $start=~s/^0+([^0])/$1/; # remove zeros at the beginning
+      $end=sprintf ("%0.3f",$arr[3]);$rEnd=$end;$end=~s/^0+([^0])/$1/;$end=~s/\.//;
+      if ( ($arr[11] !~ m/report/) && ($arr[11] !~ m/conversational/) ){$arr[11]="UNK";}
+      $id="$arr[11] $arr[0] $arr[0]_${start}_${end} $rStart $rEnd\n";
+      next if ($rStart == $rEnd);
+      $id =~ s/.sph//g;
+      print ID $id;
+      print TXT "$arr[7]\n";
+ }' "all.tmp$$ allid.tmp$$ contentall.tmp$$"
+
+perl ${top_pwd}/local/normalize_transcript_BW.pl contentall.tmp$$ contentall.buck.tmp$$
+paste allid.tmp$$ contentall.buck.tmp$$ | sed 's: $::' | awk '{if (NF>5) {print $0}}'  > all_1.tmp$$
+
+
+awk '{$1="";print $0}' all_1.tmp$$ | sed 's:^ ::' > $gale_data/all
+awk '{if ($1 == "report") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' >  $gale_data/report
+awk '{if ($1 == "conversational") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' > $gale_data/conversational
+
+cd ..;
+rm -fr $txtdir
+cd $top_pwd
+echo "$0:dat a prep text succeeded"
+
+mkdir -p data
+dir=$(utils/make_absolute.sh data/)
+grep -f local/test_list $gale_data/all | grep -v -f local/bad_segments > $gale_data/all.test
+grep -v -f local/test_list $gale_data/all | grep -v -f local/bad_segments > $gale_data/all.train 
+
+for x in test train; do
+ outdir=data/$x
+ file=$gale_data/all.$x 
+ mkdir -p $outdir
+ awk '{print $2 " " $2}' $file | sort -u > $outdir/utt2spk 
+ cp -pr $outdir/utt2spk $outdir/spk2utt
+ awk '{print $2 " " $1 " " $3 " " $4}' $file  | sort -u > $outdir/segments
+ awk '{printf $2 " "; for (i=5; i<=NF; i++) {printf $i " "} printf "\n"}' $file | sort -u > $outdir/text
+done 
+
+grep -f local/test_list $gale_data/wav.scp > $dir/test/wav.scp
+
+cat $gale_data/wav.scp | awk -v seg=$dir/train/segments 'BEGIN{while((getline<seg) >0) {seen[$2]=1;}}
+ {if (seen[$1]) { print $0}}' > $dir/train/wav.scp
+ 
+echo "$0:data prep split succeeded"
+exit 0
diff --git a/egs/gale_arabic/s5b/local/prepare_dict.sh b/egs/gale_arabic/s5b/local/prepare_dict.sh
new file mode 100755
index 00000000000..47b5869fdf1
--- /dev/null
+++ b/egs/gale_arabic/s5b/local/prepare_dict.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+
+# Copyright 2017 QCRI (author: Ahmed Ali)
+# Apache 2.0
+# This script prepares the dictionary.
+
+set -e
+dir=data/local/dict
+lexicon_url1="http://alt.qcri.org//resources/speech/dictionary/ar-ar_grapheme_lexicon_2016-02-09.bz2";
+lexicon_url2="http://alt.qcri.org//resources/speech/dictionary/ar-ar_lexicon_2014-03-17.txt.bz2";
+stage=0
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+mkdir -p $dir data/local/lexicon_data
+
+if [ $stage -le 0 ]; then
+  echo "$0: Downloading text for lexicon... $(date)."
+  wget -P data/local/lexicon_data $lexicon_url1
+  wget -P data/local/lexicon_data $lexicon_url2
+  bzcat data/local/lexicon_data/ar-ar_grapheme_lexicon_2016-02-09.bz2  | sed '1,3d' | awk '{print $1}'  >  data/local/lexicon_data/grapheme_lexicon
+  bzcat data/local/lexicon_data/ar-ar_lexicon_2014-03-17.txt.bz2 | sed '1,3d' | awk '{print $1}' >>  data/local/lexicon_data/grapheme_lexicon
+  cat data/train/text | cut -d ' ' -f 2- | tr -s " " "\n" | sort -u >> data/local/lexicon_data/grapheme_lexicon
+fi
+
+
+if [ $stage -le 0 ]; then
+  echo "$0: processing lexicon text and creating lexicon... $(date)."
+  # remove vowels and  rare alef wasla
+  grep -v [0-9] data/local/lexicon_data/grapheme_lexicon |  sed -e 's:[FNKaui\~o\`]::g' -e 's:{:}:g' | sort -u > data/local/lexicon_data/processed_lexicon
+  local/prepare_lexicon.py
+fi
+
+cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1;
+
+sed -i '1i<UNK> UNK' $dir/lexicon.txt
+
+echo UNK >> $dir/nonsilence_phones.txt
+
+echo '<sil> SIL' >> $dir/lexicon.txt
+
+echo SIL > $dir/silence_phones.txt
+
+echo SIL >$dir/optional_silence.txt
+
+echo -n "" >$dir/extra_questions.txt
+
+echo "$0: Dictionary preparation succeeded"
diff --git a/egs/gale_arabic/s5b/local/prepare_lexicon.py b/egs/gale_arabic/s5b/local/prepare_lexicon.py
new file mode 100755
index 00000000000..215541585eb
--- /dev/null
+++ b/egs/gale_arabic/s5b/local/prepare_lexicon.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+
+# Copyright      2018  Ashish Arora
+# Apache 2.0
+
+# This script prepares lexicon.
+
+import argparse
+import os
+
+parser = argparse.ArgumentParser(description="""Creates the list of characters and words in lexicon""")
+args = parser.parse_args()
+
+### main ###
+lex = {}
+text_path = os.path.join('data','local', 'lexicon_data', 'processed_lexicon')
+with open(text_path, 'r', encoding='utf-8') as f:
+    for line in f:
+        line = line.strip()
+        characters = list(line)
+        characters = " ".join(['V' if char == '*' else char for char in characters])
+        lex[line] = characters
+
+with open(os.path.join('data','local','dict', 'lexicon.txt'), 'w', encoding='utf-8') as fp:
+    for key in sorted(lex):
+        fp.write(key + "  " + lex[key] + "\n")
diff --git a/egs/gale_arabic/s5b/local/prepare_lm.sh b/egs/gale_arabic/s5b/local/prepare_lm.sh
new file mode 100755
index 00000000000..6fdf35f471a
--- /dev/null
+++ b/egs/gale_arabic/s5b/local/prepare_lm.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# Copyright 2012  Vassil Panayotov
+#           2017  Ewald Enzinger
+# Apache 2.0
+
+. ./path.sh || exit 1
+
+echo "=== Building a language model ..."
+
+dir=data/local/lm/
+text=data/train/text
+lexicon=data/local/dict/lexicon.txt
+# Language model order
+order=3
+
+. utils/parse_options.sh
+
+# Prepare a LM training corpus from the transcripts
+mkdir -p $dir
+
+for f in "$text" "$lexicon"; do
+  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
+
+loc=`which ngram-count`;
+if [ -z $loc ]; then
+  if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
+    sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64 
+  else
+    sdir=$KALDI_ROOT/tools/srilm/bin/i686
+  fi
+  if [ -f $sdir/ngram-count ]; then
+    echo Using SRILM tools from $sdir
+    export PATH=$PATH:$sdir
+  else
+    echo You appear to not have SRILM tools installed, either on your path,
+    echo or installed in $sdir.  See tools/install_srilm.sh for installation
+    echo instructions.
+    exit 1
+  fi
+fi
+
+cat data/train/text | cut -d " " -f 2- >  $dir/text.txt
+cut -d' ' -f1 $lexicon > $dir/wordlist
+
+ngram-count -text $dir/text.txt -order $order -limit-vocab -vocab $dir/wordlist \
+  -unk -map-unk "<UNK>" -kndiscount -interpolate -lm $dir/lm.gz
+
+#ngram -lm $dir/lm.gz -ppl $dir/dev.txt
+echo "*** Finished building the LM model!"
diff --git a/egs/gale_arabic/s5b/local/score.sh b/egs/gale_arabic/s5b/local/score.sh
index 83366f7c7fc..1d84815fc69 100755
--- a/egs/gale_arabic/s5b/local/score.sh
+++ b/egs/gale_arabic/s5b/local/score.sh
@@ -1,60 +1,6 @@
-#!/bin/bash
-# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
-# Apache 2.0
-
-[ -f ./path.sh ] && . ./path.sh
-
-# begin configuration section.
-cmd=run.pl
-stage=0
-decode_mbr=true
-word_ins_penalty=0.0
-min_lmwt=7
-max_lmwt=17
-iter=  #some of the scripts from steps/ seem to use it
-#end configuration section.
-
-echo "$0 $#"
-
-[ -f ./path.sh ] && . ./path.sh
-. parse_options.sh || exit 1;
-
-if [ $# -ne 3 ]; then
-  echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
-  echo " Options:"
-  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
-  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
-  echo "    --decode_mbr (true/false)       # maximum bayes risk decoding (confusion network)."
-  echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
-  echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
-  exit 1;
-fi
 
-data=$1
-lang_or_graph=$2
-dir=$3
-
-symtab=$lang_or_graph/words.txt
-
-for f in $symtab $dir/lat.1.gz $data/text; do
-  [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
-done
-
-mkdir -p $dir/scoring/log
-
-cat $data/text | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > $dir/scoring/test_filt.txt
-
-$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \
-  lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
-  lattice-add-penalty --word-ins-penalty=$word_ins_penalty ark:- ark:- \| \
-  lattice-best-path --word-symbol-table=$symtab \
-    ark:- ark,t:$dir/scoring/LMWT.tra || exit 1;
+#!/bin/bash
 
-# Note: the double level of quoting for the sed command
-$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
-   cat $dir/scoring/LMWT.tra \| \
-    utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
-    compute-wer --text --mode=present \
-     ark:$dir/scoring/test_filt.txt  ark,p:- ">&" $dir/wer_LMWT || exit 1;
 
-exit 0;
+steps/scoring/score_kaldi_wer.sh "$@"
+steps/scoring/score_kaldi_cer.sh --stage 2 "$@"
diff --git a/egs/gale_arabic/s5b/local/wer_output_filter b/egs/gale_arabic/s5b/local/wer_output_filter
new file mode 100755
index 00000000000..cf48b434144
--- /dev/null
+++ b/egs/gale_arabic/s5b/local/wer_output_filter
@@ -0,0 +1,19 @@
+#!/usr/bin/env python3
+
+# Copyright      2017  Hossein Hadian
+
+# Apache 2.0
+# This script converts a BPE-encoded text to normal text. It is used in scoring
+
+import sys, io
+import string
+
+infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
+output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+
+for line in infile:
+  words = line.strip().split()
+  words = [word for word in words if '<UNK>' not in word]
+  uttid = words[0]
+  transcript = ' '.join(words[1:])
+  output.write(uttid + ' ' + transcript + '\n')
diff --git a/egs/gale_arabic/s5b/run.sh b/egs/gale_arabic/s5b/run.sh
index c45f5119949..3f12d22495e 100755
--- a/egs/gale_arabic/s5b/run.sh
+++ b/egs/gale_arabic/s5b/run.sh
@@ -3,177 +3,121 @@
 # Copyright 2014 QCRI (author: Ahmed Ali)
 # Apache 2.0
 
-. ./path.sh
-. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
-           ## This relates to the queue.
 num_jobs=120
 num_decode_jobs=40
+decode_gmm=true
+stage=0
+overwrite=false
 
-#NB: You can add whatever number of copora you like. The supported extensions 
-#NB: (formats) are wav and flac. Flac will be converted using sox and in contrast
-#NB: with the old approach, the conversion will be on-the-fly and one-time-only
-#NB: during the parametrization.
-
-#NB: Text corpora scpecification. We support either tgz files, which are unpacked
-#NB: or just plain (already unpacked) directories. The list of transcript is then
-#NB: obtained using find command
-
-#Make sure you edit this section to reflect whers you keep the LDC data on your cluster
-
-#This is CLSP configuration. We add the 2014 GALE data. We got around 2 % 
-#improvement just by including it. The gain might be large if someone would tweak
-# the number of leaves and states and so on.
-
-#audio=(
-#  /export/corpora/LDC/LDC2013S02/
-#  /export/corpora/LDC/LDC2013S07/
-#  /export/corpora/LDC/LDC2014S07/
-#)
-#text=(
-#  /export/corpora/LDC/LDC2013T17
-#  /export/corpora/LDC/LDC2013T04
-#  /export/corpora/LDC/LDC2014T17
-#)
-
-audio=(
-  /data/sls/scratch/amali/data/GALE/LDC2013S02
-  /data/sls/scratch/amali/data/GALE/LDC2013S07
-  /data/sls/scratch/amali/data/GALE/LDC2014S07
-)
-text=(
-  /data/sls/scratch/amali/data/GALE/LDC2013T17.tgz
-  /data/sls/scratch/amali/data/GALE/LDC2013T04.tgz
-  /data/sls/scratch/amali/data/GALE/LDC2014T17.tgz
-)
+dir1=/export/corpora/LDC/LDC2013S02/
+dir2=/export/corpora/LDC/LDC2013S07/
+dir3=/export/corpora/LDC/LDC2014S07/
+text1=/export/corpora/LDC/LDC2013T17/
+text2=/export/corpora/LDC/LDC2013T04/
+text3=/export/corpora/LDC/LDC2014T17/
 
 galeData=GALE
-#prepare the data
-#split train dev test 
-#prepare lexicon and LM 
-
-# You can run the script from here automatically, but it is recommended to run the data preparation,
-# and features extraction manually and and only once.
-# By copying and pasting into your shell.
-
-#copy the audio files to local folder wav and convet flac files to wav
-local/gale_data_prep_audio.sh  "${audio[@]}" $galeData || exit 1;
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+. ./path.sh
+. ./utils/parse_options.sh  # e.g. this parses the above options
+                            # if supplied.
 
-#get the transcription and remove empty prompts and all noise markers  
-local/gale_data_prep_txt.sh  "${text[@]}" $galeData || exit 1;
+if [ $stage -le 0 ]; then
 
-# split the data to reports and conversational and for each class will have rain/dev and test
-local/gale_data_prep_split.sh $galeData  || exit 1;
+  if [ -f data/train/text ] && ! $overwrite; then
+    echo "$0: Not processing, probably script have run from wrong stage"
+    echo "Exiting with status 1 to avoid data corruption"
+    exit 1;
+  fi
 
-# get all Arabic grapheme dictionaries and add silence and UNK
-local/gale_prep_grapheme_dict.sh  || exit 1;
+  echo "$0: Preparing data..."
+  local/prepare_data.sh --dir1 $dir1 --dir2 $dir2 --dir3 $dir3 \
+                        --text1 $text1 --text2 $text2 --text3 $text3
 
+  echo "$0: Preparing lexicon and LM..." 
+  local/prepare_dict.sh
 
-#prepare the langauge resources
-utils/prepare_lang.sh data/local/dict "<UNK>" data/local/lang data/lang   || exit 1;
+  utils/prepare_lang.sh data/local/dict "<UNK>" data/local/lang data/lang
 
-# LM training
-local/gale_train_lms.sh || exit 1;
+  local/prepare_lm.sh
 
-local/gale_format_data.sh  || exit 1;
-# G compilation, check LG composition
+  utils/format_lm.sh data/lang data/local/lm/lm.gz \
+                     data/local/dict/lexicon.txt data/lang_test
+fi
 
-# Now make MFCC features.
-# mfccdir should be some place with a largish disk where you
-# want to store MFCC features.
 mfccdir=mfcc
-
-for x in train test ; do
-  steps/make_mfcc.sh --cmd "$train_cmd" --nj $num_jobs \
-    data/$x exp/make_mfcc/$x $mfccdir
-  utils/fix_data_dir.sh data/$x # some files fail to get mfcc for many reasons
-  steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
-done
-
-
-# Here we start the AM
-
-# Let's create a subset with 10k segments to make quick flat-start training:
-utils/subset_data_dir.sh data/train 10000 data/train.10K || exit 1;
-
-# Train monophone models on a subset of the data, 10K segment
-# Note: the --boost-silence option should probably be omitted by default
-steps/train_mono.sh --nj 40 --cmd "$train_cmd" \
-  data/train.10K data/lang exp/mono || exit 1;
-
-
-# Get alignments from monophone system.
-steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
-  data/train data/lang exp/mono exp/mono_ali || exit 1;
-
-# train tri1 [first triphone pass]
-steps/train_deltas.sh --cmd "$train_cmd" \
-  2500 30000 data/train data/lang exp/mono_ali exp/tri1 || exit 1;
-
-# First triphone decoding
-utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph
-steps/decode.sh  --nj $num_decode_jobs --cmd "$decode_cmd" \
-  exp/tri1/graph data/test exp/tri1/decode
-  
-steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
-  data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
-
-# Train tri2a, which is deltas+delta+deltas
-steps/train_deltas.sh --cmd "$train_cmd" \
-  3000 40000 data/train data/lang exp/tri1_ali exp/tri2a || exit 1;
-
-# tri2a decoding
-utils/mkgraph.sh data/lang_test exp/tri2a exp/tri2a/graph
-steps/decode.sh --nj $num_decode_jobs --cmd "$decode_cmd" \
-  exp/tri2a/graph data/test exp/tri2a/decode
-
-# train and decode tri2b [LDA+MLLT]
-steps/train_lda_mllt.sh --cmd "$train_cmd" 4000 50000 \
-  data/train data/lang exp/tri1_ali exp/tri2b || exit 1;
-
-utils/mkgraph.sh data/lang_test exp/tri2b exp/tri2b/graph
-steps/decode.sh --nj $num_decode_jobs --cmd "$decode_cmd" \
-  exp/tri2b/graph data/test exp/tri2b/decode
-
-# Align all data with LDA+MLLT system (tri2b)
-steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
-  --use-graphs true data/train data/lang exp/tri2b exp/tri2b_ali  || exit 1;
-
-
-# From 2b system, train 3b which is LDA + MLLT + SAT.
-steps/train_sat.sh --cmd "$train_cmd" \
-  5000 100000 data/train data/lang exp/tri2b_ali exp/tri3b || exit 1;
-
-utils/mkgraph.sh data/lang_test exp/tri3b exp/tri3b/graph
-steps/decode_fmllr.sh --nj $num_decode_jobs --cmd \
-  "$decode_cmd" exp/tri3b/graph data/test exp/tri3b/decode
-
-# From 3b system, align all data.
-steps/align_fmllr.sh --nj $num_jobs --cmd "$train_cmd" \
-  data/train data/lang exp/tri3b exp/tri3b_ali || exit 1;
-  
-
-# nnet3 cross-entropy 
-local/nnet3/run_tdnn.sh #tdnn recipe:
-local/nnet3/run_lstm.sh --stage 12  #lstm recipe (we skip ivector training)
-
-# chain lattice-free 
-local/chain/run_tdnn.sh      #tdnn recipe:
-local/chain/run_tdnn_lstm.sh #tdnn-lstm recipe:
-
-time=$(date +"%Y-%m-%d-%H-%M-%S")
-
-#get detailed WER; reports, conversational and combined
-local/split_wer.sh $galeData > RESULTS.details.$USER.$time # to make sure you keep the results timed and owned
-
-echo training succedded
+if [ $stage -le 1 ]; then
+  echo "$0: Preparing the test and train feature files..."
+  for x in train test ; do
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj $num_jobs \
+      data/$x exp/make_mfcc/$x $mfccdir
+    utils/fix_data_dir.sh data/$x # some files fail to get mfcc for many reasons
+    steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: creating sub-set and training monophone system"
+  utils/subset_data_dir.sh data/train 10000 data/train.10K || exit 1;
+
+  steps/train_mono.sh --nj 40 --cmd "$train_cmd" \
+    data/train.10K data/lang exp/mono || exit 1;
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: Aligning data using monophone system"
+  steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
+    data/train data/lang exp/mono exp/mono_ali || exit 1;
+
+  echo "$0: training triphone system with delta features"
+  steps/train_deltas.sh --cmd "$train_cmd" \
+    2500 30000 data/train data/lang exp/mono_ali exp/tri1 || exit 1;
+fi
+
+if [ $stage -le 4 ] && $decode_gmm; then
+  utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph
+  steps/decode.sh  --nj $num_decode_jobs --cmd "$decode_cmd" \
+    exp/tri1/graph data/test exp/tri1/decode
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$0: Aligning data and retraining and realigning with lda_mllt"
+  steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
+    data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
+
+  steps/train_lda_mllt.sh --cmd "$train_cmd" 4000 50000 \
+    data/train data/lang exp/tri1_ali exp/tri2b || exit 1;
+fi
+
+if [ $stage -le 6 ] && $decode_gmm; then
+  utils/mkgraph.sh data/lang_test exp/tri2b exp/tri2b/graph
+  steps/decode.sh --nj $num_decode_jobs --cmd "$decode_cmd" \
+    exp/tri2b/graph data/test exp/tri2b/decode
+fi
+
+if [ $stage -le 7 ]; then
+  echo "$0: Aligning data and retraining and realigning with sat_basis"
+  steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
+    data/train data/lang exp/tri2b exp/tri2b_ali || exit 1;
+
+  steps/train_sat_basis.sh --cmd "$train_cmd" \
+    5000 100000 data/train data/lang exp/tri2b_ali exp/tri3b || exit 1;
+
+  steps/align_fmllr.sh --nj $num_jobs --cmd "$train_cmd" \
+    data/train data/lang exp/tri3b exp/tri3b_ali || exit 1;
+fi
+
+if [ $stage -le 8 ] && $decode_gmm; then
+  utils/mkgraph.sh data/lang_test exp/tri3b exp/tri3b/graph
+  steps/decode_fmllr.sh --nj $num_decode_jobs --cmd \
+    "$decode_cmd" exp/tri3b/graph data/test exp/tri3b/decode
+fi
+
+if [ $stage -le 9 ]; then
+  echo "$0: Training a regular chain model using the e2e alignments..."
+  local/chain/run_tdnn.sh
+fi
+
+echo "$0: training succedded"
 exit 0
-
-#TODO:
-#LM (4-gram and RNN) rescoring
-#combine lattices
-#dialect detection
-
-
-
-
-
diff --git a/egs/gale_arabic/s5c/RESULT b/egs/gale_arabic/s5c/RESULT
new file mode 100644
index 00000000000..d56c9e2dbc6
--- /dev/null
+++ b/egs/gale_arabic/s5c/RESULT
@@ -0,0 +1,4 @@
+%WER 41.98 [ 29249 / 69668, 2672 ins, 5990 del, 20587 sub ] exp/tri1_subword/decode/wer_15_0.0
+%WER 37.66 [ 26239 / 69668, 2660 ins, 5255 del, 18324 sub ] exp/tri2b_subword/decode/wer_17_0.0
+%WER 35.26 [ 24565 / 69668, 2879 ins, 4892 del, 16794 sub ] exp/tri3b_subword/decode/wer_17_0.5
+%WER 17.29 [ 12049 / 69668, 1244 ins, 2758 del, 8047 sub ] exp/chain/tdnn_1a_sp/decode_test/wer_10_0.5
diff --git a/egs/gale_arabic/s5c/cmd.sh b/egs/gale_arabic/s5c/cmd.sh
new file mode 100755
index 00000000000..ea341c98d4a
--- /dev/null
+++ b/egs/gale_arabic/s5c/cmd.sh
@@ -0,0 +1,15 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="retry.pl queue.pl --mem 2G"
+export decode_cmd="retry.pl queue.pl --mem 4G"
+export mkgraph_cmd="retry.pl queue.pl --mem 8G"
diff --git a/egs/gale_arabic/s5c/conf/decode.config b/egs/gale_arabic/s5c/conf/decode.config
new file mode 100644
index 00000000000..6f503eab35e
--- /dev/null
+++ b/egs/gale_arabic/s5c/conf/decode.config
@@ -0,0 +1 @@
+link decode_dnn.config
\ No newline at end of file
diff --git a/egs/gale_arabic/s5c/conf/mfcc.conf b/egs/gale_arabic/s5c/conf/mfcc.conf
new file mode 100644
index 00000000000..7361509099f
--- /dev/null
+++ b/egs/gale_arabic/s5c/conf/mfcc.conf
@@ -0,0 +1 @@
+--use-energy=false   # only non-default option.
diff --git a/egs/gale_arabic/s5c/conf/mfcc_hires.conf b/egs/gale_arabic/s5c/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..c45f2b691a9
--- /dev/null
+++ b/egs/gale_arabic/s5c/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=16000 
+--num-mel-bins=40    
+--num-ceps=40   
+--low-freq=40    # low cutoff frequency for mel bins
+--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600)
diff --git a/egs/gale_arabic/s5c/conf/online_cmvn.conf b/egs/gale_arabic/s5c/conf/online_cmvn.conf
new file mode 100644
index 00000000000..cbdaf5f281c
--- /dev/null
+++ b/egs/gale_arabic/s5c/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/online/run_online_decoding_nnet2.sh
diff --git a/egs/gale_arabic/s5c/local/bad_segments b/egs/gale_arabic/s5c/local/bad_segments
new file mode 100644
index 00000000000..c3413f0714c
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/bad_segments
@@ -0,0 +1,10 @@
+ARABIYA_FROMIRAQ_ARB_20070302_175801_2326286_2327450
+ARABIYA_BILARABI_ARB_20061005_201400_221375_223694
+LBC_NAHAR_ARB_20060911_142800_3683267_3685290
+LBC_NAHAR_ARB_20070303_145800_3249800_3251128
+LBC_NAHAR_ARB_20070303_145800_3623646_3624152
+LBC_NAHAR_ARB_20070305_035800_481003_484069
+ALAM_WITHEVENT_ARB_20070227_205800_3141876_3144152
+ALAM_NEWSRPT_ARB_20070130_015801_2875054_2876396
+ALJZ_TODHARV_ARB_20060914_155800_2947717_2949041
+ALJZ_TODHARV_ARB_20070107_145800_2417848_2419238
diff --git a/egs/gale_arabic/s5c/local/chain/compare_wer.sh b/egs/gale_arabic/s5c/local/chain/compare_wer.sh
new file mode 100755
index 00000000000..1a40523355a
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/chain/compare_wer.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b}
+
+# ./local/chain/compare_wer.sh exp/chain/cnn1a
+# System                          cnn1a
+# WER                              0.61
+# CER                              0.15
+# Final train prob              -0.0377
+# Final valid prob              -0.0380
+# Final train prob (xent)       -0.0830
+# Final valid prob (xent)       -0.0838
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/cnn{1a,1b}"
+  exit 1
+fi
+
+echo "# $0 $*"
+used_epochs=false
+
+echo -n "# System                     "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+echo -n "# WER                        "
+for x in $*; do
+  wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# CER                        "
+for x in $*; do
+  cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+echo -n "# Final train prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
diff --git a/egs/gale_arabic/s5c/local/chain/run_chain_common.sh b/egs/gale_arabic/s5c/local/chain/run_chain_common.sh
new file mode 100755
index 00000000000..da37e148441
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/chain/run_chain_common.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+
+# this script has common stages shared across librispeech chain recipes.
+# It generates a new topology in a new lang directory, gets the alignments as
+# lattices, and builds a tree for the new topology
+set -e
+
+stage=11
+
+# input directory names. These options are actually compulsory, and they have
+# been named for convenience
+gmm_dir=
+ali_dir=
+lores_train_data_dir=
+
+num_leaves=6000
+
+# output directory names. They are also compulsory.
+lang=
+lat_dir=
+tree_dir=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+[ -z $lang ] && echo "Set --lang, this specifies the new lang directory which will have the new topology" && exit 1;
+[ -z $lat_dir ] && echo "Set --lat-dir, this specifies the experiment directory to store lattice" && exit 1;
+[ -z $tree_dir ] && echo "Set --tree-dir, this specifies the directory to store new tree " && exit 1;
+
+for f in $gmm_dir/final.mdl $ali_dir/ali.1.gz $lores_train_data_dir/feats.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 11 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 12 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat ${ali_dir}/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \
+    $lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 13 ]; then
+  # Build a tree using our new topology. We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" $num_leaves ${lores_train_data_dir} $lang $ali_dir $tree_dir
+fi
+
+exit 0;
diff --git a/egs/gale_arabic/s5c/local/chain/run_tdnn.sh b/egs/gale_arabic/s5c/local/chain/run_tdnn.sh
new file mode 120000
index 00000000000..34499362831
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/gale_arabic/s5c/local/chain/run_tdnn_lstm.sh b/egs/gale_arabic/s5c/local/chain/run_tdnn_lstm.sh
new file mode 120000
index 00000000000..8e647598556
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/chain/run_tdnn_lstm.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lstm_1a.sh
\ No newline at end of file
diff --git a/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_1a.sh b/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..bf2e45c9914
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,220 @@
+#!/bin/bash
+
+# ./local/chain/compare_wer.sh exp/chain/tdnn_1a_sp
+# System                      tdnn_1a_sp
+# WER                             16.47
+# CER                              6.68
+# Final train prob              -0.0652
+# Final valid prob              -0.0831
+# Final train prob (xent)       -0.8965
+# Final valid prob (xent)       -0.9964
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_1a_sp/
+# exp/chain/tdnn_1a_sp/: num-iters=441 nj=3..16 num-params=18.6M dim=40+100->5816 combine=-0.063->-0.062 (over 6) xent:train/valid[293,440,final]=(-1.22,-0.912,-0.896/-1.29,-1.01,-0.996) logprob:train/valid[293,440,final]=(-0.097,-0.066,-0.065/-0.108,-0.084,-0.083)
+
+
+set -e -o pipefail
+stage=0
+nj=30
+train_set=train
+test_set=test
+gmm=tri3b        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=_1a   #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# LSTM/chain options
+train_stage=-10
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+# training chunk-options
+chunk_width=150,110,100
+get_egs_stage=-10
+
+# training options
+srand=0
+remove_egs=true
+run_ivector_common=true
+run_chain_common=true
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+if $run_ivector_common; then
+  local/nnet3/run_ivector_common.sh \
+    --stage $stage --nj $nj \
+    --train-set $train_set --gmm $gmm \
+    --num-threads-ubm $num_threads_ubm \
+    --nnet3-affix "$nnet3_affix"
+fi
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+# note: you don't necessarily have to change the treedir name
+# each time you do a new experiment-- only if you change the
+# configuration in a way that affects the tree.
+tree_dir=exp/chain${nnet3_affix}/tree_a_sp
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+# Please take this as a reference on how to specify all the options of
+# local/chain/run_chain_common.sh
+if $run_chain_common; then
+  local/chain/run_chain_common.sh --stage $stage \
+                                  --gmm-dir $gmm_dir \
+                                  --ali-dir $ali_dir \
+                                  --lores-train-data-dir ${lores_train_data_dir} \
+                                  --lang $lang \
+                                  --lat-dir $lat_dir \
+                                  --num-leaves 7000 \
+                                  --tree-dir $tree_dir || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.002"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  linear-component name=prefinal-l dim=256 $linear_opts
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs 6 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.00025 \
+    --trainer.optimization.final-effective-lrate 0.000025 \
+    --trainer.num-chunk-per-minibatch=64,32 \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0 --constrained false" \
+    --egs.stage $get_egs_stage \
+    --reporting.email="$reporting_email" \
+    --cleanup.remove-egs=$remove_egs \
+    --feat-dir=$train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir=$lat_dir \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 17 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test \
+    $tree_dir $tree_dir/graph || exit 1;
+fi
+
+if [ $stage -le 18 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+    steps/nnet3/decode.sh \
+      --acwt 1.0 --post-decode-acwt 10.0 \
+      --extra-left-context 0 --extra-right-context 0 \
+      --extra-left-context-initial 0 \
+      --extra-right-context-final 0 \
+      --frames-per-chunk $frames_per_chunk \
+      --nj $nj --cmd "$decode_cmd"  --num-threads 4 \
+      --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${test_set}_hires \
+      $tree_dir/graph data/${test_set}_hires ${dir}/decode_${test_set} || exit 1
+fi
diff --git a/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh
new file mode 100755
index 00000000000..deebafc95e4
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -0,0 +1,222 @@
+#!/bin/bash
+
+#started from tedlium recipe with few edits
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=17
+nj=30
+decode_nj=30
+min_seg_len=1.55
+chunk_left_context=40
+chunk_right_context=0
+label_delay=5
+xent_regularize=0.1
+train_set=train
+gmm=tri2b # the gmm for the target data gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# decode options
+extra_left_context=50
+extra_right_context=0
+frames_per_chunk=150
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1a  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 3 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+    --acwt 1.0 --post-decode-acwt 10.0 \
+    --extra-left-context $extra_left_context  \
+    --extra-right-context $extra_right_context  \
+    --frames-per-chunk "$frames_per_chunk" \
+    --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_test_hires \
+    --scoring-opts "--min-lmwt 5 " \
+    $dir/graph data/test_hires $dir/decode || exit 1;
+fi
+exit 0
diff --git a/egs/gale_arabic/s5c/local/nnet3/run_ivector_common.sh b/egs/gale_arabic/s5c/local/nnet3/run_ivector_common.sh
new file mode 100755
index 00000000000..a03cc5b2fa3
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,182 @@
+#!/bin/bash
+
+set -e -o pipefail
+
+# This script is called from scripts like local/nnet3/run_tdnn.sh and
+# local/chain/run_tdnn.sh (and may eventually be called by more scripts).  It
+# contains the common feature preparation and iVector-related parts of the
+# script.  See those scripts for examples of usage.
+
+
+stage=0
+nj=100
+train_set=train   # you might set this to e.g. train.
+test_sets="test"
+gmm=tri3b # This specifies a GMM-dir from the features of the type you're training the system on;
+                         # it should contain alignments for 'train_set'.
+
+num_threads_ubm=32
+nnet3_affix= # affix for exp/nnet3 directory to put iVector stuff
+
+. ./cmd.sh
+. ./path.sh
+. utils/parse_options.sh
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+
+for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+
+
+if [ $stage -le 2 ] && [ -f data/${train_set}_sp_hires/feats.scp ]; then
+  echo "$0: data/${train_set}_sp_hires/feats.scp already exists."
+  echo " ... Please either remove it, or rerun this script with stage > 2."
+  exit 1
+fi
+
+
+if [ $stage -le 1 ]; then
+  echo "$0: preparing directory for speed-perturbed data"
+  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: creating high-resolution MFCC features"
+
+  # this shows how you can split across multiple file-systems.  we'll split the
+  # MFCC dir across multiple locations.  You might want to be careful here, if you
+  # have multiple copies of Kaldi checked out and run the same recipe, not to let
+  # them overwrite each other.
+  mfccdir=data/${train_set}_sp_hires/data
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/gale_arabic-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+  fi
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+  done
+
+  # do volume-perturbation on the training data prior to extracting hires
+  # features; this helps make trained nnets more invariant to test data volume.
+  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires
+    steps/compute_cmvn_stats.sh data/${datadir}_hires
+    utils/fix_data_dir.sh data/${datadir}_hires
+  done
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: computing a subset of data to train the diagonal UBM."
+  mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
+  temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
+
+  # train a diagonal UBM using a subset of about a quarter of the data
+  num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
+  num_utts=$[$num_utts_total/4]
+  utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
+      $num_utts ${temp_data_root}/${train_set}_sp_hires_subset
+
+  echo "$0: computing a PCA transform from the hires data."
+  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
+      --splice-opts "--left-context=3 --right-context=3" \
+      --max-utts 10000 --subsample 2 \
+       ${temp_data_root}/${train_set}_sp_hires_subset \
+       exp/nnet3${nnet3_affix}/pca_transform
+
+  echo "$0: training the diagonal UBM."
+  # Use 512 Gaussians in the UBM.
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
+    --num-frames 700000 \
+    --num-threads $num_threads_ubm \
+    ${temp_data_root}/${train_set}_sp_hires_subset 512 \
+    exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
+fi
+
+if [ $stage -le 4 ]; then
+  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
+  # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
+  # 100.
+  echo "$0: training the iVector extractor"
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+    data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm exp/nnet3${nnet3_affix}/extractor || exit 1;
+fi
+
+if [ $stage -le 5 ]; then
+  # note, we don't encode the 'max2' in the name of the ivectordir even though
+  # that's the data we extract the ivectors from, as it's still going to be
+  # valid for the non-'max2' data; the utterance list is the same.
+  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/ivectors/gale_arabic-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
+  fi
+  # We extract iVectors on the speed-perturbed training data .  With
+  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
+  # each of these pairs as one speaker; this gives more diversity in iVectors..
+  # Note that these are extracted 'online' (they vary within the utterance).
+
+  # Having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (the iVector starts at zero at the beginning
+  # of each pseudo-speaker).
+  temp_data_root=${ivectordir}
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${train_set}_sp_hires ${temp_data_root}/${train_set}_sp_hires_max2
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+    ${temp_data_root}/${train_set}_sp_hires_max2 \
+    exp/nnet3${nnet3_affix}/extractor $ivectordir
+
+  # Also extract iVectors for the test data, but in this case we don't need the speed
+  # perturbation (sp).
+  for data in ${test_sets}; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+      data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
+      exp/nnet3${nnet3_affix}/ivectors_${data}_hires
+  done
+fi
+
+if [ -f data/${train_set}_sp/feats.scp ] && [ $stage -le 7 ]; then
+  echo "$0: data/${train_set}_sp/feats.scp already exists.  Refusing to overwrite the features "
+  echo " to avoid wasting time.  Please remove the file and continue if you really mean this."
+  exit 1;
+fi
+
+
+if [ $stage -le 6 ]; then
+  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+  utils/data/perturb_data_dir_speed_3way.sh \
+    data/${train_set} data/${train_set}_sp
+fi
+
+if [ $stage -le 7 ]; then
+  echo "$0: making MFCC features for low-resolution speed-perturbed data (needed for alignments)"
+  steps/make_mfcc.sh --nj $nj \
+    --cmd "$train_cmd" data/${train_set}_sp
+  steps/compute_cmvn_stats.sh data/${train_set}_sp
+  echo "$0: fixing input data-dir to remove nonexistent features, in case some "
+  echo ".. speed-perturbed segments were too short."
+  utils/fix_data_dir.sh data/${train_set}_sp
+fi
+
+if [ $stage -le 8 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning with the perturbed low-resolution data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/${train_set}_sp data/lang $gmm_dir $ali_dir
+fi
+
+
+exit 0;
diff --git a/egs/gale_arabic/s5c/local/nnet3/run_lstm.sh b/egs/gale_arabic/s5c/local/nnet3/run_lstm.sh
new file mode 120000
index 00000000000..c53740399ce
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/nnet3/run_lstm.sh
@@ -0,0 +1 @@
+tuning/run_lstm_1a.sh
\ No newline at end of file
diff --git a/egs/gale_arabic/s5c/local/nnet3/run_tdnn.sh b/egs/gale_arabic/s5c/local/nnet3/run_tdnn.sh
new file mode 120000
index 00000000000..34499362831
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/nnet3/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/gale_arabic/s5c/local/nnet3/tuning/run_lstm_1a.sh b/egs/gale_arabic/s5c/local/nnet3/tuning/run_lstm_1a.sh
new file mode 100755
index 00000000000..7f7b8b3ba56
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/nnet3/tuning/run_lstm_1a.sh
@@ -0,0 +1,161 @@
+#!/bin/bash
+
+#started from tedlium recipe with few edits
+
+
+set -e -o pipefail -u
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+train_set=train
+gmm=tri2b  # this is the source gmm-dir for the data-type of interest; it
+                  # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for exp dirs, e.g. _cleaned
+
+# Options which are not passed through to run_ivector_common.sh
+affix=
+common_egs_dir=
+reporting_email=
+
+# LSTM options
+train_stage=-10
+splice_indexes="-2,-1,0,1,2 0 0"
+lstm_delay=" -1 -2 -3 "
+label_delay=5
+num_lstm_layers=3
+cell_dim=1024
+hidden_dim=1024
+recurrent_projection_dim=256
+non_recurrent_projection_dim=256
+chunk_width=20
+chunk_left_context=40
+chunk_right_context=0
+max_param_change=2.0
+
+# training options
+srand=0
+num_epochs=6
+initial_effective_lrate=0.0003
+final_effective_lrate=0.00003
+num_jobs_initial=2
+num_jobs_final=3
+momentum=0.5
+num_chunk_per_minibatch=100
+samples_per_iter=20000
+remove_egs=true
+
+#decode options
+extra_left_context=
+extra_right_context=
+frames_per_chunk=
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+graph_dir=$gmm_dir/graph
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+dir=exp/nnet3${nnet3_affix}/lstm${affix:+_$affix}
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+dir=${dir}_sp
+train_data_dir=data/${train_set}_sp_hires_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+     $graph_dir/HCLG.fst $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs"
+  config_extra_opts=()
+  [ ! -z "$lstm_delay" ] && config_extra_opts+=(--lstm-delay "$lstm_delay")
+  steps/nnet3/lstm/make_configs.py  "${config_extra_opts[@]}" \
+    --feat-dir $train_data_dir \
+    --ivector-dir $train_ivector_dir \
+    --ali-dir $ali_dir \
+    --num-lstm-layers $num_lstm_layers \
+    --splice-indexes "$splice_indexes " \
+    --cell-dim $cell_dim \
+    --hidden-dim $hidden_dim \
+    --recurrent-projection-dim $recurrent_projection_dim \
+    --non-recurrent-projection-dim $non_recurrent_projection_dim \
+    --label-delay $label_delay \
+    --self-repair-scale-nonlinearity 0.00001 \
+  $dir/configs || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+  
+  steps/nnet3/train_rnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.num-epochs=$num_epochs \
+    --trainer.samples-per-iter=$samples_per_iter \
+    --trainer.optimization.num-jobs-initial=$num_jobs_initial \
+    --trainer.optimization.num-jobs-final=$num_jobs_final \
+    --trainer.optimization.initial-effective-lrate=$initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate=$final_effective_lrate \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.rnn.num-chunk-per-minibatch=$num_chunk_per_minibatch \
+    --trainer.optimization.momentum=$momentum \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --cleanup.preserve-model-interval=1 \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$ali_dir \
+    --lang=data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  rm $dir/.error 2>/dev/null || true
+  steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
+    --extra-left-context $extra_left_context \
+    --extra-right-context $extra_right_context \
+    --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_test_hires \
+    ${graph_dir} data/test_hires ${dir}/decode || exit 1
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+    data/test_hires ${dir}/decode_test ${dir}/decode_test_rescore || exit 1
+fi
+
+exit 0;
diff --git a/egs/gale_arabic/s5c/local/nnet3/tuning/run_tdnn_1a.sh b/egs/gale_arabic/s5c/local/nnet3/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..6619df668ef
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/nnet3/tuning/run_tdnn_1a.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+
+# started from tedlium recipe with few edits
+
+set -e -o pipefail -u
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+train_set=train
+gmm=tri2b  # this is the source gmm-dir for the data-type of interest; it
+                  # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for exp dirs, e.g. _cleaned
+tdnn_affix=  #affix for TDNN directory e.g. "a" or "b", in case we change the configuration.
+
+# Options which are not passed through to run_ivector_common.sh
+train_stage=-10
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -7,2 -3,3 0 0"
+remove_egs=true
+relu_dim=850
+num_epochs=3
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+graph_dir=$gmm_dir/graph
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+dir=exp/nnet3${nnet3_affix}/tdnn${tdnn_affix}_sp
+train_data_dir=data/${train_set}_sp_hires_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+     $graph_dir/HCLG.fst $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+   
+  steps/nnet3/tdnn/train.sh --stage $train_stage \
+    --num-epochs $num_epochs --num-jobs-initial 2 --num-jobs-final 2 \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir ${train_ivector_dir} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate 0.0015 --final-effective-lrate 0.00015 \
+    --cmd "$decode_cmd" \
+    --relu-dim "$relu_dim" \
+    --remove-egs "$remove_egs" \
+    $train_data_dir data/lang $ali_dir $dir
+fi
+
+if [ $stage -le 13 ]; then
+  steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
+    --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_test_hires \
+    ${graph_dir} data/test_hires ${dir}/decode || exit 1 
+fi
+
+exit 0;
diff --git a/egs/gale_arabic/s5c/local/normalize_transcript_BW.pl b/egs/gale_arabic/s5c/local/normalize_transcript_BW.pl
new file mode 100755
index 00000000000..df01c5d7b85
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/normalize_transcript_BW.pl
@@ -0,0 +1,111 @@
+#!/usr/bin/env perl
+
+# Copyright 2014 QCRI (author: Ahmed Ali)
+# Apache 2.0
+
+use warnings;
+use strict;
+use Encode;
+use utf8;
+
+
+
+if (@ARGV !=2 )
+    {#
+	print "usage: $0 <inFile> <onlyArabicFile>\n"; 
+	exit (1);   
+    }
+    
+# <\check usage>
+my $inFile = shift (@ARGV);
+my $ouFile = shift(@ARGV);
+
+
+open INFILE, "<$inFile" || die "unable to open the input file $inFile\n";
+binmode INFILE, ":encoding(utf8)";
+
+
+open OUTPUTFILE, ">$ouFile" or die "unable to open the output mlf file $ouFile\n";
+binmode OUTPUTFILE, ":encoding(utf8)";
+
+
+while (<INFILE>) {
+  s/[^اأإآبتثجحخدذرزسشصضطظعغفقكلمنهويىئءؤة0-9]+/ /g;  ## Removes non Arabic or numbers
+  my $BW = convertUTF8ToBuckwalter ($_);
+  print OUTPUTFILE "$BW"."\n";
+}
+close INFILE;
+close OUTPUTFILE;
+
+
+
+# this function is copied from MADATools.pm: MADA Tools
+ sub convertUTF8ToBuckwalter {
+
+    my ($line)= (@_);
+    #$line = $UTF8_ENCODING_OBJ->decode($line);  ## Same as Encode::decode("utf8",$line), but faster since object already created
+    $line =~ s/\x{0621}/\'/g;   ## HAMZA
+    $line =~ s/\x{0622}/\|/g;   ## ALEF WITH MADDA ABOVE
+    $line =~ s/\x{0623}/\>/g;   ## ALEF WITH HAMZA ABOVE
+    $line =~ s/\x{0624}/\&/g;   ## WAW WITH HAMZA ABOVE
+    $line =~ s/\x{0625}/\</g;   ## ALEF WITH HAMZA BELOW
+    $line =~ s/\x{0626}/\}/g;   ## YEH WITH HAMZA ABOVE
+    $line =~ s/\x{0627}/A/g;    ## ALEF
+    $line =~ s/\x{0628}/b/g;    ## BEH
+    $line =~ s/\x{0629}/p/g;    ## TEH MARBUTA
+    $line =~ s/\x{062A}/t/g;    ## TEH
+    $line =~ s/\x{062B}/v/g;    ## THEH
+    $line =~ s/\x{062C}/j/g;    ## JEEM
+    $line =~ s/\x{062D}/H/g;    ## HAH
+    $line =~ s/\x{062E}/x/g;    ## KHAH
+    $line =~ s/\x{062F}/d/g;    ## DAL
+    $line =~ s/\x{0630}/\*/g;   ## THAL
+    $line =~ s/\x{0631}/r/g;    ## REH
+    $line =~ s/\x{0632}/z/g;    ## ZAIN
+    $line =~ s/\x{0633}/s/g;    ## SEEN
+    $line =~ s/\x{0634}/\$/g;   ## SHEEN
+    $line =~ s/\x{0635}/S/g;    ## SAD
+    $line =~ s/\x{0636}/D/g;    ## DAD
+    $line =~ s/\x{0637}/T/g;    ## TAH
+    $line =~ s/\x{0638}/Z/g;    ## ZAH
+    $line =~ s/\x{0639}/E/g;    ## AIN
+    $line =~ s/\x{063A}/g/g;    ## GHAIN
+    $line =~ s/\x{0640}/_/g;    ## TATWEEL
+    $line =~ s/\x{0641}/f/g;    ## FEH
+    $line =~ s/\x{0642}/q/g;    ## QAF
+    $line =~ s/\x{0643}/k/g;    ## KAF
+    $line =~ s/\x{0644}/l/g;    ## LAM
+    $line =~ s/\x{0645}/m/g;    ## MEEM
+    $line =~ s/\x{0646}/n/g;    ## NOON
+    $line =~ s/\x{0647}/h/g;    ## HEH
+    $line =~ s/\x{0648}/w/g;    ## WAW
+    $line =~ s/\x{0649}/Y/g;    ## ALEF MAKSURA
+    $line =~ s/\x{064A}/y/g;    ## YEH
+
+    ## Diacritics
+    $line =~ s/\x{064B}/F/g;    ## FATHATAN
+    $line =~ s/\x{064C}/N/g;    ## DAMMATAN
+    $line =~ s/\x{064D}/K/g;    ## KASRATAN
+    $line =~ s/\x{064E}/a/g;    ## FATHA
+    $line =~ s/\x{064F}/u/g;    ## DAMMA
+    $line =~ s/\x{0650}/i/g;    ## KASRA
+    $line =~ s/\x{0651}/\~/g;   ## SHADDA
+    $line =~ s/\x{0652}/o/g;    ## SUKUN
+    $line =~ s/\x{0670}/\`/g;   ## SUPERSCRIPT ALEF
+
+    $line =~ s/\x{0671}/\{/g;   ## ALEF WASLA
+    $line =~ s/\x{067E}/P/g;    ## PEH
+    $line =~ s/\x{0686}/J/g;    ## TCHEH
+    $line =~ s/\x{06A4}/V/g;    ## VEH
+    $line =~ s/\x{06AF}/G/g;    ## GAF
+
+
+    ## Punctuation should really be handled by the utf8 cleaner or other method
+#   $line =~ s/\xa2/\,/g; # comma
+#    $line =~ s//\,/g; # comma
+#    $line =~ s//\,/g;
+#    $line =~ s//\;/g; # semicolon
+#    $line =~ s//\?/g; # questionmark
+
+    return $line;
+}
diff --git a/egs/gale_arabic/s5c/local/prepare_data.sh b/egs/gale_arabic/s5c/local/prepare_data.sh
new file mode 100755
index 00000000000..aea9ba2dc8e
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/prepare_data.sh
@@ -0,0 +1,104 @@
+#!/bin/bash 
+
+# Copyright 2014 QCRI (author: Ahmed Ali)
+# Apache 2.0
+
+dir1=/export/corpora/LDC/LDC2013S02/
+dir2=/export/corpora/LDC/LDC2013S07/
+dir3=/export/corpora/LDC/LDC2014S07/
+text1=/export/corpora/LDC/LDC2013T17/
+text2=/export/corpora/LDC/LDC2013T04/
+text3=/export/corpora/LDC/LDC2014T17/
+gale_data=GALE
+
+mkdir -p $gale_data 
+# check that sox is installed 
+which sox  &>/dev/null
+if [[ $? != 0 ]]; then 
+ echo "$0: sox is not installed"; exit 1
+fi
+
+for dvd in $dir1 $dir2 $dir3; do
+  dvd_full_path=$(utils/make_absolute.sh $dvd)
+  if [[ ! -e $dvd_full_path ]]; then 
+    echo "$0: missing $dvd_full_path"; exit 1;
+  fi
+  find $dvd_full_path \( -name "*.wav" -o -name "*.flac" \)  | while read file; do
+    id=$(basename $file | awk '{gsub(".wav","");gsub(".flac","");print}')
+    echo "$id sox $file -r 16000 -t wav - |"
+  done 
+done | sort -u > $gale_data/wav.scp
+echo "$0:data prep audio succeded"
+
+gale_data=$(utils/make_absolute.sh "GALE" );
+top_pwd=`pwd`
+txtdir=$gale_data/txt
+mkdir -p $txtdir; cd $txtdir
+
+for cdx in $text1 $text2 $text3; do
+  echo "$0:Preparing $cdx"
+  if [[ $cdx  == *.tgz ]] ; then
+     tar -xvf $cdx
+  elif [  -d "$cdx" ]; then
+    ln -s $cdx `basename $cdx`
+  else
+    echo "$0:I don't really know what I shall do with $cdx " >&2
+  fi
+done
+
+find -L . -type f -name "*.tdf" | while read file; do
+sed '1,3d' $file  # delete the first 3 lines
+done >  all.tmp$$
+
+perl -e '
+    ($inFile,$idFile,$txtFile)= split /\s+/, $ARGV[0];
+    open(IN, "$inFile");
+    open(ID, ">$idFile");
+    open(TXT, ">$txtFile");
+    while (<IN>) {
+      @arr= split /\t/,$_;
+      $start=sprintf ("%0.3f",$arr[2]);$rStart=$start;$start=~s/\.//; $start=~s/^0+$/0/; $start=~s/^0+([^0])/$1/; # remove zeros at the beginning
+      $end=sprintf ("%0.3f",$arr[3]);$rEnd=$end;$end=~s/^0+([^0])/$1/;$end=~s/\.//;
+      if ( ($arr[11] !~ m/report/) && ($arr[11] !~ m/conversational/) ){$arr[11]="UNK";}
+      $id="$arr[11] $arr[0] $arr[0]_${start}_${end} $rStart $rEnd\n";
+      next if ($rStart == $rEnd);
+      $id =~ s/.sph//g;
+      print ID $id;
+      print TXT "$arr[7]\n";
+ }' "all.tmp$$ allid.tmp$$ contentall.tmp$$"
+
+perl ${top_pwd}/local/normalize_transcript_BW.pl contentall.tmp$$ contentall.buck.tmp$$
+paste allid.tmp$$ contentall.buck.tmp$$ | sed 's: $::' | awk '{if (NF>5) {print $0}}'  > all_1.tmp$$
+
+
+awk '{$1="";print $0}' all_1.tmp$$ | sed 's:^ ::' > $gale_data/all
+awk '{if ($1 == "report") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' >  $gale_data/report
+awk '{if ($1 == "conversational") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' > $gale_data/conversational
+
+cd ..;
+rm -fr $txtdir
+cd $top_pwd
+echo "$0:dat a prep text succeeded"
+
+mkdir -p data
+dir=$(utils/make_absolute.sh data/)
+grep -f local/test_list $gale_data/all | grep -v -f local/bad_segments > $gale_data/all.test
+grep -v -f local/test_list $gale_data/all | grep -v -f local/bad_segments > $gale_data/all.train 
+
+for x in test train; do
+ outdir=data/$x
+ file=$gale_data/all.$x 
+ mkdir -p $outdir
+ awk '{print $2 " " $2}' $file | sort -u > $outdir/utt2spk 
+ cp -pr $outdir/utt2spk $outdir/spk2utt
+ awk '{print $2 " " $1 " " $3 " " $4}' $file  | sort -u > $outdir/segments
+ awk '{printf $2 " "; for (i=5; i<=NF; i++) {printf $i " "} printf "\n"}' $file | sort -u > $outdir/text
+done 
+
+grep -f local/test_list $gale_data/wav.scp > $dir/test/wav.scp
+
+cat $gale_data/wav.scp | awk -v seg=$dir/train/segments 'BEGIN{while((getline<seg) >0) {seen[$2]=1;}}
+ {if (seen[$1]) { print $0}}' > $dir/train/wav.scp
+ 
+echo "$0:data prep split succeeded"
+exit 0
diff --git a/egs/gale_arabic/s5c/local/prepare_dict.sh b/egs/gale_arabic/s5c/local/prepare_dict.sh
new file mode 100755
index 00000000000..47b5869fdf1
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/prepare_dict.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+
+# Copyright 2017 QCRI (author: Ahmed Ali)
+# Apache 2.0
+# This script prepares the dictionary.
+
+set -e
+dir=data/local/dict
+lexicon_url1="http://alt.qcri.org//resources/speech/dictionary/ar-ar_grapheme_lexicon_2016-02-09.bz2";
+lexicon_url2="http://alt.qcri.org//resources/speech/dictionary/ar-ar_lexicon_2014-03-17.txt.bz2";
+stage=0
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+mkdir -p $dir data/local/lexicon_data
+
+if [ $stage -le 0 ]; then
+  echo "$0: Downloading text for lexicon... $(date)."
+  wget -P data/local/lexicon_data $lexicon_url1
+  wget -P data/local/lexicon_data $lexicon_url2
+  bzcat data/local/lexicon_data/ar-ar_grapheme_lexicon_2016-02-09.bz2  | sed '1,3d' | awk '{print $1}'  >  data/local/lexicon_data/grapheme_lexicon
+  bzcat data/local/lexicon_data/ar-ar_lexicon_2014-03-17.txt.bz2 | sed '1,3d' | awk '{print $1}' >>  data/local/lexicon_data/grapheme_lexicon
+  cat data/train/text | cut -d ' ' -f 2- | tr -s " " "\n" | sort -u >> data/local/lexicon_data/grapheme_lexicon
+fi
+
+
+if [ $stage -le 0 ]; then
+  echo "$0: processing lexicon text and creating lexicon... $(date)."
+  # remove vowels and  rare alef wasla
+  grep -v [0-9] data/local/lexicon_data/grapheme_lexicon |  sed -e 's:[FNKaui\~o\`]::g' -e 's:{:}:g' | sort -u > data/local/lexicon_data/processed_lexicon
+  local/prepare_lexicon.py
+fi
+
+cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1;
+
+sed -i '1i<UNK> UNK' $dir/lexicon.txt
+
+echo UNK >> $dir/nonsilence_phones.txt
+
+echo '<sil> SIL' >> $dir/lexicon.txt
+
+echo SIL > $dir/silence_phones.txt
+
+echo SIL >$dir/optional_silence.txt
+
+echo -n "" >$dir/extra_questions.txt
+
+echo "$0: Dictionary preparation succeeded"
diff --git a/egs/gale_arabic/s5c/local/prepare_dict_subword.sh b/egs/gale_arabic/s5c/local/prepare_dict_subword.sh
new file mode 100755
index 00000000000..330de664349
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/prepare_dict_subword.sh
@@ -0,0 +1,64 @@
+#!/usr/bin/env bash
+
+# Copyright 2017 QCRI (author: Ahmed Ali)
+#           2019 Dongji Gao
+# Apache 2.0
+# This script prepares the subword dictionary.
+
+set -e
+dir=data/local/dict
+lexicon_url1="http://alt.qcri.org//resources/speech/dictionary/ar-ar_grapheme_lexicon_2016-02-09.bz2";
+lexicon_url2="http://alt.qcri.org//resources/speech/dictionary/ar-ar_lexicon_2014-03-17.txt.bz2";
+num_merges=1000
+stage=0
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+mkdir -p $dir data/local/lexicon_data
+
+if [ $stage -le 0 ]; then
+  echo "$0: Downloading text for lexicon... $(date)."
+  wget -P data/local/lexicon_data $lexicon_url1
+  wget -P data/local/lexicon_data $lexicon_url2
+  bzcat data/local/lexicon_data/ar-ar_grapheme_lexicon_2016-02-09.bz2  | sed '1,3d' | awk '{print $1}'  >  data/local/lexicon_data/grapheme_lexicon
+  bzcat data/local/lexicon_data/ar-ar_lexicon_2014-03-17.txt.bz2 | sed '1,3d' | awk '{print $1}' >>  data/local/lexicon_data/grapheme_lexicon
+  cat data/train/text | cut -d ' ' -f 2- | tr -s " " "\n" | sort -u >> data/local/lexicon_data/grapheme_lexicon
+fi
+
+
+if [ $stage -le 0 ]; then
+  echo "$0: processing lexicon text and creating lexicon... $(date)."
+  # remove vowels and  rare alef wasla
+  grep -v [0-9] data/local/lexicon_data/grapheme_lexicon |  sed -e 's:[FNKaui\~o\`]::g' -e 's:{:}:g' | sort -u > data/local/lexicon_data/processed_lexicon
+  local/prepare_lexicon.py
+fi
+
+cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1;
+
+echo UNK >> $dir/nonsilence_phones.txt
+
+echo SIL > $dir/silence_phones.txt
+
+echo SIL >$dir/optional_silence.txt
+
+echo -n "" >$dir/extra_questions.txt
+
+# Make a subword lexicon based on current word lexicon
+glossaries="<UNK> <sil>"
+if [ $stage -le 0 ]; then
+  echo "$0: making subword lexicon... $(date)."
+  # get pair_code file
+  cut -d ' ' -f2- data/train/text | sed 's/<[^>]*>//g' | utils/lang/bpe/learn_bpe.py -s $num_merges > data/local/pair_code.txt
+  mv $dir/lexicon.txt $dir/lexicon_word.txt
+  # get words
+  cut -d ' ' -f1 $dir/lexicon_word.txt > $dir/words.txt
+  utils/lang/bpe/apply_bpe.py -c data/local/pair_code.txt --glossaries $glossaries < $dir/words.txt | \
+  sed 's/ /\n/g' | sort -u > $dir/subwords.txt
+  sed 's/./& /g' $dir/subwords.txt | sed 's/@ @ //g' | sed 's/*/V/g' | paste -d ' ' $dir/subwords.txt - > $dir/lexicon.txt
+fi
+
+sed -i '1i<UNK> UNK' $dir/lexicon.txt
+
+echo '<sil> SIL' >> $dir/lexicon.txt
+
+echo "$0: Dictionary preparation succeeded"
diff --git a/egs/gale_arabic/s5c/local/prepare_lexicon.py b/egs/gale_arabic/s5c/local/prepare_lexicon.py
new file mode 100755
index 00000000000..215541585eb
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/prepare_lexicon.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+
+# Copyright      2018  Ashish Arora
+# Apache 2.0
+
+# This script prepares lexicon.
+
+import argparse
+import os
+
+parser = argparse.ArgumentParser(description="""Creates the list of characters and words in lexicon""")
+args = parser.parse_args()
+
+### main ###
+lex = {}
+text_path = os.path.join('data','local', 'lexicon_data', 'processed_lexicon')
+with open(text_path, 'r', encoding='utf-8') as f:
+    for line in f:
+        line = line.strip()
+        characters = list(line)
+        characters = " ".join(['V' if char == '*' else char for char in characters])
+        lex[line] = characters
+
+with open(os.path.join('data','local','dict', 'lexicon.txt'), 'w', encoding='utf-8') as fp:
+    for key in sorted(lex):
+        fp.write(key + "  " + lex[key] + "\n")
diff --git a/egs/gale_arabic/s5c/local/prepare_lm.sh b/egs/gale_arabic/s5c/local/prepare_lm.sh
new file mode 100755
index 00000000000..6fdf35f471a
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/prepare_lm.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# Copyright 2012  Vassil Panayotov
+#           2017  Ewald Enzinger
+# Apache 2.0
+
+. ./path.sh || exit 1
+
+echo "=== Building a language model ..."
+
+dir=data/local/lm/
+text=data/train/text
+lexicon=data/local/dict/lexicon.txt
+# Language model order
+order=3
+
+. utils/parse_options.sh
+
+# Prepare a LM training corpus from the transcripts
+mkdir -p $dir
+
+for f in "$text" "$lexicon"; do
+  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
+
+loc=`which ngram-count`;
+if [ -z $loc ]; then
+  if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
+    sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64 
+  else
+    sdir=$KALDI_ROOT/tools/srilm/bin/i686
+  fi
+  if [ -f $sdir/ngram-count ]; then
+    echo Using SRILM tools from $sdir
+    export PATH=$PATH:$sdir
+  else
+    echo You appear to not have SRILM tools installed, either on your path,
+    echo or installed in $sdir.  See tools/install_srilm.sh for installation
+    echo instructions.
+    exit 1
+  fi
+fi
+
+cat data/train/text | cut -d " " -f 2- >  $dir/text.txt
+cut -d' ' -f1 $lexicon > $dir/wordlist
+
+ngram-count -text $dir/text.txt -order $order -limit-vocab -vocab $dir/wordlist \
+  -unk -map-unk "<UNK>" -kndiscount -interpolate -lm $dir/lm.gz
+
+#ngram -lm $dir/lm.gz -ppl $dir/dev.txt
+echo "*** Finished building the LM model!"
diff --git a/egs/gale_arabic/s5c/local/prepare_lm_subword.sh b/egs/gale_arabic/s5c/local/prepare_lm_subword.sh
new file mode 100755
index 00000000000..a5d5c1d1c94
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/prepare_lm_subword.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+# Copyright 2012  Vassil Panayotov
+#           2017  Ewald Enzinger
+#           2019  Dongji Gao
+# Apache 2.0
+
+. ./path.sh || exit 1
+
+echo "=== Building a language model ..."
+
+dir=data/local/lm/
+text=data/train/text
+lexicon=data/local/dict/lexicon.txt
+# Language model order
+order=6
+
+. utils/parse_options.sh
+
+# Prepare a LM training corpus from the transcripts
+mkdir -p $dir
+
+for f in "$text" "$lexicon"; do
+  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
+
+loc=`which ngram-count`;
+if [ -z $loc ]; then
+  if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
+    sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64 
+  else
+    sdir=$KALDI_ROOT/tools/srilm/bin/i686
+  fi
+  if [ -f $sdir/ngram-count ]; then
+    echo Using SRILM tools from $sdir
+    export PATH=$PATH:$sdir
+  else
+    echo You appear to not have SRILM tools installed, either on your path,
+    echo or installed in $sdir.  See tools/install_srilm.sh for installation
+    echo instructions.
+    exit 1
+  fi
+fi
+
+cat data/train/text | cut -d " " -f 2- >  $dir/text.txt
+cat data/test/text | cut -d ' ' -f2- > $dir/dev.txt
+cut -d' ' -f1 $lexicon > $dir/wordlist
+
+ngram-count -text $dir/text.txt -order $order -vocab $dir/wordlist \
+  -unk -map-unk "<UNK>" -wbdiscount1 -kndiscount2 -kndiscount3 -kndiscount4 -kndiscount5 -kndiscount6 -interpolate -lm $dir/lm.gz
+
+ngram -order $order -lm $dir/lm.gz -ppl $dir/dev.txt
+echo "*** Finished building the LM model!"
diff --git a/egs/gale_arabic/s5c/local/score.sh b/egs/gale_arabic/s5c/local/score.sh
new file mode 100755
index 00000000000..1d84815fc69
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/score.sh
@@ -0,0 +1,6 @@
+
+#!/bin/bash
+
+
+steps/scoring/score_kaldi_wer.sh "$@"
+steps/scoring/score_kaldi_cer.sh --stage 2 "$@"
diff --git a/egs/gale_arabic/s5c/local/split_wer.sh b/egs/gale_arabic/s5c/local/split_wer.sh
new file mode 100755
index 00000000000..d83a0f79e8c
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/split_wer.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+# Report WER for reports and conversational
+# Copyright 2014 QCRI (author: Ahmed Ali)
+# Apache 2.0
+
+if [ $# -ne 1 ]; then
+   echo "Arguments should be the gale folder, see ../run.sh for example."
+   exit 1;
+fi
+
+[ -f ./path.sh ] && . ./path.sh
+
+
+galeFolder=$(utils/make_absolute.sh $1)
+symtab=./data/lang/words.txt
+find exp/ -maxdepth 3 -type d -name decode\* > list_decode$$
+
+#split the test set per type:
+awk '{print $2}' $galeFolder/all.test | sort -u > $galeFolder/test_id$$
+
+# generate the report test set
+awk '{print $2}' $galeFolder/report | sort -u  > $galeFolder/report_id$$
+comm -1 -2 $galeFolder/test_id$$ $galeFolder/report_id$$ > $galeFolder/report.test
+
+# generate the conversational test set
+awk '{print $2}' $galeFolder/conversational | sort -u  > $galeFolder/conversational_id$$
+
+comm -1 -2 $galeFolder/test_id$$ $galeFolder/conversational_id$$ > $galeFolder/conversational.test
+
+rm -fr $galeFolder/test_id$$ $galeFolder/report_id$$ $galeFolder/conversational_id$$
+
+min_lmwt=7
+max_lmwt=20
+cat list_decode$$ | while read dir; do
+ for type in report conversational; do
+ #echo "Processing: $dir $type"
+  rm -fr $dir/scoring_$type
+  cp -pr $dir/scoring  $dir/scoring_$type
+  ( cd $dir/scoring_$type;
+    for x in *.tra test_filt.txt; do
+      sort -u $x > tmp$$
+      join tmp$$ $galeFolder/${type}.test > $x
+      rm -fr tmp$$
+    done
+   )
+
+utils/run.pl LMWT=$min_lmwt:$max_lmwt $dir/scoring_$type/log/score.LMWT.log \
+   cat $dir/scoring_${type}/LMWT.tra \| \
+    utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
+    compute-wer --text --mode=present \
+     ark:$dir/scoring_${type}/test_filt.txt  ark,p:- ">&" $dir/wer_${type}_LMWT
+done
+done
+
+
+time=$(date +"%Y-%m-%d-%H-%M-%S")
+echo "RESULTS generated by $USER at $time"
+
+echo "Report Results WER:"
+cat list_decode$$ | while read x; do [ -d $x ] && grep WER $x/wer_report_* | utils/best_wer.sh; done | sort -n -k2
+
+echo "Conversational Results WER:"
+cat list_decode$$ | while read x; do [ -d $x ] && grep WER $x/wer_conversational_* | utils/best_wer.sh; done | sort -n -k2
+
+echo "Combined Results for Reports and Conversational WER:"
+cat list_decode$$ | while read x; do [ -d $x ] && grep WER $x/wer_?? $x/wer_?| utils/best_wer.sh; done | sort -n -k2
+
+rm list_decode$$
+
+
+
diff --git a/egs/gale_arabic/s5c/local/test_list b/egs/gale_arabic/s5c/local/test_list
new file mode 100644
index 00000000000..d82cf498804
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/test_list
@@ -0,0 +1,11 @@
+ALAM_WITHEVENT_ARB_20070116_205800
+ALAM_WITHEVENT_ARB_20070130_205800
+ALAM_WITHEVENT_ARB_20070206_205801
+ALAM_WITHEVENT_ARB_20070213_205800
+ALAM_WITHEVENT_ARB_20070227_205800
+ALAM_WITHEVENT_ARB_20070306_205800
+ALAM_WITHEVENT_ARB_20070313_205800
+ARABIYA_FROMIRAQ_ARB_20070216_175800
+ARABIYA_FROMIRAQ_ARB_20070223_175801
+ARABIYA_FROMIRAQ_ARB_20070302_175801
+ARABIYA_FROMIRAQ_ARB_20070309_175800
diff --git a/egs/gale_arabic/s5c/local/wer_output_filter b/egs/gale_arabic/s5c/local/wer_output_filter
new file mode 100755
index 00000000000..fcd40539e7f
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/wer_output_filter
@@ -0,0 +1,4 @@
+#!/bin/sed -f
+s/@@ //g
+s/<sil>//g
+s/<UNK>//g
diff --git a/egs/gale_arabic/s5c/path.sh b/egs/gale_arabic/s5c/path.sh
new file mode 100755
index 00000000000..be11b34cbc6
--- /dev/null
+++ b/egs/gale_arabic/s5c/path.sh
@@ -0,0 +1,5 @@
+export KALDI_ROOT=$(pwd)/../../..
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/gale_arabic/s5c/run.sh b/egs/gale_arabic/s5c/run.sh
new file mode 100755
index 00000000000..3e363816812
--- /dev/null
+++ b/egs/gale_arabic/s5c/run.sh
@@ -0,0 +1,131 @@
+#!/bin/bash -e
+
+# Copyright 2014 QCRI (author: Ahmed Ali)
+#           2019 Dongji Gao
+# Apache 2.0
+
+# This is an example script for subword implementation
+
+num_jobs=120
+num_decode_jobs=40
+decode_gmm=true
+stage=0
+overwrite=false
+num_merges=1000
+
+dir1=/export/corpora/LDC/LDC2013S02/
+dir2=/export/corpora/LDC/LDC2013S07/
+dir3=/export/corpora/LDC/LDC2014S07/
+text1=/export/corpora/LDC/LDC2013T17/
+text2=/export/corpora/LDC/LDC2013T04/
+text3=/export/corpora/LDC/LDC2014T17/
+
+galeData=GALE
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+. ./path.sh
+. ./utils/parse_options.sh  # e.g. this parses the above options
+                            # if supplied.
+
+if [ $stage -le 0 ]; then
+
+  if [ -f data/train/text ] && ! $overwrite; then
+    echo "$0: Not processing, probably script have run from wrong stage"
+    echo "Exiting with status 1 to avoid data corruption"
+    exit 1;
+  fi
+
+  echo "$0: preparing data..."
+  local/prepare_data.sh --dir1 $dir1 --dir2 $dir2 --dir3 $dir3 \
+                        --text1 $text1 --text2 $text2 --text3 $text3
+
+  echo "$0: Preparing lexicon and LM..." 
+  local/prepare_dict_subword.sh --num_merges $num_merges
+
+  utils/subword/prepare_lang_subword.sh data/local/dict "<UNK>" data/local/lang data/lang
+
+  for set in train test; do
+    utils/subword/prepare_subword_text.sh data/$set/text data/local/pair_code.txt data/$set/text
+  done
+
+  local/prepare_lm_subword.sh
+
+  utils/format_lm.sh data/lang data/local/lm/lm.gz \
+                     data/local/dict/lexicon.txt data/lang_test
+fi
+
+mfccdir=mfcc
+if [ $stage -le 1 ]; then
+  echo "$0: Preparing the test and train feature files..."
+  for x in train test ; do
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj $num_jobs \
+      data/$x exp/make_mfcc/$x $mfccdir
+    utils/fix_data_dir.sh data/$x # some files fail to get mfcc for many reasons
+    steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: creating sub-set and training monophone system"
+  utils/subset_data_dir.sh data/train 10000 data/train.10K || exit 1;
+
+  steps/train_mono.sh --nj 40 --cmd "$train_cmd" \
+    data/train.10K data/lang exp/mono_subword || exit 1;
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: Aligning data using monophone system"
+  steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
+    data/train data/lang exp/mono_subword exp/mono_ali_subword || exit 1;
+
+  echo "$0: training triphone system with delta features"
+  steps/train_deltas.sh --cmd "$train_cmd" \
+    2500 30000 data/train data/lang exp/mono_ali_subword exp/tri1_subword || exit 1;
+fi
+
+if [ $stage -le 4 ] && $decode_gmm; then
+  utils/mkgraph.sh data/lang_test exp/tri1_subword exp/tri1_subword/graph
+  steps/decode.sh  --nj $num_decode_jobs --cmd "$decode_cmd" \
+    exp/tri1_subword/graph data/test exp/tri1_subword/decode
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$0: Aligning data and retraining and realigning with lda_mllt"
+  steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
+    data/train data/lang exp/tri1_subword exp/tri1_ali_subword || exit 1;
+
+  steps/train_lda_mllt.sh --cmd "$train_cmd" 4000 50000 \
+    data/train data/lang exp/tri1_ali_subword exp/tri2b_subword || exit 1;
+fi
+
+if [ $stage -le 6 ] && $decode_gmm; then
+  utils/mkgraph.sh data/lang_test exp/tri2b_subword exp/tri2b_subword/graph
+  steps/decode.sh --nj $num_decode_jobs --cmd "$decode_cmd" \
+    exp/tri2b_subword/graph data/test exp/tri2b_subword/decode
+fi
+
+if [ $stage -le 7 ]; then
+  echo "$0: Aligning data and retraining and realigning with sat_basis"
+  steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
+    data/train data/lang exp/tri2b_subword exp/tri2b_ali_subword || exit 1;
+
+  steps/train_sat_basis.sh --cmd "$train_cmd" \
+    5000 100000 data/train data/lang exp/tri2b_ali_subword exp/tri3b_subword || exit 1;
+
+  steps/align_fmllr.sh --nj $num_jobs --cmd "$train_cmd" \
+    data/train data/lang exp/tri3b_subword exp/tri3b_ali_subword || exit 1;
+fi
+
+if [ $stage -le 8 ] && $decode_gmm; then
+  utils/mkgraph.sh data/lang_test exp/tri3b_subword exp/tri3b_subword/graph
+  steps/decode_fmllr.sh --nj $num_decode_jobs --cmd \
+    "$decode_cmd" exp/tri3b_subword/graph data/test exp/tri3b_subword/decode
+fi
+
+if [ $stage -le 9 ]; then
+  echo "$0: Training a regular chain model using the e2e alignments..."
+  local/chain/run_tdnn.sh --gmm tri3b_subword
+fi
+
+echo "$0: training succeed"
+exit 0
diff --git a/egs/gale_arabic/s5c/steps b/egs/gale_arabic/s5c/steps
new file mode 120000
index 00000000000..1b186770dd1
--- /dev/null
+++ b/egs/gale_arabic/s5c/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps/
\ No newline at end of file
diff --git a/egs/gale_arabic/s5c/utils b/egs/gale_arabic/s5c/utils
new file mode 120000
index 00000000000..a3279dc8679
--- /dev/null
+++ b/egs/gale_arabic/s5c/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils/
\ No newline at end of file
diff --git a/egs/gale_mandarin/s5/local/gale_prep_dict.sh b/egs/gale_mandarin/s5/local/gale_prep_dict.sh
index 2e2810bb713..c6a80240754 100755
--- a/egs/gale_mandarin/s5/local/gale_prep_dict.sh
+++ b/egs/gale_mandarin/s5/local/gale_prep_dict.sh
@@ -130,7 +130,9 @@ unset LC_ALL
 # are equal
 cat $dict_dir/ch-dict.txt |\
   perl -e '
-  use encoding utf8;
+  use utf8;
+  binmode(STDIN,":encoding(utf8)");
+  binmode(STDOUT,":encoding(utf8)");
   while (<STDIN>) {
     @A = split(" ", $_);
     $word_len = length($A[0]);
@@ -299,4 +301,3 @@ cat $dict_dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ",
 
 export LC_ALL=C
 echo "$0: Done"
-
diff --git a/egs/gp/s1/local/gp_convert_audio.sh b/egs/gp/s1/local/gp_convert_audio.sh
index a7c2d7285c4..b3db909c9b6 100755
--- a/egs/gp/s1/local/gp_convert_audio.sh
+++ b/egs/gp/s1/local/gp_convert_audio.sh
@@ -108,4 +108,4 @@ done < "$INLIST"
   echo "sox: error converting following $nsoxerr file(s):" >&2
 [ -f "$soxerr" ] && cat "$soxerr" >&2
 
-exit 0;
\ No newline at end of file
+exit 0;
diff --git a/egs/gp/s1/utils/mkgraph.sh b/egs/gp/s1/utils/mkgraph.sh
index 2e45296593b..3aba742832d 100755
--- a/egs/gp/s1/utils/mkgraph.sh
+++ b/egs/gp/s1/utils/mkgraph.sh
@@ -131,4 +131,4 @@ cp $lang/silphones.csl $dir/
 # to make const fst:
 # fstconvert --fst_type=const $dir/HCLG.fst $dir/HCLG_c.fst
 
-echo "Finished making decoding graphs in $dir"
\ No newline at end of file
+echo "Finished making decoding graphs in $dir"
diff --git a/egs/heroico/s5/cmd.sh b/egs/heroico/s5/cmd.sh
index a427f3c16a5..533aad25db1 100755
--- a/egs/heroico/s5/cmd.sh
+++ b/egs/heroico/s5/cmd.sh
@@ -10,6 +10,7 @@
 # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
 # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
+export cmd="retry.pl queue.pl"
 export train_cmd="retry.pl queue.pl"
 export decode_cmd="retry.pl queue.pl --mem 2G"
 
diff --git a/egs/heroico/s5/local/chain/tuning/run_cnn_tdnn_1a.sh b/egs/heroico/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
index 1112f0ec08b..361879b4142 100755
--- a/egs/heroico/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
+++ b/egs/heroico/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
@@ -149,7 +149,7 @@ if [ $stage -le 13 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   cnn_opts="l2-regularize=0.03"
   ivector_layer_opts="l2-regularize=0.03"
diff --git a/egs/heroico/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/heroico/s5/local/chain/tuning/run_tdnn_1a.sh
index 6dde42bef79..290bd4c7970 100755
--- a/egs/heroico/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/heroico/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -150,7 +150,7 @@ if [ $stage -le 13 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.01"
   output_opts="l2-regularize=0.0025"
 
diff --git a/egs/heroico/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/heroico/s5/local/chain/tuning/run_tdnn_1b.sh
index d255d85327f..cfb4dc1f697 100755
--- a/egs/heroico/s5/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/heroico/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -151,7 +151,7 @@ if [ $stage -le 13 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   affine_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true"
   tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66"
   linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0"
diff --git a/egs/heroico/s5/local/heroico_download.sh b/egs/heroico/s5/local/heroico_download.sh
deleted file mode 100755
index 9c58fe37537..00000000000
--- a/egs/heroico/s5/local/heroico_download.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-# Copyright 2018 John Morgan
-# Apache 2.0.
-
-speech=$1
-lexicon=$2
-
-download_dir=$(pwd)
-tmpdir=data/local/tmp
-data_dir=$tmpdir/LDC2006S37/data
-
-mkdir -p $tmpdir
-
-# download the corpus from openslr
-
-if [ ! -f $download_dir/heroico.tar.gz ]; then
-  wget -O $download_dir/heroico.tar.gz $speech
-
-  (
-    cd $download_dir
-    tar -xzf heroico.tar.gz
-  )
-fi
-
-mkdir -p data/local/dict $tmpdir/dict
-
-# download the dictionary from openslr
-
-if [ ! -f $download_dir/santiago.tar.gz ]; then
-    wget -O $download_dir/santiago.tar.gz $lexicon
-fi
-
-(
-  cd $download_dir
-  tar -xzf santiago.tar.gz
-)
diff --git a/egs/heroico/s5/local/subs_prepare_data.pl b/egs/heroico/s5/local/subs_prepare_data.pl
index a7e0cfb0c6e..e39db79f610 100755
--- a/egs/heroico/s5/local/subs_prepare_data.pl
+++ b/egs/heroico/s5/local/subs_prepare_data.pl
@@ -19,7 +19,7 @@
 
 # input and output files
 
-my $corpus = "OpenSubtitles2018.en-es.es";
+my $corpus = "OpenSubtitles.en-es.es";
 my $symbol_table = "data/lang/words.txt";
 my $filtered = "data/local/tmp/subs/lm/es.txt";
 my $oovs = "data/local/tmp/subs/lm/oovs.txt";
diff --git a/egs/heroico/s5/run.sh b/egs/heroico/s5/run.sh
index 67ad87e55f9..4cc5617e985 100755
--- a/egs/heroico/s5/run.sh
+++ b/egs/heroico/s5/run.sh
@@ -9,11 +9,11 @@ stage=0
 datadir=/export/corpora5/LDC/LDC2006S37
 
 # The corpus and lexicon are on openslr.org
-speech="http://www.openslr.org/resources/39/LDC2006S37.tar.gz"
-lexicon="http://www.openslr.org/resources/34/santiago.tar.gz"
+#speech_url="http://www.openslr.org/resources/39/LDC2006S37.tar.gz"
+lexicon_url="http://www.openslr.org/resources/34/santiago.tar.gz"
 
 # Location of the Movie subtitles text corpus
-subs_src="http://opus.lingfil.uu.se/download.php?f=OpenSubtitles2018/en-es.txt.zip"
+subtitles_url="http://opus.lingfil.uu.se/download.php?f=OpenSubtitles2018/en-es.txt.zip"
 
 . utils/parse_options.sh
 
@@ -26,14 +26,22 @@ set -u
 tmpdir=data/local/tmp
 
 if [ $stage -le 0 ]; then
-  # download the corpus from openslr
-  local/heroico_download.sh $speech $lexicon
+  if [ ! -d $datadir ]; then
+    echo "$0: please download and un-tar http://www.openslr.org/resources/39/LDC2006S37.tar.gz"
+    echo "  and set $datadir to the directory where it is located."
+    exit 1
+  fi
+  if [ ! -s santiago.txt ]; then
+    echo "$0: downloading the lexicon"
+    wget -c http://www.openslr.org/resources/34/santiago.tar.gz
+    tar -xvzf santiago.tar.gz
+  fi
   # Get data for lm training
-  local/subs_download.sh $subs_src
+  local/subs_download.sh $subtitles_url
 fi
 
 if [ $stage -le 1 ]; then
-  echo "Makin lists for building models."
+  echo "Making lists for building models."
   local/prepare_data.sh $datadir
 fi
 
diff --git a/egs/hkust/s5/local/chain/tuning/run_tdnn_2a.sh b/egs/hkust/s5/local/chain/tuning/run_tdnn_2a.sh
index f771387785c..c62b776de2b 100755
--- a/egs/hkust/s5/local/chain/tuning/run_tdnn_2a.sh
+++ b/egs/hkust/s5/local/chain/tuning/run_tdnn_2a.sh
@@ -109,7 +109,7 @@ if [ $stage -le 12 ]; then
   ivector_dim=$(feat-to-dim scp:exp/nnet3/ivectors_${train_set}/ivector_online.scp -)
   feat_dim=$(feat-to-dim scp:data/${train_set}_hires/feats.scp -)
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.004 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
   linear_opts="orthonormal-constraint=-1.0 l2-regularize=0.004"
   output_opts="l2-regularize=0.002"
diff --git a/egs/hkust/s5/local/create_oov_char_lexicon.pl b/egs/hkust/s5/local/create_oov_char_lexicon.pl
index 0c146c9a123..33e2e8061c3 100755
--- a/egs/hkust/s5/local/create_oov_char_lexicon.pl
+++ b/egs/hkust/s5/local/create_oov_char_lexicon.pl
@@ -25,15 +25,17 @@
   exit;
 }
 
-use encoding utf8;
+use utf8;
 my %prons;
 open(DICT, $ARGV[0]) || die("Can't open dict ".$ARGV[0]."\n");
+binmode(DICT,":encoding(utf8)");
 foreach (<DICT>) {
   chomp; @A = split(" ", $_); $prons{$A[0]} = $A[1];
 }
 close DICT;
 
 open(WORDS, $ARGV[1]) || die("Can't open oov word list ".$ARGV[1]."\n");
+binmode(WORDS,":encoding(utf8)");
 while (<WORDS>) {
   chomp;
   print $_;
diff --git a/egs/hkust/s5/local/hkust_data_prep.sh b/egs/hkust/s5/local/hkust_data_prep.sh
index 207f03af36b..6342ccfe861 100755
--- a/egs/hkust/s5/local/hkust_data_prep.sh
+++ b/egs/hkust/s5/local/hkust_data_prep.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-
+ 
 . ./path.sh || exit 1;
 
 if [ $# != 2 ]; then
@@ -14,6 +14,11 @@ hkust_text_dir=$2
 train_dir=data/local/train
 dev_dir=data/local/dev
 
+# transcripts normalization and segmentation
+# needs external tools
+python2 -c "import mmseg" 2>/dev/null || {
+    echo "Python module mmseg is not found. To install it, run tools/extra/install_mmseg.sh"; exit 1; }
+    
 mkdir -p $train_dir
 mkdir -p $dev_dir
 
@@ -35,7 +40,7 @@ n=`cat $train_dir/sph.flist $dev_dir/sph.flist | wc -l`
 
 #collect all trans, convert encodings to utf-8,
 find $hkust_text_dir -iname "*.txt" | grep -i "trans/train" | xargs cat |\
-  iconv -f GBK -t utf-8 - | perl -e '
+  iconv -f GBK -t UTF-8 | perl -e '
     while (<STDIN>) {
       @A = split(" ", $_);
       if (@A <= 1) { next; }
@@ -50,7 +55,7 @@ find $hkust_text_dir -iname "*.txt" | grep -i "trans/train" | xargs cat |\
   ' | sort -k1 > $train_dir/transcripts.txt || exit 1;
 
 find $hkust_text_dir -iname "*.txt" | grep -i "trans/dev" | xargs cat |\
-  iconv -f GBK -t utf-8 - | perl -e '
+  iconv -f GBK -t UTF-8 | perl -e '
     while (<STDIN>) {
       @A = split(" ", $_);
       if (@A <= 1) { next; }
@@ -65,17 +70,13 @@ find $hkust_text_dir -iname "*.txt" | grep -i "trans/dev" | xargs cat |\
   ' | sort -k1  > $dev_dir/transcripts.txt || exit 1;
 
 #transcripts normalization and segmentation
-#(this needs external tools),
-python -c "import mmseg" 2>/dev/null || \
-  (echo "mmseg is not found. Checkout tools/extra/install_mmseg.sh" && exit 1;)
-
 cat $train_dir/transcripts.txt |\
   sed -e 's/<foreign language=\"[a-zA-Z]\+\">/ /g' |\
   sed -e 's/<\/foreign>/ /g' |\
   sed -e 's/<noise>\(.\+\)<\/noise>/\1/g' |\
   sed -e 's/((\([^)]\{0,\}\)))/\1/g' |\
   local/hkust_normalize.pl |\
-  python local/hkust_segment.py |\
+  local/hkust_segment.py |\
   awk '{if (NF > 1) print $0;}' > $train_dir/text || exit 1;
 
 cat $dev_dir/transcripts.txt |\
@@ -84,7 +85,7 @@ cat $dev_dir/transcripts.txt |\
   sed -e 's/<noise>\(.\+\)<\/noise>/\1/g' |\
   sed -e 's/((\([^)]\{0,\}\)))/\1/g' |\
   local/hkust_normalize.pl |\
-  python local/hkust_segment.py |\
+  local/hkust_segment.py |\
   awk '{if (NF > 1) print $0;}' > $dev_dir/text || exit 1;
 
 # some data is corrupted. Delete them
diff --git a/egs/hkust/s5/local/hkust_prepare_dict.sh b/egs/hkust/s5/local/hkust_prepare_dict.sh
index 27d1060e945..49f27f2f868 100755
--- a/egs/hkust/s5/local/hkust_prepare_dict.sh
+++ b/egs/hkust/s5/local/hkust_prepare_dict.sh
@@ -176,7 +176,9 @@ wc -l $dict_dir/lexicon-ch/lexicon-ch-iv.txt
 # dictionary in order to get OOV pronunciations
 cat $dict_dir/cedict/ch-dict.txt |\
   perl -e '
-  use encoding utf8;
+  use utf8;
+  binmode(STDIN,":encoding(utf8)");
+  binmode(STDOUT,":encoding(utf8)");
   while (<STDIN>) {
     @A = split(" ", $_);
     $word_len = length($A[0]);
@@ -188,7 +190,9 @@ cat $dict_dir/cedict/ch-dict.txt |\
 # extract chars
 cat $dict_dir/cedict/ch-dict-1.txt | awk '{print $1}' |\
   perl -e '
-  use encoding utf8;
+  use utf8;
+  binmode(STDIN,":encoding(utf8)");
+  binmode(STDOUT,":encoding(utf8)");
   while (<STDIN>) {
     @A = split(" ", $_);
     @chars = split("", $A[0]);
diff --git a/egs/hkust/s5/local/hkust_segment.py b/egs/hkust/s5/local/hkust_segment.py
index 92d3add0e3e..d4c2b35a668 100755
--- a/egs/hkust/s5/local/hkust_segment.py
+++ b/egs/hkust/s5/local/hkust_segment.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python2.7
 #coding:utf-8
 
 from __future__ import print_function
diff --git a/egs/hub4_spanish/s5/local/chain/tuning/run_cnn_tdnn_1a.sh b/egs/hub4_spanish/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
index 81915fec5a6..d1b657a2d74 100755
--- a/egs/hub4_spanish/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
+++ b/egs/hub4_spanish/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
@@ -147,7 +147,7 @@ if [ $stage -le 13 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   cnn_opts="l2-regularize=0.03"
   ivector_affine_opts="l2-regularize=0.03"
diff --git a/egs/hub4_spanish/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/hub4_spanish/s5/local/chain/tuning/run_tdnn_1a.sh
index 23a55f93023..40bbbe1ae79 100755
--- a/egs/hub4_spanish/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/hub4_spanish/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -136,7 +136,7 @@ if [ $stage -le 13 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/hub4_spanish/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/hub4_spanish/s5/local/chain/tuning/run_tdnn_1b.sh
index 724bb1e0794..a498d8157f3 100755
--- a/egs/hub4_spanish/s5/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/hub4_spanish/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -147,7 +147,7 @@ if [ $stage -le 13 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true"
   tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66"
diff --git a/egs/iam/v1/RESULTS b/egs/iam/v1/RESULTS
new file mode 100644
index 00000000000..b25cb3cd772
--- /dev/null
+++ b/egs/iam/v1/RESULTS
@@ -0,0 +1,42 @@
+Run_end2end.sh (WER using lang_test, lang_unk)
+flat_start:
+  • %WER 14.41 [ 2671 / 18542, 262 ins, 561 del, 1848 sub ] exp/chain/e2e_cnn_1a/decode_test/wer_11_1.0
+  • %WER 15.21 [ 2821 / 18542, 375 ins, 500 del, 1946 sub ] exp/chain/e2e_cnn_1a/decode_test/wer_11_1.0
+
+cnn_e2eali_1a: 
+  • %WER 11.94 [ 2214 / 18542, 267 ins, 380 del, 1567 sub ] exp/chain/cnn_e2eali_1a/decode_test/wer_9_1.0
+  • %WER 13.30 [ 2467 / 18542, 441 ins, 330 del, 1696 sub ] exp/chain/cnn_e2eali_1a/decode_test/wer_9_0.5
+
+cnn_e2eali_1b: 
+  • %WER 11.20 [ 2076 / 18542, 260 ins, 335 del, 1481 sub ] exp/chain/cnn_e2eali_1b/decode_test/wer_9_1.0
+  • %WER 12.46 [ 2311 / 18542, 371 ins, 326 del, 1614 sub ] exp/chain/cnn_e2eali_1b/decode_test/wer_9_1.0
+
+cnn_e2eali_1c: 
+  • %WER 9.90 [ 1836 / 18542, 257 ins, 227 del, 1352 sub ] exp/chain/cnn_e2eali_1c/decode_test/wer_10_1.0
+  • %WER 12.10 [ 2243 / 18542, 411 ins, 269 del, 1563 sub ] exp/chain/cnn_e2eali_1c/decode_test/wer_12_0.5
+
+
+Run.sh (WER using lang_test, lang_unk)
+cnn_1a:
+  • %WER 15.18 [ 2815 / 18542, 285 ins, 509 del, 2021 sub ] exp/chain/cnn_1a/decode_test/wer_11_0.0
+  • %WER 16.88 [ 3130 / 18542, 444 ins, 611 del, 2075 sub ] exp/chain/cnn_1a/decode_test/wer_11_0.0
+
+cnn_chainali_1a:
+  • %WER 14.09 [ 2612 / 18542, 245 ins, 505 del, 1862 sub ] exp/chain/cnn_chainali_1a/decode_test/wer_13_0.0
+  • %WER 15.93 [ 2954 / 18542, 454 ins, 470 del, 2030 sub ] exp/chain/cnn_chainali_1a/decode_test/wer_10_0.0
+
+cnn_chainali_1b:
+  • %WER 13.29 [ 2465 / 18542, 221 ins, 499 del, 1745 sub ] exp/chain/cnn_chainali_1b/decode_test/wer_12_0.5
+  • %WER 15.09 [ 2798 / 18542, 418 ins, 468 del, 1912 sub ] exp/chain/cnn_chainali_1b/decode_test/wer_10_0.5
+
+cnn_chainali_1c:
+  • %WER 11.59 [ 2149 / 18542, 276 ins, 362 del, 1511 sub ] exp/chain/cnn_chainali_1c/decode_test/wer_9_0.0
+  • %WER 13.75 [ 2550 / 18542, 465 ins, 368 del, 1717 sub ] exp/chain/cnn_chainali_1c/decode_test/wer_8_0.0
+  
+cnn_chainali_1d:
+  • %WER 11.07 [ 2053 / 18542, 261 ins, 311 del, 1481 sub ] exp/chain/cnn_chainali_1c/decode_test/wer_9_0.0
+  • %WER 12.95 [ 2402 / 18542, 436 ins, 313 del, 1653 sub ] exp/chain/cnn_chainali_1c/decode_test/wer_8_0.0
+
+cnn_chainali_1e:
+  • %WER 10.03 [ 1859 / 18542, 226 ins, 291 del, 1342 sub ] exp/chain/cnn_chainali_1e/decode_test/wer_11_0.5
+    %WER 12.15 [ 2253 / 18542, 406 ins, 282 del, 1565 sub ] exp/chain/cnn_chainali_1e/decode_test/wer_10_0.5
diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_1a.sh b/egs/iam/v1/local/chain/tuning/run_cnn_1a.sh
index 1253bbe5aa3..ef1273f3961 100755
--- a/egs/iam/v1/local/chain/tuning/run_cnn_1a.sh
+++ b/egs/iam/v1/local/chain/tuning/run_cnn_1a.sh
@@ -128,7 +128,7 @@ if [ $stage -le 4 ]; then
   mkdir -p $dir
   echo "$0: creating neural net configs using the xconfig parser";
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   common1="height-offsets=-2,-1,0,1,2 num-filters-out=36"
   common2="height-offsets=-2,-1,0,1,2 num-filters-out=70"
   mkdir -p $dir/configs
diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1a.sh b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1a.sh
index a8d7f6c6091..bbcc55aa2b0 100755
--- a/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1a.sh
+++ b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1a.sh
@@ -125,7 +125,7 @@ if [ $stage -le 4 ]; then
   mkdir -p $dir
   echo "$0: creating neural net configs using the xconfig parser";
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   common1="height-offsets=-2,-1,0,1,2 num-filters-out=36"
   common2="height-offsets=-2,-1,0,1,2 num-filters-out=70"
   common3="height-offsets=-1,0,1 num-filters-out=70"
diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1b.sh b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1b.sh
index f5dbb93e7b7..401ffa14e19 100755
--- a/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1b.sh
+++ b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1b.sh
@@ -124,7 +124,7 @@ if [ $stage -le 4 ]; then
   mkdir -p $dir
   echo "$0: creating neural net configs using the xconfig parser";
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   common1="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
   common2="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
   common3="required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1c.sh b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1c.sh
index 1dd83c5078f..17209b9204f 100755
--- a/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1c.sh
+++ b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1c.sh
@@ -122,7 +122,7 @@ if [ $stage -le 4 ]; then
   mkdir -p $dir
   echo "$0: creating neural net configs using the xconfig parser";
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   cnn_opts="l2-regularize=0.075"
   tdnn_opts="l2-regularize=0.075"
   output_opts="l2-regularize=0.1"
diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1d.sh b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1d.sh
index 3979b3d2da0..89a40ed2a13 100755
--- a/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1d.sh
+++ b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1d.sh
@@ -127,7 +127,7 @@ if [ $stage -le 4 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   cnn_opts="l2-regularize=0.075"
   tdnn_opts="l2-regularize=0.075"
   output_opts="l2-regularize=0.1"
diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
index f95f6a90ca1..703d404159a 100755
--- a/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
+++ b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
@@ -121,7 +121,7 @@ if [ $stage -le 4 ]; then
   mkdir -p $dir
   echo "$0: creating neural net configs using the xconfig parser";
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   cnn_opts="l2-regularize=0.075"
   tdnn_opts="l2-regularize=0.075"
   output_opts="l2-regularize=0.1"
diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1b.sh b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
index 81700ce2180..905c4661477 100755
--- a/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
+++ b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
@@ -117,7 +117,7 @@ if [ $stage -le 4 ]; then
   mkdir -p $dir
   echo "$0: creating neural net configs using the xconfig parser";
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   cnn_opts="l2-regularize=0.075"
   tdnn_opts="l2-regularize=0.075"
   output_opts="l2-regularize=0.1"
diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1c.sh b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1c.sh
index 047d673db17..26b1aca0929 100755
--- a/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1c.sh
+++ b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1c.sh
@@ -119,7 +119,7 @@ if [ $stage -le 4 ]; then
   mkdir -p $dir
   echo "$0: creating neural net configs using the xconfig parser";
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   cnn_opts="l2-regularize=0.03 dropout-proportion=0.0"
   tdnn_opts="l2-regularize=0.03"
   output_opts="l2-regularize=0.04"
diff --git a/egs/iam/v1/local/prepare_dict.sh b/egs/iam/v1/local/prepare_dict.sh
index f691d577fba..7451f6b85f7 100755
--- a/egs/iam/v1/local/prepare_dict.sh
+++ b/egs/iam/v1/local/prepare_dict.sh
@@ -38,7 +38,7 @@ while(<>){
 }' | sort -u > $dir/lexicon.txt
 
 
-sed -i "s/#/<HASH>/" $dir/nonsilence_phones.txt
+perl -i -pe "s/#/<HASH>/" $dir/nonsilence_phones.txt
 
 echo '<sil> SIL' >> $dir/lexicon.txt
 echo '<unk> SIL' >> $dir/lexicon.txt
diff --git a/egs/iam/v1/local/train_lm.sh b/egs/iam/v1/local/train_lm.sh
index 911f54c5439..3e8c838efdb 100755
--- a/egs/iam/v1/local/train_lm.sh
+++ b/egs/iam/v1/local/train_lm.sh
@@ -60,7 +60,7 @@ if [ $stage -le 0 ]; then
   # Using LOB and brown corpus.
   if [ ! -f data/local/lob-train-only.txt ]; then
     cat data/local/lobcorpus/0167/download/LOB_COCOA/lob.txt | \
-      local/remove_test_utterances_from_lob.py data/test/text.old data/val/text.old \
+      local/remove_test_utterances_from_lob.py data/test/text data/val/text \
                                                > data/local/lob-train-only.txt
   fi
   cat data/local/lob-train-only.txt > ${dir}/data/text/lob.txt
diff --git a/egs/iam/v1/local/unk_arc_post_to_transcription.py b/egs/iam/v1/local/unk_arc_post_to_transcription.py
index f8b69820601..1f1404b5165 100755
--- a/egs/iam/v1/local/unk_arc_post_to_transcription.py
+++ b/egs/iam/v1/local/unk_arc_post_to_transcription.py
@@ -25,6 +25,7 @@
       data/lang/oov.int
 """
 import argparse
+import io
 import os
 import sys
 parser = argparse.ArgumentParser(description="""uses phones to convert unk to word""")
@@ -42,17 +43,17 @@
 args = parser.parse_args()
 
 ### main ###
-phone_handle = open(args.phones, 'r', encoding='latin-1') # Create file handles 
-word_handle = open(args.words, 'r', encoding='latin-1')
-unk_handle = open(args.unk,'r', encoding='latin-1')
+phone_handle = open(args.phones, 'r', encoding='utf8') # Create file handles 
+word_handle = open(args.words, 'r', encoding='utf8')
+unk_handle = open(args.unk,'r', encoding='utf8')
 if args.one_best_arc_post == '-':
-    arc_post_handle = sys.stdin
+    arc_post_handle = io.TextIOWrapper(sys.stdin.buffer, encoding='utf8')
 else:
-    arc_post_handle = open(args.one_best_arc_post, 'r', encoding='latin-1')
+    arc_post_handle = open(args.one_best_arc_post, 'r', encoding='utf8')
 if args.output_text == '-':
-    output_text_handle = sys.stdout
+    output_text_handle = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8')
 else:
-    output_text_handle = open(args.output_text, 'w', encoding='latin-1')
+    output_text_handle = open(args.output_text, 'w', encoding='utf8')
 
 id2phone = dict() # Stores the mapping from phone_id (int) to phone (char)
 phones_data = phone_handle.read().strip().split("\n")
diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh
index a80bb02290b..9a01688ba35 100755
--- a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh
+++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh
@@ -135,7 +135,7 @@ if [ $stage -le 4 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   cnn_opts="l2-regularize=0.075"
   tdnn_opts="l2-regularize=0.075"
   output_opts="l2-regularize=0.1"
diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh
index 6615c4669d6..28aa246f334 100755
--- a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh
+++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh
@@ -137,7 +137,7 @@ if [ $stage -le 4 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   cnn_opts="l2-regularize=0.075"
   tdnn_opts="l2-regularize=0.075"
   output_opts="l2-regularize=0.1"
diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh
index f44c073635e..f158317950a 100755
--- a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh
+++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh
@@ -139,7 +139,7 @@ if [ $stage -le 4 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   cnn_opts="l2-regularize=0.03 dropout-proportion=0.0"
   tdnn_opts="l2-regularize=0.03"
   output_opts="l2-regularize=0.04"
diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh
index e7d9246fb89..1c44057454a 100755
--- a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh
+++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh
@@ -137,7 +137,7 @@ if [ $stage -le 4 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   cnn_opts="l2-regularize=0.03 dropout-proportion=0.0"
   tdnn_opts="l2-regularize=0.03"
   output_opts="l2-regularize=0.04"
diff --git a/egs/iam/v2/local/prepare_dict.sh b/egs/iam/v2/local/prepare_dict.sh
index e21a59c7e92..714b5b51788 100755
--- a/egs/iam/v2/local/prepare_dict.sh
+++ b/egs/iam/v2/local/prepare_dict.sh
@@ -39,7 +39,7 @@ while(<>){
 }' | sort -u > $dir/lexicon.txt
 
 
-sed -i "s/#/<HASH>/" $dir/nonsilence_phones.txt
+perl -i -pe "s/#/<HASH>/" $dir/nonsilence_phones.txt
 
 echo '<sil> SIL' >> $dir/lexicon.txt
 
diff --git a/egs/iban/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/iban/s5/local/chain/tuning/run_tdnn_1a.sh
index d320f49d3aa..10650a18269 100755
--- a/egs/iban/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/iban/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -136,7 +136,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.08 dropout-per-dim-continuous=true"
   output_opts="l2-regularize=0.02 bottleneck-dim=256"
 
diff --git a/egs/iban/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/iban/s5/local/chain/tuning/run_tdnn_1b.sh
index 56f5255288c..db62e6f8a55 100755
--- a/egs/iban/s5/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/iban/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -136,7 +136,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) 
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) 
   opts="l2-regularize=0.08 dropout-per-dim=true dropout-per-dim-continuous=true"
   linear_opts="orthonormal-constraint=-1.0"
   output_opts="l2-regularize=0.04"
diff --git a/egs/ifnenit/v1/local/chain/run_cnn_1a.sh b/egs/ifnenit/v1/local/chain/run_cnn_1a.sh
index b0e147d157b..b0ecd547741 100755
--- a/egs/ifnenit/v1/local/chain/run_cnn_1a.sh
+++ b/egs/ifnenit/v1/local/chain/run_cnn_1a.sh
@@ -123,7 +123,7 @@ if [ $stage -le 4 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   common1="required-time-offsets=0 height-offsets=-2,-1,0,1,2 num-filters-out=36"
   common2="required-time-offsets=0 height-offsets=-2,-1,0,1,2 num-filters-out=70"
   mkdir -p $dir/configs
diff --git a/egs/ifnenit/v1/local/chain/run_cnn_chainali_1a.sh b/egs/ifnenit/v1/local/chain/run_cnn_chainali_1a.sh
index b1f33b41a0c..7f3132d657e 100755
--- a/egs/ifnenit/v1/local/chain/run_cnn_chainali_1a.sh
+++ b/egs/ifnenit/v1/local/chain/run_cnn_chainali_1a.sh
@@ -128,7 +128,7 @@ if [ $stage -le 4 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   common1="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
   common2="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
   common3="required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
diff --git a/egs/librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a.sh b/egs/librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
index 2a60587fc35..8ebca6fd650 100755
--- a/egs/librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
+++ b/egs/librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
@@ -112,7 +112,7 @@ if [ $stage -le 14 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   cnn_opts="l2-regularize=0.01"
   ivector_affine_opts="l2-regularize=0.0"
   affine_opts="l2-regularize=0.008 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
diff --git a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1b.sh
index 7129827fe19..57f50df761d 100755
--- a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -122,7 +122,7 @@ if [ $stage -le 14 ]; then
   # create the config files for nnet initialization
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1c.sh b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1c.sh
index 29ebe62ddde..3970fa8c4d9 100755
--- a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1c.sh
+++ b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1c.sh
@@ -112,7 +112,7 @@ if [ $stage -le 14 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.002"
   linear_opts="orthonormal-constraint=1.0"
   output_opts="l2-regularize=0.0005 bottleneck-dim=256"
diff --git a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1d.sh b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1d.sh
index 81b621ef86f..5c488362e59 100755
--- a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1d.sh
+++ b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1d.sh
@@ -207,7 +207,7 @@ if [ $stage -le 14 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   affine_opts="l2-regularize=0.008 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
   tdnnf_opts="l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.75"
   linear_opts="l2-regularize=0.008 orthonormal-constraint=-1.0"
diff --git a/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
index 812bf5e7fc5..4277f769119 100755
--- a/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -85,7 +85,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   opts="l2-regularize=0.002"
   linear_opts="orthonormal-constraint=1.0"
diff --git a/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1b.sh b/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1b.sh
index d9f20fae011..383cc533270 100755
--- a/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1b.sh
+++ b/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1b.sh
@@ -120,7 +120,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   opts="l2-regularize=0.002"
   linear_opts="orthonormal-constraint=1.0"
diff --git a/egs/librispeech/s5/local/lm/train_lm.sh b/egs/librispeech/s5/local/lm/train_lm.sh
index 04badd95b26..6e6ae5970fb 100755
--- a/egs/librispeech/s5/local/lm/train_lm.sh
+++ b/egs/librispeech/s5/local/lm/train_lm.sh
@@ -50,7 +50,7 @@ if [ "$stage" -le 1 ]; then
   split_files=$(eval "echo $split_prefix-{$(seq -s',' $normjobs | sed 's/,$//')}")
   find $corpus_dir -mindepth 1 -maxdepth 1 -type d |\
     tee $tmp_dir/all_texts.txt |\
-    utils/split_scp.pl - $split_files
+    utils/split_scp.pl /dev/stdin $split_files
   echo "Checking the splits ..."
   total_count=$(wc -l <$tmp_dir/all_texts.txt)
   split_count=$(cat $split_files | wc -l | awk 'BEGIN{c=0} {c+=$1;} END{print c}')
diff --git a/egs/librispeech/s5/local/prepare_dict.sh b/egs/librispeech/s5/local/prepare_dict.sh
index f798a804355..f9efb2ee46b 100755
--- a/egs/librispeech/s5/local/prepare_dict.sh
+++ b/egs/librispeech/s5/local/prepare_dict.sh
@@ -75,7 +75,7 @@ if [ $stage -le 1 ]; then
   auto_vocab_splits=$(eval "echo $auto_vocab_prefix.{$(seq -s',' $nj | sed 's/,$//')}")
   awk 'NR==FNR{a[$1] = 1; next} !($1 in a)' $cmudict_plain $vocab |\
     sort | tee $g2p_dir/vocab_autogen.full |\
-    utils/split_scp.pl - $auto_vocab_splits || exit 1
+    utils/split_scp.pl /dev/stdin $auto_vocab_splits || exit 1
   echo "Autogenerating pronunciations for the words in $auto_vocab_prefix.* ..."
   $cmd JOB=1:$nj $g2p_dir/log/g2p.JOB.log \
     local/g2p.sh  $auto_vocab_prefix.JOB $g2p_model_dir $auto_lexicon_prefix.JOB || exit 1
diff --git a/egs/madcat_ar/v1/RESULTS b/egs/madcat_ar/v1/RESULTS
new file mode 100644
index 00000000000..357d209f6b9
--- /dev/null
+++ b/egs/madcat_ar/v1/RESULTS
@@ -0,0 +1,18 @@
+
+Subset - Dev: 852, Train: 23564, Test: 923
+
+BPE:  (subset) (run_end2end.sh)
+  • %WER 19.34 [ 932 / 4819, 71 ins, 75 del, 786 sub ] exp/chain/e2e_cnn_1a/decode_test/wer_7_0.0
+  • %WER 13.70 [ 660 / 4819, 52 ins, 65 del, 543 sub ] exp/chain/cnn_e2eali_1b/decode_test/wer_8_1.0
+
+word-based: (subset) (run_end2end.sh.word)
+  • %WER 27.39 [ 1320 / 4819, 209 ins, 50 del, 1061 sub ] exp/chain/e2e_cnn_1a/decode_test/wer_10_1.0
+  • %WER 24.26 [ 1169 / 4819, 123 ins, 80 del, 966 sub ] exp/chain/cnn_e2eali_1b/decode_test/wer_13_1.0
+
+BPE:  (subset) (run.sh)
+  • %WER 15.98 [ 770 / 4819, 64 ins, 48 del, 658 sub ] exp/chain/cnn_1a/decode_test/wer_8_0.5
+  
+  
+word-based: (subset) (run.sh.word)
+  • %WER 24.20 [ 1166 / 4819, 121 ins, 69 del, 976 sub ] exp/chain/cnn_1a/decode_test/wer_11_1.0
+    %WER 24.28 [ 1170 / 4819, 126 ins, 104 del, 940 sub ] exp/chain/cnn_chainali_1a/decode_test/wer_10_1.0
diff --git a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh
index d449805be1d..892ee441516 100755
--- a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh
+++ b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh
@@ -115,7 +115,7 @@ if [ $stage -le 4 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   common1="height-offsets=-2,-1,0,1,2 num-filters-out=36"
   common2="height-offsets=-2,-1,0,1,2 num-filters-out=70"
   mkdir -p $dir/configs
diff --git a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh
index 23c4d5c2036..7ca7c652fd2 100755
--- a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh
+++ b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh
@@ -112,7 +112,7 @@ if [ $stage -le 4 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   common1="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
   common2="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
   common3="required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
diff --git a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
index ee84ea0d83f..a8bc1836ffe 100755
--- a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
+++ b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
@@ -116,7 +116,7 @@ if [ $stage -le 4 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   common1="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
   common2="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
   common3="required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
diff --git a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1b.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
index c6052b76e7f..0828e051dcc 100755
--- a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
+++ b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
@@ -129,7 +129,7 @@ if [ $stage -le 4 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   common1="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
   common2="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
   common3="required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
diff --git a/egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh b/egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh
index e0cca104f50..ccbb7119674 100755
--- a/egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh
+++ b/egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh
@@ -124,7 +124,7 @@ if [ $stage -le 4 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   cnn_opts="l2-regularize=0.075"
   tdnn_opts="l2-regularize=0.075"
   output_opts="l2-regularize=0.1"
diff --git a/egs/madcat_zh/v1/local/chain/tuning/run_cnn_1a.sh b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_1a.sh
index d17b3e3c9c5..164d62a7ad9 100755
--- a/egs/madcat_zh/v1/local/chain/tuning/run_cnn_1a.sh
+++ b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_1a.sh
@@ -122,7 +122,7 @@ if [ $stage -le 4 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   common1="height-offsets=-2,-1,0,1,2 num-filters-out=36"
   common2="height-offsets=-2,-1,0,1,2 num-filters-out=70"
   mkdir -p $dir/configs
diff --git a/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1a.sh b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1a.sh
index d53949dd3de..be51bdcc3d1 100755
--- a/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1a.sh
+++ b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1a.sh
@@ -119,7 +119,7 @@ if [ $stage -le 4 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   common1="height-offsets=-2,-1,0,1,2 num-filters-out=36"
   common2="height-offsets=-2,-1,0,1,2 num-filters-out=70"
   common3="height-offsets=-1,0,1 num-filters-out=70"
diff --git a/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1b.sh b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1b.sh
index 5a3b85422f6..aa61620a92f 100755
--- a/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1b.sh
+++ b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1b.sh
@@ -123,7 +123,7 @@ if [ $stage -le 4 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   common1="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
   common2="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
   common3="required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
diff --git a/egs/material/README b/egs/material/README
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs/material/s5/README b/egs/material/s5/README
new file mode 100644
index 00000000000..0eb112493a4
--- /dev/null
+++ b/egs/material/s5/README
@@ -0,0 +1,35 @@
+About the MATERIAL corpus:
+
+The MATERIAL project:
+https://www.iarpa.gov/index.php/research-programs/material
+https://www.nist.gov/itl/iad/mig/openclir-evaluation
+
+The speech data in the MATERIAL corpus consist of four data sets for each
+language: train (BUILD), development (BUILD-dev), test (ANALYSIS1 and ANALYSIS2),
+and unlabeled evaluation audio (EVAL{1,2,3}). The train, development, test, and
+evaluation data contain around 40, 10, 20, and 250 hours of audio respectively.
+The train set is transcribed conversational audio that can be used for training
+an ASR system. It consists of some in 8-bit a-law .sph (Sphere) files and some
+in .wav files with 24-bit samples. The development set is transcribed
+conversational audio that can be used as development data for training to tune
+model parameters. The test data come in long unsegmented files. The reference
+transcripts for the test set is provided, hence, one can measure WER on the test
+set. The evaluation set is untranscribed audio that can be used for
+semi-supervised training of the acoustic model.
+Conversational speech data in the train and test sets are two-channel audio with
+the two channels temporally aligned. Each audio channel is provided and
+transcribed as a separate file, identified as inLine or outLine channel. Both
+audio channels are interleaved in a single file and a there is a single
+interleaved transcript that reflects the temporal alignments. In addition to
+conversational speech, the test and evlatuion sets also contain other
+genres of speech, namely news broadcast and topical broadcast, which are
+single channel files.
+
+
+Running the recipe:
+
+In s5)
+./run.sh --language <swahili|tagalog|somali>
+./local/chain/run_tdnn.sh
+./local/chain/decode_test.sh --language <swahili|tagalog|somali>
+./local/rnnlm/run_tdnn_lstm.sh
diff --git a/egs/material/s5/RESULTS b/egs/material/s5/RESULTS
new file mode 100644
index 00000000000..546f1630698
--- /dev/null
+++ b/egs/material/s5/RESULTS
@@ -0,0 +1,51 @@
+WER results for supervised and semi-supervised acoustic model training
+
+Baseline: GMM training to create alignments and lattice-free MMI-trained neural
+network with factorized TDNN. The BUILD package labeled audio is used for
+supervised acoustic model training, the EVALs unlabeled audio is added for
+semi-supervised acoustic model training.
+
+Source-side bitext on the BUILD package and crawled monolingual data are used in
+building the n-gram LM, RNNLM re-scoring, as well as extending the baseline lexicon.
+
+
+Results for *supervised* acoustic model training:
+
+Swahili
+          Baseline +RNNLM +RNNLM-nbest
+BUILD-dev   36.8    36.7    38.9
+ANALYSIS1   42.5    41.3    41.4
+ANALYSIS2   38.1    36.8    36.9
+
+Tagalog
+          Baseline +RNNLM +RNNLM-nbest
+BUILD-dev   46.4    46.1    47.5
+ANALYSIS1   52.1    51.0    50.9
+ANALYSIS2   53.6    52.3    52.2
+
+Somali
+          Baseline +RNNLM +RNNLM-nbest
+BUILD-dev   57.4    56.5    57.8
+ANALYSIS1   61.6    57.8    57.7
+ANALYSIS2   59.3    55.5    55.3
+
+
+Results for *semi-supervised* acoustic model training:
+
+Swahili
+          Baseline +RNNLM +RNNLM-nbest
+BUILD-dev   35.3    35.1    36.7
+ANALYSIS1   35.2    34.5    34.7
+ANALYSIS2   30.8    30.0    30.1
+
+Tagalog
+          Baseline +RNNLM +RNNLM-nbest
+BUILD-dev   45.0    45.2    46.6
+ANALYSIS1   40.8    40.1    40.1
+ANALYSIS2   41.1    40.6    40.6
+
+Somali
+          Baseline +RNNLM +RNNLM-nbest
+BUILD-dev   56.8    56.3    57.7
+ANALYSIS1   50.6    48.8    48.6
+ANALYSIS2   49.8    48.2    48.2
diff --git a/egs/material/s5/cmd.sh b/egs/material/s5/cmd.sh
new file mode 100644
index 00000000000..2bb1c6d24f5
--- /dev/null
+++ b/egs/material/s5/cmd.sh
@@ -0,0 +1,14 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="retry.pl --num-tries 3 queue.pl --mem 8G"
diff --git a/egs/material/s5/conf/decode.config b/egs/material/s5/conf/decode.config
new file mode 100644
index 00000000000..7ba966f2b83
--- /dev/null
+++ b/egs/material/s5/conf/decode.config
@@ -0,0 +1 @@
+# empty config, just use the defaults.
diff --git a/egs/material/s5/conf/lang/somali.conf b/egs/material/s5/conf/lang/somali.conf
new file mode 100755
index 00000000000..999c4c0ef14
--- /dev/null
+++ b/egs/material/s5/conf/lang/somali.conf
@@ -0,0 +1,26 @@
+# speech corpora files location
+# the user should replace the values with the ones that work for their location
+corpus=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1S/BUILD/
+# test audio files to decode
+audio_path_analysis1=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1S/ANALYSIS1/audio/
+audio_path_analysis2=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1S/ANALYSIS2/audio/
+audio_path_dev=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1S/DEV/audio/
+audio_path_eval1=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1S/EVAL1/audio/
+audio_path_eval2=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1S/EVAL2/audio/
+audio_path_eval3=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1S/EVAL3/audio/
+# bitext file location
+bitext=$corpus/bitext/MATERIAL_BASE-1S-BUILD_bitext.txt
+mono=/home/pkoehn/statmt/data/site-crawl/corpus/paracrawl-release3.2018-11-05.en-so.zipporah-20-dedup.lang-filtered.so
+mono2=/home/pkoehn/statmt/data/data.statmt.org/lm/so.filtered.tok.gz
+# number_mapping is a 2-column file consisting of the numbers written as digits (1st column) and letters (2nd column)
+number_mapping=/home/pkoehn/experiment/material-asr-so-en/scripts/somali_1_9999.txt
+# Acoustic model parameters
+numShorestUtts=40000
+numLeavesTri1=2000
+numGaussTri1=30000
+numLeavesTri2=3000
+numGaussTri2=60000
+numLeavesTri3=6000
+numGaussTri3=80000
+
+
diff --git a/egs/material/s5/conf/lang/swahili.conf b/egs/material/s5/conf/lang/swahili.conf
new file mode 100755
index 00000000000..d90f4c2abd7
--- /dev/null
+++ b/egs/material/s5/conf/lang/swahili.conf
@@ -0,0 +1,26 @@
+# speech corpora files location
+# the user should replace the values with the ones that work for their location
+corpus=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/
+# test audio files to decode
+audio_path_analysis1=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/ANALYSIS1/audio/
+audio_path_analysis2=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/ANALYSIS2/audio/
+audio_path_dev=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/DEV/audio/
+audio_path_eval1=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/EVAL1/audio/
+audio_path_eval2=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/EVAL2/audio/
+audio_path_eval3=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/EVAL3/audio/
+# bitext file location
+bitext=$corpus/bitext/MATERIAL_BASE-1A-BUILD_bitext.txt
+mono=/home/pkoehn/statmt/data/site-crawl/mono-corpus/mono.2018-04-24.sw
+mono2=
+# number_mapping is a 2-column file consisting of the numbers written as digits (1st column) and letters (2nd column)
+number_mapping=/home/pkoehn/experiment/material-asr-so-en/scripts/swahili_1_9999.txt
+# Acoustic model parameters
+numShorestUtts=40000
+numLeavesTri1=2000
+numGaussTri1=30000
+numLeavesTri2=3000
+numGaussTri2=60000
+numLeavesTri3=6000
+numGaussTri3=80000
+
+
diff --git a/egs/material/s5/conf/lang/tagalog.conf b/egs/material/s5/conf/lang/tagalog.conf
new file mode 100644
index 00000000000..238979feb3f
--- /dev/null
+++ b/egs/material/s5/conf/lang/tagalog.conf
@@ -0,0 +1,26 @@
+# speech corpora files location
+# the user should replace the values with the ones that work for their location
+corpus=/home/pkoehn/experiment/material-asr-so-en/scripts/swahili_1_9999.txt
+# test audio files to decode
+audio_path_analysis1=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B/ANALYSIS1/audio/
+audio_path_analysis2=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B/ANALYSIS2/audio/
+audio_path_dev=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B/DEV/audio/
+audio_path_eval1=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B/EVAL1/audio/
+audio_path_eval2=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B/EVAL2/audio/
+audio_path_eval3=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B/EVAL3/audio/
+# bitext file location
+bitext=$corpus/bitext/MATERIAL_BASE-1B-BUILD_bitext.txt
+mono=/home/pkoehn/statmt/data/site-crawl/mono-corpus/mono.2018-04-24.tl
+mono2=
+# number_mapping is a 2-column file consisting of the numbers written as digits (1st column) and letters (2nd column)
+number_mapping=
+# Acoustic model parameters
+numShorestUtts=45000
+numLeavesTri1=4000
+numGaussTri1=60000
+numLeavesTri2=5000
+numGaussTri2=80000
+numLeavesTri3=7000
+numGaussTri3=100000
+
+
diff --git a/egs/material/s5/conf/mfcc.conf b/egs/material/s5/conf/mfcc.conf
new file mode 100644
index 00000000000..e6defc10078
--- /dev/null
+++ b/egs/material/s5/conf/mfcc.conf
@@ -0,0 +1,2 @@
+--use-energy=false
+--sample-frequency=8000 
diff --git a/egs/material/s5/conf/mfcc_hires.conf b/egs/material/s5/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..f218143e78a
--- /dev/null
+++ b/egs/material/s5/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=8000 # most of the files are 8kHz
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=40    # low cutoff frequency for mel bins
+--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800)
diff --git a/egs/material/s5/conf/online_cmvn.conf b/egs/material/s5/conf/online_cmvn.conf
new file mode 100644
index 00000000000..7748a4a4dd3
--- /dev/null
+++ b/egs/material/s5/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
diff --git a/egs/material/s5/conf/plp.conf b/egs/material/s5/conf/plp.conf
new file mode 100644
index 00000000000..926bcfca92a
--- /dev/null
+++ b/egs/material/s5/conf/plp.conf
@@ -0,0 +1 @@
+--sample-frequency=8000
diff --git a/egs/material/s5/local/audio2wav_scp.pl b/egs/material/s5/local/audio2wav_scp.pl
new file mode 100755
index 00000000000..f051c2714d2
--- /dev/null
+++ b/egs/material/s5/local/audio2wav_scp.pl
@@ -0,0 +1,55 @@
+#!/usr/bin/env perl
+#===============================================================================
+# Copyright 2017  (Author: Yenda Trmal <jtrmal@gmail.com>)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+
+
+my $sox =  `which sox` or die "The sox binary does not exist";
+chomp $sox;
+my $sph2pipe = `which sph2pipe` or die "The sph2pipe binary does not exist";
+chomp $sph2pipe;
+
+while(<STDIN>) {
+  chomp;
+  my $full_path = $_;
+  (my $basename = $full_path) =~ s/.*\///g;
+
+  die "The filename $basename does not match the expected naming pattern!" unless $basename =~ /.*\.(wav|sph)$/;
+  (my $ext = $basename) =~ s/.*\.(wav|sph)$/$1/g;
+  (my $name = $basename) =~ s/(.*)\.(wav|sph)$/$1/g;
+
+
+  # name looks like this:
+  #   MATERIAL_BASE-1A-BUILD_10002_20131130_011225_inLine.sph
+  # Please note that the naming pattern must match
+  # the pattern in create_datafiles.pl
+  $name =~ s/inLine.*/0/g;
+  $name =~ s/outLine.*/1/g;
+  $name =~ s/_BASE//g;
+  $name =~ s/-BUILD//g;
+
+  if ($ext eq "wav") {
+    print "$name $sox $full_path -r 8000 -c 1 -b 16 -t wav - downsample|\n";
+  } else {
+    print "$name $sph2pipe -f wav -p -c 1 $full_path|\n";
+  }
+}
+
+
diff --git a/egs/material/s5/local/chain/decode_test.sh b/egs/material/s5/local/chain/decode_test.sh
new file mode 100755
index 00000000000..40115a04cf6
--- /dev/null
+++ b/egs/material/s5/local/chain/decode_test.sh
@@ -0,0 +1,171 @@
+#!/bin/bash
+
+# Copyright 2018  Johns Hopkins University (author: Daniel Povey)
+#           2018  Mahsa Yarmohammadi
+#           2018  Yiming Wang
+
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+language=swahili
+stage=0
+datadev="data/analysis1 data/analysis2 data/test_dev data/eval1 data/eval2 data/eval3"
+dir=exp/chain/tdnn1b_sp
+lang=data/lang_combined_chain
+tree_dir=exp/chain/tree_sp
+cmd=queue.pl
+graph_affix=_combined
+
+# training options
+chunk_width=140,100,160
+chunk_left_context=0
+chunk_right_context=0
+
+# ivector options
+max_count=75 # parameter for extract_ivectors.sh
+sub_speaker_frames=600
+filter_ctm=true
+weights_file=
+silence_weight=0.00001
+nj=30
+
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+[ ! -f ./conf/lang/${language}.conf ] && \
+  echo "Language configuration conf/lang/${language}.conf does not exist!" && exit 1
+ln -sf ./conf/lang/${language}.conf lang.conf                                   
+. ./lang.conf
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+if [ $stage -le 1 ]; then
+  # extract hires mfcc features from uniformly segmented data
+  for datadir in $datadev; do
+    utils/copy_data_dir.sh ${datadir}_segmented ${datadir}_segmented_hires
+    steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" ${datadir}_segmented_hires || exit 1;
+    steps/compute_cmvn_stats.sh ${datadir}_segmented_hires || exit 1;
+    utils/fix_data_dir.sh ${datadir}_segmented_hires || exit 1;
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  # extract iVectors for the test data, in this case we don't need the speed
+  # perturbation (sp).
+  for datadir in $datadev; do
+    data=$(basename $datadir)
+    steps/online/nnet2/extract_ivectors.sh --cmd "$train_cmd" --nj $nj \
+      --silence-weight $silence_weight \
+      --sub-speaker-frames $sub_speaker_frames --max-count $max_count \
+      ${datadir}_segmented_hires $lang exp/nnet3/extractor \
+      exp/nnet3/ivectors_${data}_segmented_hires
+  done
+fi
+
+frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+rm $dir/.error 2>/dev/null || true
+
+if [ $stage -le 3 ]; then
+  # do the 1st pass decoding
+  for datadir in $datadev; do
+    (
+      data=$(basename $datadir)
+      nspk=$(wc -l <data/${data}_segmented_hires/spk2utt)
+      decode_dir=${dir}/decode_${data}_segmented
+      steps/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --extra-left-context $chunk_left_context \
+        --extra-right-context $chunk_right_context \
+        --extra-left-context-initial 0 \
+        --extra-right-context-final 0 \
+        --frames-per-chunk $frames_per_chunk \
+        --skip-scoring true \
+        --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+        --online-ivector-dir exp/nnet3/ivectors_${data}_segmented_hires \
+        $tree_dir/graph${graph_affix} ${datadir}_segmented_hires ${decode_dir} || exit 1
+
+      # resolve ctm overlaping regions, and compute wer
+      local/postprocess_test.sh ${data}_segmented ${tree_dir}/graph${graph_affix} \
+        ${decode_dir}
+    ) || touch $dir/.error &
+  done
+fi
+wait
+
+if [ $stage -le 4 ]; then
+  # re-segement data based on 1st-pass decoding
+  segmentation_opts="--silence-proportion 0.2 --max-segment-length 15 --frame-shift 0.03"
+  for datadir in $datadev; do
+    data=$(basename $datadir)
+    # get alignment from lattice
+    nj_ali=`cat ${dir}/decode_${data}_segmented/num_jobs` || exit 1;
+    $cmd JOB=1:${nj_ali} ${dir}/decode_${data}_segmented/log/generate_alignments.JOB.log \
+    lattice-best-path --acoustic-scale=0.2 \
+    "ark:gunzip -c ${dir}/decode_${data}_segmented/lat.JOB.gz |" \
+    ark:/dev/null "ark:|gzip -c >${dir}/decode_${data}_segmented/ali.JOB.gz" || exit 1;
+
+    cp $lang/phones.txt ${dir}/decode_${data}_segmented || exit 1;
+
+    steps/resegment_data.sh --segmentation-opts "$segmentation_opts" ${datadir}_segmented_hires $lang \
+      ${dir}/decode_${data}_segmented ${datadir}_segmented_reseg_hires_tmp exp/resegment_${data}_segmented
+
+    utils/data/subsegment_data_dir.sh ${datadir}_segmented_hires ${datadir}_segmented_reseg_hires_tmp/segments \
+      ${datadir}_segmented_reseg_hires
+
+    rm -rf ${datadir}_segmented_reseg_hires_tmp 2>/dev/null || true
+
+    echo "Extracting i-vectors, stage 2"
+    # this does offline decoding, except we estimate the iVectors per
+    # speaker, excluding silence (based on alignments from a DNN decoding), with a
+    # different script.  This is just to demonstrate that script.
+    # the --sub-speaker-frames is optional; if provided, it will divide each speaker
+    # up into "sub-speakers" of at least that many frames... can be useful if
+    # acoustic conditions drift over time within the speaker's data.
+    steps/online/nnet2/extract_ivectors.sh --cmd "$train_cmd" --nj $nj \
+      --silence-weight $silence_weight \
+      --sub-speaker-frames $sub_speaker_frames --max-count $max_count \
+      ${datadir}_segmented_reseg_hires $lang exp/nnet3/extractor \
+      exp/nnet3/ivectors_${data}_segmented_reseg_hires;
+  done
+fi
+
+if [ $stage -le 5 ]; then
+  # 2nd-pass decoding on the resegmented data
+  for datadir in $datadev; do
+    (
+      data=$(basename $datadir)
+      nspk=$(wc -l <data/${data}_segmented_reseg_hires/spk2utt)
+      decode_dir=${dir}/decode_${data}_segmented_reseg
+      steps/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --extra-left-context $chunk_left_context \
+        --extra-right-context $chunk_right_context \
+        --extra-left-context-initial 0 \
+        --extra-right-context-final 0 \
+        --frames-per-chunk $frames_per_chunk \
+        --skip-scoring true \
+        --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+        --online-ivector-dir exp/nnet3/ivectors_${data}_segmented_reseg_hires \
+        $tree_dir/graph${graph_affix} ${datadir}_segmented_reseg_hires ${decode_dir} || exit 1
+
+      # resolve ctm overlaping regions, and compute wer
+      local/postprocess_test.sh ${data}_segmented_reseg $tree_dir/graph${graph_affix} \
+        ${decode_dir}
+    ) || touch $dir/.error &
+  done
+fi
+wait
+[ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+
+exit 0;
diff --git a/egs/tedlium/s5_r3/local/chain/run_tdnnf.sh b/egs/material/s5/local/chain/run_tdnn.sh
similarity index 100%
rename from egs/tedlium/s5_r3/local/chain/run_tdnnf.sh
rename to egs/material/s5/local/chain/run_tdnn.sh
diff --git a/egs/material/s5/local/chain/run_tdnn_lstm.sh b/egs/material/s5/local/chain/run_tdnn_lstm.sh
new file mode 120000
index 00000000000..8e647598556
--- /dev/null
+++ b/egs/material/s5/local/chain/run_tdnn_lstm.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lstm_1a.sh
\ No newline at end of file
diff --git a/egs/material/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/material/s5/local/chain/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..4f38ee886a7
--- /dev/null
+++ b/egs/material/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,276 @@
+#!/bin/bash
+
+# Copyright 2017-2018  Johns Hopkins University (author: Daniel Povey)
+#           2017-2018  Yiming Wang
+
+# 1a is trying an architecture with factored parameter matrices with dropout.
+
+# cat exp/chain/tdnn1a_sp/decode_dev/scoring_kaldi/best_wer
+# [for swahili]
+# %WER 38.65 [ 24021 / 62144, 3044 ins, 6378 del, 14599 sub ] exp/chain/tdnn1a_sp/decode_dev/wer_9_0.5
+# [for tagalog]
+# %WER 46.53 [ 29955 / 64382, 3425 ins, 9485 del, 17045 sub ] exp/chain/tdnn1a_sp/decode_dev/wer_9_0.0
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn1a_sp
+# [for swahili]
+# exp/chain/tdnn1a_sp: num-iters=99 nj=2..12 num-params=12.2M dim=40+100->1792 xent:train/valid[65,98,final]=(-1.93,-1.66,-1.68/-2.05,-1.84,-1.83) logprob:train/valid[65,98,final]=(-0.199,-0.166,-0.167/-0.225,-0.208,-0.206)
+# [for tagalog]
+# exp/chain/tdnn1a_sp: num-iters=96 nj=2..12 num-params=12.3M dim=40+100->1952 combine=-0.165->-0.165 (over 2) xent:train/valid[63,95,final]=(-1.89,-1.66,-1.65/-2.06,-1.89,-1.89) logprob:train/valid[63,95,final]=(-0.186,-0.158,-0.157/-0.231,-0.219,-0.218)
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+train_set=train
+test_sets="dev"
+gmm=tri3        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1a   #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+tree_affix=
+common_egs_dir=
+reporting_email=
+
+# LSTM/chain options
+train_stage=-10
+get_egs_stage=-10
+xent_regularize=0.1
+
+# training chunk-options
+chunk_width=140,100,160
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+num_epochs=7
+
+# training options
+srand=0
+remove_egs=true
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh \
+  --stage $stage --nj $nj \
+  --train-set $train_set --gmm $gmm \
+  --nnet3-affix "$nnet3_affix" || exit 1;
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang_combined=data/lang_combined_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 7 ]; then
+  echo "$0: creating lang directory $lang_combined with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang_combined ]; then
+    if [ $lang_combined/L.fst -nt data/lang_combined_test/L.fst ]; then
+      echo "$0: $lang_combined already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang_combined already exists and seems to be older than data/lang_combined_test ..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang_combined_test $lang_combined
+    silphonelist=$(cat $lang_combined/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang_combined/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang_combined/topo
+  fi
+fi
+
+if [ $stage -le 8 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang_combined $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 9 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 6000 ${lores_train_data_dir} \
+    $lang_combined $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 10 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.01 dropout-per-dim=true dropout-per-dim-continuous=true"
+  linear_opts="orthonormal-constraint=1.0"
+  output_opts="l2-regularize=0.005"
+
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $opts dim=768
+  linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-dropout-layer name=tdnn2 $opts input=Append(0,1) dim=768
+  linear-component name=tdnn3l dim=256 $linear_opts
+  relu-batchnorm-dropout-layer name=tdnn3 $opts dim=768
+  linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-dropout-layer name=tdnn4 $opts input=Append(0,1) dim=768
+  linear-component name=tdnn5l dim=256 $linear_opts
+  relu-batchnorm-dropout-layer name=tdnn5 $opts dim=768 input=Append(0, tdnn3l)
+  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn6 $opts input=Append(0,3) dim=1024
+  linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=768
+  linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn8 $opts input=Append(0,3) dim=1024
+  linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn5l) dim=768
+  linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn10 $opts input=Append(0,3) dim=1024
+  linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn9l,tdnn7l) dim=768
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1024
+  output-layer name=output include-log-softmax=false dim=$num_targets bottleneck-dim=256 $output_opts
+
+  relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1024
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor bottleneck-dim=256 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 11 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/material-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=$num_epochs \
+    --trainer.frames-per-iter=1500000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=12 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=0 \
+    --egs.chunk-right-context=0 \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test \
+    $tree_dir $tree_dir/graph || exit 1;
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_combined_test \
+    $tree_dir ${tree_dir}/graph_combined || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)  
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context 0 --extra-right-context 0 \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_combined data/${data}_hires ${dir}/decode_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+exit 0;
diff --git a/egs/material/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/material/s5/local/chain/tuning/run_tdnn_1b.sh
new file mode 100755
index 00000000000..023cb34b43d
--- /dev/null
+++ b/egs/material/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -0,0 +1,297 @@
+#!/bin/bash
+
+# Copyright 2017-2018  Johns Hopkins University (author: Daniel Povey)
+#           2017-2018  Yiming Wang
+
+# 1a is trying an architecture with factored parameter matrices with dropout.
+
+# cat exp/chain/tdnn1b_sp/decode_dev/scoring_kaldi/best_wer
+# [for swahili]
+# %WER 36.84 [ 22893 / 62144, 2988 ins, 5712 del, 14193 sub ] exp/chain/tdnn1b_sp/decode_dev/wer_10_0.0
+# [for tagalog]
+# %WER 46.37 [ 29852 / 64382, 4163 ins, 7652 del, 18037 sub ] exp/chain/tdnn1b_sp/decode_dev/wer_10_0.0
+# [for somali]
+# %WER 57.44 [ 46889 / 81637, 5016 ins, 12015 del, 29858 sub ] exp/chain/tdnn1b_sp/decode_dev/wer_10_0.0
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn1b_sp
+
+# [for swahili]
+# exp/chain/tdnn1b_sp/: num-iters=99 nj=2..12 num-params=17.2M dim=40+100->1816
+# combine=-0.127->-0.127 (over 2) xent:train/valid[65,98,final]=(-1.74,-1.44,-1.43/-1.80,-1.62,-1.61)
+# logprob:train/valid[65,98,final]=(-0.175,-0.136,-0.135/-0.194,-0.182,-0.180)
+
+# [for tagalog]
+# exp/chain/tdnn1b_sp/: num-iters=96 nj=2..12 num-params=17.2M dim=40+100->1928 combine=-0.124->-0.123
+# (over 2) xent:train/valid[63,95,final]=(-1.69,-1.43,-1.42/-1.75,-1.62,-1.60) 
+# logprob:train/valid[63,95,final]=(-0.168,-0.128,-0.127/-0.193,-0.187,-0.187)
+
+# [for somali]
+# exp/chain/tdnn1b_sp/: num-iters=84 nj=2..12 num-params=17.9M dim=40+100->3240 combine=-0.162->-0.160 
+# (over 2) xent:train/valid[55,83,final]=(-2.31,-2.02,-2.00/-2.27,-2.13,-2.10)
+# logprob:train/valid[55,83,final]=(-0.218,-0.157,-0.154/-0.268,-0.263,-0.263)
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+train_set=train
+test_sets="dev"
+gmm=tri3        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1b   #affix for TDNN directory e.g. "1a" or "1b", in case we change the configuration.
+tree_affix=
+common_egs_dir=
+reporting_email=
+
+# LSTM/chain options
+train_stage=-10
+get_egs_stage=-10
+xent_regularize=0.1
+
+# training chunk-options
+chunk_width=140,100,160
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+num_epochs=7
+
+# training options
+srand=0
+remove_egs=true
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh \
+  --stage $stage --nj $nj \
+  --train-set $train_set --gmm $gmm \
+  --nnet3-affix "$nnet3_affix" || exit 1;
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang_combined=data/lang_combined_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 7 ]; then
+  echo "$0: creating lang directory $lang_combined with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang_combined ]; then
+    if [ $lang_combined/L.fst -nt data/lang_combined_test/L.fst ]; then
+      echo "$0: $lang_combined already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang_combined already exists and seems to be older than data/lang_combined_test ..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang_combined_test $lang_combined
+    silphonelist=$(cat $lang_combined/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang_combined/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang_combined/topo
+  fi
+fi
+
+if [ $stage -le 8 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang_combined $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 9 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 6000 ${lores_train_data_dir} \
+    $lang_combined $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 10 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.004 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  linear_opts="orthonormal-constraint=-1.0 l2-regularize=0.004"
+  output_opts="l2-regularize=0.002"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $opts dim=1024
+  linear-component name=tdnn2l0 dim=256 $linear_opts input=Append(-1,0)
+  linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-dropout-layer name=tdnn2 $opts input=Append(0,1) dim=1024
+  linear-component name=tdnn3l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-dropout-layer name=tdnn3 $opts dim=1024 input=Append(0,1)
+  linear-component name=tdnn4l0 dim=256 $linear_opts input=Append(-1,0)
+  linear-component name=tdnn4l dim=256 $linear_opts input=Append(0,1)
+  relu-batchnorm-dropout-layer name=tdnn4 $opts input=Append(0,1) dim=1024
+  linear-component name=tdnn5l dim=256 $linear_opts
+  relu-batchnorm-dropout-layer name=tdnn5 $opts dim=1024 input=Append(0, tdnn3l)
+  linear-component name=tdnn6l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn6 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn7l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn7l dim=256 $linear_opts input=Append(0,3)
+  relu-batchnorm-dropout-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1024
+  linear-component name=tdnn8l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn8l dim=256 $linear_opts input=Append(0,3)
+  relu-batchnorm-dropout-layer name=tdnn8 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn9l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn5l) dim=1024
+  linear-component name=tdnn10l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn10l dim=256 $linear_opts input=Append(0,3)
+  relu-batchnorm-dropout-layer name=tdnn10 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn11l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn9l,tdnn7l) dim=1024
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1280
+  linear-component name=prefinal-chain-l dim=256 $linear_opts
+  batchnorm-component name=prefinal-chain-batchnorm
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1280
+  linear-component name=prefinal-xent-l dim=256 $linear_opts
+  batchnorm-component name=prefinal-xent-batchnorm
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+  
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 11 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/material-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=$num_epochs \
+    --trainer.frames-per-iter=1500000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=12 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=0 \
+    --egs.chunk-right-context=0 \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_combined_test \
+    $tree_dir ${tree_dir}/graph_combined || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context 0 --extra-right-context 0 \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_combined data/${data}_hires ${dir}/decode_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+exit 0;
diff --git a/egs/material/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/material/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
new file mode 100755
index 00000000000..af5a62dad0d
--- /dev/null
+++ b/egs/material/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -0,0 +1,268 @@
+#!/bin/bash
+
+# Copyright 2017-2018  Johns Hopkins University (author: Daniel Povey)
+#           2017-2018  Yiming Wang
+
+# tdnn-lstm recipe
+# [for swahili]
+# cat exp/chain/tdnn_lstm1a_sp/decode_dev/scoring_kaldi/best_wer
+# %WER 39.12 [ 24312 / 62144, 3118 ins, 5952 del, 15242 sub ] exp/chain/tdnn_lstm1a_sp/decode_dev/wer_9_0.5
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_lstm1a_sp
+# exp/chain/tdnn_lstm1a_sp: num-iters=70 nj=2..12 num-params=10.9M dim=40+100->1792 combine=-0.176->-0.174 (over 6) xent:train/valid[45,69,final]=(-1.71,-1.52,-1.50/-1.81,-1.69,-1.67) logprob:train/valid[45,69,final]=(-0.185,-0.160,-0.159/-0.213,-0.208,-0.205)
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=30
+train_set=train
+test_sets=dev
+gmm=tri3
+nnet3_affix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+tlstm_affix=1a   # affix for the TDNN-LSTM directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+# training options
+# training chunk-options
+chunk_width=140,100,160
+chunk_left_context=40
+chunk_right_context=0
+label_delay=5
+common_egs_dir=
+xent_regularize=0.1
+
+# training options
+srand=0
+remove_egs=true
+reporting_email=
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang_combined=data/lang_combined_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 7 ]; then
+  echo "$0: creating lang directory $lang_combined with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang_combined ]; then
+    if [ $lang_combined/L.fst -nt data/lang_combined_test/L.fst ]; then
+      echo "$0: $lang_combined already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang_combined already exists and seems to be older than data/lang_combined_test ..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang_combined_test $lang_combined
+    silphonelist=$(cat $lang_combined/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang_combined/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang_combined/topo
+  fi
+fi
+
+if [ $stage -le 8 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang_combined $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 9 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 6000 ${lores_train_data_dir} \
+    $lang_combined $ali_dir $tree_dir
+fi
+
+if [ $stage -le 10 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  tdnn_opts="l2-regularize=0.02"
+  lstm_opts="l2-regularize=0.005"
+  output_opts="l2-regularize=0.004"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $tdnn_opts dim=512
+  relu-batchnorm-layer name=tdnn2 $tdnn_opts input=Append(-1,0,1) dim=512
+  relu-batchnorm-layer name=tdnn3 $tdnn_opts input=Append(-1,0,1) dim=512
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstm_opts
+  relu-batchnorm-layer name=tdnn4 $tdnn_opts input=Append(-3,0,3) dim=512
+  relu-batchnorm-layer name=tdnn5 $tdnn_opts input=Append(-3,0,3) dim=512
+  relu-batchnorm-layer name=tdnn6 $tdnn_opts input=Append(-3,0,3) dim=512
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstm_opts
+  relu-batchnorm-layer name=tdnn7 $tdnn_opts input=Append(-3,0,3) dim=512
+  relu-batchnorm-layer name=tdnn8 $tdnn_opts input=Append(-3,0,3) dim=512
+  relu-batchnorm-layer name=tdnn9 $tdnn_opts input=Append(-3,0,3) dim=512
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 11 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/material-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=5 \
+    --trainer.frames-per-iter=1500000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=12 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --trainer.deriv-truncate-margin=8 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test \
+    $tree_dir $tree_dir/graph || exit 1;
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_combined_test \
+    $tree_dir ${tree_dir}/graph_combined || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $chunk_left_context \
+          --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_combined data/${data}_hires ${dir}/decode_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+exit 0;
diff --git a/egs/material/s5/local/cleanup_transcripts.pl b/egs/material/s5/local/cleanup_transcripts.pl
new file mode 100755
index 00000000000..6cd237c5b7e
--- /dev/null
+++ b/egs/material/s5/local/cleanup_transcripts.pl
@@ -0,0 +1,90 @@
+#!/usr/bin/env perl
+#===============================================================================
+# Copyright 2017  (Author: Yenda Trmal <jtrmal@gmail.com>)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+
+binmode STDIN, "utf8";
+binmode STDOUT, "utf8";
+binmode STDERR, "utf8";
+
+# replacement of the smart-match operator (apparently not supported anymore)
+sub is_elem {
+  my $word = shift;
+  my $array = shift;
+  foreach my $other_word (@{$array}) {
+    return 1 if $word eq $other_word;
+  }
+  return 0;
+}
+
+my $unk = "<unk>";
+my $noise = "<noise>";
+my $spnoise = "<spnoise>";
+my $sil = "<sil>";
+
+my @ignore_events = ("<female-to-male>", "<male-to-female>");
+#as per the BABEL docs, ~ means truncation of the word/utterance
+my @ignore_utt_events = ("<overlap>", "<dtmf>", "<foreign>", "~");
+my @sil_events = ("<no-speech>");
+my @noise_events = ("<sta>", "<ring>", "<int>" );
+my @spnoise_events = ("<breath>", "<cough>", "<hes>", "<laugh>", "<click>", "<lipsmack>");
+
+
+
+UTT: while(<>) {
+  chomp;
+  my @line = split " ", $_;
+  my $file = shift @line;
+  my $begin = shift @line;
+  my $end = shift @line;
+
+  next if (@line == 1) and ($line[0] eq "<no-speech>");
+  next if (@line == 1) and ($line[0] =~ "<.*>"); #skip the utterance if all
+                                                 #it contains is a non-speech event
+
+  my @out_line;
+  foreach my $word (@line) {
+    if ($word =~ /.*-$/) {
+      push @out_line, $unk;
+    } elsif ($word =~ /^-.*/) {
+      push @out_line, $unk;
+    } elsif ($word =~ /^\*.*\*$/) {
+      push @out_line, $unk;
+    } elsif ($word eq "(())") {
+      push @out_line, $unk;
+    } elsif (is_elem $word, \@ignore_events) {
+      next;
+    } elsif (is_elem $word, \@ignore_utt_events) {
+      next UTT;
+    } elsif (is_elem $word, \@sil_events) {
+      push @out_line, $sil;
+    } elsif (is_elem $word, \@noise_events) {
+      push @out_line, $noise;
+    } elsif (is_elem $word, \@spnoise_events) {
+      push @out_line, $spnoise;
+    } else {
+      push @out_line, $word;
+    }
+  }
+  print "$file\t$begin\t$end\t" . join(" ", @out_line) . "\n" if @out_line;
+
+}
+
+
diff --git a/egs/material/s5/local/convert_lexicon.pl b/egs/material/s5/local/convert_lexicon.pl
new file mode 100755
index 00000000000..1fe7e90ac1f
--- /dev/null
+++ b/egs/material/s5/local/convert_lexicon.pl
@@ -0,0 +1,76 @@
+#!/usr/bin/env perl
+#===============================================================================
+# Copyright 2017  (Author: Yenda Trmal <jtrmal@gmail.com>)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+
+binmode STDIN, ":utf8";
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+
+my $lexicon_name = $ARGV[0];
+open(my $lexicon_file, "<:encoding(UTF-8)", $lexicon_name) or
+  die "Cannot open $lexicon_name: $!\n";
+
+my $wordlist_name = $ARGV[1];
+open(my $wordlist_file, "<:encoding(UTF-8)", $wordlist_name) or
+  die "Cannot open $wordlist_name: $!\n";
+
+
+my %lexicon;
+while (<$lexicon_file>) {
+  chomp;
+  (my $word, my $prons) = split " ", $_, 2;
+  $lexicon{uc $word} = $prons;
+}
+
+while (<$wordlist_file>) {
+  chomp;
+  my $word = $_;
+  print STDERR "Cannot find word $word in lexicon\n" unless defined($lexicon{uc $word});
+
+  #print "$word $lexicon{$word}\n";
+
+  my @prons = split "\t", $lexicon{uc $word};
+  foreach my $pron (@prons) {
+    my @phones = split " ", $pron;
+    my $stress_mark = 0;
+    my @out_phones = ();
+    foreach my $phone (@phones) {
+      if ($phone eq "\"") {
+        $stress_mark = 1
+      } elsif ( $phone eq "." ) {
+        $stress_mark = 0;
+        push @out_phones, '.';
+      } elsif ( $phone eq "#" ) {
+        $stress_mark = 0;
+        push @out_phones, '.';
+      } else {
+        $phone =~ s/_/+/g;
+        #let's just ignore stress for now
+        #$phone = "${phone}_\"" if $stress_mark;
+        push @out_phones, $phone;
+      }
+    }
+    my $out_pron = join(" ", @out_phones);
+    $out_pron =~ s/ *\. */\t/g;
+    print "$word\t$out_pron\n";
+  }
+}
+
diff --git a/egs/material/s5/local/count_oovs.pl b/egs/material/s5/local/count_oovs.pl
new file mode 100755
index 00000000000..228399f99e3
--- /dev/null
+++ b/egs/material/s5/local/count_oovs.pl
@@ -0,0 +1,81 @@
+#!/usr/bin/perl -W
+
+# (c) 2014  Korbinian Riedhammer
+
+# Count the number of OOV per turn (or speaker, if utt2spk is provided).  Use
+# the --split-words option to split non-ascii words into characters (syllable
+# based languages).
+
+
+use strict;
+use warnings;
+use Getopt::Long;
+use open qw(:std :utf8);
+
+
+my $utt2spkf = "";
+my $split_words = 0;
+
+GetOptions(
+	'utt2spk=s' => \$utt2spkf,
+	'split-words' => \$split_words
+);
+
+if (scalar @ARGV lt 1) {
+	print STDERR "usage:  $0 [--utt2spk=utt2spk] words.txt [input]\n";
+	exit 1;
+}
+
+my $lexf = shift @ARGV;
+
+my %lex = map { my ($a, $b) = split /\s+/; $a => $b; } `cat $lexf`;
+
+my %utt2spk = ();
+if (length $utt2spkf gt 0) {
+	%utt2spk = map { my ($a, $b) = split /\s+/; $a => $b; } `cat $utt2spkf`; #read_file($utt2spkf, binmode => ':utf8');
+}
+
+my %num_words = ();
+my %num_oovs = ();
+my %oov_string = ();
+
+while (<>) {
+	my ($id, @trl) = split /\s+/;
+
+	if (length $utt2spkf gt 0) {
+		if (defined $utt2spk{$id}) {
+			$id = $utt2spk{$id};
+		} else {
+			printf STDERR "Warning: $id not specified in $utt2spkf\n";
+		}
+	}
+
+	$num_words{$id} = 0 unless defined $num_words{$id};
+	$num_oovs{$id} = 0 unless defined $num_oovs{$id};
+	$oov_string{$id} = ""  unless defined $oov_string{$id};
+
+
+	if ($split_words) {
+		for (my $i = 0; $i < scalar @trl; $i++) {
+			my $w = $trl[$i];
+			unless ($w =~ m/[a-zA-Z_\-]/) {
+				my @sw = split //, $w;
+				splice @trl, $i, 1, @sw;
+				$i += (scalar @sw) - 1;
+			}
+		}
+	}
+
+	$num_words{$id} += scalar @trl;
+	for my $w (@trl) {
+		$num_oovs{$id} += 1 unless defined $lex{$w};
+		$oov_string{$id} .= "$w " unless defined $lex{$w};
+	}
+
+}
+
+for my $i (sort keys %num_words) {
+	printf "%s %d %d %s\n", $i, $num_words{$i}, $num_oovs{$i}, 
+		( defined $oov_string{$i} ? $oov_string{$i} : "");
+}
+
diff --git a/egs/material/s5/local/create_datafiles.pl b/egs/material/s5/local/create_datafiles.pl
new file mode 100755
index 00000000000..d8e692524a1
--- /dev/null
+++ b/egs/material/s5/local/create_datafiles.pl
@@ -0,0 +1,69 @@
+#!/usr/bin/env perl
+#===============================================================================
+# Copyright 2017  (Author: Yenda Trmal <jtrmal@gmail.com>)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+
+binmode STDIN, "utf8";
+binmode STDOUT, "utf8";
+binmode STDERR, "utf8";
+
+my $output = $ARGV[0];
+open(my $utt2spk, ">:utf8", "$output/utt2spk") or
+  die "Cannot open $output/utt2spk: $!\n";
+open(my $text, ">:utf8", "$output/text") or
+  die "Cannot open $output/text: $!\n";
+open(my $segments, ">:utf8", "$output/segments") or
+  die "Cannot open $output/segments: $!\n";
+open(my $wav, ">:utf8", "$output/wav2file") or
+  die "Cannot open $output/wav2file: $!\n";
+
+my %text2id;
+while(<STDIN>) {
+  chomp;
+  my @line = split (" ", $_, 4);
+  my $name = shift @line;
+  my $begin =  shift @line;
+  my $end = shift @line;
+  my $words = shift @line;
+  my $name_raw = $name;
+
+  my $begin_text = sprintf("%07d", $begin * 1000);
+  my $end_text = sprintf("%07d", $end * 1000);
+
+  # name looks like this:
+  #   MATERIAL_BASE-1A-BUILD_10002_20131130_011225_inLine.txt
+  # Please note that the naming pattern must match
+  # the pattern in audio2wav_scp.pl
+  $name =~ s/inLine.*/0/g;
+  $name =~ s/outLine.*/1/g;
+  $name =~ s/_BASE//g;
+  $name =~ s/-BUILD//g;
+
+  my $utt_name = join("_", $name, $begin_text, $end_text);
+  print $segments "$utt_name $name $begin $end\n";
+  print $utt2spk  "$utt_name $name\n";
+  print $text "$utt_name $words\n";
+  if (defined $text2id{$name}) {
+    die "" if $text2id{$name} ne $name_raw;
+  } else {
+    print $wav "$name $name_raw\n";
+    $text2id{$name} = $name_raw;
+  }
+}
diff --git a/egs/material/s5/local/ctm_filter b/egs/material/s5/local/ctm_filter
new file mode 100755
index 00000000000..fa0f749c92a
--- /dev/null
+++ b/egs/material/s5/local/ctm_filter
@@ -0,0 +1,7 @@
+#!/usr/bin/perl
+
+while (<>) {
+  if ($_ !~ m/<(noise|unk|spnoise|sil)>/i) {
+    print $_;
+  }
+}
diff --git a/egs/material/s5/local/g2p/apply_g2p.sh b/egs/material/s5/local/g2p/apply_g2p.sh
new file mode 100755
index 00000000000..704a1a906bb
--- /dev/null
+++ b/egs/material/s5/local/g2p/apply_g2p.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+# Copyright 2016  Allen Guo
+#           2017  Xiaohui Zhang
+# Apache License 2.0
+
+# This script applies a trained Phonetisarus G2P model to
+# synthesize pronunciations for missing words (i.e., words in
+# transcripts but not the lexicon), and output the expanded lexicon.
+
+var_counts=1
+
+. ./path.sh || exit 1
+. parse_options.sh || exit 1;
+
+if [ $# -ne "4" ]; then
+  echo "Usage: $0 <g2p-model> <g2p-tmp-dir> <current-lexicon> <output-lexicon>"
+  exit 1
+fi
+
+model=$1
+workdir=$2
+lexicon=$3
+outlexicon=$4
+
+mkdir -p $workdir
+
+echo 'Synthesizing pronunciations for missing words...'
+phonetisaurus-apply --nbest $var_counts --model $model --thresh 5 --accumulate --word_list $workdir/missing_onlywords.txt > $workdir/missing_g2p_${var_counts}.txt 
+
+echo "Adding new pronunciations to $lexicon"
+cat "$lexicon" $workdir/missing_g2p_${var_counts}.txt | sort | uniq > $outlexicon
diff --git a/egs/material/s5/local/g2p/train_g2p.sh b/egs/material/s5/local/g2p/train_g2p.sh
new file mode 100755
index 00000000000..43e75f6608d
--- /dev/null
+++ b/egs/material/s5/local/g2p/train_g2p.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+
+# Copyright 2017  Intellisist, Inc. (Author: Navneeth K)
+#           2017  Xiaohui Zhang
+# Apache License 2.0
+
+# This script trains a g2p model using Phonetisaurus and SRILM.
+
+stage=0
+silence_phones=
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. utils/parse_options.sh || exit 1;
+
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <dictdir> <outdir>"
+  exit 1;
+fi
+
+lexicondir=$1
+outdir=$2
+
+[ ! -f $lexicondir/lexicon.txt ] && echo "Cannot find $lexicondir/lexicon.txt" && exit
+
+isuconv=`which uconv`
+if [ -z $isuconv ]; then
+  echo "uconv was not found. You must install the icu4c package."
+  exit 1;
+fi
+
+mkdir -p $outdir
+
+
+# For input lexicon, remove pronunciations containing non-utf-8-encodable characters,
+# and optionally remove words that are mapped to a single silence phone from the lexicon.
+if [ $stage -le 0 ]; then
+  lexicon=$lexicondir/lexicon.txt
+  if [ ! -z "$silence_phones" ]; then
+    awk 'NR==FNR{a[$1] = 1; next} {s=$2;for(i=3;i<=NF;i++) s=s" "$i; if(!(s in a)) print $1" "s}' \
+      $silence_phones $lexicon | \
+      awk '{printf("%s\t",$1); for (i=2;i<NF;i++){printf("%s ",$i);} printf("%s\n",$NF);}' | \
+      uconv -f utf-8  -t utf-8 -x Any-NFC - | awk 'NF > 0'> $outdir/lexicon_tab_separated.txt
+  else
+    awk '{printf("%s\t",$1); for (i=2;i<NF;i++){printf("%s ",$i);} printf("%s\n",$NF);}' $lexicon | \
+      uconv -f utf-8  -t utf-8 -x Any-NFC - | awk 'NF > 0'> $outdir/lexicon_tab_separated.txt
+  fi
+fi
+
+if [ $stage -le 1 ]; then
+  # Align lexicon stage. Lexicon is assumed to have first column tab separated
+  phonetisaurus-align --input=$outdir/lexicon_tab_separated.txt --ofile=${outdir}/aligned_lexicon.corpus || exit 1;
+fi
+
+if [ $stage -le 2 ]; then
+  # Convert aligned lexicon to arpa using srilm.
+  ngram-count -order 7 -kn-modify-counts-at-end -gt1min 0 -gt2min 0 \
+    -gt3min 0 -gt4min 0 -gt5min 0 -gt6min 0 -gt7min 0 -ukndiscount \
+    -text ${outdir}/aligned_lexicon.corpus -lm ${outdir}/aligned_lexicon.arpa
+fi
+
+if [ $stage -le 3 ]; then
+  # Convert the arpa file to FST.
+  phonetisaurus-arpa2wfst --lm=${outdir}/aligned_lexicon.arpa --ofile=${outdir}/model.fst
+fi
diff --git a/egs/material/s5/local/nnet3/run_ivector_common.sh b/egs/material/s5/local/nnet3/run_ivector_common.sh
new file mode 100755
index 00000000000..a56b3bf67d8
--- /dev/null
+++ b/egs/material/s5/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,149 @@
+#!/bin/bash
+
+set -euo pipefail
+
+# This script is called from local/nnet3/run_tdnn.sh and
+# local/chain/run_tdnn.sh (and may eventually be called by more
+# scripts).  It contains the common feature preparation and
+# iVector-related parts of the script.  See those scripts for examples
+# of usage.
+
+stage=0
+train_set=train
+test_sets="dev"
+nj=30
+gmm=tri3
+
+nnet3_affix=
+
+. ./cmd.sh
+. ./path.sh
+. utils/parse_options.sh
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+
+for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+if [ $stage -le 1 ]; then
+  # Although the nnet will be trained by high resolution data, we still have to
+  # perturb the normal data to get the alignment _sp stands for speed-perturbed
+  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+  echo "$0: making MFCC features for low-resolution speed-perturbed data"
+  steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj data/${train_set}_sp || exit 1;
+  steps/compute_cmvn_stats.sh data/${train_set}_sp || exit 1;
+  utils/fix_data_dir.sh data/${train_set}_sp
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: aligning with the perturbed low-resolution data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/${train_set}_sp data/lang_test $gmm_dir $ali_dir || exit 1
+fi
+
+if [ $stage -le 3 ]; then
+  # Create high-resolution MFCC features (with 40 cepstra instead of 13).
+  # this shows how you can split across multiple file-systems.
+  echo "$0: creating high-resolution MFCC features"
+  mfccdir=data/${train_set}_sp_hires/data
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    utils/create_split_dir.pl /export/b1{5,6,7,8}/$USER/kaldi-data/egs/material-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+  fi
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+  done
+
+  # do volume-perturbation on the training data prior to extracting hires
+  # features; this helps make trained nnets more invariant to test data volume.
+  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires || exit 1;
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1;
+    utils/fix_data_dir.sh data/${datadir}_hires || exit 1;
+  done
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: computing a subset of data to train the diagonal UBM."
+  # We'll use about a quarter of the data.
+  mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
+  temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
+
+  num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
+  num_utts=$[$num_utts_total/4]
+  utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
+     $num_utts ${temp_data_root}/${train_set}_sp_hires_subset
+
+  echo "$0: computing a PCA transform from the hires data."
+  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
+      --splice-opts "--left-context=3 --right-context=3" \
+      --max-utts 10000 --subsample 2 \
+       ${temp_data_root}/${train_set}_sp_hires_subset \
+       exp/nnet3${nnet3_affix}/pca_transform
+
+  echo "$0: training the diagonal UBM."
+  # Use 512 Gaussians in the UBM.
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
+    --num-frames 700000 \
+    --num-threads 32 \
+    ${temp_data_root}/${train_set}_sp_hires_subset 512 \
+    exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
+fi
+
+if [ $stage -le 5 ]; then
+  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
+  # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
+  # 100.
+  echo "$0: training the iVector extractor"
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj $nj \
+     data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm \
+     exp/nnet3${nnet3_affix}/extractor || exit 1;
+fi
+
+
+if [ $stage -le 6 ]; then
+  # We extract iVectors on the speed-perturbed training data after combining
+  # short segments, which will be what we train the system on.  With
+  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
+  # each of these pairs as one speaker; this gives more diversity in iVectors..
+  # Note that these are extracted 'online'.
+
+  # note, we don't encode the 'max2' in the name of the ivectordir even though
+  # that's the data we extract the ivectors from, as it's still going to be
+  # valid for the non-'max2' data, the utterance list is the same.
+
+  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/egs/material-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
+  fi
+
+
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  temp_data_root=${ivectordir}
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${train_set}_sp_hires ${temp_data_root}/${train_set}_sp_hires_max2
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+    ${temp_data_root}/${train_set}_sp_hires_max2 \
+    exp/nnet3${nnet3_affix}/extractor $ivectordir
+
+  # Also extract iVectors for the test data, but in this case we don't need the speed
+  # perturbation (sp).
+  for data in $test_sets; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+      data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
+      exp/nnet3${nnet3_affix}/ivectors_${data}_hires
+  done
+fi
+
+exit 0
diff --git a/egs/material/s5/local/normalize_numbers.py b/egs/material/s5/local/normalize_numbers.py
new file mode 100755
index 00000000000..b471cb853d4
--- /dev/null
+++ b/egs/material/s5/local/normalize_numbers.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python
+# Converts numbers to their text representations
+# Reads from stdin
+
+import os
+import sys
+__location__ = os.path.realpath(
+    os.path.join(os.getcwd(), os.path.dirname(__file__)))
+
+# Read translations of numbers into a dict
+num_trans = dict()
+with open(sys.argv[1]) as s_f:
+  for line in s_f:
+    line_comp = line.strip().split('\t')
+    num_trans[int(line_comp[0])] = line_comp[1]
+
+# Read input line by line and translate integers
+# Will only work for positive integers
+# Will not handle numbers which have a comma in them
+for line in sys.stdin:
+  words = line.strip().split()
+  for i in range(len(words)):
+    if words[i].isdigit() and int(words[i]) in num_trans:
+      words[i] = num_trans[int(words[i])]
+
+  sys.stdout.write(" ".join(words) + "\n")
diff --git a/egs/material/s5/local/parse_dev_transcripts.py b/egs/material/s5/local/parse_dev_transcripts.py
new file mode 100755
index 00000000000..730d27ec4f1
--- /dev/null
+++ b/egs/material/s5/local/parse_dev_transcripts.py
@@ -0,0 +1,195 @@
+#! /usr/bin/env python3
+
+import sys
+import os
+import re
+
+
+def normalize_text(text):
+    parts = text.strip().split()
+
+    for i, w in enumerate(parts):
+        if w in ["<no-speech>", "--", ".", "?", "~"]:
+            parts[i] = ""
+        elif w == "%incomplete":
+            parts[i] = "<unk>"
+        elif w in ["<cough>", "<laugh>", "<lipsmack>", "<hes>"]:
+            parts[i] = "<spnoise>"
+        elif w in ["<breath>", "<sta>"]:
+            parts[i] = "<noise>"
+        elif w in ["<int>", "(())", "<foreign>", "<overlap>", "<misc>"]:
+            parts[i] = "<unk>"
+
+        # change *word* into word
+        parts[i] = re.sub(r"^[*](\S+)[*]$", r"\1", parts[i])
+
+    return re.sub(r"\s+", " ", " ".join(parts))
+
+
+def write_segment(start_time, end_time, text, reco_id,
+                  segments_fh, utt2spk_fh, text_fh):
+    assert end_time > start_time
+
+    text = normalize_text(text)
+
+    utt_id = "{reco_id}-{st:06d}-{end:06d}".format(
+        reco_id=reco_id,
+        st=int(start_time * 100), end=int(end_time * 100))
+
+    print ("{utt_id} {reco_id} {st} {end}"
+           "".format(utt_id=utt_id, reco_id=reco_id,
+                     st=start_time, end=end_time),
+           file=segments_fh)
+    print ("{utt_id} {reco_id}"
+           "".format(utt_id=utt_id, reco_id=reco_id),
+           file=utt2spk_fh)
+    print ("{utt_id} {text}"
+           "".format(utt_id=utt_id, text=text),
+           file=text_fh)
+
+
+def parse_calls_transcript_file(transcript_file, segments_fh,
+                                utt2spk_fh, text_fh):
+    base_name = os.path.basename(transcript_file)
+    file_id = re.sub(".transcription.txt", "", base_name)
+
+    inline_start_time = -1
+    outline_start_time = -1
+
+    i = 0
+
+    for line in open(transcript_file):
+        parts = line.strip().split()
+
+        if i == 0 and not parts[0].startswith('0'):
+            raise Exception("Transcript file {0} does not start with 0.000"
+                            "".format(transcript_file))
+        i += 1
+
+        start_time = float(parts[0])
+        if len(parts) == 1:
+            # Last line in the file
+            write_segment(inline_start_time, start_time, inline_text, file_id + "_inLine",
+                          segments_fh, utt2spk_fh, text_fh)
+            write_segment(outline_start_time, start_time, outline_text, file_id + "_outLine",
+                          segments_fh, utt2spk_fh, text_fh)
+            break
+
+        assert parts[1] in ["inLine", "outLine"]
+
+        if parts[1] == "inLine":
+            reco_id = file_id + "_inLine"
+            if inline_start_time >= 0:
+                write_segment(inline_start_time, start_time, inline_text, reco_id,
+                              segments_fh, utt2spk_fh, text_fh)
+            inline_text = " ".join(parts[2:])
+            inline_start_time = start_time
+        else:
+            reco_id = file_id + "_outLine"
+            if outline_start_time >= 0:
+                write_segment(outline_start_time, start_time, outline_text, reco_id,
+                              segments_fh, utt2spk_fh, text_fh)
+            outline_text = " ".join(parts[2:])
+            outline_start_time = start_time
+
+
+def parse_non_calls_transcript_file(transcript_file, segments_fh,
+                                    utt2spk_fh, text_fh):
+    base_name = os.path.basename(transcript_file)
+    file_id = re.sub(".transcription.txt", "", base_name)
+
+    start_time = -1
+    i = 0
+
+    with open(transcript_file) as fh:
+        line = fh.readline().strip()
+        if not line.startswith('['):
+            raise Exception("Transcript file {0} does not start with [0.000"
+                            "".format(transcript_file))
+        try:
+            start_time  = float(re.sub(r"\[([^\]]+)\]", r"\1", line))
+        except Exception:
+            print("Could not parse line {0}".format(line), file=sys.stderr)
+            raise
+
+        text = fh.readline()
+        while text != '':
+            text = text.strip()
+            line = fh.readline().strip()
+            if not line.startswith('['):
+                raise Exception("Time-stamp in transcript file {0} does not start with [; error parsing line {1} after text {2}"
+                                "".format(transcript_file, line, text))
+            try:
+                end_time  = float(re.sub(r"\[([^\]]+)\]", r"\1", line))
+            except Exception:
+                print("Could not parse line {0}".format(line), file=sys.stderr)
+                raise
+
+            write_segment(start_time, end_time, text, file_id,
+                          segments_fh, utt2spk_fh, text_fh)
+            start_time = end_time
+            text = fh.readline()
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 5:
+        print ("Usage: {0} <corpus-root-dir> <calls-list> <non-calls-list> <data-dir>",
+               file=sys.stderr)
+        raise SystemExit(1)
+
+    root_path = sys.argv[1]
+    calls_list = open(sys.argv[2]).readlines()
+    non_calls_list = open(sys.argv[3]).readlines()
+    data_dir = sys.argv[4]
+
+    wav_scp_fh = open("{0}/wav.scp".format(data_dir), 'w')
+    utt2spk_fh = open("{0}/utt2spk".format(data_dir), 'w')
+    reco2file_and_channel_fh = open(
+        "{0}/reco2file_and_channel".format(data_dir), 'w')
+    text_fh = open("{0}/text".format(data_dir), 'w')
+    segments_fh = open("{0}/segments".format(data_dir), 'w')
+
+    for line in calls_list:
+        file_id = line.strip()
+        transcript_file = (
+            "{root_path}/transcription/{file_id}.transcription.txt"
+            "".format(root_path=root_path, file_id=file_id))
+        wav_file = "{root_path}/src/{file_id}.wav".format(
+            root_path=root_path, file_id=file_id)
+
+        for channel in [1, 2]:
+            reco_id = file_id + ("_inLine" if channel == 1 else "_outLine")
+            print ("{reco_id} {file_id} {channel}"
+                   "".format(reco_id=reco_id, file_id=file_id,
+                             channel="A" if channel == 1 else "B"),
+                   file=reco2file_and_channel_fh)
+            print ("{reco_id} sox {wav_file} -r 8000 -b 16 -c 1 -t wav - remix {channel} |"
+                   "".format(reco_id=reco_id, wav_file=wav_file, channel=channel),
+                   file=wav_scp_fh)
+
+        parse_calls_transcript_file(transcript_file, segments_fh,
+                                    utt2spk_fh, text_fh)
+
+    for line in non_calls_list:
+        file_id = line.strip()
+        transcript_file = (
+            "{root_path}/transcription/{file_id}.transcription.txt"
+            "".format(root_path=root_path, file_id=file_id))
+        wav_file = "{root_path}/src/{file_id}.wav".format(
+            root_path=root_path, file_id=file_id)
+
+        print ("{file_id} {file_id} 1"
+               "".format(file_id=file_id),
+               file=reco2file_and_channel_fh)
+        print ("{reco_id} sox {wav_file} -r 8000 -b 16 -c 1 -t wav - |"
+               "".format(reco_id=file_id, wav_file=wav_file),
+               file=wav_scp_fh)
+
+        parse_non_calls_transcript_file(transcript_file, segments_fh,
+                                        utt2spk_fh, text_fh)
+
+    wav_scp_fh.close()
+    utt2spk_fh.close()
+    reco2file_and_channel_fh.close()
+    text_fh.close()
+    segments_fh.close()
diff --git a/egs/material/s5/local/parse_transcripts.pl b/egs/material/s5/local/parse_transcripts.pl
new file mode 100755
index 00000000000..06c18a30c6c
--- /dev/null
+++ b/egs/material/s5/local/parse_transcripts.pl
@@ -0,0 +1,53 @@
+#!/usr/bin/env perl
+#===============================================================================
+# Copyright 2017  (Author: Yenda Trmal <jtrmal@gmail.com>)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+
+binmode STDIN, "utf8";
+binmode STDOUT, "utf8";
+binmode STDERR, "utf8";
+
+my $file = $ARGV[0];
+
+open(my $transcript, "<:utf8", $file) or
+  die "Cannot open file $file: $!\n";
+
+(my $basename = $file) =~ s/(.*\/)?([^\/]+)/$2/g;
+
+my $sentence = undef;
+my $begin_time = undef;
+my $end_time = undef;
+while(<$transcript>) {
+  chomp;
+  if (/^\[([0-9.]+)\]$/) {
+    $begin_time = $end_time;
+    $end_time = $1;
+    if ($sentence) {
+      print "$basename\t$begin_time\t$end_time\t$sentence\n";
+      $sentence = undef;
+    }
+  } else {
+    die "Invalid format of the transcription in $basename\n" if defined($sentence);
+    $sentence = $_;
+  }
+}
+
+die "Invalid format of the transcription in $basename\n" if defined($sentence);
+
diff --git a/egs/material/s5/local/postprocess_test.sh b/egs/material/s5/local/postprocess_test.sh
new file mode 100755
index 00000000000..950c1191d4d
--- /dev/null
+++ b/egs/material/s5/local/postprocess_test.sh
@@ -0,0 +1,56 @@
+#!/bin/sh
+set -euo pipefail
+echo "$0 $@"
+
+stage=0
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 <data-id> <graph-dir> <decode-dir>"
+  echo " e.g.: $0 analysis1 exp/chain/tdnn/graph exp/chain/tdnn/decode_analysis1_segmented"
+  exit 1
+fi
+
+data=$1
+graph_dir=$2
+decode_dir=$3
+
+# get recording-level CTMs from the lattice by resolving the overlapping
+# regions
+
+if [ $stage -le 0 ]; then
+  steps/get_ctm_fast.sh --cmd "$decode_cmd" --frame-shift 0.03 \
+    data/${data}_hires/ ${graph_dir} \
+    ${decode_dir} ${decode_dir}/score_10_0.0
+fi
+
+if [ $stage -le 1 ]; then
+  utils/ctm/resolve_ctm_overlaps.py data/${data}_hires/segments \
+    ${decode_dir}/score_10_0.0/ctm \
+    - | utils/convert_ctm.pl data/${data}_hires/segments data/${data}_hires/reco2file_and_channel > \
+    ${decode_dir}/score_10_0.0/${data}_hires.ctm
+fi
+
+if [ $stage -le 2 ]; then
+  # extract n-best lists from archive.* files
+  if [[ ${decode_dir} == *_rescore_nbest ]]; then
+    hyp_filtering_cmd="cat"
+    [ -x local/wer_output_filter ] && hyp_filtering_cmd="local/wer_output_filter"
+    [ -x local/wer_hyp_filter ] && hyp_filtering_cmd="local/wer_hyp_filter"
+    mkdir -p ${decode_dir}/output_nbest
+    for f in ${decode_dir}/archives.*; do
+      docid=$(head -1 $f/words_text | awk '{print $1}' | cut -f1,2 -d'-')
+      $hyp_filtering_cmd $f/words_text  > \
+        ${decode_dir}/output_nbest/$docid".n.txt" || exit 1;
+    done
+  fi
+
+  # compute WER              
+  local/score_stm.sh --min-lmwt 10 --max-lmwt 10 --word-ins-penalty 0.0 \
+    --cmd "$decode_cmd" data/${data}_hires $graph_dir ${decode_dir}
+
+  grep -H Sum ${decode_dir}/score*/*.sys | utils/best_wer.sh
+fi
diff --git a/egs/material/s5/local/prepare_audio_data.sh b/egs/material/s5/local/prepare_audio_data.sh
new file mode 100755
index 00000000000..2bf9283f435
--- /dev/null
+++ b/egs/material/s5/local/prepare_audio_data.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Copyright (c) 2017, Johns Hopkins University (Jan "Yenda" Trmal<jtrmal@gmail.com>)
+# License: Apache 2.0
+
+# Begin configuration section.
+# End configuration section
+set -e -o pipefail
+set -o nounset                              # Treat unset variables as an error
+echo "$0 " "$@"
+
+if [ $# -ne 1 ] ; then
+  echo "Invalid number of script parameters. "
+  echo "  $0 <path-to-material-corpus>"
+  echo "e.g."
+  echo "  $0 /export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/"
+  exit
+fi
+data=$1
+
+conversational_train=$data/conversational/training/
+audio=$conversational_train/audio/
+[ ! -d $audio ] && \
+  echo "The directory $audio does not exist!" && exit 1
+
+find $audio -type f \( -name "*.wav" -o -name "*.sph" \) | \
+  local/audio2wav_scp.pl > data/train/wav.scp
+
+
+conversational_dev=$data/conversational/dev
+audio=$conversational_dev/audio/
+[ ! -d $audio ] && \
+  echo "The directory $audio does not exist!" && exit 1
+
+find $audio -type f \( -name "*.wav" -o -name "*.sph" \) | \
+  local/audio2wav_scp.pl > data/dev/wav.scp
+
diff --git a/egs/material/s5/local/prepare_dict.sh b/egs/material/s5/local/prepare_dict.sh
new file mode 100755
index 00000000000..710f1a66e2e
--- /dev/null
+++ b/egs/material/s5/local/prepare_dict.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+# Copyright (c) 2017, Johns Hopkins University (Jan "Yenda" Trmal<jtrmal@gmail.com>)
+# License: Apache 2.0
+
+# Begin configuration section.
+# End configuration section
+set -e -o pipefail
+set -o nounset                              # Treat unset variables as an error
+echo "$0 " "$@"
+
+language=swahili
+
+. ./utils/parse_options.sh
+
+if [ $# -ne 1 ] ; then
+  echo "Invalid number of script parameters. "
+  echo "  $0 [options] <path-to-material-corpus>"
+  echo "e.g."
+  echo "  $0 --language swahili /export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/"
+  exit
+fi
+data=$1
+
+lexicon=$data/conversational/reference_materials/lexicon.txt
+
+mkdir -p data/local
+cat $lexicon | awk '{print $1}' > data/local/lexicon_words
+cat $lexicon | cut -f2-  > data/local/lexicon_phns
+
+if [ "$language" == "swahili" ]; then
+  language_affix="sw"
+elif [ "$language" == "tagalog" ]; then
+  language_affix="tl"
+elif [ "$language" == "somali" ]; then
+  language_affix="so"
+fi
+MOSES=/home/pkoehn/moses
+SOURCE_TC_MODEL=/home/pkoehn/experiment/material-asr-${language_affix}-en/truecaser/truecase-model.1.${language_affix}
+  $MOSES/scripts/recaser/truecase.perl -model $SOURCE_TC_MODEL \
+    < data/local/lexicon_words > data/local/lexicon_words_tc
+
+paste data/local/lexicon_words_tc data/local/lexicon_phns | sort > data/local/lexicon_tc
+
+lexicon=data/local/lexicon_tc
+
+[ ! -f $lexicon ] && echo "Lexicon $lexicon does not exist!" && exit 1;
+echo $0: using lexicon $lexicon
+mkdir -p data/local/dict_nosp/
+cat data/train/text | cut -f 2- -d ' ' | \
+  sed 's/ /\n/g' | grep . | sort -u > data/local/dict_nosp/wordlist
+
+local/convert_lexicon.pl <(echo -e "<unk>\t<unk>\n<sil>\t<sil>\n<noise>\t<noise>\n<spnoise>\t<spnoise>" | cat - $lexicon ) data/local/dict_nosp/wordlist | sort -u > data/local/dict_nosp/lexicon.txt
+[ -f  data/local/dict_nosp/lexiconp.txt ] && rm data/local/dict_nosp/lexiconp.txt
+
+cat data/local/dict_nosp/lexicon.txt | sed 's/\t/ /g' | \
+  cut -f 2- -d ' ' | sed 's/ /\n/g' | grep . | sort -u > data/local/dict_nosp/phones.txt
+
+
+grep "^<.*>$" data/local/dict_nosp/phones.txt  > data/local/dict_nosp/silence_phones.txt
+grep -v "^<.*>$" data/local/dict_nosp/phones.txt  > data/local/dict_nosp/nonsilence_phones.txt
+echo "<sil>" > data/local/dict_nosp/optional_silence.txt
+echo "<unk>" > data/local/dict_nosp/oov.txt
+
+
+
+utils/validate_dict_dir.pl data/local/dict_nosp/
+
diff --git a/egs/material/s5/local/prepare_text_data.sh b/egs/material/s5/local/prepare_text_data.sh
new file mode 100755
index 00000000000..4200a55ed9d
--- /dev/null
+++ b/egs/material/s5/local/prepare_text_data.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+# Copyright (c) 2017, Johns Hopkins University (Jan "Yenda" Trmal<jtrmal@gmail.com>)
+# License: Apache 2.0
+
+# Begin configuration section.
+# End configuration section
+set -e -o pipefail
+set -o nounset                              # Treat unset variables as an error
+echo "$0 " "$@"
+
+language=swahili
+
+. ./utils/parse_options.sh
+
+if [ $# -ne 1 ] ; then
+  echo "Invalid number of script parameters. "
+  echo "  $0 [options] <path-to-material-corpus>"
+  echo "e.g."
+  echo "  $0 --language swahili /export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/"
+  exit
+fi
+data=$1;
+conversational_train=$data/conversational/training/
+mkdir -p data/train/
+for file in $conversational_train/transcription/*txt ; do
+  ./local/parse_transcripts.pl $file
+done  > data/train/transcripts.txt
+
+
+conversational_dev=$data/conversational/dev/
+mkdir -p data/dev
+for file in $conversational_dev/transcription/*txt ; do
+  ./local/parse_transcripts.pl $file
+done > data/dev/transcripts.txt
+
+
+cat data/train/transcripts.txt | \
+  local/cleanup_transcripts.pl | \
+  local/create_datafiles.pl data/train/
+
+cat data/dev/transcripts.txt | \
+  local/cleanup_transcripts.pl | \
+  local/create_datafiles.pl data/dev/
+
+if [ "$language" == "swahili" ]; then
+  language_affix="sw"
+elif [ "$language" == "tagalog" ]; then
+  language_affix="tl"
+elif [ "$language" == "somali" ]; then
+  language_affix="so"
+fi
+MOSES=/home/pkoehn/moses
+SOURCE_TC_MODEL=/home/pkoehn/experiment/material-asr-${language_affix}-en/truecaser/truecase-model.1.${language_affix}
+
+for i in train dev; do
+  cat data/$i/text | cut -d " " -f2- > data/$i/text.notruecase
+  cat data/$i/text | cut -d " " -f1  > data/$i/uttids
+  # Truecase
+  $MOSES/scripts/recaser/truecase.perl -model $SOURCE_TC_MODEL \
+    < data/$i/text.notruecase | sed "s=<= <=g" > data/$i/text.truecase
+#  cat data/$i/text.truecase | sed 's/&apos; //g' | sed 's/&apos//g' | sed 's/&#91//g' | sed 's/&#93//g' | sed 's/&quot; //g' | sed 's/&quot //g' | sed 's/&amp; //g' | sed 's/@-@ //g' | sed 's/://g' | sed 's/\///g' | sed 's/%//g' | sed 's/+//g' | sed 's/( //g' | sed 's/) //g' | sed 's/\, //g' | sed 's/ \.//g' | sed 's/\?//g' | sed 's/\!//g' | sed 's/\;//g' > data/$i/text.nopunc
+  cat data/$i/text.truecase | tr 'A-Z' 'a-z' > data/$i/text.nopunc
+  paste -d " " data/$i/uttids data/$i/text.nopunc > data/$i/text
+done
+
+
diff --git a/egs/material/s5/local/preprocess_external_text.sh b/egs/material/s5/local/preprocess_external_text.sh
new file mode 100755
index 00000000000..4cbc457310e
--- /dev/null
+++ b/egs/material/s5/local/preprocess_external_text.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+set -euo pipefail
+set -e -o pipefail                                                              
+set -o nounset                              # Treat unset variables as an error 
+echo "$0 $@"
+
+language=swahili
+srctext_bitext=data/bitext/text
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+output=$1
+
+set -e -o pipefail
+set -o nounset                              # Treat unset variables as an error
+
+if [ "$language" == "swahili" ]; then
+  language_affix="sw"
+elif [ "$language" == "tagalog" ]; then
+  language_affix="tl"
+elif [ "$language" == "somali" ]; then
+  language_affix="so"
+fi
+MOSES=/home/pkoehn/moses
+
+# Normalize punctuation and tokenize input
+$MOSES/scripts/tokenizer/normalize-punctuation.perl ${language_affix} < ${srctext_bitext} \
+ | $MOSES/scripts/tokenizer/tokenizer.perl -a -l ${language_affix} > ${srctext_bitext}.tok
+
+# convert to lower cases
+cat ${srctext_bitext}.tok | tr 'A-Z' 'a-z' > ${srctext_bitext}.tc
+
+# Remove punctuation
+cat ${srctext_bitext}.tc | sed 's/&apos; //g' | sed 's/&apos//g' | sed 's/&#91//g' | sed 's/&#93//g' | sed 's/&quot; //g' | sed 's/&quot //g' | sed 's/&amp; //g' | sed 's/@-@ //g' | sed 's/-//g' | sed 's/://g' | sed 's/\///g' | sed 's/%//g' | sed 's/+//g' | sed 's/( //g' | sed 's/) //g' | sed 's/\, //g' | sed 's/ \.//g' | sed 's/\?//g' | sed 's/\!//g' | sed 's/\;//g' > $output
+
diff --git a/egs/material/s5/local/preprocess_test.sh b/egs/material/s5/local/preprocess_test.sh
new file mode 100755
index 00000000000..fbc868d3f7c
--- /dev/null
+++ b/egs/material/s5/local/preprocess_test.sh
@@ -0,0 +1,135 @@
+#!/bin/sh
+set -euo pipefail
+set -e -o pipefail                                                              
+set -o nounset                              # Treat unset variables as an error 
+echo "$0 $@"
+
+stage=0
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+. ./lang.conf
+
+datadev=$1
+
+mkdir -p $datadev
+
+# 1. create the reference transcript $datadev/reftext
+
+dataset=$(basename $datadev)
+
+audio_path=
+if [ $dataset == "analysis1" ]; then
+  audio_path=${audio_path_analysis1}
+elif [ $dataset == "analysis2" ]; then
+  audio_path=${audio_path_analysis2}
+elif [ $(basename $datadev) == 'test_dev' ]; then
+  audio_path=${audio_path_dev}
+elif [ $(basename $datadev) == 'eval1' ]; then
+  audio_path=${audio_path_eval1}
+elif [ $(basename $datadev) == 'eval2' ]; then
+  audio_path=${audio_path_eval2}
+elif [ $(basename $datadev) == 'eval3' ]; then
+  audio_path=${audio_path_eval3}
+fi
+
+[ -z ${audio_path} ] && echo "$0: test data should be either analysis1, analysis2, test_dev, eval1 or eval2." && exit 1
+
+metadata_file=${audio_path}/metadata/metadata.tsv
+
+if [ $stage -le 0 ]; then
+  mkdir -p data/local/$dataset
+
+  tail -n +2 $metadata_file | \
+    perl -ane '$F[0] =~ s/.wav//; print "$F[0] $F[1]\n";' > \
+    data/local/$dataset/all_list
+
+  awk '{if ($2 == "CS") { print $1 } }' data/local/$dataset/all_list > data/local/$dataset/call_list
+  awk '{if ($2 != "CS") { print $1 } }' data/local/$dataset/all_list > data/local/$dataset/non_call_list
+fi
+
+if [ $stage -le 2 ]; then
+  rm data/local/$dataset/{wav.scp,reco2file_and_channel} 2>/dev/null || true
+
+  if [ $dataset == "analysis1" ] || [ $dataset == "analysis2" ]; then
+    local/parse_dev_transcripts.py $audio_path \
+      data/local/$dataset/call_list \
+      data/local/$dataset/non_call_list \
+      data/local/$dataset
+  else
+    for f in $(cat data/local/$dataset/call_list); do
+      wav_file="$audio_path/src/$f.wav"
+
+      echo "${f}_inLine sox $wav_file -r 8000 -b 16 -c 1 -t wav - remix 1 |" >> data/local/$dataset/wav.scp
+      echo "${f}_outLine sox $wav_file -r 8000 -b 16 -c 1 -t wav - remix 2 |" >> data/local/$dataset/wav.scp
+      echo "${f}_inLine ${f} A" >> data/local/$dataset/reco2file_and_channel
+      echo "${f}_outLine ${f} B" >> data/local/$dataset/reco2file_and_channel
+    done
+    
+    for f in $(cat data/local/$dataset/non_call_list); do
+      wav_file="$audio_path/src/$f.wav"
+
+      echo "${f} sox $wav_file -r 8000 -b 16 -c 1 -t wav - |" >> data/local/$dataset/wav.scp
+      echo "${f} ${f} 1" >> data/local/$dataset/reco2file_and_channel
+    done
+
+    awk '{print $1" "$1}' data/local/$dataset/wav.scp > data/local/$dataset/utt2spk
+  fi
+  utils/utt2spk_to_spk2utt.pl data/local/$dataset/utt2spk > data/local/$dataset/spk2utt
+  utils/fix_data_dir.sh data/local/$dataset
+  
+  utils/copy_data_dir.sh data/local/$dataset $datadev
+fi
+
+if [ $stage -le 3 ]; then
+  if [ $dataset == "analysis1" ] || [ $dataset == "analysis2" ]; then
+    cat data/local/$dataset/all_list | awk '{print $1" <"$2",O>"}' > \
+      data/local/$dataset/all_list_labels
+    
+    awk '{print $2" "$1" "$3" "$4" "$1}' $datadev/segments | \
+      utils/apply_map.pl -f 1 $datadev/reco2file_and_channel | \
+      utils/apply_map.pl -f 3 $datadev/utt2spk | \
+      awk '{print $1" "$2" "$3" "$4" "$5" "$1" "$6}' | \
+      utils/apply_map.pl -f 7 $datadev/text | \
+      utils/apply_map.pl -f 6 data/local/$dataset/all_list_labels | \
+      sort +0 -1 +1 -2 +3nb -4 > \
+      $datadev/stm
+
+    touch $datadev/glm
+  fi
+fi
+
+# 3. segment .wav files
+ 
+# 3.1. create a trivial segments file:
+
+if [ $stage -le 4 ]; then
+  utils/data/get_utt2dur.sh --nj 4 --cmd "$train_cmd" ${datadev}
+
+  if [ ! -f $datadev/segments ]; then
+    utils/data/get_segments_for_data.sh $datadev/ > $datadev/segments
+  fi
+
+  # 3.2. create uniform segmented directory using: (The durations are in seconds)
+
+  if [ $dataset == "analysis1" ] || [ $dataset == "analysis2" ]; then
+    utils/data/convert_data_dir_to_whole.sh $datadev ${datadev}_whole
+    utils/data/get_utt2dur.sh --nj 4 --cmd "$train_cmd" ${datadev}_whole
+    
+    utils/data/get_segments_for_data.sh ${datadev}_whole > ${datadev}_whole/segments
+    utils/data/get_uniform_subsegments.py --max-segment-duration=30 \
+    --overlap-duration=5 --max-remaining-duration=15 ${datadev}_whole/segments > \
+    ${datadev}_whole/uniform_sub_segments
+
+    utils/data/subsegment_data_dir.sh ${datadev}_whole/ \
+      ${datadev}_whole/uniform_sub_segments ${datadev}_segmented
+  else
+    utils/data/get_uniform_subsegments.py --max-segment-duration=30 \
+    --overlap-duration=5 --max-remaining-duration=15 ${datadev}/segments > \
+    ${datadev}/uniform_sub_segments
+
+    utils/data/subsegment_data_dir.sh ${datadev}/ \
+      ${datadev}/uniform_sub_segments ${datadev}_segmented
+  fi
+fi
diff --git a/egs/material/s5/local/rnnlm/run_tdnn_lstm.sh b/egs/material/s5/local/rnnlm/run_tdnn_lstm.sh
new file mode 100755
index 00000000000..3f5c7e547b1
--- /dev/null
+++ b/egs/material/s5/local/rnnlm/run_tdnn_lstm.sh
@@ -0,0 +1,217 @@
+#!/bin/bash
+
+# Copyright 2017-2018  Johns Hopkins University (author: Daniel Povey)
+#                2017  Hainan Xu
+#                2018  Ke Li
+#                2018  Yiming Wang
+
+
+# [for swahili]
+# rnnlm/train_rnnlm.sh: best iteration (out of 40) was 38, linking it to final iteration.
+# rnnlm/train_rnnlm.sh: train/dev perplexity was 140.6 / 1019.4.
+# Train objf: -6.28 -5.90 -5.70 -5.56 -5.47 -5.40 -5.34 -5.29 -5.25 -5.22 -5.17 -5.16 -5.13 -5.10 -5.07 -5.06 -5.04 -5.01 -4.99 -4.98 -4.97 -4.96 -4.93 -4.93 -4.91 -4.91 -4.89 -4.88 -4.87 -4.86 -4.84 -4.85 -4.81 -4.79 -4.78 -4.76 -4.75 -4.74 -4.73
+# Dev objf:   -8.69 -7.76 -7.31 -7.03 -6.98 -7.00 -6.96 -6.96 -6.93 -6.94
+
+# %WER 36.75 [ 22836 / 62144, 2758 ins, 6307 del, 13771 sub ] exp/chain/tdnn1b_sp/decode_dev_rnnlm_rescore/wer_10_0.0
+# %WER 38.91 [ 24181 / 62144, 2750 ins, 6579 del, 14852 sub ] exp/chain/tdnn1b_sp/decode_dev_rnnlm_rescore_nbest/wer_10_0.0
+# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis1_segmented_rnnlm_rescore/score_10_0.0/analysis1_segmented_hires.ctm.sys 
+# | Sum/Avg                            |  9906   59164  | 62.2     23.8    14.0     3.5     41.3    49.1  |
+# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis1_segmented_rnnlm_rescore_nbest/score_10_0.0/analysis1_segmented_hires.ctm.sys 
+# |  Sum/Avg                            |  9906    59164  |  61.9     23.6     14.6      3.2     41.4     49.5  |
+# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis2_segmented_rnnlm_rescore/score_10_0.0/analysis2_segmented_hires.ctm.sys 
+# | Sum/Avg                            |  5322   37120  | 66.2     21.2    12.6     2.9     36.8    49.8  |
+# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis2_segmented_rnnlm_rescore_nbest/score_10_0.0/analysis2_segmented_hires.ctm.sys 
+# |  Sum/Avg                            |  5322    37120  |  65.8     21.1     13.1      2.7     36.9     49.9  |
+
+# [for tagalog]
+# rnnlm/train_rnnlm.sh: best iteration (out of 320) was 125, linking it to final iteration.
+# rnnlm/train_rnnlm.sh: train/dev perplexity was 141.2 / 259.6.
+# Train objf: -6.08 -5.78 -5.62 -5.52 -5.45 -5.40 -5.36 -5.32 -5.28 -5.26 -5.23 -5.20 -5.18 -5.16 -5.14 -5.13 -5.11 -5.10 -5.09 -5.07 -5.06 -5.05 -5.03 -5.03 -5.02 -5.01 -5.00 -4.99 -4.99 -4.97 -4.97 -4.97 -4.96 -4.94 -4.94 -4.93 -4.93 -4.92 -4.91 -4.92 -4.91 -4.90 -4.89 -4.89 -4.89 -4.88 -4.87 -4.87 -4.87 -4.86 -4.86 -4.85 -4.85 -4.84 -4.84 -4.84 -4.83 -4.83 -4.83 -4.82 -4.82 -4.82 -4.81 -4.82 -4.81 -4.81 -4.80 -4.79 -4.79 -4.79 -4.79 -4.80 -4.79 -4.79 -4.79 -4.80 -4.79 -4.78 -4.78 -4.79 -4.77 -4.79 -4.79 -4.78 -4.78 -4.78 -4.78 -4.78 -4.77 -4.77 -4.79 -4.79 -4.78 -4.78 -4.78 -4.78 -4.78 -4.79 -4.78 -4.80 -4.79 -4.78 -4.79 -4.80 -4.80 -4.79 -4.79 -4.77 -4.78 -4.77 -4.77 -4.78 -4.75 -4.80 -4.78 -4.77 -4.76 -4.77 -4.76 -4.76 -4.75 -4.75 -4.76 -4.76 -4.77 -4.75 -4.75 -4.75 -4.76 -4.75 -4.76 -4.74 -4.75 -4.75 -4.76 -4.75 -4.75 -4.75 -4.74 -4.76 -4.75 -4.74 -4.78 -4.74 -4.73 -4.77 -4.76 -4.75 -4.74 -4.73 -4.73 -4.75 -4.75 -4.74 -4.76 -4.73 -4.72 -4.76 -4.72 -4.72 -4.73 -4.72 -4.73 -4.75 -4.72 -4.73 -4.76 -4.75 -4.72 -4.72 -4.74 -4.75 -4.73 -4.72 -4.74 -4.74 -4.73 -4.74 -4.74 -4.74 -4.72 -4.70 -4.72 -4.75 -4.74 -4.75 -4.74 -4.76 -4.72 -4.72 -4.74 -4.75 -4.71 -4.74 -4.73 -4.73 -4.73 -4.73 -4.74 -4.75 -4.73 -4.73 -4.72 -4.71 -4.72 -4.71 -4.72 -4.75 -4.72 -4.71 -4.74 -4.71 -4.70 -4.73 -4.73 -4.75 -4.75 -4.72 -4.72 -4.73 -4.75 -4.73 -4.72 -4.72 -4.72 -4.73 -4.76 -4.73 -4.76 -4.74 -4.73 -4.74 -4.74 -4.74 -4.73 -4.73 -4.73 -4.70 -4.73 -4.74 -4.72 -4.73 -4.73 -4.75 -4.72 -4.73 -4.73 -4.75 -4.73 -4.75 -4.75 -4.73 -4.75 -4.74 -4.75 -4.77 -4.74 -4.75 -4.74 -4.73 -4.77 -4.75 -4.74 -4.75 -4.74 -4.77 -4.76 -4.75 -4.79 -4.78 -4.76 -4.76 -4.77 -4.76 -4.75 -4.74 -4.74 -4.78 -4.77 -4.77 -4.78 -4.79 -4.79 -4.79 -4.76 -4.77 -4.76 -4.79 -4.76 -4.77 -4.76 -4.78 -4.80 -4.79 -4.78 -4.82 -4.82 -4.79 -4.80 -4.81 -4.79 -4.77 -4.79 -4.82 -4.81 -4.82 -4.83 -4.85 -4.84 -4.83 -4.85 -4.88 -4.85 -4.87 -4.86 -4.84 -4.87 -4.85 -4.84 
+# Dev objf:   -8.70 -7.03 -60340.00 -6.61 -6.45 -6.54 -60340.00 -6.34 -60340.00 -60340.00 -6.15 -6.12 -6.03 -6.03 -60340.00 -60340.00 -6.64 -60340.00 -6.01 -5.91 -5.93 -6.06 -5.92 -5.95 -6.00 -6.17 -6.06 -5.92 -5.92 -60340.00 -6.03 -5.93 -5.98 -60340.00 -6.00 -5.90 -5.84 -6.00 -60340.00 -5.95 -5.89 -60340.00 -5.90 -6.14 -5.84 -5.92 -5.83 -5.86 -5.89 -5.84 -60340.00 -5.90 -5.80 -5.87 -5.87 -60340.00 -5.79 -60340.00 -60340.00 -60340.00 -6.56 -5.88 -5.94 -60340.00 -5.84 -60340.00 -5.84 -5.81 -5.77 -60340.00 -60340.00 -60340.00 -5.81 -5.90 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -5.72 -5.79 -60340.00 -60340.00 -60340.00 -60340.00 -5.72 -5.80 -60340.00 -60340.00 -5.68 -5.73 -5.74 -60340.00 -5.67 -5.63 -60340.00 -5.75 -60340.00 -5.66 -5.71 -5.73 -5.73 -5.75 -60340.00 -5.77 -60340.00 -5.70 -5.70 -5.82 -60340.00 -60340.00 -5.77 -5.72 -5.75 -60340.00 -5.56 -60340.00 -5.73 -60340.00 -60340.00 -5.99 -5.77 -60340.00 -5.65 -5.80 -60340.00 -60340.00 -5.64 -5.67 -5.73 -5.59 -60340.00 -60340.00 -5.73 -60340.00 -60340.00 -5.83 -5.58 -5.64 -5.75 -60340.00 -5.77 -5.68 -60340.00 -60340.00 -5.70 -5.85 -60340.00 -60340.00 -5.82 -6.15 -5.74 -5.73 -5.75 -60340.00 -60340.00 -5.86 -60340.00 -5.80 -5.79 -5.81 -60340.00 -5.89 -60340.00 -5.81 -5.71 -60340.00 -60340.00 -5.65 -5.87 -60340.00 -60340.00 -60340.00 -5.83 -60340.00 -5.94 -5.74 -5.75 -5.75 -60340.00 -5.76 -5.73 -5.76 -60340.00 -60340.00 -5.85 -5.91 -5.98 -60340.00 -5.88 -5.86 -60340.00 -60340.00 -60340.00 -60340.00 -5.91 -5.81 -5.86 -60340.00 -6.10 -6.17 -60340.00 -60340.00 -5.82 -5.82 -60340.00 -60340.00 -6.78 -5.71 -5.87 -60340.00 -60340.00 -5.98 -5.94 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -5.81 -60340.00 -60340.00 -60340.00 -5.74 -60340.00 -5.83 -60340.00 -5.96 -5.80 -60340.00 -60340.00 -60340.00 -5.82 -60340.00 -60340.00 -60340.00 -60340.00 -5.80 -60340.00 -60340.00 -60340.00 -60340.00 -5.79 -60340.00 -6.13 -5.97 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -5.97 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -5.98 -60340.00 -60340.00 -60340.00 -5.85 -5.92 -5.85 -5.82 -6.04 -60340.00 -60340.00 -60340.00 -60340.00 -5.93 -60340.00 -5.85 -5.87 -5.77 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -5.89 -60340.00 -60340.00 -60340.00 -60340.00 -6.18 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -5.92 -6.01
+
+# %WER 46.07 [ 29664 / 64382, 3133 ins, 9896 del, 16635 sub ] exp/chain/tdnn1b_sp/decode_dev_rnnlm_rescore/wer_10_0.5
+# %WER 47.47 [ 30563 / 64382, 3568 ins, 8934 del, 18061 sub ] exp/chain/tdnn1b_sp/decode_dev_rnnlm_rescore_nbest/wer_10_0.5
+# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis1_segmented_rnnlm_rescore/score_10_0.0/analysis1_segmented_hires.ctm.sys 
+# | Sum/Avg                            | 10551   87329  | 53.7     25.3    21.0     4.6     51.0    65.6  |
+# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis1_segmented_rnnlm_rescore_nbest/score_10_0.0/analysis1_segmented_hires.ctm.sys 
+# |  Sum/Avg                            | 10551    87329  |  53.4     24.9     21.6      4.3     50.9     65.6  |
+# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis2_segmented_rnnlm_rescore/score_10_0.0/analysis2_segmented_hires.ctm.sys 
+# | Sum/Avg                            |  5933   56887  | 52.6     25.0    22.4     4.9     52.3    73.8  |
+# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis2_segmented_rnnlm_rescore_nbest/score_10_0.0/analysis2_segmented_hires.ctm.sys 
+# |  Sum/Avg                            |  5933    56887  |  52.3     24.5     23.1      4.5     52.2     73.9  |
+
+# [for somali]
+# rnnlm/train_rnnlm.sh: best iteration (out of 800) was 133, linking it to final iteration.
+# rnnlm/train_rnnlm.sh: train/dev perplexity was 414.5 / 860.9.
+
+# %WER 56.54 [ 46160 / 81637, 4654 ins, 13070 del, 28436 sub ] exp/chain/tdnn1b_sp/decode_dev_rnnlm_rescore/wer_10_0.0
+# %WER 57.85 [ 47226 / 81637, 5002 ins, 12287 del, 29937 sub ] exp/chain/tdnn1b_sp/decode_dev_rnnlm_rescore_nbest/wer_10_0.0
+# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis1_segmented_rnnlm_rescore/score_10_0.0/analysis1_segmented_hires.ctm.sys 
+# | Sum/Avg                            |  9852   90609  | 50.4     33.3    16.3     8.2     57.8    74.8  |
+# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis1_segmented_rnnlm_rescore_nbest/score_10_0.0/analysis1_segmented_hires.ctm.sys 
+# |  Sum/Avg                            |  9852    90609  |  50.4     33.2     16.4      8.1     57.7     74.9  |
+# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis2_segmented_rnnlm_rescore/score_10_0.0/analysis2_segmented_hires.ctm.sys 
+# | Sum/Avg                            |  8275   67640  | 53.0     32.8    14.2     8.5     55.5    69.3  |
+# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis2_segmented_rnnlm_rescore_nbest/score_10_0.0/analysis2_segmented_hires.ctm.sys 
+# |  Sum/Avg                            |  8275    67640  |  53.0     32.7     14.3      8.3     55.3     69.2  |
+
+
+# Begin configuration section.
+
+embedding_dim=512
+lstm_rpd=128
+lstm_nrpd=128
+stage=0
+train_stage=-10
+epochs=40
+
+# variables for lattice rescoring
+run_rescore=true
+decode_dir_suffix=rnnlm
+ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order
+              # if it's set, it merges histories in the lattice if they share
+              # the same ngram history and this prevents the lattice from 
+              # exploding exponentially
+pruned_rescore=true
+
+ac_model_dir=exp/chain/tdnn1b_sp
+decode_sets="dev analysis1_segmented analysis2_segmented test_dev_segmented eval1_segmented eval2_segmented eval3_segmented"
+
+dir=exp/rnnlm_lstm_1a
+text_dir=data/rnnlm/text
+train_text=data/lm/train.txt
+dev_text=data/lm/dev.txt
+bitext=data/bitext/text.txt
+monotext=data/mono/text.txt
+
+lang=data/lang_combined_chain
+tree_dir=exp/chain/tree_sp
+
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+
+mkdir -p $dir/config
+set -e
+
+for f in ${train_text} ${dev_text} $bitext $monotext; do
+
+  [ ! -f $f ] && \
+    echo "$0: expected file $f to exist; look at stage 12 in run.sh" && exit 1
+done
+
+if [ $stage -le 0 ]; then
+  mkdir -p $text_dir
+  cat $train_text > $text_dir/train.txt
+  cat $dev_text > $text_dir/dev.txt
+  cat $bitext > $text_dir/bitext.txt
+  cat $monotext > $text_dir/monotext.txt
+
+fi
+
+if [ $stage -le 1 ]; then
+  cp $lang/words.txt $dir/config/
+  n=`cat $dir/config/words.txt | wc -l`
+  echo "<brk> $n" >> $dir/config/words.txt
+
+  # words that are not present in words.txt but are in the training or dev data, will be
+  # mapped to <SPOKEN_NOISE> during training.
+  echo "<unk>" >$dir/config/oov.txt
+
+  cat > $dir/config/data_weights.txt <<EOF
+train   1   1.0
+bitext  1   1.0
+monotext  1   1.0
+EOF
+
+  rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
+                             --unk-word="<unk>" \
+                             --data-weights-file=$dir/config/data_weights.txt \
+                             $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
+
+  # choose features
+  rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
+                           --use-constant-feature=true \
+                           --special-words='<s>,</s>,<brk>,<unk>,<noise>,<spnoise>,<sil>' \
+                           $dir/config/words.txt > $dir/config/features.txt
+
+  cat >$dir/config/xconfig <<EOF
+input dim=$embedding_dim name=input
+relu-renorm-layer name=tdnn1 dim=$embedding_dim input=Append(0, IfDefined(-1))
+fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn2 dim=$embedding_dim input=Append(0, IfDefined(-2))
+fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn3 dim=$embedding_dim input=Append(0, IfDefined(-1))
+output-layer name=output include-log-softmax=false dim=$embedding_dim
+EOF
+  rnnlm/validate_config_dir.sh $text_dir $dir/config
+fi
+
+if [ $stage -le 2 ]; then
+  rnnlm/prepare_rnnlm_dir.sh $text_dir $dir/config $dir
+fi
+
+if [ $stage -le 3 ]; then
+  rnnlm/train_rnnlm.sh --num-jobs-initial 1 --num-jobs-final 1 --embedding-l2 0.001 \
+                  --stage $train_stage --num-epochs $epochs --cmd "$train_cmd" $dir
+fi
+
+LM=combined_chain
+if [ $stage -le 4 ] && $run_rescore; then
+  echo "$0: Perform lattice-rescoring on $ac_model_dir"
+  pruned=
+  if $pruned_rescore; then
+    pruned=_pruned
+  fi
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in ${decode_sets}; do
+    (
+      decode_dir=${ac_model_dir}/decode_${decode_set}
+      skip_scoring=false
+      if [ ${decode_set} != "dev" ]; then skip_scoring=true; fi
+
+      # Lattice rescoring
+      rnnlm/lmrescore$pruned.sh \
+        --cmd "$decode_cmd" \
+        --weight 0.5 --max-ngram-order $ngram_order --max-arcs 20000 \
+        --skip-scoring ${skip_scoring} \
+        data/lang_$LM $dir data/${decode_set}_hires \
+        ${decode_dir} ${decode_dir}_${decode_dir_suffix}_rescore || exit 1
+
+      if [ ${decode_set} != "dev" ]; then
+        local/postprocess_test.sh ${decode_set} ${tree_dir}/graph_combined \
+          ${decode_dir}_${decode_dir_suffix}_rescore
+      fi
+    ) || touch $dir/.error &
+  done
+fi
+wait
+#[ -f $dir/.error ] && echo "$0: there was a problem while rescoring" && exit 1
+
+if [ $stage -le 5 ]; then
+  echo "$0: Perform nbest-rescoring on $ac_model_dir"
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in ${decode_sets}; do
+    (
+      decode_dir=${ac_model_dir}/decode_${decode_set}
+      skip_scoring=false
+      if [ ${decode_set} != "dev" ]; then skip_scoring=true; fi
+
+      # Lattice rescoring
+      rnnlm/lmrescore_nbest.sh \
+        --N 20 \
+        --cmd "$decode_cmd" \
+        --skip-scoring ${skip_scoring} \
+        0.5 data/lang_$LM $dir data/${decode_set}_hires \
+        ${decode_dir}_${decode_dir_suffix}_rescore ${decode_dir}_${decode_dir_suffix}_rescore_nbest || exit 1
+
+      if [ ${decode_set} != "dev" ]; then
+        local/postprocess_test.sh ${decode_set} ${tree_dir}/graph_combined \
+          ${decode_dir}_${decode_dir_suffix}_rescore_nbest
+      fi
+    ) || touch $dir/.error 
+  done
+fi
+
+exit 0
diff --git a/egs/material/s5/local/rnnlm/run_tdnn_lstm_2.sh b/egs/material/s5/local/rnnlm/run_tdnn_lstm_2.sh
new file mode 100755
index 00000000000..13cf0bde44c
--- /dev/null
+++ b/egs/material/s5/local/rnnlm/run_tdnn_lstm_2.sh
@@ -0,0 +1,181 @@
+#!/bin/bash
+
+# Copyright 2017-2018  Johns Hopkins University (author: Daniel Povey)
+#                2017  Hainan Xu
+#                2018  Ke Li
+#                2018  Yiming Wang
+
+
+# [for swahili]
+# rnnlm/train_rnnlm.sh: best iteration (out of 10) was 5, linking it to final iteration.
+# rnnlm/train_rnnlm.sh: train/dev perplexity was 59.1 / 273.1.
+# Train objf: -5.48 -4.75 -4.47 -4.30 -4.17 -4.06 -3.96 -3.87 -3.77 -3.68 
+# Dev objf:   -10.79 -6.00 -5.75 -5.69 -5.62 -5.61 -5.62 -5.66 -5.66
+
+# %WER 35.84 [ 22270 / 62144, 2573 ins, 6961 del, 12736 sub ] exp/chain/tdnn1b_sp/decode_dev_rnnlm_rescore/wer_11_0.5
+# %WER 48.49 [ 28692 / 59166, 2310 ins, 9200 del, 17182 sub ] exp/chain/tdnn1b_sp/decode_analysis1_segmented_reseg_rnnlm_rescore
+
+# [for tagalog]
+# rnnlm/train_rnnlm.sh: best iteration (out of 10) was 4, linking it to final iteration.
+# rnnlm/train_rnnlm.sh: train/dev perplexity was 73.6 / 106.2.
+# Train objf: -5.55 -4.83 -4.58 -4.41 -4.28 -4.17 -4.06 -3.96 -3.86
+# Dev objf:   -10.54 -4.87 -4.72 -4.67 -4.67 -4.69 -4.71 -4.74 -4.78
+
+# %WER 42.91 [ 27628 / 64382, 3624 ins, 8301 del, 15703 sub ] exp/chain/tdnn1b_sp/decode_dev_rnnlm_rescore/wer_10_0.0
+# %WER 55.55 [ 48530 / 87362, 4030 ins, 19326 del, 25174 sub ] exp/chain/tdnn1b_sp/decode_analysis1_segmented_reseg_rnnlm_rescore
+
+# Begin configuration section.
+
+embedding_dim=512
+lstm_rpd=128
+lstm_nrpd=128
+stage=0
+train_stage=-10
+epochs=40
+
+# variables for lattice rescoring
+run_rescore=true
+decode_dir_suffix=rnnlm
+ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order
+              # if it's set, it merges histories in the lattice if they share
+              # the same ngram history and this prevents the lattice from 
+              # exploding exponentially
+pruned_rescore=true
+
+ac_model_dir=exp/chain/tdnn1b_sp
+#decode_sets="dev analysis1_segmented_reseg test_dev_segmented_reseg eval1_segmented_reseg eval2_segmented_reseg"
+decode_sets="dev analysis1_segmented test_dev_segmented eval1_segmented eval2_segmented eval3_segmented"
+decode_sets="analysis2_segmented"
+#decode_sets="dev eval1_segmented eval2_segmented"
+dir=exp/rnnlm_lstm_1a
+text_dir=data/rnnlm/text
+train_text=data/lm/train.txt
+dev_text=data/lm/dev.txt
+bitext=data/bitext/text.txt
+lang=data/lang_combined_chain
+tree_dir=exp/chain/tree_sp
+
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+
+mkdir -p $dir/config
+set -e
+
+for f in ${train_text} ${dev_text} $bitext; do
+  [ ! -f $f ] && \
+    echo "$0: expected file $f to exist; look at stage 12 in run.sh" && exit 1
+done
+
+if [ $stage -le 0 ]; then
+  mkdir -p $text_dir
+  cat $train_text > $text_dir/train.txt
+  cat $dev_text > $text_dir/dev.txt
+  cat $bitext > $text_dir/bitext.txt
+fi
+
+if [ $stage -le 1 ]; then
+  cp $lang/words.txt $dir/config/
+  n=`cat $dir/config/words.txt | wc -l`
+  echo "<brk> $n" >> $dir/config/words.txt
+
+  # words that are not present in words.txt but are in the training or dev data, will be
+  # mapped to <SPOKEN_NOISE> during training.
+  echo "<unk>" >$dir/config/oov.txt
+
+  cat > $dir/config/data_weights.txt <<EOF
+train   1   1.0
+bitext  1   1.0
+EOF
+
+  rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
+                             --unk-word="<unk>" \
+                             --data-weights-file=$dir/config/data_weights.txt \
+                             $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
+
+  # choose features
+  rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
+                           --use-constant-feature=true \
+                           --special-words='<s>,</s>,<brk>,<unk>,<noise>,<spnoise>,<sil>' \
+                           $dir/config/words.txt > $dir/config/features.txt
+
+  cat >$dir/config/xconfig <<EOF
+input dim=$embedding_dim name=input
+relu-renorm-layer name=tdnn1 dim=$embedding_dim input=Append(0, IfDefined(-1))
+fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn2 dim=$embedding_dim input=Append(0, IfDefined(-2))
+fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn3 dim=$embedding_dim input=Append(0, IfDefined(-1))
+output-layer name=output include-log-softmax=false dim=$embedding_dim
+EOF
+  rnnlm/validate_config_dir.sh $text_dir $dir/config
+fi
+
+if [ $stage -le 2 ]; then
+  rnnlm/prepare_rnnlm_dir.sh $text_dir $dir/config $dir
+fi
+
+if [ $stage -le 3 ]; then
+  rnnlm/train_rnnlm.sh --num-jobs-initial 1 --num-jobs-final 1 --embedding-l2 0.001 \
+                  --stage $train_stage --num-epochs $epochs --cmd "$train_cmd" $dir
+fi
+
+LM=combined_chain
+if [ $stage -le 4 ] && $run_rescore; then
+  echo "$0: Perform lattice-rescoring on $ac_model_dir"
+  pruned=
+  if $pruned_rescore; then
+    pruned=_pruned
+  fi
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in ${decode_sets}; do
+    (
+      decode_dir=${ac_model_dir}/decode_${decode_set}
+      skip_scoring=false
+      if [ ${decode_set} != "dev" ]; then skip_scoring=true; fi
+
+      # Lattice rescoring
+      rnnlm/lmrescore$pruned.sh \
+        --cmd "$decode_cmd" \
+        --weight 0.5 --max-ngram-order $ngram_order --max-arcs 20000 \
+        --skip-scoring ${skip_scoring} \
+        data/lang_$LM $dir data/${decode_set}_hires \
+        ${decode_dir} ${decode_dir}_${decode_dir_suffix}_rescore || exit 1
+
+      if [ ${decode_set} != "dev" ]; then
+        local/postprocess_test.sh ${decode_set} ${tree_dir}/graph_combined \
+          ${decode_dir}_${decode_dir_suffix}_rescore
+      fi
+    ) || touch $dir/.error &
+  done
+fi
+wait
+#[ -f $dir/.error ] && echo "$0: there was a problem while rescoring" && exit 1
+
+if [ $stage -le 5 ]; then
+  echo "$0: Perform nbest-rescoring on $ac_model_dir"
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in ${decode_sets}; do
+    (
+      decode_dir=${ac_model_dir}/decode_${decode_set}
+      skip_scoring=false
+      if [ ${decode_set} != "dev" ]; then skip_scoring=true; fi
+
+      # Lattice rescoring
+      rnnlm/lmrescore_nbest.sh \
+        --N 20 \
+        --cmd "$decode_cmd" \
+        --skip-scoring ${skip_scoring} \
+        0.5 data/lang_$LM $dir data/${decode_set}_hires \
+        ${decode_dir}_${decode_dir_suffix}_rescore ${decode_dir}_${decode_dir_suffix}_rescore_nbest || exit 1
+
+      if [ ${decode_set} != "dev" ]; then
+        local/postprocess_test.sh ${decode_set} ${tree_dir}/graph_combined \
+          ${decode_dir}_${decode_dir_suffix}_rescore_nbest
+      fi
+    ) || touch $dir/.error 
+  done
+fi
+
+exit 0
diff --git a/egs/material/s5/local/score.sh b/egs/material/s5/local/score.sh
new file mode 100755
index 00000000000..c7da00fba32
--- /dev/null
+++ b/egs/material/s5/local/score.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+# Copyright (c) 2017, Johns Hopkins University (Jan "Yenda" Trmal<jtrmal@gmail.com>)
+# License: Apache 2.0
+
+# Begin configuration section.
+# End configuration section
+set -e -o pipefail
+set -o nounset                              # Treat unset variables as an error
+
+echo "$0" "$@"
+steps/scoring/score_kaldi_wer.sh "$@"
+steps/scoring/score_kaldi_cer.sh --stage 2 "$@"
+
diff --git a/egs/material/s5/local/score_segments.sh b/egs/material/s5/local/score_segments.sh
new file mode 100755
index 00000000000..064e15ae40d
--- /dev/null
+++ b/egs/material/s5/local/score_segments.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+set -e -o pipefail
+set -o nounset                              # Treat unset variables as an error
+
+echo "$0" "$@"
+local/score_wer_segments.sh "$@"
+#local/score_cer_segment.sh --stage 2 "$@"
+
diff --git a/egs/material/s5/local/score_stm.sh b/egs/material/s5/local/score_stm.sh
new file mode 100755
index 00000000000..7e1236ce92e
--- /dev/null
+++ b/egs/material/s5/local/score_stm.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+# Copyright 2013  Johns Hopkins University (authors: Yenda Trmal)
+#           2018  Vimal Manohar
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# This scoring script is copied from Babel and modified.
+# This is a scoring script for the CTMS in <decode-dir>/score_<LMWT>/${name}.ctm
+# it tries to mimic the NIST scoring setup as much as possible (and usually does a good job)
+
+# begin configuration section.
+cmd=run.pl
+cer=0
+min_lmwt=7
+max_lmwt=17
+model=
+stage=0
+ctm_name=
+word_ins_penalty=0.0,0.5,1.0
+case_insensitive=true
+use_icu=true
+icu_transform='Any-Lower'
+#end configuration section.
+
+echo $0 $@
+
+[ -f ./path.sh ] && . ./path.sh
+[ -f ./cmd.sh ]  && . ./cmd.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [options] <dataDir> <langDir|graphDir> <decodeDir>" && exit;
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --cer (0|1)                     # compute CER in addition to WER"
+  exit 1;
+fi
+
+data=$1
+lang=$2 # This parameter is not used -- kept only for backwards compatibility
+dir=$3
+
+set -e
+set -o pipefail
+set -u
+
+ScoringProgram=`which sclite` || ScoringProgram=$KALDI_ROOT/tools/sctk/bin/sclite
+[ ! -x $ScoringProgram ] && echo "Cannot find scoring program at $ScoringProgram" && exit 1;
+SortingProgram=`which hubscr.pl` || SortingProgram=$KALDI_ROOT/tools/sctk/bin/hubscr.pl
+[ ! -x $ScoringProgram ] && echo "Cannot find scoring program at $ScoringProgram" && exit 1;
+
+stm_filter_cmd=cat
+[ -x local/stm_filter ] && stm_filter_cmd=local/stm_filter
+ctm_filter_cmd=cat
+[ -x local/ctm_filter ] && ctm_filter_cmd=local/ctm_filter
+
+for f in $data/stm  ; do
+  [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
+done
+
+if [ -z $ctm_name ] ; then
+  name=`basename $data`; # e.g. eval2000
+else
+  name=$ctm_name
+fi
+
+if [ $stage -le 0 ] ; then
+  for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+    mkdir -p $dir/scoring/penalty_$wip/log
+    $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/penalty_$wip/log/score.LMWT.log \
+      set -e';' set -o pipefail';' \
+      cat $dir/score_LMWT_${wip}/${name}.ctm \| $ctm_filter_cmd '>' $dir/score_LMWT_${wip}/${name}.ctm.unsorted '&&' \
+      cat $data/stm \| $stm_filter_cmd '>' $dir/score_LMWT_${wip}/stm.unsorted '&&' \
+      $SortingProgram sortSTM \<$dir/score_LMWT_${wip}/stm.unsorted          \>$dir/score_LMWT_${wip}/stm.sorted '&&' \
+      $SortingProgram sortCTM \<$dir/score_LMWT_${wip}/${name}.ctm.unsorted  \>$dir/score_LMWT_${wip}/${name}.ctm.sorted '&&' \
+      paste -d ' ' \<\(cut -f 1-5 -d ' ' $dir/score_LMWT_${wip}/stm.sorted \) \
+                   \<\(cut -f 6- -d ' ' $dir/score_LMWT_${wip}/stm.sorted \| uconv -f utf8 -t utf8 -x "$icu_transform" \) \
+          \> $dir/score_LMWT_${wip}/stm '&&' \
+      paste -d ' ' \<\(cut -f 1-4 -d ' ' $dir/score_LMWT_${wip}/${name}.ctm.sorted \) \
+                   \<\(cut -f 5-  -d ' ' $dir/score_LMWT_${wip}/${name}.ctm.sorted \| uconv -f utf8 -t utf8 -x "$icu_transform" \) \
+          \> $dir/score_LMWT_${wip}/${name}.ctm.sorted2 '&&' \
+      utils/fix_ctm.sh $dir/score_LMWT_${wip}/stm $dir/score_LMWT_${wip}/${name}.ctm.sorted2 '&&' \
+      $SortingProgram sortCTM \<$dir/score_LMWT_${wip}/${name}.ctm.sorted2  \>$dir/score_LMWT_${wip}/${name}.ctm '&&' \
+      $ScoringProgram -s -r $dir/score_LMWT_${wip}/stm  stm -h $dir/score_LMWT_${wip}/${name}.ctm ctm \
+        -n "$name.ctm" -f 0 -D -F  -o  sum rsum prf dtl sgml -e utf-8 || exit 1
+  done
+fi
+
+if [ $stage -le 1 ]; then
+  if [ $cer -eq 1 ]; then
+    $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/penalty_$wip/log/score.LMWT.char.log \
+      $ScoringProgram -s -r $dir/score_LMWT_${wip}/stm stm -h $dir/score_LMWT_${wip}/${name}.ctm ctm \
+        -n "$name.char.ctm" -o sum rsum prf dtl sgml -f 0 -D -F -c NOASCII DH -e utf-8 || exit 1
+  fi
+fi
+
+
+echo "Finished scoring on" `date`
+exit 0
diff --git a/egs/material/s5/local/score_wer_segments.sh b/egs/material/s5/local/score_wer_segments.sh
new file mode 100755
index 00000000000..555ec5056d9
--- /dev/null
+++ b/egs/material/s5/local/score_wer_segments.sh
@@ -0,0 +1,100 @@
+#!/bin/bash
+
+
+[ -f ./path.sh ] && . ./path.sh
+
+# begin configuration section.
+cmd=run.pl
+stage=0
+stats=true
+#end configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+data=$1
+dir=$2
+
+ref_filtering_cmd="cat"
+[ -x local/wer_output_filter ] && ref_filtering_cmd="local/wer_output_filter"
+[ -x local/wer_ref_filter ] && ref_filtering_cmd="local/wer_ref_filter"
+hyp_filtering_cmd="cat"
+[ -x local/wer_output_filter ] && hyp_filtering_cmd="local/wer_output_filter"
+[ -x local/wer_hyp_filter ] && hyp_filtering_cmd="local/wer_hyp_filter"
+
+mkdir -p $dir/scoring_kaldi
+if [ -f $data/reftext ]; then
+  cat $data/reftext | $ref_filtering_cmd > $dir/scoring_kaldi/test_filt.txt || exit 1;
+else
+  echo "$0: No reference text to compute WER" 
+fi
+
+if [ $stage -le 0 ]; then
+
+  mkdir -p $dir/scoring_kaldi/log
+  # begin building hypothesis hyp.txt
+  # in the same format as $data/reftext
+  awk '{a[$1]=a[$1]" "$5;}END{for(i in a)print i""a[i];}' \
+    $dir/score_10/ctm_out > tmpconcat
+  if [ -f $data/reftext ]; then
+    awk -F" " '{print $1}' $data/reftext > tmporder
+    awk 'FNR==NR {x2[$1] = $0; next} $1 in x2 {print x2[$1]}' \
+      tmpconcat tmporder > "$dir/score_10/ctm_out.concat"
+    $hyp_filtering_cmd $dir/score_10/ctm_out.concat > \
+      $dir/scoring_kaldi/hyp.txt || exit 1;
+    # end building hypothesis hyp.txt
+
+    $cmd $dir/scoring_kaldi/log/score.hyp.log \
+      cat $dir/scoring_kaldi/hyp.txt \| \
+      compute-wer --text --mode=present \
+      ark:$dir/scoring_kaldi/test_filt.txt  ark:- ">&" $dir/wer || exit 1;
+
+    cat $dir/wer
+  else
+    cat tmpconcat > "$dir/score_10/ctm_out.concat"
+    awk -F" " '{print $1}' $dir/score_10/ctm_out.concat > tmporder
+    $hyp_filtering_cmd $dir/score_10/ctm_out.concat > \
+      $dir/scoring_kaldi/hyp.txt || exit 1;
+    #exit 0;
+    #end building hypothesis hyp.txt
+
+  fi
+  
+  # building hyp.segmentedXms.txt
+  for dur in {700,800,900,1000}; do                                             
+    dursec=`echo $dur' / 1000' | bc -l`                                         
+    awk '{if ($4 < '$dursec') a[$1]=a[$1]" "$5; else a[$1]=a[$1]" "$5"\n"$1"";}END\
+      {for(i in a)print i""a[i];}' $dir/score_10/ctm_out > tmpconcat          
+    rm -rf $dir/score_10/ctm_out.concat.$dur                                    
+    while read LINE; do                                                         
+    grep "$LINE" "tmpconcat" >> "$dir/score_10/ctm_out.concat."$dur           
+    done < "tmporder"                                                        
+    
+    $hyp_filtering_cmd $dir/score_10/ctm_out.concat.$dur > $dir/scoring_kaldi/hyp.segmented${dur}ms.txt || exit 1;                   
+  done       
+  rm -rf tmpconcat                                                            
+  rm -rf tmporder 
+fi
+
+if [ $stage -le 1 ]; then
+  if $stats; then
+    mkdir -p $dir/scoring_kaldi/wer_details
+
+    $cmd $dir/scoring_kaldi/log/stats1.log \
+      cat $dir/scoring_kaldi/hyp.txt \| \
+      align-text --special-symbol="'***'" ark:$dir/scoring_kaldi/test_filt.txt ark:- ark,t:- \| \
+      utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \| tee $dir/scoring_kaldi/wer_details/per_utt \| \
+      utils/scoring/wer_per_spk_details.pl $data/utt2spk \> $dir/scoring_kaldi/wer_details/per_spk || exit 1;
+
+    $cmd $dir/scoring_kaldi/log/stats2.log \
+      cat $dir/scoring_kaldi/wer_details/per_utt \| \
+      utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \
+      sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $dir/scoring_kaldi/wer_details/ops || exit 1;
+
+    $cmd $dir/scoring_kaldi/log/wer_bootci.log \
+      compute-wer-bootci --mode=present \
+         ark:$dir/scoring_kaldi/test_filt.txt ark:$dir/scoring_kaldi/hyp.txt \
+         '>' $dir/scoring_kaldi/wer_details/wer_bootci || exit 1;
+  fi
+fi
diff --git a/egs/material/s5/local/semisup/chain/decode_test.sh b/egs/material/s5/local/semisup/chain/decode_test.sh
new file mode 100755
index 00000000000..3d9a1eda1f5
--- /dev/null
+++ b/egs/material/s5/local/semisup/chain/decode_test.sh
@@ -0,0 +1,108 @@
+#!/bin/bash
+
+# Copyright 2018  Johns Hopkins University (author: Daniel Povey)
+#           2018  Mahsa Yarmohammadi
+#           2018  Yiming Wang
+
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+language=swahili
+stage=0
+datadev="data/analysis1 data/analysis2 data/test_dev data/eval1 data/eval2 data/eval3"
+dir=exp/semisup/chain/tdnn_semisup_1a
+lang=data/lang_combined_chain
+tree_dir=exp/semisup/chain/tree_sp
+cmd=queue.pl
+graph_affix=_combined
+
+# training options
+chunk_width=140,100,160
+chunk_left_context=0
+chunk_right_context=0
+
+# ivector options
+max_count=75 # parameter for extract_ivectors.sh
+sub_speaker_frames=600
+filter_ctm=true
+weights_file=
+silence_weight=0.00001
+nj=30
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+[ ! -f ./conf/lang/${language}.conf ] && \
+  echo "Language configuration conf/lang/${language}.conf does not exist!" && exit 1
+ln -sf ./conf/lang/${language}.conf lang.conf                                   
+. ./lang.conf
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+if [ $stage -le 1 ]; then
+  # extract hires mfcc features from uniformly segmented data
+  for datadir in $datadev; do
+    utils/copy_data_dir.sh ${datadir}_segmented ${datadir}_segmented_hires
+    steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" ${datadir}_segmented_hires || exit 1;
+    steps/compute_cmvn_stats.sh ${datadir}_segmented_hires || exit 1;
+    utils/fix_data_dir.sh ${datadir}_segmented_hires || exit 1;
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  # extract iVectors for the test data, in this case we don't need the speed
+  # perturbation (sp).
+  for datadir in $datadev; do
+    data=$(basename $datadir)
+    steps/online/nnet2/extract_ivectors.sh --cmd "$train_cmd" --nj $nj \
+      --silence-weight $silence_weight \
+      --sub-speaker-frames $sub_speaker_frames --max-count $max_count \
+      ${datadir}_segmented_hires $lang exp/nnet3/extractor \
+      exp/nnet3/ivectors_${data}_segmented_hires
+  done
+fi
+
+frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+rm $dir/.error 2>/dev/null || true
+
+if [ $stage -le 3 ]; then
+  # do the 1st pass decoding
+  for datadir in $datadev; do
+    (
+      data=$(basename $datadir)
+      nspk=$(wc -l <data/${data}_segmented_hires/spk2utt)
+      decode_dir=${dir}/decode_${data}_segmented
+      steps/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --extra-left-context $chunk_left_context \
+        --extra-right-context $chunk_right_context \
+        --extra-left-context-initial 0 \
+        --extra-right-context-final 0 \
+        --frames-per-chunk $frames_per_chunk \
+        --skip-scoring true \
+        --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+        --online-ivector-dir exp/nnet3/ivectors_${data}_segmented_hires \
+        $tree_dir/graph${graph_affix} ${datadir}_segmented_hires ${decode_dir} || exit 1
+
+      # resolve ctm overlaping regions, and compute wer
+      local/postprocess_test.sh ${data}_segmented ${tree_dir}/graph${graph_affix} \
+        ${decode_dir}
+    ) || touch $dir/.error &
+  done
+fi
+wait
+# [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+
+exit 0;
diff --git a/egs/material/s5/local/semisup/chain/run_tdnn.sh b/egs/material/s5/local/semisup/chain/run_tdnn.sh
new file mode 120000
index 00000000000..34499362831
--- /dev/null
+++ b/egs/material/s5/local/semisup/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/material/s5/local/semisup/chain/run_tdnn_semisupervised.sh b/egs/material/s5/local/semisup/chain/run_tdnn_semisupervised.sh
new file mode 120000
index 00000000000..f1cc0216196
--- /dev/null
+++ b/egs/material/s5/local/semisup/chain/run_tdnn_semisupervised.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_semisupervised_1a.sh
\ No newline at end of file
diff --git a/egs/material/s5/local/semisup/chain/tuning/run_tdnn_1a.sh b/egs/material/s5/local/semisup/chain/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..3d3056182ee
--- /dev/null
+++ b/egs/material/s5/local/semisup/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,275 @@
+#!/bin/bash
+
+# Copyright 2017-2019  Johns Hopkins University (author: Daniel Povey)
+#                2017  Vimal Manohar
+#           2018-2019  Yiming Wang
+# Apache 2.0
+
+set -e
+set -o pipefail
+
+# This system uses phone LM to model UNK.
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+train_set=train
+test_sets="dev"
+gmm=tri3        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+exp_root=exp/semisup
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1a   #affix for TDNN directory e.g. "1a" or "1b", in case we change the configuration.
+tree_affix=
+common_egs_dir=
+reporting_email=
+
+# LSTM/chain options
+train_stage=-10
+get_egs_stage=-10
+xent_regularize=0.1
+
+# training chunk-options
+chunk_width=140,100,160
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+num_epochs=7
+
+# training options
+srand=0
+remove_egs=true
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage --nj $nj \
+                                  --train-set $train_set --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1
+
+
+gmm_dir=exp/$gmm   # used to get training lattices (for chain supervision)
+tree_dir=$exp_root/chain/tree_sp${tree_affix:+_tree_affix}
+lang_combined=data/lang_combined_chain
+lat_dir=$exp_root/chain/${gmm}_${train_set}_sp_lats  # training lattices directory
+dir=$exp_root/chain/tdnn_${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 7 ]; then
+  echo "$0: creating lang directory $lang_combined with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang_combined ]; then
+    if [ $lang_combined/L.fst -nt data/lang_combined_test/L.fst ]; then
+      echo "$0: $lang_combined already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang_combined already exists and seems to be older than data/lang_combined_test ..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    rm -rf ${lang_combined} 2>/dev/null || true
+    cp -r data/lang_combined_test $lang_combined
+    silphonelist=$(cat $lang_combined/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang_combined/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang_combined/topo
+  fi
+fi
+
+if [ $stage -le 8 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" \
+    --generate-ali-from-lats true ${lores_train_data_dir} \
+    data/lang_combined $gmm_dir $lat_dir || exit 1;
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 9 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 6000 ${lores_train_data_dir} \
+    $lang_combined $lat_dir $tree_dir || exit 1
+fi
+
+
+if [ $stage -le 10 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.004 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  linear_opts="orthonormal-constraint=-1.0 l2-regularize=0.004"
+  output_opts="l2-regularize=0.002"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $opts dim=1024
+  linear-component name=tdnn2l0 dim=256 $linear_opts input=Append(-1,0)
+  linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-dropout-layer name=tdnn2 $opts input=Append(0,1) dim=1024
+  linear-component name=tdnn3l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-dropout-layer name=tdnn3 $opts dim=1024 input=Append(0,1)
+  linear-component name=tdnn4l0 dim=256 $linear_opts input=Append(-1,0)
+  linear-component name=tdnn4l dim=256 $linear_opts input=Append(0,1)
+  relu-batchnorm-dropout-layer name=tdnn4 $opts input=Append(0,1) dim=1024
+  linear-component name=tdnn5l dim=256 $linear_opts
+  relu-batchnorm-dropout-layer name=tdnn5 $opts dim=1024 input=Append(0, tdnn3l)
+  linear-component name=tdnn6l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn6 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn7l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn7l dim=256 $linear_opts input=Append(0,3)
+  relu-batchnorm-dropout-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1024
+  linear-component name=tdnn8l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn8l dim=256 $linear_opts input=Append(0,3)
+  relu-batchnorm-dropout-layer name=tdnn8 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn9l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn5l) dim=1024
+  linear-component name=tdnn10l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn10l dim=256 $linear_opts input=Append(0,3)
+  relu-batchnorm-dropout-layer name=tdnn10 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn11l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn9l,tdnn7l) dim=1024
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1280
+  linear-component name=prefinal-chain-l dim=256 $linear_opts
+  batchnorm-component name=prefinal-chain-batchnorm
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1280
+  linear-component name=prefinal-xent-l dim=256 $linear_opts
+  batchnorm-component name=prefinal-xent-batchnorm
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+  
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 11 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/material-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  mkdir -p $dir/egs
+  touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=$num_epochs \
+    --trainer.frames-per-iter=1500000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=12 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=0 \
+    --egs.chunk-right-context=0 \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.stage=$get_egs_stage \
+    --egs.opts="--frames-overlap-per-eg 0 --generate-egs-scp true" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_combined_test \
+    $tree_dir ${tree_dir}/graph_combined || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context 0 --extra-right-context 0 \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_combined data/${data}_hires ${dir}/decode_${data} || exit 1
+    ) &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+exit 0;
diff --git a/egs/material/s5/local/semisup/chain/tuning/run_tdnn_semisupervised_1a.sh b/egs/material/s5/local/semisup/chain/tuning/run_tdnn_semisupervised_1a.sh
new file mode 100755
index 00000000000..37c957a3227
--- /dev/null
+++ b/egs/material/s5/local/semisup/chain/tuning/run_tdnn_semisupervised_1a.sh
@@ -0,0 +1,466 @@
+#!/bin/bash
+
+# Copyright 2017  Vimal Manohar
+#           2019  Yiming Wang
+# Apache 2.0
+
+# This script is semi-supervised recipe with ~40 hours of supervised data
+# and ~320 hours unsupervised data with naive splitting.
+# Based on "Semi-Supervised Training of Acoustic Models using Lattice-Free MMI",
+# Vimal Manohar, Hossein Hadian, Daniel Povey, Sanjeev Khudanpur, ICASSP 2018
+# http://www.danielpovey.com/files/2018_icassp_semisupervised_mmi.pdf
+# local/semisup/run_100k.sh shows how to call this.
+
+# This version of script uses only supervised data for i-vector extractor
+# training as against using the combined data.
+
+# This script uses the same tree as that for the seed model.
+# See the comments in the script about how to change these.
+
+# Unsupervised set: eval1_2_3_segmented (320 hours)
+# unsup_frames_per_eg=150
+# Deriv weights: Lattice posterior of best path pdf
+# Unsupervised weight: 1.0
+# Weights for phone LM (supervised, unsupervised): 3,2
+# LM for decoding unsupervised data: ngram
+# Supervision: Naive split lattices
+
+# output-0 and output-1 are for superivsed and unsupervised data respectively.
+
+# Semi-supervised training            train_sup
+# WER on dev                          18.70
+# WER on test                         18.18
+# Final output-0 train prob           -0.1345
+# Final output-0 valid prob           -0.1547
+# Final output-0 train prob (xent)    -1.3683
+# Final output-0 valid prob (xent)    -1.4077
+# Final output-1 train prob           -0.6856
+# Final output-1 valid prob           -0.6815
+# Final output-1 train prob (xent)    -1.1224
+# Final output-1 valid prob (xent)    -1.2218
+
+set -u -e -o pipefail
+
+stage=0   # Start from -1 for supervised seed system training
+nj=30
+test_sets="dev"
+
+exp_root=exp/semisup
+affix=1a  # affix for semi-supervised chain system
+
+# LSTM/chain options
+train_stage=-10
+xent_regularize=0.1
+
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+num_epochs=12
+
+# training options
+srand=0
+remove_egs=true
+
+# Datasets -- Expects data/$supervised_set and data/$unsupervised_set to be
+# present
+supervised_set=train
+unsupervised_set=eval1_2_3_segmented
+
+# Input seed system
+sup_chain_dir=exp/semisup/chain/tdnn_1a_sp  # supervised chain system
+sup_lat_dir=exp/semisup/chain/tri3_${supervised_set}_sp_lats  # Seed model options
+sup_tree_dir=exp/semisup/chain/tree_sp  # tree directory for supervised chain system
+ivector_root_dir=exp/nnet3  # i-vector extractor root directory
+
+# Semi-supervised options
+supervision_weights=1.0,1.0   # Weights for supervised, unsupervised data egs.
+                              # Can be used to scale down the effect of unsupervised data
+                              # by using a smaller scale for it e.g. 1.0,0.3
+lm_weights=3,2  # Weights on phone counts from supervised, unsupervised data for denominator FST creation
+
+sup_egs_dir=   # Supply this to skip supervised egs creation
+unsup_egs_dir=  # Supply this to skip unsupervised egs creation
+unsup_egs_opts=  # Extra options to pass to unsupervised egs creation
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+if [ -f ./path.sh ]; then . ./path.sh; fi
+. ./utils/parse_options.sh
+
+# The following can be replaced with the versions that model
+# UNK using phone LM. $sup_lat_dir should also ideally be changed.
+unsup_decode_lang=data/lang_combined_test
+test_lang=data/lang_combined_test
+
+dir=$exp_root/chain/tdnn_semisup_${affix}
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+supervised_set_perturbed=${supervised_set}_sp
+
+sup_ivector_dir=$ivector_root_dir/ivectors_${supervised_set_perturbed}_hires
+
+graphdir=$sup_chain_dir/graph_combined
+
+for f in data/${supervised_set_perturbed}/feats.scp \
+  data/${supervised_set_perturbed}_hires/feats.scp \
+  $ivector_root_dir/extractor/final.ie $sup_ivector_dir/ivector_online.scp \
+  $sup_lat_dir/lat.1.gz $sup_tree_dir/ali.1.gz \
+  $unsup_decode_lang/G.fst; do
+  if [ ! -f $f ]; then
+    echo "$0: Could not find file $f"
+    exit 1
+  fi
+done
+
+if [ $stage -le 1 ]; then
+  if [ ! -f $graphdir/HCLG.fst ]; then
+    utils/mkgraph.sh --self-loop-scale 1.0 $unsup_decode_lang $sup_chain_dir $graphdir
+  fi
+fi
+
+# Prepare the speed-perturbed unsupervised data directory
+if [ $stage -le 2 ]; then
+  if [ -f data/${unsupervised_set}_sp_hires/feats.scp ]; then
+    echo "$0: data/${unsupervised_set}_sp_hires/feats.scp exists. Remove it or re-run from next stage"
+    exit 1
+  fi
+
+  utils/data/perturb_data_dir_speed_3way.sh data/$unsupervised_set data/${unsupervised_set}_sp_hires
+  utils/data/perturb_data_dir_volume.sh data/${unsupervised_set}_sp_hires
+
+  steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" \
+    --mfcc-config conf/mfcc_hires.conf data/${unsupervised_set}_sp_hires || exit 1
+fi
+unsupervised_set_perturbed=${unsupervised_set}_sp
+
+# Extract i-vectors for the unsupervised data
+if [ $stage -le 3 ]; then
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${unsupervised_set_perturbed}_hires data/${unsupervised_set_perturbed}_max2_hires
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+    data/${unsupervised_set_perturbed}_max2_hires $ivector_root_dir/extractor \
+    $ivector_root_dir/ivectors_${unsupervised_set_perturbed}_hires || exit 1
+fi
+
+# Decode unsupervised data and write lattices in non-compact
+# undeterminized format
+# Set --skip-scoring to false in order to score the unsupervised data
+if [ $stage -le 4 ]; then
+  echo "$0: getting the decoding lattices for the unsupervised subset using the chain model at: $sup_chain_dir"
+  steps/nnet3/decode_semisup.sh --num-threads 4 --nj $nj --cmd "$decode_cmd" \
+            --acwt 1.0 --post-decode-acwt 10.0 --write-compact false --skip-scoring true \
+            --extra-left-context $chunk_left_context --extra-right-context $chunk_right_context \
+            --extra-left-context-initial 0 --extra-right-context-final 0 \
+            --frames-per-chunk 150 \
+            --online-ivector-dir $ivector_root_dir/ivectors_${unsupervised_set_perturbed}_hires \
+            --scoring-opts "--min-lmwt 10 --max-lmwt 10" --word-determinize false \
+            $graphdir data/${unsupervised_set_perturbed}_hires $sup_chain_dir/decode_${unsupervised_set_perturbed}
+fi
+
+# Get best path alignment and lattice posterior of best path alignment to be
+# used as frame-weights in lattice-based training
+if [ $stage -le 5 ]; then
+  steps/best_path_weights.sh --cmd "${train_cmd}" --acwt 0.1 \
+    data/${unsupervised_set_perturbed}_hires \
+    $sup_chain_dir/decode_${unsupervised_set_perturbed} \
+    $sup_chain_dir/best_path_${unsupervised_set_perturbed}
+fi
+
+frame_subsampling_factor=1
+if [ -f $sup_chain_dir/frame_subsampling_factor ]; then
+  frame_subsampling_factor=$(cat $sup_chain_dir/frame_subsampling_factor)
+fi
+cmvn_opts=$(cat $sup_chain_dir/cmvn_opts) || exit 1
+
+diff $sup_tree_dir/tree $sup_chain_dir/tree || { echo "$0: $sup_tree_dir/tree and $sup_chain_dir/tree differ"; exit 1; }
+
+# Uncomment the following lines if you need to build new tree using both
+# supervised and unsupervised data. This may help if amount of
+# supervised data used to train the seed system tree is very small.
+# unsupervised data
+
+# tree_affix=semisup
+# treedir=$exp_root/chain/tree_sp_${tree_affix}
+# if [ -f $treedir/final.mdl ]; then
+#   echo "$0: $treedir/final.mdl exists. Remove it and run again."
+#   exit 1
+# fi
+#
+# if [ $stage -le 6 ]; then
+#   # This is usually 3 for chain systems.
+#   echo $frame_subsampling_factor > \
+#     $sup_chain_dir/best_path_${unsupervised_set_perturbed}/frame_subsampling_factor
+#
+#   # This should be 1 if using a different source for supervised data alignments.
+#   # However alignments in seed tree directory have already been sub-sampled.
+#   echo $frame_subsampling_factor > \
+#     $sup_tree_dir/frame_subsampling_factor
+#
+#   # Build a new tree using stats from both supervised and unsupervised data
+#   steps/nnet3/chain/build_tree_multiple_sources.sh \
+#     --use-fmllr false --context-opts "--context-width=2 --central-position=1" \
+#     --frame-subsampling-factor $frame_subsampling_factor \
+#     7000 $lang \
+#     data/${supervised_set_perturbed} \
+#     ${sup_tree_dir} \
+#     data/${unsupervised_set_perturbed} \
+#     $chaindir/best_path_${unsupervised_set_perturbed} \
+#     $treedir || exit 1
+# fi
+#
+# sup_tree_dir=$treedir   # Use the new tree dir for further steps
+
+# Train denominator FST using phone alignments from
+# supervised and unsupervised data
+if [ $stage -le 7 ]; then
+  steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \
+    ${sup_tree_dir} ${sup_chain_dir}/best_path_${unsupervised_set_perturbed} \
+    $dir
+fi
+
+if [ $stage -le 8 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $sup_tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.004 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  linear_opts="orthonormal-constraint=-1.0 l2-regularize=0.004"
+  output_opts="l2-regularize=0.002"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $opts dim=1024
+  linear-component name=tdnn2l0 dim=256 $linear_opts input=Append(-1,0)
+  linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-dropout-layer name=tdnn2 $opts input=Append(0,1) dim=1024
+  linear-component name=tdnn3l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-dropout-layer name=tdnn3 $opts dim=1024 input=Append(0,1)
+  linear-component name=tdnn4l0 dim=256 $linear_opts input=Append(-1,0)
+  linear-component name=tdnn4l dim=256 $linear_opts input=Append(0,1)
+  relu-batchnorm-dropout-layer name=tdnn4 $opts input=Append(0,1) dim=1024
+  linear-component name=tdnn5l dim=256 $linear_opts
+  relu-batchnorm-dropout-layer name=tdnn5 $opts dim=1024 input=Append(0, tdnn3l)
+  linear-component name=tdnn6l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn6 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn7l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn7l dim=256 $linear_opts input=Append(0,3)
+  relu-batchnorm-dropout-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1024
+  linear-component name=tdnn8l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn8l dim=256 $linear_opts input=Append(0,3)
+  relu-batchnorm-dropout-layer name=tdnn8 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn9l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn5l) dim=1024
+  linear-component name=tdnn10l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn10l dim=256 $linear_opts input=Append(0,3)
+  relu-batchnorm-dropout-layer name=tdnn10 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn11l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn9l,tdnn7l) dim=1024
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1280
+  linear-component name=prefinal-chain-l dim=256 $linear_opts
+  batchnorm-component name=prefinal-chain-batchnorm
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1280
+  linear-component name=prefinal-xent-l dim=256 $linear_opts
+  batchnorm-component name=prefinal-xent-batchnorm
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+ 
+  # We use separate outputs for supervised and unsupervised data
+  # so we can properly track the train and valid objectives.
+
+  output name=output-0 input=output.affine
+  output name=output-1 input=output.affine
+
+  output name=output-0-xent input=output-xent.log-softmax
+  output name=output-1-xent input=output-xent.log-softmax
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+# Get values for $model_left_context, $model_right_context
+. $dir/configs/vars
+
+left_context=$model_left_context
+right_context=$model_right_context
+
+egs_left_context=$(perl -e "print int($left_context + $frame_subsampling_factor / 2)")
+egs_right_context=$(perl -e "print int($right_context + $frame_subsampling_factor / 2)")
+
+if [ -z "$sup_egs_dir" ]; then
+  sup_egs_dir=$dir/egs_${supervised_set_perturbed}
+  frames_per_eg=$(cat $sup_chain_dir/egs/info/frames_per_eg)
+
+  if [ $stage -le 9 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then
+      utils/create_split_dir.pl \
+       /export/b0{5,6,7,8}/$USER/kaldi-data/egs/material-$(date +'%m_%d_%H_%M')/s5/$sup_egs_dir/storage $sup_egs_dir/storage
+    fi
+    mkdir -p $sup_egs_dir/
+    touch $sup_egs_dir/.nodelete # keep egs around when that run dies.
+
+    echo "$0: generating egs from the supervised data"
+    steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \
+               --left-context $egs_left_context --right-context $egs_right_context \
+               --left-context-initial $egs_left_context --right-context-final $egs_right_context \
+               --frame-subsampling-factor $frame_subsampling_factor \
+               --alignment-subsampling-factor $frame_subsampling_factor \
+               --frames-per-eg $frames_per_eg \
+               --frames-per-iter 1500000 \
+               --cmvn-opts "$cmvn_opts" \
+               --online-ivector-dir $sup_ivector_dir \
+               --generate-egs-scp true \
+               data/${supervised_set_perturbed}_hires $dir \
+               $sup_lat_dir $sup_egs_dir
+  fi
+else
+  frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg)
+fi
+
+unsup_frames_per_eg=150  # Using a frames-per-eg of 150 for unsupervised data
+                         # was found to be better than allowing smaller chunks
+                         # (160,140,110,80) like for supervised system
+lattice_lm_scale=0.5  # lm-scale for using the weights from unsupervised lattices when
+                      # creating numerator supervision
+lattice_prune_beam=4.0  # beam for pruning the lattices prior to getting egs
+                        # for unsupervised data
+tolerance=1   # frame-tolerance for chain training
+
+unsup_lat_dir=${sup_chain_dir}/decode_${unsupervised_set_perturbed}
+if [ -z "$unsup_egs_dir" ]; then
+  unsup_egs_dir=$dir/egs_${unsupervised_set_perturbed}
+
+  if [ $stage -le 10 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then
+      utils/create_split_dir.pl \
+       /export/b0{5,6,7,8}/$USER/kaldi-data/egs/material-$(date +'%m_%d_%H_%M')/s5/$unsup_egs_dir/storage $unsup_egs_dir/storage
+    fi
+    mkdir -p $unsup_egs_dir
+    touch $unsup_egs_dir/.nodelete # keep egs around when that run dies.
+
+    cp $sup_chain_dir/final.mdl $unsup_lat_dir || exit 1;
+
+    echo "$0: generating egs from the unsupervised data"
+    steps/nnet3/chain/get_egs.sh \
+      --cmd "$decode_cmd" --alignment-subsampling-factor 1 \
+      --left-tolerance $tolerance --right-tolerance $tolerance \
+      --left-context $egs_left_context --right-context $egs_right_context \
+      --left-context-initial $egs_left_context --right-context-final $egs_right_context \
+      --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \
+      --frame-subsampling-factor $frame_subsampling_factor \
+      --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \
+      --lattice-prune-beam "$lattice_prune_beam" \
+      --deriv-weights-scp $sup_chain_dir/best_path_${unsupervised_set_perturbed}/weights.scp \
+      --online-ivector-dir $ivector_root_dir/ivectors_${unsupervised_set_perturbed}_hires \
+      --generate-egs-scp true $unsup_egs_opts \
+      data/${unsupervised_set_perturbed}_hires $dir \
+      $unsup_lat_dir $unsup_egs_dir
+  fi
+fi
+
+comb_egs_dir=$dir/comb_egs
+if [ $stage -le 11 ]; then
+  steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$train_cmd" \
+    --block-size 128 \
+    --lang2weight $supervision_weights 2 \
+    $sup_egs_dir $unsup_egs_dir $comb_egs_dir
+  touch $comb_egs_dir/.nodelete # keep egs around when that run dies.
+fi
+
+if [ $train_stage -le -4 ]; then
+  # This is to skip stages of den-fst creation, which was already done.
+  train_stage=-4
+fi
+
+if [ $stage -le 12 ]; then
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --egs.dir="$comb_egs_dir" \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$sup_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=true \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --egs.chunk-width=$frames_per_eg \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --trainer.frames-per-iter=1500000 \
+    --trainer.num-epochs=$num_epochs \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=12 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.momentum=0.0 \
+    --trainer.max-param-change=2.0 \
+    --cleanup.remove-egs=false \
+    --feat-dir=data/${supervised_set_perturbed}_hires \
+    --tree-dir=$sup_tree_dir \
+    --lat-dir=$sup_lat_dir \
+    --dir=$dir || exit 1;
+fi
+
+test_graph_dir=$dir/graph_combined
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 ${test_lang} $dir $test_graph_dir
+fi
+
+if [ $stage -le 14 ]; then
+  frames_per_chunk=150
+  rm -f $dir/.error 2>/dev/null || true
+  for data in $test_sets; do
+      (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $chunk_left_context --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir ${ivector_root_dir}/ivectors_${data}_hires \
+          $test_graph_dir data/${data}_hires ${dir}/decode_${data} || touch $dir/.error
+      ) &
+  done
+  wait;
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+exit 0;
diff --git a/egs/material/s5/local/semisup/rnnlm/run_tdnn_lstm.sh b/egs/material/s5/local/semisup/rnnlm/run_tdnn_lstm.sh
new file mode 100755
index 00000000000..8fb570ea153
--- /dev/null
+++ b/egs/material/s5/local/semisup/rnnlm/run_tdnn_lstm.sh
@@ -0,0 +1,178 @@
+#!/bin/bash
+
+# Copyright 2017-2018  Johns Hopkins University (author: Daniel Povey)
+#                2017  Hainan Xu
+#           2018-2019  Yiming Wang
+
+
+# [for swahili]
+# %WER 34.5 | 9906 59164 | 68.1 16.9 15.0 2.6 34.5 47.0 | exp/semisup/chain/tdnn_semisup_1a/decode_analysis1_segmented_rnnlm_rescore/score_10_0.0/analysis1_segmented_hires.ctm.sys
+# %WER 30.0 | 5322 37120 | 72.3 15.1 12.6 2.2 30.0 47.5 | exp/semisup/chain/tdnn_semisup_1a/decode_analysis2_segmented_rnnlm_rescore/score_10_0.0/analysis2_segmented_hires.ctm.sys
+
+# [for tagalog]
+# %WER 40.1 | 10551 87329 | 63.9 19.4 16.6 4.0 40.1 63.6 | exp/semisup/chain/tdnn_semisup_1a/decode_analysis1_segmented_rnnlm_rescore/score_10_0.0/analysis1_segmented_hires.ctm.sys
+# %WER 40.6 | 5933 56887 | 63.5 18.6 17.9 4.1 40.6 71.7 | exp/semisup/chain/tdnn_semisup_1a/decode_analysis2_segmented_rnnlm_rescore/score_10_0.0/analysis2_segmented_hires.ctm.sys
+
+# [for somali]
+# %WER 48.8 | 9852 90609 | 59.1 28.6 12.3 7.8 48.8 73.4 | exp/semisup/chain/tdnn_semisup_1a/decode_analysis1_segmented_rnnlm_rescore/score_10_0.0/analysis1_segmented_hires.ctm.sys
+# %WER 48.2 | 8275 67640 | 60.0 28.3 11.6 8.2 48.2 68.3 | exp/semisup/chain/tdnn_semisup_1a/decode_analysis2_segmented_rnnlm_rescore/score_10_0.0/analysis2_segmented_hires.ctm.sys
+
+# Begin configuration section.
+
+embedding_dim=512
+lstm_rpd=128
+lstm_nrpd=128
+stage=0
+train_stage=-10
+epochs=40
+
+# variables for lattice rescoring
+run_rescore=true
+decode_dir_suffix=rnnlm
+ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order
+              # if it's set, it merges histories in the lattice if they share
+              # the same ngram history and this prevents the lattice from 
+              # exploding exponentially
+pruned_rescore=true
+
+ac_model_dir=exp/semisup/chain/tdnn_semisup_1a
+decode_sets="dev analysis1_segmented analysis2_segmented test_dev_segmented eval1_segmented eval2_segmented eval3_segmented"
+
+dir=exp/semisup/rnnlm_lstm_1a
+text_dir=data/rnnlm/text
+train_text=data/lm/train.txt
+dev_text=data/lm/dev.txt
+bitext=data/bitext/text.txt
+monotext=data/mono/text.txt
+
+lang=data/lang_combined_chain
+tree_dir=exp/semisup/chain/tree_sp
+
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+
+mkdir -p $dir/config
+set -e
+
+for f in ${train_text} ${dev_text} $bitext $monotext; do
+
+  [ ! -f $f ] && \
+    echo "$0: expected file $f to exist; look at stage 12 in run.sh" && exit 1
+done
+
+if [ $stage -le 0 ]; then
+  mkdir -p $text_dir
+  cat $train_text > $text_dir/train.txt
+  cat $dev_text > $text_dir/dev.txt
+  cat $bitext > $text_dir/bitext.txt
+  cat $monotext > $text_dir/monotext.txt
+
+fi
+
+if [ $stage -le 1 ]; then
+  cp $lang/words.txt $dir/config/
+  n=`cat $dir/config/words.txt | wc -l`
+  echo "<brk> $n" >> $dir/config/words.txt
+
+  # words that are not present in words.txt but are in the training or dev data, will be
+  # mapped to <SPOKEN_NOISE> during training.
+  echo "<unk>" >$dir/config/oov.txt
+
+  cat > $dir/config/data_weights.txt <<EOF
+train   1   1.0
+bitext  1   1.0
+monotext  1   1.0
+EOF
+
+  rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
+                             --unk-word="<unk>" \
+                             --data-weights-file=$dir/config/data_weights.txt \
+                             $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
+
+  # choose features
+  rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
+                           --use-constant-feature=true \
+                           --special-words='<s>,</s>,<brk>,<unk>,<noise>,<spnoise>,<sil>' \
+                           $dir/config/words.txt > $dir/config/features.txt
+
+  cat >$dir/config/xconfig <<EOF
+input dim=$embedding_dim name=input
+relu-renorm-layer name=tdnn1 dim=$embedding_dim input=Append(0, IfDefined(-1))
+fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn2 dim=$embedding_dim input=Append(0, IfDefined(-2))
+fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn3 dim=$embedding_dim input=Append(0, IfDefined(-1))
+output-layer name=output include-log-softmax=false dim=$embedding_dim
+EOF
+  rnnlm/validate_config_dir.sh $text_dir $dir/config
+fi
+
+if [ $stage -le 2 ]; then
+  rnnlm/prepare_rnnlm_dir.sh $text_dir $dir/config $dir
+fi
+
+if [ $stage -le 3 ]; then
+  rnnlm/train_rnnlm.sh --num-jobs-initial 1 --num-jobs-final 1 --embedding-l2 0.001 \
+                  --stage $train_stage --num-epochs $epochs --cmd "$train_cmd" $dir
+fi
+
+LM=combined_chain
+if [ $stage -le 4 ] && $run_rescore; then
+  echo "$0: Perform lattice-rescoring on $ac_model_dir"
+  pruned=
+  if $pruned_rescore; then
+    pruned=_pruned
+  fi
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in ${decode_sets}; do
+    (
+      decode_dir=${ac_model_dir}/decode_${decode_set}
+      skip_scoring=false
+      if [ ${decode_set} != "dev" ]; then skip_scoring=true; fi
+
+      # Lattice rescoring
+      rnnlm/lmrescore$pruned.sh \
+        --cmd "$decode_cmd" \
+        --weight 0.5 --max-ngram-order $ngram_order --max-arcs 20000 \
+        --skip-scoring ${skip_scoring} \
+        data/lang_$LM $dir data/${decode_set}_hires \
+        ${decode_dir} ${decode_dir}_${decode_dir_suffix}_rescore || exit 1
+
+      if [ ${decode_set} != "dev" ]; then
+        local/postprocess_test.sh ${decode_set} ${tree_dir}/graph_combined \
+          ${decode_dir}_${decode_dir_suffix}_rescore
+      fi
+    ) || touch $dir/.error &
+  done
+fi
+wait
+#[ -f $dir/.error ] && echo "$0: there was a problem while rescoring" && exit 1
+
+if [ $stage -le 5 ]; then
+  echo "$0: Perform nbest-rescoring on $ac_model_dir"
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in ${decode_sets}; do
+    (
+      decode_dir=${ac_model_dir}/decode_${decode_set}
+      skip_scoring=false
+      if [ ${decode_set} != "dev" ]; then skip_scoring=true; fi
+
+      # Lattice rescoring
+      rnnlm/lmrescore_nbest.sh \
+        --N 20 \
+        --cmd "$decode_cmd" \
+        --skip-scoring ${skip_scoring} \
+        0.5 data/lang_$LM $dir data/${decode_set}_hires \
+        ${decode_dir}_${decode_dir_suffix}_rescore ${decode_dir}_${decode_dir_suffix}_rescore_nbest || exit 1
+
+      if [ ${decode_set} != "dev" ]; then
+        local/postprocess_test.sh ${decode_set} ${tree_dir}/graph_combined \
+          ${decode_dir}_${decode_dir_suffix}_rescore_nbest
+      fi
+    ) || touch $dir/.error 
+  done
+fi
+
+exit 0
diff --git a/egs/material/s5/local/semisup/run.sh b/egs/material/s5/local/semisup/run.sh
new file mode 100755
index 00000000000..6b22cb1ad36
--- /dev/null
+++ b/egs/material/s5/local/semisup/run.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+# Copyright 2017  Vimal Manohar
+#           2019  Yiming Wang
+# Apache 2.0
+
+# This script demonstrates semi-supervised training using ~40 hours of
+# supervised data and ~320 hours of unsupervised data.
+
+. ./cmd.sh
+. ./path.sh 
+
+set -o pipefail
+exp_root=exp/semisup
+
+stage=0
+
+. ./utils/parse_options.sh
+
+###############################################################################
+# Train seed chain system using ~40 hours supervised data.
+# Here we train i-vector extractor on only the supervised set.
+###############################################################################
+
+if [ $stage -le 1 ]; then
+  local/semisup/chain/run_tdnn.sh \
+    --train-set train \
+    --nnet3-affix "" \
+    --affix 1a --tree-affix "" \
+    --gmm tri3 --exp-root $exp_root || exit 1
+fi
+
+if [ $stage -le 2 ]; then
+    utils/combine_data.sh data/eval1_2_3_segmented data/eval1_segmented data/eval2_segmented data/eval3_segmented || exit 1
+fi
+
+###############################################################################
+# Semi-supervised training using ~40 hours supervised data and
+# 320 hours unsupervised data. We use i-vector extractor, tree, lattices
+# and seed chain system from the previous stage.
+###############################################################################
+
+if [ $stage -le 3 ]; then
+  local/semisup/chain/run_tdnn_semisupervised.sh \
+    --supervised-set train \
+    --unsupervised-set eval1_2_3_segmented \
+    --sup-chain-dir $exp_root/chain/tdnn_1a_sp \
+    --sup-lat-dir $exp_root/chain/tri3_train_sp_lats \
+    --sup-tree-dir $exp_root/chain/tree_sp \
+    --ivector-root-dir exp/nnet3 \
+    --affix 1a \
+    --exp-root $exp_root || exit 1
+
+  # [for swahili]
+  # %WER 35.2 | 9906 59164 | 67.8 18.4 13.8 3.0 35.2 47.1 | exp/semisup/chain/tdnn_semisup_1a/decode_analysis1_segmented/score_10_0.0/analysis1_segmented_hires.ctm.sys
+  # %WER 30.8 | 5322 37120 | 71.9 16.4 11.8 2.7 30.8 47.8 | exp/semisup/chain/tdnn_semisup_1a/decode_analysis2_segmented/score_10_0.0/analysis2_segmented_hires.ctm.sys
+
+  # [for tagalog]
+  # %WER 40.8 | 10551 87329 | 64.0 21.4 14.6 4.8 40.8 63.9 | exp/semisup/chain/tdnn_semisup_1a/decode_analysis1_segmented/score_10_0.0/analysis1_segmented_hires.ctm.sys
+  # %WER 41.1 | 5933 56887 | 63.8 20.4 15.9 4.9 41.1 71.9 | exp/semisup/chain/tdnn_semisup_1a/decode_analysis2_segmented/score_10_0.0/analysis2_segmented_hires.ctm.sys
+fi
+
diff --git a/egs/material/s5/local/stm_filter b/egs/material/s5/local/stm_filter
new file mode 100755
index 00000000000..9409119a54f
--- /dev/null
+++ b/egs/material/s5/local/stm_filter
@@ -0,0 +1,22 @@
+#!/usr/bin/perl
+
+while (<>) {
+  chomp;
+  my @F = split;
+  my @A = @F[6..$#F];
+  for (my $i = 0; $i <= $#A; $i++) {
+    my $w = $A[$i];
+
+    # Make partial words optionally detectable
+    if ($w =~ m/^(\S+-)$/ || $w =~ m/^(-\S+)$/) {
+      $A[$i] = "(" . $w . ")";
+    }
+    
+    # Remove filler words
+    if ($w =~ m/<(unk|noise|spnoise|sil)>/) {
+      $A[$i] = "";
+    }
+  }
+    
+  print join(" ", @F[0..5]) . " " . join(" ", @A) . "\n";
+}
diff --git a/egs/material/s5/local/train_lms_srilm.sh b/egs/material/s5/local/train_lms_srilm.sh
new file mode 100755
index 00000000000..8160b060dc7
--- /dev/null
+++ b/egs/material/s5/local/train_lms_srilm.sh
@@ -0,0 +1,224 @@
+#!/bin/bash
+export LC_ALL=C
+
+words_file=
+train_text=
+dev_text=
+oov_symbol="<UNK>"
+
+echo "$0 $@"
+
+[ -f path.sh ]  && . ./path.sh
+. ./utils/parse_options.sh || exit 1
+
+echo "-------------------------------------"
+echo "Building an SRILM language model     "
+echo "-------------------------------------"
+
+if [ $# -ne 2 ] ; then
+  echo "Incorrect number of parameters. "
+  echo "Script has to be called like this:"
+  echo "  $0 [switches] <datadir> <tgtdir>"
+  echo "For example: "
+  echo "  $0 data data/srilm"
+  echo "The allowed switches are: "
+  echo "    words_file=<word_file|>        word list file -- data/lang/words.txt by default"
+  echo "    train_text=<train_text|>       data/train/text is used in case when not specified"
+  echo "    dev_text=<dev_text|>           last 10 % of the train text is used by default"
+  echo "    oov_symbol=<unk_sumbol|<UNK>>  symbol to use for oov modeling -- <UNK> by default"
+  exit 1
+fi
+
+datadir=$1
+tgtdir=$2
+outlm=lm.gz
+
+
+##End of configuration
+loc=`which ngram-count`;
+if [ -z $loc ]; then
+  if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
+    sdir=`pwd`/../../../tools/srilm/bin/i686-m64
+  else
+    sdir=`pwd`/../../../tools/srilm/bin/i686
+  fi
+  if [ -f $sdir/ngram-count ]; then
+    echo Using SRILM tools from $sdir
+    export PATH=$PATH:$sdir
+  else
+    echo You appear to not have SRILM tools installed, either on your path,
+    echo or installed in $sdir.  See tools/install_srilm.sh for installation
+    echo instructions.
+    exit 1
+  fi
+fi
+
+# Prepare the destination directory
+mkdir -p $tgtdir
+
+for f in $words_file $train_text $dev_text; do
+  [ ! -s $f ] && echo "No such file $f" && exit 1;
+done
+
+[ -z $words_file ] && words_file=$datadir/lang/words.txt
+if [ ! -z "$train_text" ] && [ ! -z "$dev_text" ] ; then
+  echo "Using words file: $words_file"
+  echo "Using train text: $train_text"
+  echo "Using dev text  : $dev_text"
+  train_text=$train_text
+  dev_text=$dev_text
+else
+  [ -z "$train_text" ] && train_text=$datadir/train/text
+  nr=`cat  $train_text | wc -l`
+  nr_dev=$(($nr / 10 ))
+  nr_train=$(( $nr - $nr_dev ))
+  orig_train_text=$train_text
+  head -n $nr_train $train_text > $tgtdir/train_text
+  tail -n $nr_dev $train_text > $tgtdir/dev_text
+
+  train_text=$tgtdir/train_text
+  dev_text=$tgtdir/dev_text
+  echo "Using words file: $words_file"
+  echo "Using train text: 9/10 of $orig_train_text"
+  echo "Using dev text  : 1/10 of $orig_train_text"
+fi
+
+
+
+# Extract the word list from the training dictionary; exclude special symbols
+sort $words_file | awk '{print $1}' | grep -v '\#0' | grep -v '<eps>' | grep -v -F "$oov_symbol" > $tgtdir/vocab
+if (($?)); then
+  echo "Failed to create vocab from $words_file"
+  exit 1
+else
+  # wc vocab # doesn't work due to some encoding issues
+  echo vocab contains `cat $tgtdir/vocab | perl -ne 'BEGIN{$l=$w=0;}{split; $w+=$#_; $w++; $l++;}END{print "$l lines, $w words\n";}'`
+fi
+
+# Kaldi transcript files contain Utterance_ID as the first word; remove it
+cat $train_text | cut -f2- -d' ' > $tgtdir/train.txt
+if (($?)); then
+    echo "Failed to create $tgtdir/train.txt from $train_text"
+    exit 1
+else
+    echo "Removed first word (uid) from every line of $train_text"
+    # wc text.train train.txt # doesn't work due to some encoding issues
+    echo $train_text contains `cat $train_text | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $w--; $s++;}END{print "$w words, $s sentences\n";}'`
+    echo train.txt contains `cat $tgtdir/train.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $s++;}END{print "$w words, $s sentences\n";}'`
+fi
+
+# Kaldi transcript files contain Utterance_ID as the first word; remove it
+cat $dev_text | cut -f2- -d' ' > $tgtdir/dev.txt
+if (($?)); then
+    echo "Failed to create $tgtdir/dev.txt from $dev_text"
+    exit 1
+else
+    echo "Removed first word (uid) from every line of $dev_text"
+    # wc text.train train.txt # doesn't work due to some encoding issues
+    echo $dev_text contains `cat $dev_text | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $w--; $s++;}END{print "$w words, $s sentences\n";}'`
+    echo $tgtdir/dev.txt contains `cat $tgtdir/dev.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F;  $s++;}END{print "$w words, $s sentences\n";}'`
+fi
+
+echo "-------------------"
+echo "Good-Turing 2grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/2gram.gt01.gz -gt1min 0 -gt2min 1 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/2gram.gt02.gz -gt1min 0 -gt2min 2 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Kneser-Ney 2grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/2gram.kn01.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/2gram.kn02.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Good-Turing 3grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/3gram.gt011.gz -gt1min 0 -gt2min 1 -gt3min 1 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.gt012.gz -gt1min 0 -gt2min 1 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.gt022.gz -gt1min 0 -gt2min 2 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.gt023.gz -gt1min 0 -gt2min 2 -gt3min 3 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Kneser-Ney 3grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/3gram.kn011.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn012.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn022.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn023.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 3 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+
+echo "-------------------"
+echo "Good-Turing 4grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/4gram.gt0111.gz -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 1 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0112.gz -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0122.gz -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0123.gz -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0113.gz -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0222.gz -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0223.gz -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Kneser-Ney 4grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/4gram.kn0111.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 1 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0112.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0113.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0122.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0123.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0222.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0223.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+if [ ! -z ${LIBLBFGS} ]; then
+  #please not that if the switch -map-unk "$oov_symbol" is used with -maxent-convert-to-arpa, ngram-count will segfault
+  #instead of that, we simply output the model in the maxent format and convert it using the "ngram"
+  echo "-------------------"
+  echo "Maxent 2grams"
+  echo "-------------------"
+  sed 's/'${oov_symbol}'/<unk>/g' $tgtdir/train.txt | \
+    ngram-count -lm - -order 2 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\
+    sed 's/<unk>/'${oov_symbol}'/g' | gzip -c > $tgtdir/2gram.me.gz || exit 1
+
+  echo "-------------------"
+  echo "Maxent 3grams"
+  echo "-------------------"
+  sed 's/'${oov_symbol}'/<unk>/g' $tgtdir/train.txt | \
+    ngram-count -lm - -order 3 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\
+    sed 's/<unk>/'${oov_symbol}'/g' | gzip -c > $tgtdir/3gram.me.gz || exit 1
+
+  echo "-------------------"
+  echo "Maxent 4grams"
+  echo "-------------------"
+  sed 's/'${oov_symbol}'/<unk>/g' $tgtdir/train.txt | \
+    ngram-count -lm - -order 4 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\
+    sed 's/<unk>/'${oov_symbol}'/g' | gzip -c > $tgtdir/4gram.me.gz || exit 1
+
+fi
+
+
+echo "--------------------"
+echo "Computing perplexity"
+echo "--------------------"
+(
+  for f in $tgtdir/3gram* ; do ( echo $f; ngram -order 3 -lm $f -unk -map-unk "$oov_symbol" -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done
+  for f in $tgtdir/4gram* ; do ( echo $f; ngram -order 4 -lm $f -unk -map-unk "$oov_symbol" -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done
+)  | sort  -r -n -k 15,15g | column -t | tee $tgtdir/perplexities.txt
+
+echo "The perlexity scores report is stored in $tgtdir/perplexities.txt "
+
+#This will link the lowest perplexity LM as the output LM.
+#ln -sf $tgtdir/`head -n 1 $tgtdir/perplexities.txt | cut -f 1 -d ' '` $outlm
+
+#A slight modification of the previous approach:
+#We look at the two lowest perplexity LMs and use a 3gram LM if one of the two, even if the 4gram is of lower ppl
+nof_trigram_lm=`head -n 2 $tgtdir/perplexities.txt | grep 3gram | wc -l`
+if [[ $nof_trigram_lm -eq 0 ]] ; then
+  lmfilename=`head -n 1 $tgtdir/perplexities.txt | cut -f 1 -d ' '`
+elif [[ $nof_trigram_lm -eq 2 ]] ; then
+  lmfilename=`head -n 1 $tgtdir/perplexities.txt | cut -f 1 -d ' '`
+else  #exactly one 3gram LM
+  lmfilename=`head -n 2 $tgtdir/perplexities.txt | grep 3gram | cut -f 1 -d ' '`
+fi
+(cd $tgtdir; ln -sf `basename $lmfilename` $outlm )
+
diff --git a/egs/material/s5/local/wer_output_filter b/egs/material/s5/local/wer_output_filter
new file mode 100755
index 00000000000..5195bb9150d
--- /dev/null
+++ b/egs/material/s5/local/wer_output_filter
@@ -0,0 +1,25 @@
+#!/usr/bin/env perl
+# Copyright 2012-2014  Johns Hopkins University (Author: Yenda Trmal)
+# Apache 2.0
+use utf8;
+
+use open qw(:encoding(utf8));
+binmode STDIN, ":utf8";
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+
+while (<>) {
+  @F = split " ";
+  print $F[0] . " "; 
+  foreach $s (@F[1..$#F]) {
+    if (($s =~ /\[.*\]/) || ($s =~ /\<.*\>/) || ($s =~ "!SIL") || ($s =~ /--|\.|\?|\(\(\)\)|%incomplete/)) {
+      print "";
+    } else {
+      print "$s"
+    }
+    print " ";
+  }
+  print "\n";
+}
+
+
diff --git a/egs/material/s5/path.sh b/egs/material/s5/path.sh
new file mode 100644
index 00000000000..ffa108b6737
--- /dev/null
+++ b/egs/material/s5/path.sh
@@ -0,0 +1,7 @@
+export KALDI_ROOT=`pwd`/../../..
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5/:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+[ ! -f $KALDI_ROOT/tools/env.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/env.sh is not present (this is uncommon but might be OK)"
+. $KALDI_ROOT/tools/env.sh
+export LC_ALL=C
diff --git a/egs/material/s5/rnnlm b/egs/material/s5/rnnlm
new file mode 120000
index 00000000000..72302c5e570
--- /dev/null
+++ b/egs/material/s5/rnnlm
@@ -0,0 +1 @@
+../../../scripts/rnnlm
\ No newline at end of file
diff --git a/egs/material/s5/run.sh b/egs/material/s5/run.sh
new file mode 100755
index 00000000000..4ba518f53e0
--- /dev/null
+++ b/egs/material/s5/run.sh
@@ -0,0 +1,322 @@
+#!/bin/bash
+
+# Copyright 2017-2018  Johns Hopkins University (Jan "Yenda" Trmal<jtrmal@gmail.com>)
+#           2017-2018  Johns Hopkins University (author: Daniel Povey)
+#                2018  Yiming Wang
+#                2019  Mahsa Yarmohammadi
+# License: Apache 2.0
+
+. ./path.sh
+. ./cmd.sh
+
+nj=30 # number of parallel jobs
+stage=1
+language=swahili
+. utils/parse_options.sh
+
+set -e -o pipefail
+set -o nounset                              # Treat unset variables as an error
+
+[ ! -f ./conf/lang/${language}.conf ] && \
+  echo "Language configuration conf/lang/${language}.conf does not exist!" && exit 1
+ln -sf ./conf/lang/${language}.conf lang.conf
+. ./lang.conf
+
+if [ $stage -le 1 ]; then
+  local/prepare_text_data.sh $corpus
+  local/prepare_audio_data.sh $corpus
+fi
+
+if [ $stage -le 2 ]; then
+  local/prepare_dict.sh $corpus
+  utils/validate_dict_dir.pl data/local/dict_nosp
+  utils/prepare_lang.sh data/local/dict_nosp \
+    "<unk>" data/local/lang_nosp data/lang_nosp
+  utils/validate_lang.pl data/lang_nosp
+fi
+
+if [ $stage -le 3 ]; then
+  local/train_lms_srilm.sh --oov-symbol "<unk>" --words-file \
+    data/lang_nosp/words.txt data data/lm
+  utils/format_lm.sh data/lang_nosp data/lm/lm.gz \
+    data/local/dict_nosp/lexiconp.txt data/lang_nosp_test
+  utils/validate_lang.pl data/lang_nosp_test
+fi
+
+if [ $stage -le 4 ]; then
+  for set in train dev; do
+    dir=data/$set
+    utils/fix_data_dir.sh $dir
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 16 $dir
+    steps/compute_cmvn_stats.sh $dir
+    utils/fix_data_dir.sh $dir
+    utils/validate_data_dir.sh $dir
+  done
+fi
+
+# Create a subset with 40k short segments to make flat-start training easier
+if [ $stage -le 5 ]; then
+  utils/subset_data_dir.sh --shortest data/train $numShorestUtts data/train_short
+fi
+
+# monophone training
+if [ $stage -le 6 ]; then
+  steps/train_mono.sh --nj $nj --cmd "$train_cmd" \
+    data/train_short data/lang_nosp_test exp/mono
+  (
+    utils/mkgraph.sh data/lang_nosp_test \
+      exp/mono exp/mono/graph_nosp
+    for test in dev; do
+      steps/decode.sh --nj $nj --cmd "$decode_cmd" exp/mono/graph_nosp \
+        data/$test exp/mono/decode_nosp_$test
+    done
+  )&
+
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    data/train data/lang_nosp_test exp/mono exp/mono_ali
+fi
+
+# train a first delta + delta-delta triphone system on all utterances
+if [ $stage -le 7 ]; then
+  steps/train_deltas.sh --cmd "$train_cmd" \
+    $numLeavesTri1 $numGaussTri1 data/train data/lang_nosp_test exp/mono_ali exp/tri1
+
+  # decode using the tri1 model
+  (
+    utils/mkgraph.sh data/lang_nosp_test exp/tri1 exp/tri1/graph_nosp
+    for test in dev; do
+      steps/decode.sh --nj $nj --cmd "$decode_cmd" exp/tri1/graph_nosp \
+        data/$test exp/tri1/decode_nosp_$test
+    done
+  )&
+
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    data/train data/lang_nosp_test exp/tri1 exp/tri1_ali
+fi
+
+# train an LDA+MLLT system.
+if [ $stage -le 8 ]; then
+  steps/train_lda_mllt.sh --cmd "$train_cmd" \
+    --splice-opts "--left-context=3 --right-context=3" $numLeavesTri2 $numGaussTri2 \
+    data/train data/lang_nosp_test exp/tri1_ali exp/tri2
+
+  # decode using the LDA+MLLT model
+  (
+    utils/mkgraph.sh data/lang_nosp_test exp/tri2 exp/tri2/graph_nosp
+    for test in dev; do
+      steps/decode.sh --nj $nj --cmd "$decode_cmd" exp/tri2/graph_nosp \
+        data/$test exp/tri2/decode_nosp_$test
+    done
+  )&
+
+  steps/align_si.sh  --nj $nj --cmd "$train_cmd" --use-graphs true \
+    data/train data/lang_nosp_test exp/tri2 exp/tri2_ali
+fi
+
+# Train tri3, which is LDA+MLLT+SAT
+if [ $stage -le 9 ]; then
+  steps/train_sat.sh --cmd "$train_cmd" $numLeavesTri3 $numGaussTri3 \
+    data/train data/lang_nosp_test exp/tri2_ali exp/tri3
+
+  # decode using the tri3 model
+  (
+    utils/mkgraph.sh data/lang_nosp_test exp/tri3 exp/tri3/graph_nosp
+    for test in dev; do
+      steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" exp/tri3/graph_nosp \
+        data/$test exp/tri3/decode_nosp_$test
+    done
+  )&
+fi
+
+# Now we compute the pronunciation and silence probabilities from training data,
+# and re-create the lang directory.
+if [ $stage -le 10 ]; then
+  steps/get_prons.sh --cmd "$train_cmd" data/train data/lang_nosp_test exp/tri3
+  utils/dict_dir_add_pronprobs.sh --max-normalize true \
+    data/local/dict_nosp \
+    exp/tri3/pron_counts_nowb.txt exp/tri3/sil_counts_nowb.txt \
+    exp/tri3/pron_bigram_counts_nowb.txt data/local/dict
+
+  utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
+
+  utils/format_lm.sh data/lang data/lm/lm.gz \
+    data/local/dict/lexiconp.txt data/lang_test
+
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/train data/lang_test exp/tri3 exp/tri3_ali
+fi
+
+if [ $stage -le 11 ]; then
+  # Test the tri3 system with the silprobs and pron-probs.
+
+  # decode using the tri3 model
+  utils/mkgraph.sh data/lang_test exp/tri3 exp/tri3/graph
+  for test in dev; do
+    steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" \
+      exp/tri3/graph data/$test exp/tri3/decode_$test
+  done
+fi
+
+mkdir -p data/bitext
+mkdir -p data/mono
+
+srctext_bitext=data/bitext/text
+srctext_mono=data/mono/text
+
+if [ $stage -le 12 ]; then
+  # Read the foreign part of the bitext as $srctext_bitext and preprocess the text
+  if [ "$number_mapping" != "" ]; then
+    echo Number mapping file Found. Converting numbers...
+    cat $bitext | awk -F"\t" '{print $2;}' | local/normalize_numbers.py $number_mapping > $srctext_bitext
+    if [[ $mono == *.gz ]]; then 
+      gzip -cd $mono | local/normalize_numbers.py $number_mapping > $srctext_mono
+    else
+      cat $mono | local/normalize_numbers.py $number_mapping > $srctext_mono
+    fi
+    if [ "$mono2" != "" ]; then
+      if [[ $mono2 == *.gz ]]; then 
+        gzip -cd $mono2 | local/normalize_numbers.py $number_mapping >> $srctext_mono
+      else
+        cat $mono2 | local/normalize_numbers.py $number_mapping >> $srctext_mono
+      fi
+    fi
+  else
+    cat $bitext | awk -F"\t" '{print $2;}' > $srctext_bitext
+    if [[ $mono == *.gz ]]; then
+      gzip -cd $mono > $srctext_mono
+    else
+      cat $mono > $srctext_mono
+    fi
+    if [ "$mono2" != "" ]; then
+      if [[ $mono2 == *.gz ]]; then 
+        gzip -cd $mono2 >> $srctext_mono
+      else
+        cat $mono2 >> $srctext_mono
+      fi
+    fi
+  fi
+
+  local/preprocess_external_text.sh --language $language \
+    --srctext-bitext ${srctext_bitext} ${srctext_bitext}.txt
+
+  local/preprocess_external_text.sh --language $language \
+    --srctext-bitext ${srctext_mono} ${srctext_mono}.txt
+
+  # Combine two sources of text
+  cat $bitext | awk '{print $1}' > ${srctext_bitext}.header
+  paste ${srctext_bitext}.header ${srctext_bitext}.txt > ${srctext_bitext}.processed
+
+  if [[ $mono == *.gz ]]; then
+    gzip -cd $mono | awk '{printf("mono-%d\n",NR)}' > ${srctext_mono}.header
+  else
+    cat $mono | awk '{printf("mono-%d\n",NR)}' > ${srctext_mono}.header
+  fi
+  if [ "$mono2" != "" ]; then
+    if [[ $mono2 == *.gz ]]; then 
+      gzip -cd $mono2 | awk '{printf("mono-%d\n",NR)}' >> ${srctext_mono}.header
+    else
+      cat $mono2 | awk '{printf("mono-%d\n",NR)}' >> ${srctext_mono}.header
+    fi
+  fi
+  paste ${srctext_mono}.header ${srctext_mono}.txt > ${srctext_mono}.processed
+fi
+
+# The next 3 stages are to train g2p from the existing lexicon,
+# apply g2p to expand the lexicon using oov words from bitext data
+# as in ${dict_root}_nosp.
+g2p_workdir=data/local/g2p_phonetisarus
+if [ $stage -le 13 ]; then
+  echo 'Gathering missing words...'
+  mkdir -p ${g2p_workdir}
+  cat ${srctext_bitext}.txt ${srctext_mono}.txt | \
+    local/count_oovs.pl data/local/dict_nosp/lexicon.txt | \
+    awk '{for(i=4; i<NF; i++) printf "%s",$i OFS; if(NF) printf "%s",$NF; printf ORS}' | \
+    perl -ape 's/\s/\n/g;' | \
+    sort | uniq > ${g2p_workdir}/missing.txt
+  cat ${g2p_workdir}/missing.txt | \
+    grep "^[a-z]*$"  > ${g2p_workdir}/missing_onlywords.txt
+fi
+
+if [ $stage -le 14 ]; then
+  local/g2p/train_g2p.sh --stage 0 --silence-phones \
+    "data/local/dict/silence_phones.txt" data/local/dict_nosp exp/g2p || touch exp/g2p/.error
+fi
+
+dict_root=data/local/dict_combined
+if [ $stage -le 15 ]; then
+  if [ -f exp/g2p/.error ]; then
+    rm exp/g2p/.error || true
+    echo "Fail to train the G2P model." && exit 1;
+  fi
+  mkdir -p ${dict_root}_nosp
+  rm ${dict_root}_nosp/lexiconp.txt 2>/dev/null || true
+  cp data/local/dict_nosp/{phones,oov,nonsilence_phones,silence_phones,optional_silence}.txt ${dict_root}_nosp
+  local/g2p/apply_g2p.sh --var-counts 1 exp/g2p/model.fst ${g2p_workdir} \
+  data/local/dict_nosp/lexicon.txt ${dict_root}_nosp/lexicon.txt || exit 1;
+
+  utils/validate_dict_dir.pl ${dict_root}_nosp
+fi
+
+lang_root=data/lang_combined
+lmdir=data/lm_combined
+if [ $stage -le 16 ]; then
+  utils/prepare_lang.sh ${dict_root}_nosp "<unk>" data/local/lang_combined_nosp ${lang_root}_nosp
+  utils/validate_lang.pl ${lang_root}_nosp
+fi
+
+# prepare the new LM with bitext data and the new lexicon,
+# as in the new test lang directory ${lang_root}_nosp_test
+
+datadev="data/analysis1 data/analysis2 data/test_dev data/eval1 data/eval2 data/eval3"
+
+if [ $stage -le 17 ]; then
+  for datadir in $datadev; do
+    local/preprocess_test.sh $datadir &
+  done
+  wait
+
+  mkdir -p $lmdir
+  mkdir -p $lmdir/mono
+  mkdir -p $lmdir/bitext
+
+  cat data/analysis1/text | awk '{for(i=2;i<=NF;i++) printf("%s ", $i); print""}' \
+    | grep . | shuf | head -n 2000 > $lmdir/dev_text || echo done
+
+  local/train_lms_srilm.sh --oov-symbol "<unk>" --words-file ${lang_root}_nosp/words.txt \
+    --train-text ${srctext_bitext}.processed --dev-text $lmdir/dev_text \
+    data $lmdir/bitext
+
+  local/train_lms_srilm.sh --oov-symbol "<unk>" --words-file ${lang_root}_nosp/words.txt \
+    --train-text ${srctext_mono}.processed --dev-text $lmdir/dev_text \
+    data $lmdir/mono
+fi
+
+if [ $stage -le 18 ]; then
+  ngram -order 4 -lm data/lm/lm.gz -mix-lm $lmdir/bitext/lm.gz \
+    -mix-lm2 $lmdir/mono/lm.gz -lambda 0.3 -mix-lambda2 0.4 \
+    -write-lm $lmdir/lm.gz
+
+  utils/format_lm.sh ${lang_root}_nosp $lmdir/lm.gz \
+    ${dict_root}_nosp/lexiconp.txt ${lang_root}_nosp_test
+  utils/validate_lang.pl ${lang_root}_nosp_test
+fi
+
+# Now we compute the pronunciation and silence probabilities from training data,
+# and re-create the lang directory ${lang_root}_test.
+if [ $stage -le 19 ]; then
+  steps/get_prons.sh --cmd "$train_cmd" data/train ${lang_root}_nosp_test exp/tri3
+  utils/dict_dir_add_pronprobs.sh --max-normalize true \
+    ${dict_root}_nosp \
+    exp/tri3/pron_counts_nowb.txt exp/tri3/sil_counts_nowb.txt \
+    exp/tri3/pron_bigram_counts_nowb.txt ${dict_root}
+  utils/prepare_lang.sh ${dict_root} "<unk>" data/local/lang_combined ${lang_root}
+
+  utils/format_lm.sh ${lang_root} $lmdir/lm.gz \
+    ${dict_root}/lexiconp.txt ${lang_root}_test
+fi
+
+# After run.sh is finished, run the followings:
+# ./local/chain/run_tdnn.sh
+# ./local/chain/decode_test.sh --language <swahili|tagalog|somali>
+# ./local/rnnlm/run_tdnn_lstm.sh
+exit 0;
diff --git a/egs/material/s5/steps b/egs/material/s5/steps
new file mode 120000
index 00000000000..1b186770dd1
--- /dev/null
+++ b/egs/material/s5/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps/
\ No newline at end of file
diff --git a/egs/material/s5/utils b/egs/material/s5/utils
new file mode 120000
index 00000000000..a3279dc8679
--- /dev/null
+++ b/egs/material/s5/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils/
\ No newline at end of file
diff --git a/egs/mgb5/README b/egs/mgb5/README
new file mode 100644
index 00000000000..5114b278b71
--- /dev/null
+++ b/egs/mgb5/README
@@ -0,0 +1,18 @@
+###
+# MGB-5 corpus: Moroccan Arabic Automatic Speech Recognition
+# Created in collaboration between QCRI and ELRA
+# More details can be found here: https://arabicspeech.org/mgb5
+###
+
+
+## INTRODUCTION ##
+Training data: 10.2 hours from 69 programs
+Development data: 1.8 hours from 10 programs
+Testing data: 2.0 hours from 14 programs
+
+## KNOWN ISSUES ##
+1- The dev data does not have the same alignment across the four annotators 
+2- Once alignment is consistent, we can include multi-refence word error rate
+3- Use MGB-2 as background model
+
+
diff --git a/egs/mgb5/s5/RESULTS b/egs/mgb5/s5/RESULTS
new file mode 100644
index 00000000000..5ac2daf0d49
--- /dev/null
+++ b/egs/mgb5/s5/RESULTS
@@ -0,0 +1,14 @@
+%WER 75.59 [ 51973 / 68755, 1993 ins, 19098 del, 30882 sub ] exp/chain/tdnn_1a/decode_dev/wer_10_0.0
+%WER 84.47 [ 58079 / 68755, 1679 ins, 18637 del, 37763 sub ] exp/sgmm2_5b2/decode_dev.rescored/wer_9_0.0
+%WER 84.48 [ 58087 / 68755, 1720 ins, 18518 del, 37849 sub ] exp/sgmm2_5b2/decode_dev.big/wer_9_0.0
+%WER 84.62 [ 58180 / 68755, 1746 ins, 18289 del, 38145 sub ] exp/sgmm2_5b2/decode_dev/wer_9_0.0
+%WER 86.93 [ 59766 / 68755, 1634 ins, 19636 del, 38496 sub ] exp/tri3b/decode_dev.rescored/wer_10_0.0
+%WER 87.01 [ 59822 / 68755, 1508 ins, 20885 del, 37429 sub ] exp/tri3b/decode_dev/wer_11_0.0
+%WER 87.23 [ 59974 / 68755, 1686 ins, 18873 del, 39415 sub ] exp/tri3b/decode_dev.si/wer_10_0.0
+%WER 87.57 [ 60209 / 68755, 1325 ins, 21282 del, 37602 sub ] exp/tri2b/decode_dev.rescored/wer_11_0.0
+%WER 87.59 [ 60225 / 68755, 1133 ins, 21631 del, 37461 sub ] exp/tri2b/decode_dev/wer_10_0.5
+%WER 88.35 [ 60745 / 68755, 1359 ins, 20030 del, 39356 sub ] exp/tri2a/decode_dev.rescored/wer_11_0.0
+%WER 88.50 [ 60849 / 68755, 1469 ins, 18597 del, 40783 sub ] exp/tri1/decode_dev.rescored/wer_10_0.0
+%WER 88.53 [ 60866 / 68755, 1229 ins, 20752 del, 38885 sub ] exp/tri2a/decode_dev/wer_10_0.5
+%WER 88.59 [ 60909 / 68755, 1567 ins, 17986 del, 41356 sub ] exp/tri1/decode_dev/wer_10_0.0
+%WER 94.78 [ 65167 / 68755, 664 ins, 23336 del, 41167 sub ] exp/mono/decode_dev/wer_7_0.0
diff --git a/egs/mgb5/s5/cmd.sh b/egs/mgb5/s5/cmd.sh
new file mode 100644
index 00000000000..86240967f67
--- /dev/null
+++ b/egs/mgb5/s5/cmd.sh
@@ -0,0 +1,10 @@
+# "queue.pl" uses qsub.  The options to it are
+# options to qsub.  If you have GridEngine installed,
+# change this to a queue you have access to.
+# Otherwise, use "run.pl", which will run jobs locally
+# (make sure your --num-jobs options are no more than
+# the number of cpus on your machine.
+
+export train_cmd="slurm.pl --mem 6G --config conf/slurm.conf"
+export decode_cmd="slurm.pl  --config conf/slurm.conf"
+export cuda_cmd="slurm.pl gpu --mem 6G --gpu 2 --config conf/slurm.conf"
diff --git a/egs/mgb5/s5/conf/decode.config b/egs/mgb5/s5/conf/decode.config
new file mode 100644
index 00000000000..10b0eee900b
--- /dev/null
+++ b/egs/mgb5/s5/conf/decode.config
@@ -0,0 +1,4 @@
+# Use wider-than-normal decoding beams for RM.
+first_beam=16.0
+beam=20.0
+lattice_beam=10.0
diff --git a/egs/mgb5/s5/conf/decode_dnn.config b/egs/mgb5/s5/conf/decode_dnn.config
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs/mgb5/s5/conf/mfcc.conf b/egs/mgb5/s5/conf/mfcc.conf
new file mode 100644
index 00000000000..7361509099f
--- /dev/null
+++ b/egs/mgb5/s5/conf/mfcc.conf
@@ -0,0 +1 @@
+--use-energy=false   # only non-default option.
diff --git a/egs/mgb5/s5/conf/mfcc_hires.conf b/egs/mgb5/s5/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..434834a6725
--- /dev/null
+++ b/egs/mgb5/s5/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why 
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=20     # low cutoff frequency for mel bins... this is high-bandwidth data, so
+                  # there might be some information at the low end.
+--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) 
diff --git a/egs/mgb5/s5/conf/online_cmvn.conf b/egs/mgb5/s5/conf/online_cmvn.conf
new file mode 100644
index 00000000000..7748a4a4dd3
--- /dev/null
+++ b/egs/mgb5/s5/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
diff --git a/egs/mgb5/s5/conf/slurm.conf b/egs/mgb5/s5/conf/slurm.conf
new file mode 100644
index 00000000000..2cc4052a0a9
--- /dev/null
+++ b/egs/mgb5/s5/conf/slurm.conf
@@ -0,0 +1,10 @@
+command sbatch --export=PATH --ntasks-per-node=1 --partition=cpu 
+option mem=* --mem-per-cpu=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* --cpus-per-task=$0 --ntasks-per-node=1
+option num_threads=1 --cpus-per-task=1  --ntasks-per-node=1 # Do not add anything to qsub_opts
+option max_jobs_run=*     # Do nothing
+option gpu=* -N1 -n1 -p gpu --mem=4GB --gres=gpu:$0 --cpus-per-task=6 --time=72:0:0  # in reality, we probably should have --cpus-per-task=$((6*$0))
+option gpu=0
+
+
diff --git a/egs/mgb5/s5/local/chain/run_tdnn.sh b/egs/mgb5/s5/local/chain/run_tdnn.sh
new file mode 120000
index 00000000000..34499362831
--- /dev/null
+++ b/egs/mgb5/s5/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/mgb5/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/mgb5/s5/local/chain/tuning/run_tdnn_1a.sh
new file mode 100644
index 00000000000..6300511e817
--- /dev/null
+++ b/egs/mgb5/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,264 @@
+#!/bin/bash
+
+
+# Copyright 2017-2018  Johns Hopkins University (author: Daniel Povey)
+#           2017-2018  Yiming Wang
+
+# The script is copied from egs/iban
+# 1a is trying an architecture with factored parameter matrices with dropout.
+
+# grep WER exp/chain/tdnn_1a/decode_dev/wer_10_0.0
+# %WER 75.59 [ 51973 / 68755, 1993 ins, 19098 del, 30882 sub ] 
+
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_1a
+# exp/chain/tdnn_1a: num-iters=38 nj=2..5 num-params=12.6M dim=40+50->1592 combine=-0.069->-0.067 (over 2) xent:train/valid[24,37,final]=(-1.41,-1.18,-1.12/-1.68,-1.54,-1.47) logprob:train/valid[24,37,final]=(-0.071,-0.057,-0.053/-0.124,-0.122,-0.121)
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+train_set=train
+test_sets="dev"
+gmm=tri3b
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1a   #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# LSTM/chain options
+train_stage=-10
+get_egs_stage=-10
+xent_regularize=0.1
+
+# training chunk-options
+chunk_width=140,100,160
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+num_epochs=15
+
+# training options
+srand=0
+remove_egs=true
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --gmm $gmm || exit 1;
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain/tree_sp
+lang=data/lang_chain
+lat_dir=exp/chain/${gmm}_${train_set}_sp_lats
+dir=exp/chain/tdnn_${affix}
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3/ivectors_${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 9 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang_test/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang_test ..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang_test $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 10 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 50 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 12 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  opts="l2-regularize=0.08 dropout-per-dim-continuous=true"
+  output_opts="l2-regularize=0.02 bottleneck-dim=256"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=50 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $opts dim=768
+  relu-batchnorm-dropout-layer name=tdnn2 $opts dim=768 input=Append(-1,0,1)
+  relu-batchnorm-dropout-layer name=tdnn3 $opts dim=768
+  relu-batchnorm-dropout-layer name=tdnn4 $opts dim=768 input=Append(-1,0,1)
+  relu-batchnorm-dropout-layer name=tdnn5 $opts dim=768
+  relu-batchnorm-dropout-layer name=tdnn6 $opts dim=768 input=Append(-3,0,3)
+  relu-batchnorm-dropout-layer name=tdnn7 $opts dim=768 input=Append(-3,0,3)
+  relu-batchnorm-dropout-layer name=tdnn8 $opts dim=768 input=Append(-6,-3,0)
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain $opts dim=768
+  output-layer name=output include-log-softmax=false $output_opts dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn8 $opts dim=768
+  output-layer name=output-xent $output_opts dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 13 ]; then
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=$num_epochs \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.num-chunk-per-minibatch=256,128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=0 \
+    --egs.chunk-right-context=0 \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test \
+    $tree_dir $tree_dir/graph || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)  
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context 0 --extra-right-context 0 \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3/ivectors_${data}_hires \
+          $tree_dir/graph data/${data}_hires ${dir}/decode_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+if [ $stage -le 16 ]; then
+  for data in $test_sets; do
+    (
+      steps/lmrescore_const_arpa.sh  --cmd "$decode_cmd" \
+        data/lang_test/ data/lang_big/ data/${data} \
+        ${dir}/decode_${data} ${dir}/decode_${data}.rescored
+    )
+  done
+  wait
+fi
+
+exit 0;
diff --git a/egs/mgb5/s5/local/nnet3/run_ivector_common.sh b/egs/mgb5/s5/local/nnet3/run_ivector_common.sh
new file mode 100644
index 00000000000..b909ed04cde
--- /dev/null
+++ b/egs/mgb5/s5/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,126 @@
+#!/bin/bash
+
+set -euo pipefail
+
+# This script is called from local/nnet3/run_tdnn.sh and
+# local/chain/run_tdnn.sh (and may eventually be called by more
+# scripts).  It contains the common feature preparation and
+# iVector-related parts of the script.  See those scripts for examples
+# of usage.
+
+stage=0
+train_set=train
+test_sets="dev"
+gmm=tri3b
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+
+for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+if [ $stage -le 1 ]; then
+  # Although the nnet will be trained by high resolution data, we still have to
+  # perturb the normal data to get the alignment _sp stands for speed-perturbed
+  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+  echo "$0: making MFCC features for low-resolution speed-perturbed data"
+  steps/make_mfcc.sh --cmd "$train_cmd" --nj 17 data/${train_set}_sp || exit 1;
+  steps/compute_cmvn_stats.sh data/${train_set}_sp || exit 1;
+  utils/fix_data_dir.sh data/${train_set}_sp
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: aligning with the perturbed low-resolution data"
+  steps/align_fmllr.sh --nj 16 --cmd "$train_cmd" \
+    data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1
+fi
+
+if [ $stage -le 3 ]; then
+  # Create high-resolution MFCC features (with 40 cepstra instead of 13).
+  # this shows how you can split across multiple file-systems.
+  echo "$0: creating high-resolution MFCC features"
+  mfccdir=data/${train_set}_sp_hires/data
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+  done
+
+  # do volume-perturbation on the training data prior to extracting hires
+  # features; this helps make trained nnets more invariant to test data volume.
+  #utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires || exit 1;
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    steps/make_mfcc.sh --nj 16 --mfcc-config conf/mfcc_hires.conf \
+     --cmd "$train_cmd" data/${datadir}_hires || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1;
+    utils/fix_data_dir.sh data/${datadir}_hires || exit 1;
+  done
+fi
+
+if [ $stage -le 4 ]; then
+  # Train a small system just for its LDA+MLLT transform.  We use --num-iters 13
+  # because after we get the transform (12th iter is the last), any further
+  # training is pointless.
+  steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
+    --realign-iters "" --splice-opts "--left-context=3 --right-context=3" \
+    5000 10000 data/${train_set}_sp_hires data/lang \
+     $ali_dir exp/nnet3/tri5b || exit 1
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$0: training the diagonal UBM."
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 16  --num-frames 200000 \
+     data/${train_set}_sp_hires 256 exp/nnet3/tri5b exp/nnet3/diag_ubm || exit 1
+fi
+
+if [ $stage -le 6 ]; then
+  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
+  # can be sensitive to the amount of data. The iVector dimension of 50.
+  # even though $nj is just 10, each job uses multiple processes and threads.
+  echo "$0: training the iVector extractor"
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" \
+    --nj 10 --num-processes 1 --num-threads 2 --ivector-dim 50 \
+    data/${train_set}_sp_hires exp/nnet3/diag_ubm exp/nnet3/extractor || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  # We extract iVectors on the speed-perturbed training data after combining
+  # short segments, which will be what we train the system on.  With
+  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
+  # each of these pairs as one speaker; this gives more diversity in iVectors..
+  # Note that these are extracted 'online'.
+
+  # note, we don't encode the 'max2' in the name of the ivectordir even though
+  # that's the data we extract the ivectors from, as it's still going to be
+  # valid for the non-'max2' data, the utterance list is the same.
+
+  ivectordir=exp/nnet3/ivectors_${train_set}_sp_hires
+
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  temp_data_root=${ivectordir}
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${train_set}_sp_hires ${temp_data_root}/${train_set}_sp_hires_max2
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 16 \
+    ${temp_data_root}/${train_set}_sp_hires_max2 \
+    exp/nnet3/extractor $ivectordir
+
+  # Also extract iVectors for the test data, but in this case we don't need the speed
+  # perturbation (sp).
+  for data in $test_sets; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 6 \
+      data/${data}_hires exp/nnet3/extractor exp/nnet3/ivectors_${data}_hires
+  done
+fi
+
+exit 0;
diff --git a/egs/mgb5/s5/local/prepare_data.sh b/egs/mgb5/s5/local/prepare_data.sh
new file mode 100755
index 00000000000..36cb4d8fa3f
--- /dev/null
+++ b/egs/mgb5/s5/local/prepare_data.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+# Copyright 2019 QCRI (Author: Ahmed Ali)
+# Apache 2.0
+
+set -e -o pipefail
+
+
+###
+# The script assumes you have downloaded to the MGB-5 corpus: https://arabicspeech.org/mgb5
+# DB/{dev.tar.gz,train.tar.gz}
+###
+echo "Preparing train and dev data"
+
+if [[ ! -e "DB/train.tar.gz" || ! -e "DB/dev.tar.gz" ]]; then
+  echo "You need to download the MGB-5 first and copy dev.tar.gz and train.tar.gz to DB folder"
+  echo "check: https://arabicspeech.org/mgb5"
+  exit 1
+fi
+
+# We will extract data again even if you did this before.
+(cd DB; rm -fr train dev;for x in *; do tar -xvf $x; done)
+
+mkdir -p data/local data/train data/dev
+
+for x in train dev; do
+    sed -e 's:UNK: :g' -e 's:  : :g' DB/$x/$x.txt.bw > data/$x/text #removing words that annotators couldn't understand
+    cp DB/$x/$x.segments.bw data/$x/segments
+    awk '{print $1 " " $1}' DB/$x/$x.segments.bw > data/$x/spk2utt
+    cp data/$x/spk2utt data/$x/utt2spk 
+    find $PWD/DB/$x/ -name \*.wav | while read wav; do
+        id=$(basename $wav | sed 's:.wav::')
+        echo $id $wav
+    done | sort -u > data/$x/wav.scp
+    utils/fix_data_dir.sh data/$x
+done
+
+
+echo "Data preparation completed."
+
diff --git a/egs/mgb5/s5/local/prepare_dict.sh b/egs/mgb5/s5/local/prepare_dict.sh
new file mode 100755
index 00000000000..5ea0938af90
--- /dev/null
+++ b/egs/mgb5/s5/local/prepare_dict.sh
@@ -0,0 +1,61 @@
+#!/usr/bin/env bash
+
+# Copyright 2019 QCRI (author: Ahmed Ali)
+# Apache 2.0
+# This script prepares the grapaheme dictionary
+
+set -e
+dir=data/local/dict
+lexicon_url1="https://arabicspeech.org/static/data_resources/ar-ar_grapheme_lexicon_20160209.bz2";
+lexicon_url2="https://arabicspeech.org/static/data_resources/ar-ar_phoneme_lexicon_20140317.bz2";
+stage=0
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+mkdir -p $dir data/local/lexicon_data
+
+if [ $stage -le 0 ]; then
+  echo "$0: Downloading text for lexicon... $(date)."
+  if [ ! -f data/local/lexicon_data/ar-ar_grapheme_lexicon_20160209.bz2 ]; then
+    wget -P data/local/lexicon_data $lexicon_url1
+  else
+    echo "data/local/lexicon_data/ar-ar_grapheme_lexicon_20160209.bz2 already exist on disk"
+  fi 
+  
+  if [ ! -f data/local/lexicon_data/ar-ar_phoneme_lexicon_20140317.bz2 ]; then
+    wget -P data/local/lexicon_data $lexicon_url2
+  else
+    echo "data/local/lexicon_data/ar-ar_phoneme_lexicon_20140317.bz2 already exist on disk"
+  fi 
+  
+  rm -fr data/local/lexicon_data/grapheme_lexicon
+  for dict in ar-ar_grapheme_lexicon_20160209.bz2 ar-ar_phoneme_lexicon_20140317.bz2; do
+    bzcat data/local/lexicon_data/$dict | sed '1,3d' | \
+    awk '{print $1}'  >>  data/local/lexicon_data/grapheme_lexicon
+  done
+  cat data/train/text | cut -d ' ' -f 2- | tr -s " " "\n" | grep -v UNK |  sort -u >> data/local/lexicon_data/grapheme_lexicon
+fi
+
+
+if [ $stage -le 0 ]; then
+  echo "$0: processing lexicon text and creating lexicon... $(date)."
+  # remove vowels and  rare alef wasla
+  grep -v [0-9] data/local/lexicon_data/grapheme_lexicon |  sed -e 's:[FNKaui\~o\`]::g' -e 's:{:}:g' | sort -u > data/local/lexicon_data/processed_lexicon
+  local/prepare_lexicon.py
+fi
+
+cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1;
+
+sed -i '1i<UNK> UNK' $dir/lexicon.txt
+
+echo UNK >> $dir/nonsilence_phones.txt
+
+echo '<sil> SIL' >> $dir/lexicon.txt
+
+echo SIL > $dir/silence_phones.txt
+
+echo SIL >$dir/optional_silence.txt
+
+echo -n "" >$dir/extra_questions.txt
+
+echo "$0: Dictionary preparation succeeded"
diff --git a/egs/mgb5/s5/local/prepare_lexicon.py b/egs/mgb5/s5/local/prepare_lexicon.py
new file mode 100755
index 00000000000..215541585eb
--- /dev/null
+++ b/egs/mgb5/s5/local/prepare_lexicon.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+
+# Copyright      2018  Ashish Arora
+# Apache 2.0
+
+# This script prepares lexicon.
+
+import argparse
+import os
+
+parser = argparse.ArgumentParser(description="""Creates the list of characters and words in lexicon""")
+args = parser.parse_args()
+
+### main ###
+lex = {}
+text_path = os.path.join('data','local', 'lexicon_data', 'processed_lexicon')
+with open(text_path, 'r', encoding='utf-8') as f:
+    for line in f:
+        line = line.strip()
+        characters = list(line)
+        characters = " ".join(['V' if char == '*' else char for char in characters])
+        lex[line] = characters
+
+with open(os.path.join('data','local','dict', 'lexicon.txt'), 'w', encoding='utf-8') as fp:
+    for key in sorted(lex):
+        fp.write(key + "  " + lex[key] + "\n")
diff --git a/egs/mgb5/s5/local/prepare_lm.sh b/egs/mgb5/s5/local/prepare_lm.sh
new file mode 100755
index 00000000000..02fb59aba87
--- /dev/null
+++ b/egs/mgb5/s5/local/prepare_lm.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Copyright 2019  QCRI (Author: Ahmed Ali)
+# Apache 2.0
+
+set -e -o pipefail
+
+# To create G.fst from ARPA language model
+. ./path.sh || die "path.sh expected";
+
+local/train_lms_srilm.sh --train-text data/train/text data/ data/srilm
+
+# for basic decoding, let's use only a trigram LM
+[ -d data/lang_test/ ] && rm -rf data/lang_test
+cp -R data/lang data/lang_test
+lm=data/srilm/3gram.me.gz
+utils/format_lm.sh data/lang_test $lm data/local/dict/lexicon.txt data/lang_test
+
+# for decoding using bigger, we build 4-gram using the same transcription text
+[ -d data/lang_big ] && rm -rf data/lang_big
+cp -R data/lang data/lang_big
+lm=data/srilm/4gram.me.gz
+utils/format_lm.sh data/lang_big $lm data/local/dict/lexicon.txt data/lang_big
+
+utils/build_const_arpa_lm.sh $lm data/lang_big data/lang_big
+exit 0;
diff --git a/egs/mgb5/s5/local/score.sh b/egs/mgb5/s5/local/score.sh
new file mode 100755
index 00000000000..9988c941441
--- /dev/null
+++ b/egs/mgb5/s5/local/score.sh
@@ -0,0 +1,149 @@
+#!/bin/bash
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey, Yenda Trmal)
+# Apache 2.0
+
+# See the script steps/scoring/score_kaldi_cer.sh in case you need to evalutate CER
+
+[ -f ./path.sh ] && . ./path.sh
+
+# begin configuration section.
+cmd=run.pl
+stage=0
+decode_mbr=false
+stats=true
+beam=6
+word_ins_penalty=0.0,0.5,1.0
+min_lmwt=7
+max_lmwt=17
+iter=final
+#end configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
+  echo "    --decode_mbr (true/false)       # maximum bayes risk decoding (confusion network)."
+  echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
+  echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
+  exit 1;
+fi
+
+data=$1
+lang_or_graph=$2
+dir=$3
+
+symtab=$lang_or_graph/words.txt
+
+for f in $symtab $dir/lat.1.gz $data/text; do
+  [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
+done
+
+
+ref_filtering_cmd="cat"
+[ -x local/wer_output_filter ] && ref_filtering_cmd="local/wer_output_filter"
+[ -x local/wer_ref_filter ] && ref_filtering_cmd="local/wer_ref_filter"
+hyp_filtering_cmd="cat"
+[ -x local/wer_output_filter ] && hyp_filtering_cmd="local/wer_output_filter"
+[ -x local/wer_hyp_filter ] && hyp_filtering_cmd="local/wer_hyp_filter"
+
+
+if $decode_mbr ; then
+  echo "$0: scoring with MBR, word insertion penalty=$word_ins_penalty"
+else
+  echo "$0: scoring with word insertion penalty=$word_ins_penalty"
+fi
+
+
+mkdir -p $dir/scoring_kaldi
+cat $data/text | $ref_filtering_cmd > $dir/scoring_kaldi/test_filt.txt || exit 1;
+if [ $stage -le 0 ]; then
+
+  for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+    mkdir -p $dir/scoring_kaldi/penalty_$wip/log
+
+    if $decode_mbr ; then
+      $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \
+        acwt=\`perl -e \"print 1.0/LMWT\"\`\; \
+        lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
+        lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
+        lattice-prune --beam=$beam ark:- ark:- \| \
+        lattice-mbr-decode  --word-symbol-table=$symtab \
+        ark:- ark,t:- \| \
+        utils/int2sym.pl -f 2- $symtab \| \
+        $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1;
+
+    else
+      $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \
+        lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
+        lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
+        lattice-best-path --word-symbol-table=$symtab ark:- ark,t:- \| \
+        utils/int2sym.pl -f 2- $symtab \| \
+        $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1;
+    fi
+
+    $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/score.LMWT.log \
+      cat $dir/scoring_kaldi/penalty_$wip/LMWT.txt \| \
+      compute-wer --text --mode=present \
+      ark:$dir/scoring_kaldi/test_filt.txt  ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1;
+
+  done
+fi
+
+
+
+if [ $stage -le 1 ]; then
+
+  for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+    for lmwt in $(seq $min_lmwt $max_lmwt); do
+      # adding /dev/null to the command list below forces grep to output the filename
+      grep WER $dir/wer_${lmwt}_${wip} /dev/null
+    done
+  done | utils/best_wer.sh  >& $dir/scoring_kaldi/best_wer || exit 1
+
+  best_wer_file=$(awk '{print $NF}' $dir/scoring_kaldi/best_wer)
+  best_wip=$(echo $best_wer_file | awk -F_ '{print $NF}')
+  best_lmwt=$(echo $best_wer_file | awk -F_ '{N=NF-1; print $N}')
+
+  if [ -z "$best_lmwt" ]; then
+    echo "$0: we could not get the details of the best WER from the file $dir/wer_*.  Probably something went wrong."
+    exit 1;
+  fi
+
+  if $stats; then
+    mkdir -p $dir/scoring_kaldi/wer_details
+    echo $best_lmwt > $dir/scoring_kaldi/wer_details/lmwt # record best language model weight
+    echo $best_wip > $dir/scoring_kaldi/wer_details/wip # record best word insertion penalty
+
+    $cmd $dir/scoring_kaldi/log/stats1.log \
+      cat $dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \| \
+      align-text --special-symbol="'***'" ark:$dir/scoring_kaldi/test_filt.txt ark:- ark,t:- \|  \
+      utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \| tee $dir/scoring_kaldi/wer_details/per_utt \|\
+       utils/scoring/wer_per_spk_details.pl $data/utt2spk \> $dir/scoring_kaldi/wer_details/per_spk || exit 1;
+
+    $cmd $dir/scoring_kaldi/log/stats2.log \
+      cat $dir/scoring_kaldi/wer_details/per_utt \| \
+      utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \
+      sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $dir/scoring_kaldi/wer_details/ops || exit 1;
+
+    $cmd $dir/scoring_kaldi/log/wer_bootci.log \
+      compute-wer-bootci --mode=present \
+        ark:$dir/scoring_kaldi/test_filt.txt ark:$dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \
+        '>' $dir/scoring_kaldi/wer_details/wer_bootci || exit 1;
+
+  fi
+fi
+
+# If we got here, the scoring was successful.
+# As a  small aid to prevent confusion, we remove all wer_{?,??} files;
+# these originate from the previous version of the scoring files
+# i keep both statement here because it could lead to confusion about
+# the capabilities of the script (we don't do cer in the script)
+rm $dir/wer_{?,??} 2>/dev/null
+rm $dir/cer_{?,??} 2>/dev/null
+
+exit 0;
diff --git a/egs/mgb5/s5/local/train_lms_srilm.sh b/egs/mgb5/s5/local/train_lms_srilm.sh
new file mode 100755
index 00000000000..6af13921511
--- /dev/null
+++ b/egs/mgb5/s5/local/train_lms_srilm.sh
@@ -0,0 +1,233 @@
+#!/bin/bash
+
+#      2019 QCRI (Ahmed Ali)
+
+export LC_ALL=C
+
+words_file=
+train_text=
+dev_text=
+oov_symbol="<UNK>"
+
+echo "$0 $@"
+
+[ -f path.sh ]  && . ./path.sh
+. ./utils/parse_options.sh || exit 1
+
+echo "-------------------------------------"
+echo "Building an SRILM language model     "
+echo "-------------------------------------"
+
+if [ $# -ne 2 ] ; then
+  echo "Incorrect number of parameters. "
+  echo "Script has to be called like this:"
+  echo "  $0 [switches] <datadir> <tgtdir>"
+  echo "For example: "
+  echo "  $0 data data/srilm"
+  echo "The allowed switches are: "
+  echo "    words_file=<word_file|>        word list file -- data/lang/words.txt by default"
+  echo "    train_text=<train_text|>       data/train/text is used in case when not specified"
+  echo "    dev_text=<dev_text|>           last 10 % of the train text is used by default"
+  echo "    oov_symbol=<unk_sumbol|<UNK>>  symbol to use for oov modeling -- <UNK> by default"
+  exit 1
+fi
+
+datadir=$1
+tgtdir=$2
+outlm=lm.gz
+
+
+##End of configuration
+loc=`which ngram-count`;
+if [ -z $loc ]; then
+  if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
+    sdir=`pwd`/../../../tools/srilm/bin/i686-m64
+  else
+    sdir=`pwd`/../../../tools/srilm/bin/i686
+  fi
+  if [ -f $sdir/ngram-count ]; then
+    echo Using SRILM tools from $sdir
+    export PATH=$PATH:$sdir
+  else
+    echo You appear to not have SRILM tools installed, either on your path,
+    echo or installed in $sdir.  See tools/install_srilm.sh for installation
+    echo instructions.
+    exit 1
+  fi
+fi
+
+# Prepare the destination directory
+mkdir -p $tgtdir
+
+for f in $words_file $train_text $dev_text; do
+  [ ! -s $f ] && echo "No such file $f" && exit 1;
+done
+
+[ -z $words_file ] && words_file=$datadir/lang/words.txt
+if [ ! -z "$train_text" ] && [ -z "$dev_text" ] ; then
+  nr=`cat  $train_text | wc -l`
+  nr_dev=$(($nr / 10 ))
+  nr_train=$(( $nr - $nr_dev ))
+  orig_train_text=$train_text
+  head -n $nr_train $train_text > $tgtdir/train_text
+  tail -n $nr_dev $train_text > $tgtdir/dev_text
+
+  train_text=$tgtdir/train_text
+  dev_text=$tgtdir/dev_text
+  echo "Using words file: $words_file"
+  echo "Using train text: 9/10 of $orig_train_text"
+  echo "Using dev text  : 1/10 of $orig_train_text"
+elif [ ! -z "$train_text" ] && [ ! -z "$dev_text" ] ; then
+  echo "Using words file: $words_file"
+  echo "Using train text: $train_text"
+  echo "Using dev text  : $dev_text"
+  train_text=$train_text
+  dev_text=$dev_text
+else
+  train_text=$datadir/train/text
+  dev_text=$datadir/dev2h/text
+  echo "Using words file: $words_file"
+  echo "Using train text: $train_text"
+  echo "Using dev text  : $dev_text"
+fi
+
+
+
+# Extract the word list from the training dictionary; exclude special symbols
+sort $words_file | awk '{print $1}' | grep -v '\#0' | grep -v '<eps>' | grep -v -F "$oov_symbol" > $tgtdir/vocab
+if (($?)); then
+  echo "Failed to create vocab from $words_file"
+  exit 1
+else
+  # wc vocab # doesn't work due to some encoding issues
+  echo vocab contains `cat $tgtdir/vocab | perl -ne 'BEGIN{$l=$w=0;}{split; $w+=$#_; $w++; $l++;}END{print "$l lines, $w words\n";}'`
+fi
+
+# Kaldi transcript files contain Utterance_ID as the first word; remove it
+cat $train_text | cut -f2- -d' ' > $tgtdir/train.txt
+if (($?)); then
+    echo "Failed to create $tgtdir/train.txt from $train_text"
+    exit 1
+else
+    echo "Removed first word (uid) from every line of $train_text"
+    # wc text.train train.txt # doesn't work due to some encoding issues
+    echo $train_text contains `cat $train_text | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $w--; $s++;}END{print "$w words, $s sentences\n";}'`
+    echo train.txt contains `cat $tgtdir/train.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $s++;}END{print "$w words, $s sentences\n";}'`
+fi
+
+# Kaldi transcript files contain Utterance_ID as the first word; remove it
+cat $dev_text | cut -f2- -d' ' > $tgtdir/dev.txt
+if (($?)); then
+    echo "Failed to create $tgtdir/dev.txt from $dev_text"
+    exit 1
+else
+    echo "Removed first word (uid) from every line of $dev_text"
+    # wc text.train train.txt # doesn't work due to some encoding issues
+    echo $dev_text contains `cat $dev_text | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $w--; $s++;}END{print "$w words, $s sentences\n";}'`
+    echo $tgtdir/dev.txt contains `cat $tgtdir/dev.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F;  $s++;}END{print "$w words, $s sentences\n";}'`
+fi
+
+echo "-------------------"
+echo "Good-Turing 2grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/2gram.gt01.gz -gt1min 0 -gt2min 1 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/2gram.gt02.gz -gt1min 0 -gt2min 2 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Kneser-Ney 2grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/2gram.kn01.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/2gram.kn02.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Good-Turing 3grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/3gram.gt011.gz -gt1min 0 -gt2min 1 -gt3min 1 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.gt012.gz -gt1min 0 -gt2min 1 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.gt022.gz -gt1min 0 -gt2min 2 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.gt023.gz -gt1min 0 -gt2min 2 -gt3min 3 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Kneser-Ney 3grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/3gram.kn011.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn012.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn022.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn023.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 3 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+
+echo "-------------------"
+echo "Good-Turing 4grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/4gram.gt0111.gz -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 1 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0112.gz -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0122.gz -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0123.gz -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0113.gz -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0222.gz -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0223.gz -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Kneser-Ney 4grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/4gram.kn0111.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 1 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0112.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0113.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0122.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0123.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0222.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0223.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+if [ ! -z ${LIBLBFGS} ]; then
+  #please not that if the switch -map-unk "$oov_symbol" is used with -maxent-convert-to-arpa, ngram-count will segfault
+  #instead of that, we simply output the model in the maxent format and convert it using the "ngram"
+  echo "-------------------"
+  echo "Maxent 2grams"
+  echo "-------------------"
+  sed 's/'${oov_symbol}'/<unk>/g' $tgtdir/train.txt | \
+    ngram-count -lm - -order 2 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\
+    sed 's/<unk>/'${oov_symbol}'/g' | gzip -c > $tgtdir/2gram.me.gz || exit 1
+
+  echo "-------------------"
+  echo "Maxent 3grams"
+  echo "-------------------"
+  sed 's/'${oov_symbol}'/<unk>/g' $tgtdir/train.txt | \
+    ngram-count -lm - -order 3 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\
+    sed 's/<unk>/'${oov_symbol}'/g' | gzip -c > $tgtdir/3gram.me.gz || exit 1
+
+  echo "-------------------"
+  echo "Maxent 4grams"
+  echo "-------------------"
+  sed 's/'${oov_symbol}'/<unk>/g' $tgtdir/train.txt | \
+    ngram-count -lm - -order 4 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\
+    sed 's/<unk>/'${oov_symbol}'/g' | gzip -c > $tgtdir/4gram.me.gz || exit 1
+
+fi
+
+
+echo "--------------------"
+echo "Computing perplexity"
+echo "--------------------"
+(
+  for f in $tgtdir/2gram* ; do ( echo $f; ngram -order 2 -lm $f -unk -map-unk "$oov_symbol" -ppl $tgtdir/dev.txt ) | paste -s -d ' ' - ; done
+  for f in $tgtdir/3gram* ; do ( echo $f; ngram -order 3 -lm $f -unk -map-unk "$oov_symbol" -ppl $tgtdir/dev.txt ) | paste -s -d ' ' - ; done
+  for f in $tgtdir/4gram* ; do ( echo $f; ngram -order 4 -lm $f -unk -map-unk "$oov_symbol" -ppl $tgtdir/dev.txt ) | paste -s -d ' ' - ; done
+)  | sort  -r -n -k 15,15g | column -t | tee $tgtdir/perplexities.txt
+
+echo "The perlexity scores report is stored in $tgtdir/perplexities.txt "
+
+#This will link the lowest perplexity LM as the output LM.
+#ln -sf $tgtdir/`head -n 1 $tgtdir/perplexities.txt | cut -f 1 -d ' '` $outlm
+
+#A slight modification of the previous approach:
+#We look at the two lowest perplexity LMs and use a 3gram LM if one of the two, even if the 4gram is of lower ppl
+nof_trigram_lm=`head -n 2 $tgtdir/perplexities.txt | grep 3gram | wc -l`
+if [[ $nof_trigram_lm -eq 0 ]] ; then
+  lmfilename=`head -n 1 $tgtdir/perplexities.txt | cut -f 1 -d ' '`
+elif [[ $nof_trigram_lm -eq 2 ]] ; then
+  lmfilename=`head -n 1 $tgtdir/perplexities.txt | cut -f 1 -d ' '`
+else  #exactly one 3gram LM
+  lmfilename=`head -n 2 $tgtdir/perplexities.txt | grep 3gram | cut -f 1 -d ' '`
+fi
+(cd $tgtdir; ln -sf `basename $lmfilename` $outlm )
+
diff --git a/egs/mgb5/s5/path.sh b/egs/mgb5/s5/path.sh
new file mode 100644
index 00000000000..ebc3e1f4ee0
--- /dev/null
+++ b/egs/mgb5/s5/path.sh
@@ -0,0 +1,8 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+export CUDA_CACHE_DISABLE=1
+
diff --git a/egs/mgb5/s5/run.sh b/egs/mgb5/s5/run.sh
new file mode 100755
index 00000000000..6fc21629f0f
--- /dev/null
+++ b/egs/mgb5/s5/run.sh
@@ -0,0 +1,190 @@
+#!/bin/bash
+
+# Copyright 2019 QCRI (Author:Ahmed Ali)
+# Apache 2.0
+
+
+stage=0
+
+# initialization PATH
+. ./path.sh  || die "path.sh expected";
+# initialization commands
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+set -e -o pipefail
+
+
+nj=16
+dev_nj=16
+
+if [ $stage -le 1 ]; then
+  echo "Preparing data and training language models"
+  local/prepare_data.sh 
+  local/prepare_dict.sh 
+  utils/prepare_lang.sh data/local/dict "<UNK>" data/local/lang data/lang
+  local/prepare_lm.sh
+fi
+
+
+if [ $stage -le 2 ]; then
+  # Feature extraction
+  for x in train dev; do
+      steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" data/$x exp/make_mfcc/$x mfcc
+      steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x mfcc
+  done
+fi
+
+if [ $stage -le 3 ]; then
+  ### Monophone
+  echo "Starting monophone training."
+  utils/subset_data_dir.sh data/train 1000 data/train.1k
+  steps/train_mono.sh --nj $nj --cmd "$train_cmd" data/train.1k data/lang exp/mono
+  echo "Mono training done."
+
+  (
+  echo "Decoding the dev set using monophone models."
+  utils/mkgraph.sh data/lang_test exp/mono exp/mono/graph
+
+  steps/decode.sh --config conf/decode.config --nj $dev_nj --cmd "$decode_cmd" \
+    exp/mono/graph data/dev exp/mono/decode_dev
+  echo "Monophone decoding done."
+  ) &
+fi
+
+
+if [ $stage -le 4 ]; then
+  ### Triphone
+  echo "Starting triphone training."
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+      data/train data/lang exp/mono exp/mono_ali
+  steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd"  \
+      3200 30000 data/train data/lang exp/mono_ali exp/tri1
+  echo "Triphone training done."
+
+  (
+  echo "Decoding the dev set using triphone models."
+  utils/mkgraph.sh data/lang_test  exp/tri1 exp/tri1/graph
+  steps/decode.sh --nj $dev_nj --cmd "$decode_cmd"  \
+      exp/tri1/graph  data/dev exp/tri1/decode_dev
+
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      data/lang_test/ data/lang_big/ data/dev \
+      exp/tri1/decode_dev exp/tri1/decode_dev.rescored
+  echo "Triphone decoding done."
+  ) &
+fi
+
+if [ $stage -le 5 ]; then
+  ## Triphones + delta delta
+  # Training
+  echo "Starting (larger) triphone training."
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" --use-graphs true \
+       data/train data/lang exp/tri1 exp/tri1_ali
+  steps/train_deltas.sh --cmd "$train_cmd"  \
+      4200 40000 data/train data/lang exp/tri1_ali exp/tri2a
+  echo "Triphone (large) training done."
+
+  (
+  echo "Decoding the dev set using triphone(large) models."
+  utils/mkgraph.sh data/lang_test exp/tri2a exp/tri2a/graph
+  steps/decode.sh --nj $dev_nj --cmd "$decode_cmd" \
+      exp/tri2a/graph data/dev exp/tri2a/decode_dev
+
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      data/lang_test/ data/lang_big/ data/dev \
+      exp/tri2a/decode_dev exp/tri2a/decode_dev.rescored
+  echo "Triphone(large) decoding done."
+  ) &
+fi
+
+if [ $stage -le 6 ]; then
+  ### Triphone + LDA and MLLT
+  # Training
+  echo "Starting LDA+MLLT training."
+  steps/align_si.sh --nj $nj --cmd "$train_cmd"  \
+      data/train data/lang exp/tri2a exp/tri2a_ali
+
+  steps/train_lda_mllt.sh --cmd "$train_cmd"  \
+    --splice-opts "--left-context=3 --right-context=3" \
+    4200 40000 data/train data/lang exp/tri2a_ali exp/tri2b
+  echo "LDA+MLLT training done."
+
+  (
+  echo "Decoding the dev set using LDA+MLLT models."
+  utils/mkgraph.sh data/lang_test exp/tri2b exp/tri2b/graph
+  steps/decode.sh --nj $dev_nj --cmd "$decode_cmd" \
+      exp/tri2b/graph data/dev exp/tri2b/decode_dev
+
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      data/lang_test/ data/lang_big/ data/dev \
+      exp/tri2b/decode_dev exp/tri2b/decode_dev.rescored
+  echo "LDA+MLLT decoding done."
+  ) &
+fi
+
+
+if [ $stage -le 7 ]; then
+  ### Triphone + LDA and MLLT + SAT and FMLLR
+  # Training
+  echo "Starting SAT+FMLLR training."
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+      --use-graphs true data/train data/lang exp/tri2b exp/tri2b_ali
+  steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+      data/train data/lang exp/tri2b_ali exp/tri3b
+  echo "SAT+FMLLR training done."
+
+  (
+  echo "Decoding the dev set using SAT+FMLLR models."
+  utils/mkgraph.sh data/lang_test  exp/tri3b exp/tri3b/graph
+  steps/decode_fmllr.sh --nj $dev_nj --cmd "$decode_cmd" \
+      exp/tri3b/graph  data/dev exp/tri3b/decode_dev
+
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      data/lang_test/ data/lang_big/ data/dev \
+      exp/tri3b/decode_dev exp/tri3b/decode_dev.rescored
+  echo "SAT+FMLLR decoding done."
+  ) &
+fi
+
+
+if [ $stage -le 8 ]; then
+  echo "Starting SGMM training."
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+      data/train data/lang exp/tri3b exp/tri3b_ali
+
+  steps/train_ubm.sh --cmd "$train_cmd"  \
+      600 data/train data/lang exp/tri3b_ali exp/ubm5b2
+
+  steps/train_sgmm2.sh --cmd "$train_cmd"  \
+       5200 12000 data/train data/lang exp/tri3b_ali exp/ubm5b2/final.ubm exp/sgmm2_5b2
+  echo "SGMM training done."
+
+  (
+  echo "Decoding the dev set using SGMM models"
+  # Graph compilation
+  utils/mkgraph.sh data/lang_test exp/sgmm2_5b2 exp/sgmm2_5b2/graph
+  utils/mkgraph.sh data/lang_big/ exp/sgmm2_5b2 exp/sgmm2_5b2/graph_big
+
+  steps/decode_sgmm2.sh --nj $dev_nj --cmd "$decode_cmd" \
+      --transform-dir exp/tri3b/decode_dev \
+      exp/sgmm2_5b2/graph data/dev exp/sgmm2_5b2/decode_dev
+
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      data/lang_test/ data/lang_big/ data/dev \
+      exp/sgmm2_5b2/decode_dev exp/sgmm2_5b2/decode_dev.rescored
+
+  steps/decode_sgmm2.sh --nj $dev_nj --cmd "$decode_cmd" \
+      --transform-dir exp/tri3b/decode_dev \
+      exp/sgmm2_5b2/graph_big data/dev exp/sgmm2_5b2/decode_dev.big
+  echo "SGMM decoding done."
+  ) &
+fi
+
+wait;
+
+time bash -x ./local/chain/run_tdnn.sh &> chain_run_tdnn.log
+#score
+for x in exp/chain/*/decode* exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done | sort -k2 -n > RESULTS
+
+
diff --git a/egs/mgb5/s5/steps b/egs/mgb5/s5/steps
new file mode 120000
index 00000000000..1b186770dd1
--- /dev/null
+++ b/egs/mgb5/s5/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps/
\ No newline at end of file
diff --git a/egs/mgb5/s5/utils b/egs/mgb5/s5/utils
new file mode 120000
index 00000000000..a3279dc8679
--- /dev/null
+++ b/egs/mgb5/s5/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils/
\ No newline at end of file
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
index 0b86ace2de1..c8f2503b578 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
@@ -144,7 +144,7 @@ if [ $stage -le 13 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   cnn_opts="l2-regularize=0.03"
   ivector_affine_opts="l2-regularize=0.03"
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1a.sh
index 642c20ec191..da16297c9dd 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -147,7 +147,7 @@ if [ $stage -le 13 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1b.sh
index 110b7b87415..3d0c2d63902 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -154,7 +154,7 @@ if [ $stage -le 13 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1c.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1c.sh
index fe6f1b50f9e..081af8fe2f8 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1c.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1c.sh
@@ -150,7 +150,7 @@ if [ $stage -le 13 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1d.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1d.sh
index 225b36f909c..04df38d4da3 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1d.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1d.sh
@@ -150,7 +150,7 @@ if [ $stage -le 13 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1e.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1e.sh
index 565387003ff..cdf9bb584f4 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1e.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1e.sh
@@ -148,7 +148,7 @@ if [ $stage -le 13 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.05"
   output_opts="l2-regularize=0.01"
 
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh
index 9cc6d93022a..d1385ff2be5 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh
@@ -156,7 +156,7 @@ if [ $stage -le 13 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.05"
   output_opts="l2-regularize=0.02 bottleneck-dim=192"
 
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g.sh
index e234b847aa7..ad51780e191 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g.sh
@@ -155,7 +155,7 @@ if [ $stage -le 13 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.05 dropout-per-dim-continuous=true"
   output_opts="l2-regularize=0.02 bottleneck-dim=192"
 
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g20.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g20.sh
index 18540806028..dbfe5c5a07a 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g20.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g20.sh
@@ -168,7 +168,7 @@ if [ $stage -le 13 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.05 dropout-per-dim-continuous=true"
   output_opts="l2-regularize=0.02 bottleneck-dim=192"
 
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1h.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1h.sh
index 776247f5ea3..cc4123e2755 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1h.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1h.sh
@@ -151,7 +151,7 @@ if [ $stage -le 13 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true"
   tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66"
diff --git a/egs/mini_librispeech/s5/local/nnet3/tuning/run_tdnn_lstm_1a.sh b/egs/mini_librispeech/s5/local/nnet3/tuning/run_tdnn_lstm_1a.sh
index de858973c98..c2f90df4b5c 100755
--- a/egs/mini_librispeech/s5/local/nnet3/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/mini_librispeech/s5/local/nnet3/tuning/run_tdnn_lstm_1a.sh
@@ -99,7 +99,7 @@ if [ $stage -le 10 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $ali_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/mini_librispeech/s5/local/nnet3/tuning/run_tdnn_lstm_1b.sh b/egs/mini_librispeech/s5/local/nnet3/tuning/run_tdnn_lstm_1b.sh
index ba4ecc268df..2b3c2844972 100755
--- a/egs/mini_librispeech/s5/local/nnet3/tuning/run_tdnn_lstm_1b.sh
+++ b/egs/mini_librispeech/s5/local/nnet3/tuning/run_tdnn_lstm_1b.sh
@@ -102,7 +102,7 @@ if [ $stage -le 10 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $ali_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   lstm_opts="decay-time=20 delay=-3 dropout-proportion=0.0"
 
   mkdir -p $dir/configs
diff --git a/egs/mini_librispeech/s5/local/nnet3/tuning/run_tdnn_lstm_1c.sh b/egs/mini_librispeech/s5/local/nnet3/tuning/run_tdnn_lstm_1c.sh
index 74df56b0537..5118cb0f8bd 100755
--- a/egs/mini_librispeech/s5/local/nnet3/tuning/run_tdnn_lstm_1c.sh
+++ b/egs/mini_librispeech/s5/local/nnet3/tuning/run_tdnn_lstm_1c.sh
@@ -100,7 +100,7 @@ if [ $stage -le 10 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $ali_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   tdnn_opts="l2-regularize=0.05"
   lstm_opts="l2-regularize=0.01 decay-time=20 delay=-3 dropout-proportion=0.0"
   output_opts="l2-regularize=0.01"
diff --git a/egs/multi_en/s5/local/chain/tuning/run_tdnn_5b.sh b/egs/multi_en/s5/local/chain/tuning/run_tdnn_5b.sh
index 9f8c49387b1..96f5fdac8f3 100755
--- a/egs/multi_en/s5/local/chain/tuning/run_tdnn_5b.sh
+++ b/egs/multi_en/s5/local/chain/tuning/run_tdnn_5b.sh
@@ -132,7 +132,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.0015 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
   linear_opts="l2-regularize=0.0015 orthonormal-constraint=-1.0"
   output_opts="l2-regularize=0.001"
diff --git a/egs/multi_en/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/multi_en/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
index 5793fef0fc2..62266334962 100755
--- a/egs/multi_en/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/multi_en/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -155,7 +155,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   lstm_opts="dropout-proportion=0.0 decay-time=40"
 
   relu_dim=1024
diff --git a/egs/multi_en/s5/local/chain/tuning/run_tdnn_opgru_1a.sh b/egs/multi_en/s5/local/chain/tuning/run_tdnn_opgru_1a.sh
index 98e7c2ed6c1..79cd3eb3014 100755
--- a/egs/multi_en/s5/local/chain/tuning/run_tdnn_opgru_1a.sh
+++ b/egs/multi_en/s5/local/chain/tuning/run_tdnn_opgru_1a.sh
@@ -150,7 +150,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   gru_opts="dropout-per-frame=true dropout-proportion=0.0 "
 
   mkdir -p $dir/configs
diff --git a/egs/multi_en/s5/local/chain/tuning/run_tdnn_opgru_1b.sh b/egs/multi_en/s5/local/chain/tuning/run_tdnn_opgru_1b.sh
index 8b1f34b15a6..a7170af9431 100755
--- a/egs/multi_en/s5/local/chain/tuning/run_tdnn_opgru_1b.sh
+++ b/egs/multi_en/s5/local/chain/tuning/run_tdnn_opgru_1b.sh
@@ -146,7 +146,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   gru_opts="dropout-per-frame=true dropout-proportion=0.0 "
 
   mkdir -p $dir/configs
diff --git a/egs/reverb/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/reverb/s5/local/chain/tuning/run_tdnn_1a.sh
index 61cc8b97d41..c8b4997161e 100755
--- a/egs/reverb/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/reverb/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -133,7 +133,7 @@ if [ $stage -le 13 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.05"
   output_opts="l2-regularize=0.01 bottleneck-dim=320"
 
diff --git a/egs/reverb/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/reverb/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
index 9369e00a7ba..4723400c76b 100755
--- a/egs/reverb/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/reverb/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -141,7 +141,7 @@ if [ $stage -le 13 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=40"
 
diff --git a/egs/reverb/s5/local/download_se_eval_tool.sh b/egs/reverb/s5/local/download_se_eval_tool.sh
index c7b272907b6..0d7bb8305ea 100755
--- a/egs/reverb/s5/local/download_se_eval_tool.sh
+++ b/egs/reverb/s5/local/download_se_eval_tool.sh
@@ -18,14 +18,14 @@ unzip REVERB_scores.zip -d local/REVERB_scores_source
 rm REVERB_scores.zip
 
 pushd local/REVERB_scores_source/REVERB-SPEENHA.Release04Oct/evaltools
-sed -i 's/wavread/audioread/g' prog/score_sim.m
+perl -i -pe 's/wavread/audioread/g' prog/score_sim.m
 git clone https://github.com/MuSAELab/SRMRToolbox.git
-sed -i 's/wavread/audioread/g' SRMRToolbox/libs/preprocess.m
-sed -i 's/SRMR_main/SRMR/g' prog/score_real.m
-sed -i 's/SRMR_main/SRMR/g' prog/score_sim.m
-sed -i 's/+wb\ //g' prog/calcpesq.m
-sed -i 's/pesq_/_pesq_/g' prog/calcpesq.m
-sed -ie '30d;31d' prog/calcpesq.m
+perl -i -pe 's/wavread/audioread/g' SRMRToolbox/libs/preprocess.m
+perl -i -pe 's/SRMR_main/SRMR/g' prog/score_real.m
+perl -i -pe 's/SRMR_main/SRMR/g' prog/score_sim.m
+perl -i -pe 's/\+wb //g' prog/calcpesq.m
+perl -i -pe 's/pesq_/_pesq_/g' prog/calcpesq.m
+perl -n -i -e 'print unless /remove target file name/' prog/calcpesq.m
 patch score_RealData.m -i ../../../score_RealData.patch -o score_RealData_new.m
 mv score_RealData_new.m score_RealData.m
 patch score_SimData.m -i ../../../score_SimData.patch -o score_SimData_new.m
diff --git a/egs/rimes/v1/RESULTS b/egs/rimes/v1/RESULTS
new file mode 100644
index 00000000000..4a9d7225e33
--- /dev/null
+++ b/egs/rimes/v1/RESULTS
@@ -0,0 +1,45 @@
+Run_end2end.sh Word-based system WER using lang_unk and lang. WER at line-level and paragraph level
+flat_start:
+Line-level:
+  • %WER 13.97 [ 788 / 5639, 136 ins, 62 del, 590 sub ] exp/chain/e2e_cnn_1a/decode_test/wer_10_1.0
+  • %WER 16.56 [ 934 / 5639, 158 ins, 75 del, 701 sub ] exp/chain/e2e_cnn_1a/decode_test/wer_13_1.0
+
+Paragraph-level:
+  • %WER 12.89 [ 727 / 5639, 116 ins, 42 del, 569 sub ] exp/chain/e2e_cnn_1a/decode_test/para/wer_10_1.0
+  • %WER 15.50 [ 874 / 5639, 133 ins, 50 del, 691 sub ] exp/chain/e2e_cnn_1a/decode_test/para/wer_13_1.0
+
+cnn_e2eali_1a:
+Line-level:
+  • %WER 10.43 [ 588 / 5639, 115 ins, 57 del, 416 sub ] exp/chain/cnn_e2eali_1a/decode_test/wer_7_1.0
+  • %WER 13.78 [ 777 / 5639, 153 ins, 58 del, 566 sub ] exp/chain/cnn_e2eali_1a/decode_test/wer_10_1.0
+
+Paragraph-level:
+  • %WER 9.35 [ 527 / 5639, 89 ins, 31 del, 407 sub ] exp/chain/cnn_e2eali_1a/decode_test//para/wer_7_1.0
+  • %WER 12.70 [ 716 / 5639, 134 ins, 39 del, 543 sub ] exp/chain/cnn_e2eali_1a/decode_test/para/wer_10_1.0
+
+
+Run_end2end.sh BPE-based system WER using lang. WER at line-level and paragraph level
+flat_start:
+Line-level:
+  • %WER 11.58 [ 653 / 5639, 72 ins, 67 del, 514 sub ] exp/chain/e2e_cnn_1a/decode_test/wer_8_1.0
+
+Paragraph-level:
+  • %WER 10.50 [ 592 / 5639, 54 ins, 49 del, 489 sub ] exp/chain/e2e_cnn_1a/decode_test/para/wer_8_1.0
+
+cnn_e2eali_1a:
+Line-level:
+  • %WER 8.48 [ 478 / 5639, 56 ins, 54 del, 368 sub ] exp/chain/cnn_e2eali_1a/decode_test/wer_9_0.5
+
+Paragraph-level:
+  • %WER 7.41 [ 418 / 5639, 38 ins, 36 del, 344 sub ] exp/chain/cnn_e2eali_1a/decode_test/para/wer_9_0.5
+
+
+Run_end2end.sh BPE-based system WER using lang with optional open-source extra corpus text. 
+WER at line-level and paragraph level.
+
+cnn_e2eali_1a:
+Line-level:
+  • %WER 7.66 [ 432 / 5639, 50 ins, 38 del, 344 sub ] exp/chain/cnn_e2eali_1a/decode_test/para/wer_7_0.5
+
+Paragraph-level:
+  • %WER 6.85 [ 386 / 5639, 35 ins, 36 del, 315 sub ] exp/chain/cnn_e2eali_1a/decode_test/para/wer_8_1.0
diff --git a/egs/rimes/v1/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/rimes/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
index 4eb3e5e1e76..33eb9dcb98c 100755
--- a/egs/rimes/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
+++ b/egs/rimes/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
@@ -151,7 +151,7 @@ if [ $stage -le 5 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   cnn_opts="l2-regularize=0.03 dropout-proportion=0.0"
   tdnn_opts="l2-regularize=0.03"
   output_opts="l2-regularize=0.04"
diff --git a/egs/rm/README.txt b/egs/rm/README.txt
index ed588e481c6..4fa3d7c87e8 100644
--- a/egs/rm/README.txt
+++ b/egs/rm/README.txt
@@ -9,7 +9,7 @@ About the Resource Management corpus:
 
 Each subdirectory of this directory contains the
 scripts for a sequence of experiments. 
-s5 is the currently recommmended setup.
+s5 is the currently recommended setup.
 
   s5: This is the "new-new-style" recipe.  It is now finished.
       All further work will be on top of this style of recipe.  Note: 
diff --git a/egs/rm/s5/RESULTS b/egs/rm/s5/RESULTS
index a8156e10e14..2f1262510fb 100644
--- a/egs/rm/s5/RESULTS
+++ b/egs/rm/s5/RESULTS
@@ -233,10 +233,12 @@ for x in exp/nnet2_online_wsj/nnet_ms_a_smbr_0.00005/1/decode_*; do grep WER $x/
 # current best chain result with TDNN (check local/chain/run_tdnn_5g.sh)
 %WER 2.86 [ 358 / 12533, 46 ins, 61 del, 251 sub ] exp/chain/tdnn_5g/decode/wer_5_0.0
 %WER 2.71 [ 340 / 12533, 58 ins, 59 del, 223 sub ] exp/chain/tdnn_5n/decode/wer_4_0.0
-
+# Its topology of chain model is from mini_librispeech's.
+# It uses a new configs convention for chain model after kaldi 5.2.
+%WER 1.32 [ 166 / 12533, 19 ins, 31 del, 116 sub ] exp/chain/tdnn_5o/decode/wer_4_0.0
 ### WSJ->RM Transfer learning using chain model ###
 %WER 1.68 [ 210 / 12533, 25 ins, 33 del, 152 sub ] exp/chain/tdnn_wsj_rm_1a/decode/wer_2_0.0
-
+ 
 ### nnet1 results ###
 
 # dnn4b, MFCC,LDA,fMLLR feaures, (Karel - 30.7.2015)
diff --git a/egs/rm/s5/conf/mfcc_hires.conf b/egs/rm/s5/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..19f218f982e
--- /dev/null
+++ b/egs/rm/s5/conf/mfcc_hires.conf
@@ -0,0 +1,8 @@
+# config for high-resolution MFCC features, intended for neural network training
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why 
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--low-freq=20     # low cutoff frequency for mel bins... this is high-bandwidth data, so
+                  # there might be some information at the low end.
+--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) 
diff --git a/egs/rm/s5/local/chain/run_tdnn.sh b/egs/rm/s5/local/chain/run_tdnn.sh
new file mode 120000
index 00000000000..317ebb106b9
--- /dev/null
+++ b/egs/rm/s5/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_5o.sh
\ No newline at end of file
diff --git a/egs/rm/s5/local/chain/run_tdnn_5g.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_5g.sh
similarity index 100%
rename from egs/rm/s5/local/chain/run_tdnn_5g.sh
rename to egs/rm/s5/local/chain/tuning/run_tdnn_5g.sh
diff --git a/egs/rm/s5/local/chain/run_tdnn_5n.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_5n.sh
similarity index 100%
rename from egs/rm/s5/local/chain/run_tdnn_5n.sh
rename to egs/rm/s5/local/chain/tuning/run_tdnn_5n.sh
diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_5o.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_5o.sh
new file mode 100755
index 00000000000..db5944fdbea
--- /dev/null
+++ b/egs/rm/s5/local/chain/tuning/run_tdnn_5o.sh
@@ -0,0 +1,185 @@
+#!/bin/bash
+
+# this script is a modified version of run_tdnn_5n.sh. It uses
+# a new configs convention for chain model after kaldi 5.2.
+
+
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+xent_regularize=0.1
+dir=exp/chain/tdnn_5o
+
+# training options
+num_epochs=13
+initial_effective_lrate=0.005
+final_effective_lrate=0.0005
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=2
+num_jobs_final=4
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+#common_egs_dir=exp/chain/tdnn_5g/egs/
+common_egs_dir=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet2 setup, and you can skip them by setting "--stage 4" if you have already
+# run those things.
+
+ali_dir=exp/tri3b_ali
+treedir=exp/chain/tri4_5o_tree
+lang=data/lang_chain_5o
+
+local/online/run_nnet2_common.sh --stage $stage || exit 1;
+
+if [ $stage -le 4 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri3b_ali/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/train \
+    data/lang exp/tri3b exp/tri3b_lats
+  rm exp/tri3b_lats/fsts.*.gz # save space
+fi
+
+if [ $stage -le 5 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 6 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+    --cmd "$train_cmd" 1200 data/train $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 7 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print(0.5/$xent_regularize)" | python)
+  tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.005"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=50 name=ivector
+  input dim=13 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+  ## adding the layers for chain branch
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  # adding the layers for xent branch
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 8 ]; then
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet2_online/ivectors \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=200" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1000000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/train_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri3b_lats \
+    --dir $dir
+fi
+
+if [ $stage -le 9 ]; then
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \
+    data/test_hires exp/nnet2_online/extractor exp/nnet2_online/ivectors_test || exit 1;
+fi
+
+if [ $stage -le 10 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --scoring-opts "--min-lmwt 1" \
+    --nj 20 --cmd "$decode_cmd" \
+    --online-ivector-dir exp/nnet2_online/ivectors_test \
+    $dir/graph data/test_hires $dir/decode || exit 1;
+fi
+
+if [ $stage -le 11 ]; then
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_ug $dir $dir/graph_ug
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --nj 20 --cmd "$decode_cmd" \
+    --online-ivector-dir exp/nnet2_online/ivectors_test \
+    $dir/graph_ug data/test_hires $dir/decode_ug || exit 1;
+fi
+wait;
+exit 0;
diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh
index 6b6c08e779a..2fd2556c19b 100755
--- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh
+++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh
@@ -130,7 +130,7 @@ if [ $stage -le 7 ]; then
   echo " generating new layers, that are specific to rm. These layers ";
   echo " are added to the transferred part of the wsj network.";
   num_targets=$(tree-info --print-args=false $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   mkdir -p $dir
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/rm/s5/local/online/run_nnet2_common.sh b/egs/rm/s5/local/online/run_nnet2_common.sh
index e0034ddd7d2..fb516375543 100755
--- a/egs/rm/s5/local/online/run_nnet2_common.sh
+++ b/egs/rm/s5/local/online/run_nnet2_common.sh
@@ -36,6 +36,7 @@ else
 fi
 
 train_set=train
+test_set=test
 if [ $stage -le 0 ]; then
   echo "$0: creating high-resolution MFCC features."
   mfccdir=data/${train_set}_hires/data
@@ -48,9 +49,10 @@ if [ $stage -le 0 ]; then
     steps/compute_cmvn_stats.sh data/${datadir}_hires
     utils/fix_data_dir.sh data/${datadir}_hires
   done
+  train_set=${train_set}_hires
+  test_set=${test_set}_hires
 fi
 
-train_set=${train_set}_hires
 if [ ! -f $extractor/final.ie ] && [ $ivector_dim -gt 0 ]; then
   if [ $stage -le 1 ]; then
     mkdir -p exp/nnet2${nnet_affix}
@@ -61,7 +63,7 @@ if [ ! -f $extractor/final.ie ] && [ $ivector_dim -gt 0 ]; then
   if [ $stage -le 2 ]; then
     # use a smaller iVector dim (50) than the default (100) because RM has a very
     # small amount of data.
-    steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 40 \
+    steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
       --ivector-dim $ivector_dim \
      data/${train_set} exp/nnet2${nnet_affix}/diag_ubm $extractor || exit 1;
   fi
@@ -76,5 +78,5 @@ if [ $stage -le 3 ] && [ $ivector_dim -gt 0 ]; then
     data/${train_set}_max2 $extractor exp/nnet2${nnet_affix}/ivectors || exit 1;
 
   steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 10 \
-    data/test_hires $extractor exp/nnet2${nnet_affix}/ivectors_test || exit 1;
+    data/${test_set} $extractor exp/nnet2${nnet_affix}/ivectors_test || exit 1;
 fi
diff --git a/egs/rm/s5/local/run_vtln2.sh b/egs/rm/s5/local/run_vtln2.sh
index 6437032ca61..b87030d2e3d 100755
--- a/egs/rm/s5/local/run_vtln2.sh
+++ b/egs/rm/s5/local/run_vtln2.sh
@@ -59,4 +59,4 @@ steps/compute_cmvn_stats.sh data/test_vtln exp/make_mfcc/test_vtln $featdir
 # %WER 3.13 [ 392 / 12533, 59 ins, 64 del, 269 sub ] exp/tri3b/decode.si/wer_3
 # %WER 10.36 [ 1298 / 12533, 147 ins, 192 del, 959 sub ] exp/tri3b/decode_ug/wer_12
 # %WER 13.48 [ 1689 / 12533, 159 ins, 277 del, 1253 sub ] exp/tri3b/decode_ug.si/wer_13
-# a04:s5: 
\ No newline at end of file
+# a04:s5: 
diff --git a/egs/rm/s5/run.sh b/egs/rm/s5/run.sh
index 7d27b2c6d91..f167e590735 100755
--- a/egs/rm/s5/run.sh
+++ b/egs/rm/s5/run.sh
@@ -251,4 +251,4 @@ local/run_sgmm2.sh
 # local/nnet/run_cnn2d.sh
 
 # chain recipe
-# local/chain/run_tdnn_5f.sh
+# local/chain/run_tdnn.sh
diff --git a/egs/sitw/v1/local/make_musan.py b/egs/sitw/v1/local/make_musan.py
deleted file mode 100755
index 833da0619c9..00000000000
--- a/egs/sitw/v1/local/make_musan.py
+++ /dev/null
@@ -1,123 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2015   David Snyder
-#           2018   Ewald Enzinger
-# Apache 2.0.
-#
-# Modified version of egs/sre16/v1/local/make_musan.py (commit e3fb7c4a0da4167f8c94b80f4d3cc5ab4d0e22e8).
-# This version uses the raw MUSAN audio files (16 kHz) and does not use sox to resample at 8 kHz.
-#
-# This file is meant to be invoked by make_musan.sh.
-
-import os, sys
-
-def process_music_annotations(path):
-  utt2spk = {}
-  utt2vocals = {}
-  lines = open(path, 'r').readlines()
-  for line in lines:
-    utt, genres, vocals, musician = line.rstrip().split()[:4]
-    # For this application, the musican ID isn't important
-    utt2spk[utt] = utt
-    utt2vocals[utt] = vocals == "Y"
-  return utt2spk, utt2vocals
-
-def prepare_music(root_dir, use_vocals):
-  utt2vocals = {}
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  music_dir = os.path.join(root_dir, "music")
-  for root, dirs, files in os.walk(music_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-      elif str(file) == "ANNOTATIONS":
-        utt2spk_part, utt2vocals_part = process_music_annotations(file_path)
-        utt2spk.update(utt2spk_part)
-        utt2vocals.update(utt2vocals_part)
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2vocals:
-    if utt in utt2wav:
-      if use_vocals or not utt2vocals[utt]:
-        utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-        utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
-      num_good_files += 1
-    else:
-      print("Missing file {}".format(utt))
-      num_bad_files += 1
-  print("In music directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
-  return utt2spk_str, utt2wav_str
-
-def prepare_speech(root_dir):
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  speech_dir = os.path.join(root_dir, "speech")
-  for root, dirs, files in os.walk(speech_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-        utt2spk[utt] = utt
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2spk:
-    if utt in utt2wav:
-      utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-      utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
-      num_good_files += 1
-    else:
-      print("Missing file {}".format(utt))
-      num_bad_files += 1
-  print("In speech directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
-  return utt2spk_str, utt2wav_str
-
-def prepare_noise(root_dir):
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  noise_dir = os.path.join(root_dir, "noise")
-  for root, dirs, files in os.walk(noise_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-        utt2spk[utt] = utt
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2spk:
-    if utt in utt2wav:
-      utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-      utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
-      num_good_files += 1
-    else:
-      print("Missing file {}".format(utt))
-      num_bad_files += 1
-  print("In noise directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
-  return utt2spk_str, utt2wav_str
-
-def main():
-  in_dir = sys.argv[1]
-  out_dir = sys.argv[2]
-  use_vocals = sys.argv[3] == "Y"
-  utt2spk_music, utt2wav_music = prepare_music(in_dir, use_vocals)
-  utt2spk_speech, utt2wav_speech = prepare_speech(in_dir)
-  utt2spk_noise, utt2wav_noise = prepare_noise(in_dir)
-  utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise
-  utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise
-  wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w')
-  wav_fi.write(utt2wav)
-  utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w')
-  utt2spk_fi.write(utt2spk)
-
-
-if __name__=="__main__":
-  main()
diff --git a/egs/sitw/v1/local/make_musan.sh b/egs/sitw/v1/local/make_musan.sh
deleted file mode 100755
index 1565ef0d85c..00000000000
--- a/egs/sitw/v1/local/make_musan.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash
-# Copyright 2015   David Snyder
-# Apache 2.0.
-#
-# Copy of egs/sre16/v1/local/make_musan.sh (commit e3fb7c4a0da4167f8c94b80f4d3cc5ab4d0e22e8).
-#
-# This script, called by ../run.sh, creates the MUSAN
-# data directory. The required dataset is freely available at
-#   http://www.openslr.org/17/
-
-set -e
-in_dir=$1
-data_dir=$2
-use_vocals='Y'
-
-mkdir -p local/musan.tmp
-
-echo "Preparing ${data_dir}/musan..."
-mkdir -p ${data_dir}/musan
-local/make_musan.py ${in_dir} ${data_dir}/musan ${use_vocals}
-
-utils/fix_data_dir.sh ${data_dir}/musan
-
-grep "music" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_music
-grep "speech" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_speech
-grep "noise" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_noise
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_music \
-  ${data_dir}/musan ${data_dir}/musan_music
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_speech \
-  ${data_dir}/musan ${data_dir}/musan_speech
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_noise \
-  ${data_dir}/musan ${data_dir}/musan_noise
-
-utils/fix_data_dir.sh ${data_dir}/musan_music
-utils/fix_data_dir.sh ${data_dir}/musan_speech
-utils/fix_data_dir.sh ${data_dir}/musan_noise
-
-rm -rf local/musan.tmp
-
diff --git a/egs/sitw/v1/run.sh b/egs/sitw/v1/run.sh
index e016f8a4752..797451df263 100755
--- a/egs/sitw/v1/run.sh
+++ b/egs/sitw/v1/run.sh
@@ -137,7 +137,7 @@ if [ $stage -le 4 ]; then
 
   # Prepare the MUSAN corpus, which consists of music, speech, and noise
   # suitable for augmentation.
-  local/make_musan.sh $musan_root data
+  steps/data/make_musan.sh --sampling-rate 16000 $musan_root data
 
   # Get the duration of the MUSAN recordings.  This will be used by the
   # script augment_data_dir.py.
diff --git a/egs/sitw/v2/run.sh b/egs/sitw/v2/run.sh
index 8aeecc18b3f..aad58e4a853 100755
--- a/egs/sitw/v2/run.sh
+++ b/egs/sitw/v2/run.sh
@@ -103,7 +103,7 @@ if [ $stage -le 2 ]; then
 
   # Prepare the MUSAN corpus, which consists of music, speech, and noise
   # suitable for augmentation.
-  local/make_musan.sh $musan_root data
+  steps/data/make_musan.sh --sampling-rate 16000 $musan_root data
 
   # Get the duration of the MUSAN recordings.  This will be used by the
   # script augment_data_dir.py.
diff --git a/egs/spanish_dimex100/README.txt b/egs/spanish_dimex100/README.txt
new file mode 100644
index 00000000000..19406641f56
--- /dev/null
+++ b/egs/spanish_dimex100/README.txt
@@ -0,0 +1,22 @@
+About the DIMEx100 corpus:
+    Mexican Spanish clean speech corpus introduced in Pineda, et al. (2001).
+    "DIMEx100: A New Phonetic and Speech Corpus for Mexican Spanish".
+
+        > Studio recorded audio with a total of 6000 phrases by 100 speakers.
+        > Mono/16 bit/44.1 khz
+        > Three different levels of transcription
+        > For additional information about the corpus design and
+            characteristics refer to (Pineda, 2001)
+
+
+    Created by the computer science department of the "Investigaciones en
+    Matemáticas Aplicadas y en Sistemas (IIMAS)" institute at the "National
+    Autonomous University of Mexico (UNAM)".
+
+    DIMEx100 corpus is available free of charge for academic purposes
+    exclusively. For commercial use a formal agreement with UNAM is required.
+    For more information refer to
+    http://turing.iimas.unam.mx/~luis/DIME/CORPUS-DIMEX.html
+
+Example author:
+    Daniel A. Campoverde <alx@sillybytes.net>
diff --git a/egs/spanish_dimex100/s5/.gitignore b/egs/spanish_dimex100/s5/.gitignore
new file mode 100644
index 00000000000..5936e451c95
--- /dev/null
+++ b/egs/spanish_dimex100/s5/.gitignore
@@ -0,0 +1,5 @@
+DVDCorpusDimex100.zip
+CorpusDimex100
+
+data
+*.wav
diff --git a/egs/spanish_dimex100/s5/RESULTS b/egs/spanish_dimex100/s5/RESULTS
new file mode 100755
index 00000000000..dcab09973d6
--- /dev/null
+++ b/egs/spanish_dimex100/s5/RESULTS
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
+exit 0
+
+# Result on decode_test (tri2b_mmi_b0.05)
+%WER 7.58 [ 72 / 950, 50 ins, 0 del, 22 sub ] exp/tri2b_mmi_b0.05/decode_test/wer_15:2
diff --git a/egs/spanish_dimex100/s5/cmd.sh b/egs/spanish_dimex100/s5/cmd.sh
new file mode 100644
index 00000000000..71dd849a93b
--- /dev/null
+++ b/egs/spanish_dimex100/s5/cmd.sh
@@ -0,0 +1,15 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/spanish_dimex100/s5/conf/decode.config b/egs/spanish_dimex100/s5/conf/decode.config
new file mode 100644
index 00000000000..81c6a7b2745
--- /dev/null
+++ b/egs/spanish_dimex100/s5/conf/decode.config
@@ -0,0 +1,3 @@
+first_beam=10.0
+beam=13.0
+lattice_beam=6.0
diff --git a/egs/spanish_dimex100/s5/conf/mfcc.conf b/egs/spanish_dimex100/s5/conf/mfcc.conf
new file mode 100644
index 00000000000..45d284ad05c
--- /dev/null
+++ b/egs/spanish_dimex100/s5/conf/mfcc.conf
@@ -0,0 +1 @@
+--use-energy=false
diff --git a/egs/spanish_dimex100/s5/local/data_prep.sh b/egs/spanish_dimex100/s5/local/data_prep.sh
new file mode 100755
index 00000000000..50cb3de4f9c
--- /dev/null
+++ b/egs/spanish_dimex100/s5/local/data_prep.sh
@@ -0,0 +1,286 @@
+#!/bin/bash
+
+## Only run this file from the example root directory
+##      $ ./local/data_prep.sh
+
+mkdir -p "data/train" "data/test" "data/local"
+
+source ./path.sh
+
+# Dimex100 unziped corpus root directory
+CORPUS_DIR="$1"
+
+# Corpus data
+#
+#   Number of Different speakers:   100
+#   Speakers common utterances:     10
+#   Speakers individual utterances: 50
+#
+# Training/testing split
+#
+#   Common utterances for training:     10 (100%)
+#   Individual utterances for training: 40 (80%)
+#   Individual utterances for testing:  10 (20%)
+N_SPEAKERS=100
+N_COMMON_UTTERANCES=10
+N_INDIVIDUAL_UTTERANCES=50
+N_INDIVIDUAL_UTTERANCES_TRAINING=40
+N_INDIVIDUAL_UTTERANCES_TESTING=10
+
+# speakerId-utteranceId-[c|i]
+#   c = speaker common utterances (10)
+#   i = speaker individual utterances (50)
+#
+#   e.g.:
+#       s001-01-c
+#       ...
+#       s001-10-c
+#       ...
+#       s001-01-i
+#       ...
+#       s001-50-i
+
+## 80-20 train-test split
+## Only individual utterances are used in testing
+#    10/10 common utterances go into training
+#    40/50 individual utterances go into training
+#    10/50 individual utterances go into testing
+
+function make_speaker_id
+{
+    printf "s%03d" "$1"
+}
+
+function make_sentence_id
+{
+    printf "%02d" "$1"
+}
+
+#####################################
+# Convert wave audio to 16-bit, 16kHz
+#####################################
+
+function convert_to_16khz
+{
+    for i in $(seq 1 $N_SPEAKERS); do
+        speaker_id=$(make_speaker_id $i)
+
+        mkdir -p "$CORPUS_DIR/$speaker_id/audio_16k/comunes"
+        mkdir -p "$CORPUS_DIR/$speaker_id/audio_16k/individuales"
+
+        # Common utterances
+        for j in $(seq 1 $N_COMMON_UTTERANCES); do
+            sentence_id=$(make_sentence_id $j)
+            old_wav_file="$CORPUS_DIR/$speaker_id/audio_editado/comunes/$speaker_id$sentence_id.wav"
+            new_wav_file="$CORPUS_DIR/$speaker_id/audio_16k/comunes/$speaker_id$sentence_id.wav"
+            sox "$old_wav_file" -r 16k "$new_wav_file"
+        done
+
+        # Individual utterances
+        for k in $(seq 1 $N_INDIVIDUAL_UTTERANCES_TRAINING); do
+            sentence_id=$(make_sentence_id $k)
+            old_wav_file="$CORPUS_DIR/$speaker_id/audio_editado/individuales/$speaker_id$sentence_id.wav"
+            new_wav_file="$CORPUS_DIR/$speaker_id/audio_16k/individuales/$speaker_id$sentence_id.wav"
+            sox "$old_wav_file" -r 16k "$new_wav_file"
+        done
+    done
+}
+
+if [[ ! -d "$CORPUS_DIR/s001/audio_16k" ]]; then
+    echo
+    echo Converting audio from 44.1kHz to 16kHz
+    echo
+    convert_to_16khz
+fi
+
+
+
+#################
+# data/train/text
+# data/test/text
+#################
+
+# speakerId-utteranceId-[c|i]
+#   c = speaker common utterances (10)
+#   i = speaker individual utterances (50)
+#
+#   e.g.:
+#       s001-01-c
+#       ...
+#       s001-10-c
+#       ...
+#       s001-01-i
+#       ...
+#       s001-50-i
+
+## 80-20 train-test split
+## Only individual utterances are used in testing
+#    10/10 common utterances go into training
+#    40/50 individual utterances go into training
+#    10/50 individual utterances go into testing
+
+
+
+function clean
+{
+    echo "$1" \
+        | tr -d '\r' \
+        | tr '[:upper:]' '[:lower:]' \
+        | sed \
+            -e 's/á/a/g' -e 's/é/e/g' -e 's/í/i/g' -e 's/ó/o/g' -e 's/ú/u/g' \
+            -e 's/Á/a/g' -e 's/É/e/g' -e 's/Í/i/g' -e 's/Ó/o/g' -e 's/Ú/u/g' \
+            -e 's/ñ/n/g' -e 's/Ñ/n/g' -e 's/ü/u/g' -e 's/Ü/u/g' \
+        | tr -d -c "a-zA-Z0-9 \r\n"
+        # | tr -d -c "_,.;:\-?¿!'\"()" \
+}
+
+### Generate data/train/text
+for i in $(seq 1 $N_SPEAKERS); do
+    speaker_id=$(make_speaker_id $i)
+
+    # Common utterances
+    for j in $(seq 1 $N_COMMON_UTTERANCES); do
+        sentence_id=$(make_sentence_id $j)
+        utterance_id="$speaker_id-$sentence_id-c"
+        trans_file="$CORPUS_DIR/$speaker_id/texto/comunes/$speaker_id$sentence_id.txt"
+        iconv -f iso-8859-1 -t utf-8 "$trans_file" > "$trans_file.utf8"
+        if [ -f "$trans_file.utf8" ]; then
+            transcription=$(cat "$trans_file.utf8")
+            transcription=$(clean "$transcription")
+            echo "$utterance_id $transcription" >> "data/train/text"
+        fi
+    done
+
+    # Individual utterances
+    for k in $(seq 1 $N_INDIVIDUAL_UTTERANCES_TRAINING); do
+        sentence_id=$(make_sentence_id $k)
+        utterance_id="$speaker_id-$sentence_id-i"
+        trans_file="$CORPUS_DIR/$speaker_id/texto/individuales/$speaker_id$sentence_id.txt"
+        iconv -f iso-8859-1 -t utf-8 "$trans_file" > "$trans_file.utf8"
+        if [ -f "$trans_file.utf8" ]; then
+            transcription=$(cat "$trans_file.utf8")
+            transcription=$(clean "$transcription")
+            echo "$utterance_id $transcription" >> "data/train/text"
+        fi
+    done
+
+done
+
+
+### Generate data/test/text
+for i in $(seq 1 $N_SPEAKERS); do
+    speaker_id=$(make_speaker_id $i)
+
+    # Individual utterances
+    for k in $(seq $N_INDIVIDUAL_UTTERANCES_TRAINING $N_INDIVIDUAL_UTTERANCES); do
+        sentence_id=$(make_sentence_id $k)
+        utterance_id="$speaker_id-$sentence_id-i"
+        trans_file="$CORPUS_DIR/$speaker_id/texto/individuales/$speaker_id$sentence_id.txt"
+        iconv -f iso-8859-1 -t utf-8 "$trans_file" > "$trans_file.utf8"
+        if [ -f "$trans_file.utf8" ]; then
+            transcription=$(cat "$trans_file.utf8")
+            transcription=$(clean "$transcription")
+            echo "$utterance_id $transcription" >> "data/test/text"
+        fi
+    done
+
+done
+
+
+
+
+####################
+# data/train/wav.scp
+# data/test/wav.scp
+####################
+
+
+### Generate data/train/wav.scp
+for i in $(seq 1 $N_SPEAKERS); do
+    speaker_id=$(make_speaker_id $i)
+
+    # Common utterances
+    for j in $(seq 1 $N_COMMON_UTTERANCES); do
+        sentence_id=$(make_sentence_id $j)
+        utterance_id="$speaker_id-$sentence_id-c"
+        wav_file="$CORPUS_DIR/$speaker_id/audio_16k/comunes/$speaker_id$sentence_id.wav"
+        if [ -f "$wav_file" ]; then
+            echo "$utterance_id $wav_file" >> "data/train/wav.scp"
+        fi
+    done
+
+    # Individual utterances
+    for k in $(seq 1 $N_INDIVIDUAL_UTTERANCES_TRAINING); do
+        sentence_id=$(make_sentence_id $k)
+        utterance_id="$speaker_id-$sentence_id-i"
+        wav_file="$CORPUS_DIR/$speaker_id/audio_16k/individuales/$speaker_id$sentence_id.wav"
+        if [ -f "$wav_file" ]; then
+            echo "$utterance_id $wav_file" >> "data/train/wav.scp"
+        fi
+    done
+
+done
+
+
+### Generate data/test/wav.scp
+for i in $(seq 1 $N_SPEAKERS); do
+    speaker_id=$(make_speaker_id $i)
+
+    # Individual utterances
+    for k in $(seq $N_INDIVIDUAL_UTTERANCES_TRAINING $N_INDIVIDUAL_UTTERANCES); do
+        sentence_id=$(make_sentence_id $k)
+        utterance_id="$speaker_id-$sentence_id-i"
+        wav_file="$CORPUS_DIR/$speaker_id/audio_16k/individuales/$speaker_id$sentence_id.wav"
+        if [ -f "$wav_file" ]; then
+            echo "$utterance_id $wav_file" >> "data/test/wav.scp"
+        fi
+    done
+
+done
+
+
+
+
+####################
+# data/train/utt2spk
+# data/test/utt2spk
+####################
+
+# Take IDs from 'text' file to avoid including missing data's IDs
+
+### Generate data/train/utt2spk
+utterance_ids=$(cat "data/train/text" | cut -d' ' -f1)
+
+while read -r utterance_id; do
+    speaker_id=$(echo "$utterance_id" | cut -d'-' -f1)
+    echo "$utterance_id $speaker_id" >> "data/train/utt2spk"
+done <<< "$utterance_ids"
+
+
+### Generate data/test/utt2spk
+utterance_ids=$(cat "data/test/text" | cut -d' ' -f1)
+
+while read -r utterance_id; do
+    speaker_id=$(echo "$utterance_id" | cut -d'-' -f1)
+    echo "$utterance_id $speaker_id" >> "data/test/utt2spk"
+done <<< "$utterance_ids"
+
+
+############
+# Sort files
+############
+
+LC_ALL=C sort -o "data/train/text" "data/train/text"
+LC_ALL=C sort -o "data/test/text" "data/test/text"
+LC_ALL=C sort -o "data/train/wav.scp" "data/train/wav.scp"
+LC_ALL=C sort -o "data/test/wav.scp" "data/test/wav.scp"
+LC_ALL=C sort -o "data/train/utt2spk" "data/train/utt2spk"
+LC_ALL=C sort -o "data/test/utt2spk" "data/test/utt2spk"
+
+
+####################
+# data/train/spk2utt
+# data/test/spk2utt
+####################
+utils/utt2spk_to_spk2utt.pl "data/train/utt2spk" > "data/train/spk2utt"
+utils/utt2spk_to_spk2utt.pl "data/test/utt2spk" > "data/test/spk2utt"
diff --git a/egs/spanish_dimex100/s5/local/lang_prep.sh b/egs/spanish_dimex100/s5/local/lang_prep.sh
new file mode 100755
index 00000000000..1ba49bac6d6
--- /dev/null
+++ b/egs/spanish_dimex100/s5/local/lang_prep.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+## Only run this file from the example root directory
+##      $ ./local/data_prep.sh
+
+CORPUS_DIR="$1"
+
+mkdir -p "data/local/dict"
+
+source ./path.sh
+
+#############################
+# data/local/dict/lexicon.txt
+#############################
+
+export LC_ALL=C
+
+echo -e '!SIL sil\n<UNK> spn' > data/local/dict/lexicon.txt
+cat "$CORPUS_DIR/diccionarios/T22.full.dic" \
+    | tr '[:upper:]' '[:lower:]' \
+    | sed -e 's/([0123456789]*)//g' \
+        -e 's/\([^ ]\)n\~/\1n/g' \
+        -e 's/a_7/a/g' -e 's/e_7/e/g' -e 's/i_7/i/g' -e 's/o_7/o/g' -e 's/u_7/u/g' \
+        -e 's/a-7/a/g' -e 's/e-7/e/g' -e 's/i-7/i/g' -e 's/o-7/o/g' -e 's/u-7/u/g' \
+        -e 's/a_/a/g' -e 's/e_/e/g' -e 's/i_/i/g' -e 's/o_/o/g' -e 's/u_/u/g' \
+    | sed -e 's/_7n.*$//' \
+        -e 's/atl_7tica/atletica/' \
+        -e 's/biol_7gicas/biologicas/' \
+        -e 's/elec_7ctrico/electrico/' \
+        -e 's/gr_7afico/grafico/' \
+        -e 's/s_7lo/solo/' \
+    | sed -e 's/n~/ni/g' -e 's/r(/rh/g' \
+    | sed -e 's/\t/ /g' -e '/^$/d' \
+    | sort | uniq \
+    >> data/local/dict/lexicon.txt
+
+
+#######################################
+# data/local/dict/silence_phones.txt
+# data/local/dict/optional_silence.txt
+# data/local/dict/nonsilence_phones.txt
+# data/local/dict/extra_questions.txt
+#######################################
+
+echo -e 'sil\nspn' > data/local/dict/silence_phones.txt
+echo -e 'sil' > data/local/dict/optional_silence.txt
+cat data/local/dict/lexicon.txt \
+    | grep -v '<UNK>' \
+    | grep -v '!SIL' \
+    | cut -d' ' -f1 --complement \
+    | sed 's/ /\n/g' \
+    | sort -u \
+    > data/local/dict/nonsilence_phones.txt
diff --git a/egs/spanish_dimex100/s5/local/lm_prep.sh b/egs/spanish_dimex100/s5/local/lm_prep.sh
new file mode 100755
index 00000000000..82c3c22cddd
--- /dev/null
+++ b/egs/spanish_dimex100/s5/local/lm_prep.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+## Install SRILM in the `tools` directory (install_srilm.sh)
+
+## Only run this file from the example root directory
+##      $ ./local/data_prep.sh
+
+mkdir -p "data/local/tmp" "data/lang/tmp"
+
+source ./path.sh
+
+if [ -d "../../../tools/srilm/bin/i686-m64" ]; then
+    ngram_count_exe="../../../tools/srilm/bin/i686-m64/ngram-count"
+elif [ -d "../../../tools/srilm/bin/i686" ]; then
+    ngram_count_exe="../../../tools/srilm/bin/i686/ngram-count"
+else
+    echo
+    echo "[!] Install SRILM in the 'tools' directory (install_srilm.sh)"
+    echo
+    exit 1
+fi
+
+
+########################
+# data/local/tmp/lm_text
+########################
+
+# Text sentences input for language model generation
+# taken from data/[train|test]/text but with utterance IDs removed
+
+cat data/train/text data/test/text | cut -d' ' -f1 --complement > data/local/tmp/lm_text
+
+
+#################################
+# data/local/tmp/3gram_arpa_lm.gz
+##################################
+
+$ngram_count_exe -lm data/local/tmp/3gram_lm.arpa.kn.gz \
+    -order 3 \
+    -write-vocab data/local/tmp/vocab-full.txt \
+    -sort \
+    -wbdiscount \
+    -unk \
+    -map-unk "<UNK>" \
+    -text data/local/tmp/lm_text
+    # -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 \
+    # -kndiscount3 -gt3min 3 -order 3 \
+
+
+#################
+# data/lang/G.fst
+#################
+
+utils/format_lm.sh data/lang \
+    data/local/tmp/3gram_lm.arpa.kn.gz \
+    data/local/dict/lexicon.txt \
+    data/lang
diff --git a/egs/spanish_dimex100/s5/local/score.sh b/egs/spanish_dimex100/s5/local/score.sh
new file mode 100755
index 00000000000..0be7d192282
--- /dev/null
+++ b/egs/spanish_dimex100/s5/local/score.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0
+
+[ -f ./path.sh ] && . ./path.sh
+
+# begin configuration section.
+cmd=run.pl
+min_lmwt=7
+max_lmwt=17
+#end configuration section.
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
+  echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
+  exit 1;
+fi
+
+data=$1
+lang_or_graph=$2
+dir=$3
+
+symtab=$lang_or_graph/words.txt
+
+for f in $symtab $dir/lat.1.gz $data/text; do
+  [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
+done
+
+mkdir -p $dir/scoring/log
+
+cat $data/text | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > $dir/scoring/test_filt.txt
+
+$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \
+  lattice-best-path --lm-scale=LMWT --word-symbol-table=$symtab \
+    "ark:gunzip -c $dir/lat.*.gz|" ark,t:$dir/scoring/LMWT.tra || exit 1;
+
+# Note: the double level of quoting for the sed command
+$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
+   cat $dir/scoring/LMWT.tra \| \
+    utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
+    compute-wer --text --mode=present \
+     ark:$dir/scoring/test_filt.txt  ark,p:- ">&" $dir/wer_LMWT || exit 1;
+
+# Show results
+for f in $dir/wer_*; do echo $f; egrep  '(WER)|(SER)' < $f; done
+
+exit 0;
diff --git a/egs/spanish_dimex100/s5/path.sh b/egs/spanish_dimex100/s5/path.sh
new file mode 100755
index 00000000000..2d17b17a84a
--- /dev/null
+++ b/egs/spanish_dimex100/s5/path.sh
@@ -0,0 +1,6 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/spanish_dimex100/s5/run.sh b/egs/spanish_dimex100/s5/run.sh
new file mode 100755
index 00000000000..30f1ad0397f
--- /dev/null
+++ b/egs/spanish_dimex100/s5/run.sh
@@ -0,0 +1,118 @@
+#!/bin/bash
+
+. ./path.sh || exit 1
+. ./cmd.sh || exit 1
+
+########
+# Config
+########
+
+train_cmd="utils/run.pl"
+decode_cmd="utils/run.pl"
+
+CORPUS_DIR="CorpusDimex100"
+
+N_HMM=2000 # leaves
+N_GAUSSIANS=11000
+
+
+#################
+# Download corpus
+#################
+
+echo
+echo Downloading corpus
+echo
+if [ ! -d "$CORPUS_DIR" ]; then
+  wget http://turing.iimas.unam.mx/~luis/DIME/DIMEx100/DVD/DVDCorpusDimex100.zip || exit 1;
+  unzip DVDCorpusDimex100.zip || exit 1;
+fi
+
+
+##################
+# Data preparation
+##################
+
+echo
+echo Data preparation
+echo
+rm -rf data exp mfcc
+local/data_prep.sh "$CORPUS_DIR"
+utils/fix_data_dir.sh "data/train"
+utils/fix_data_dir.sh "data/test"
+
+
+#####################
+# Features generation
+#####################
+
+echo
+echo Features generation
+echo
+steps/make_mfcc.sh --cmd "$train_cmd" "data/train" "exp/make_mfcc/train" mfcc
+steps/make_mfcc.sh --cmd "$train_cmd" "data/test"  "exp/make_mfcc/test"  mfcc
+
+steps/compute_cmvn_stats.sh "data/train" "exp/make_mfcc/train" mfcc
+steps/compute_cmvn_stats.sh "data/test" "exp/make_mfcc/test" mfcc
+
+utils/validate_data_dir.sh "data/train"
+utils/validate_data_dir.sh "data/test"
+
+
+#######################
+# Lang data preparation
+#######################
+
+echo
+echo Language data preparation
+echo
+rm -rf data/local/dict
+local/lang_prep.sh "$CORPUS_DIR"
+utils/prepare_lang.sh data/local/dict "<UNK>" data/local/lang data/lang
+utils/fix_data_dir.sh "data/train"
+utils/fix_data_dir.sh "data/test"
+
+
+############################
+# Language model preparation
+############################
+
+echo
+echo Language model preparation
+echo
+local/lm_prep.sh
+
+
+#######################
+# Training and Decoding
+#######################
+
+echo
+echo Training
+echo
+# utils/subset_data_dir.sh --first data/train 500 data/train_500
+
+# Training and aligning
+steps/train_mono.sh --cmd "$train_cmd" data/train data/lang exp/mono || exit 1
+steps/align_si.sh --cmd "$train_cmd" data/train data/lang exp/mono exp/mono_aligned || exit 1
+steps/train_deltas.sh "$N_HMM" "$N_GAUSSIANS" data/train data/lang exp/mono_aligned exp/tri1 || exit 1
+steps/align_si.sh --cmd "$train_cmd" data/train data/lang exp/tri1 exp/tri1_aligned || exit 1
+
+# train tri2b [LDA+MLLT]
+steps/train_lda_mllt.sh --cmd "$train_cmd" "$N_HMM" "$N_GAUSSIANS" data/train data/lang exp/tri1_aligned exp/tri2b || exit 1;
+utils/mkgraph.sh data/lang exp/tri2b exp/tri2b/graph
+steps/align_si.sh --cmd "$train_cmd" data/train data/lang exp/tri2b exp/tri2b_aligned || exit 1
+
+#  Do MMI on top of LDA+MLLT.
+steps/make_denlats.sh --cmd "$train_cmd" data/train data/lang exp/tri2b exp/tri2b_denlats || exit 1;
+steps/train_mmi.sh --boost 0.05 data/train data/lang exp/tri2b_aligned exp/tri2b_denlats exp/tri2b_mmi_b0.05 || exit 1;
+
+
+
+# Decoding
+echo
+echo Decoding
+echo
+steps/decode.sh --config conf/decode.config --cmd "$decode_cmd" exp/tri2b/graph data/test exp/tri2b_mmi_b0.05/decode_test
+
+for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
diff --git a/egs/spanish_dimex100/s5/steps b/egs/spanish_dimex100/s5/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/spanish_dimex100/s5/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/spanish_dimex100/s5/utils b/egs/spanish_dimex100/s5/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/spanish_dimex100/s5/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file
diff --git a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1a.sh b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1a.sh
index ec6b8941955..47557f93696 100755
--- a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1a.sh
+++ b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1a.sh
@@ -152,7 +152,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1b.sh b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1b.sh
index 53aa92710e8..7afa1b7f902 100755
--- a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1b.sh
+++ b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1b.sh
@@ -153,7 +153,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1c.sh b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1c.sh
index 83c2f3607f0..e69e499e152 100755
--- a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1c.sh
+++ b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1c.sh
@@ -151,7 +151,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1d.sh b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1d.sh
index 2665ea91ff8..86e0352828c 100755
--- a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1d.sh
+++ b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1d.sh
@@ -164,7 +164,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1e.sh b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1e.sh
index 80f67d34ba9..313f899a471 100755
--- a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1e.sh
+++ b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1e.sh
@@ -152,7 +152,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_1b.sh
index e242660a10e..600f27ddf86 100755
--- a/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -135,7 +135,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
index 86dc4b75a24..cedc448464a 100755
--- a/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -145,7 +145,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/sprakbanken/s5/local/norm_dk/write_punct.sh b/egs/sprakbanken/s5/local/norm_dk/write_punct.sh
index 57726bd44cb..3b8decaf376 100755
--- a/egs/sprakbanken/s5/local/norm_dk/write_punct.sh
+++ b/egs/sprakbanken/s5/local/norm_dk/write_punct.sh
@@ -22,4 +22,4 @@ perl -pe 's/([\n ])\;([ \n])/\1SEMIKOLON\2/g' | \
 perl -pe 's/([\n ])_NL_([ \n])/\1NY LINJE\2/g' | \
 perl -pe 's/([\n ])_NS_([ \n])/\1NYT AFSNIT\2/g' | \
 
-tr -s ' '
\ No newline at end of file
+tr -s ' '
diff --git a/egs/sre08/v1/local/score_sre08.sh b/egs/sre08/v1/local/score_sre08.sh
index 92831502f45..c1584946735 100755
--- a/egs/sre08/v1/local/score_sre08.sh
+++ b/egs/sre08/v1/local/score_sre08.sh
@@ -35,11 +35,11 @@ tot_eer=0.0
 printf '% 12s' 'EER:'
 for condition in $(seq 8); do
   eer=$(awk '{print $3}' $scores | paste - $trials | awk -v c=$condition '{n=4+c; if ($n == "Y") print $1, $4}' | compute-eer - 2>/dev/null)
-  tot_eer=$(echo "$tot_eer+$eer" | bc)
+  tot_eer=$(perl -e "print ($tot_eer+$eer);")
   eers[$condition]=$eer
 done
 
-eers[0]=$(echo "$tot_eer/8" | bc -l)
+eers[0]=$(perl -e "print ($tot_eer/8.0);")
 
 for i in $(seq 0 8); do
   printf '% 7.2f' ${eers[$i]}
diff --git a/egs/sre16/v1/local/make_musan.py b/egs/sre16/v1/local/make_musan.py
deleted file mode 100755
index 7735bd28818..00000000000
--- a/egs/sre16/v1/local/make_musan.py
+++ /dev/null
@@ -1,119 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2015   David Snyder
-# Apache 2.0.
-#
-# This file is meant to be invoked by make_musan.sh.
-
-import os, sys
-
-def process_music_annotations(path):
-  utt2spk = {}
-  utt2vocals = {}
-  lines = open(path, 'r').readlines()
-  for line in lines:
-    utt, genres, vocals, musician = line.rstrip().split()[:4]
-    # For this application, the musican ID isn't important
-    utt2spk[utt] = utt
-    utt2vocals[utt] = vocals == "Y"
-  return utt2spk, utt2vocals
-
-def prepare_music(root_dir, use_vocals):
-  utt2vocals = {}
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  music_dir = os.path.join(root_dir, "music")
-  for root, dirs, files in os.walk(music_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-      elif str(file) == "ANNOTATIONS":
-        utt2spk_part, utt2vocals_part = process_music_annotations(file_path)
-        utt2spk.update(utt2spk_part)
-        utt2vocals.update(utt2vocals_part)
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2vocals:
-    if utt in utt2wav:
-      if use_vocals or not utt2vocals[utt]:
-        utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-        utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n"
-      num_good_files += 1
-    else:
-      print("Missing file {}".format(utt))
-      num_bad_files += 1
-  print("In music directory, processed {} files; {} had missing wav data".format(num_good_files, num_bad_files))
-  return utt2spk_str, utt2wav_str
-
-def prepare_speech(root_dir):
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  speech_dir = os.path.join(root_dir, "speech")
-  for root, dirs, files in os.walk(speech_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-        utt2spk[utt] = utt
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2spk:
-    if utt in utt2wav:
-      utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-      utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n"
-      num_good_files += 1
-    else:
-      print("Missing file {}".format(utt))
-      num_bad_files += 1
-  print("In speech directory, processed {} files; {} had missing wav data".format(num_good_files, num_bad_files))
-  return utt2spk_str, utt2wav_str
-
-def prepare_noise(root_dir):
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  noise_dir = os.path.join(root_dir, "noise")
-  for root, dirs, files in os.walk(noise_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-        utt2spk[utt] = utt
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2spk:
-    if utt in utt2wav:
-      utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-      utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n"
-      num_good_files += 1
-    else:
-      print("Missing file {}".format(utt))
-      num_bad_files += 1
-  print("In noise directory, processed {} files; {} had missing wav data".format(num_good_files, num_bad_files))
-  return utt2spk_str, utt2wav_str
-
-def main():
-  in_dir = sys.argv[1]
-  out_dir = sys.argv[2]
-  use_vocals = sys.argv[3] == "Y"
-  utt2spk_music, utt2wav_music = prepare_music(in_dir, use_vocals)
-  utt2spk_speech, utt2wav_speech = prepare_speech(in_dir)
-  utt2spk_noise, utt2wav_noise = prepare_noise(in_dir)
-  utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise
-  utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise
-  wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w')
-  wav_fi.write(utt2wav)
-  utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w')
-  utt2spk_fi.write(utt2spk)
-
-
-if __name__=="__main__":
-  main()
diff --git a/egs/sre16/v1/local/make_musan.sh b/egs/sre16/v1/local/make_musan.sh
deleted file mode 100755
index 694940ad70f..00000000000
--- a/egs/sre16/v1/local/make_musan.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-# Copyright 2015   David Snyder
-# Apache 2.0.
-#
-# This script, called by ../run.sh, creates the MUSAN
-# data directory. The required dataset is freely available at
-#   http://www.openslr.org/17/
-
-set -e
-in_dir=$1
-data_dir=$2
-use_vocals='Y'
-
-mkdir -p local/musan.tmp
-
-echo "Preparing ${data_dir}/musan..."
-mkdir -p ${data_dir}/musan
-local/make_musan.py ${in_dir} ${data_dir}/musan ${use_vocals}
-
-utils/fix_data_dir.sh ${data_dir}/musan
-
-grep "music" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_music
-grep "speech" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_speech
-grep "noise" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_noise
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_music \
-  ${data_dir}/musan ${data_dir}/musan_music
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_speech \
-  ${data_dir}/musan ${data_dir}/musan_speech
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_noise \
-  ${data_dir}/musan ${data_dir}/musan_noise
-
-utils/fix_data_dir.sh ${data_dir}/musan_music
-utils/fix_data_dir.sh ${data_dir}/musan_speech
-utils/fix_data_dir.sh ${data_dir}/musan_noise
-
-rm -rf local/musan.tmp
-
diff --git a/egs/sre16/v1/run.sh b/egs/sre16/v1/run.sh
index 28481e27c3a..2315d7ac78a 100755
--- a/egs/sre16/v1/run.sh
+++ b/egs/sre16/v1/run.sh
@@ -145,7 +145,7 @@ if [ $stage -le 4 ]; then
 
   # Prepare the MUSAN corpus, which consists of music, speech, and noise
   # suitable for augmentation.
-  local/make_musan.sh /export/corpora/JHU/musan data
+  steps/data/make_musan.sh --sampling-rate 8000 /export/corpora/JHU/musan data
 
   # Get the duration of the MUSAN recordings.  This will be used by the
   # script augment_data_dir.py.
diff --git a/egs/sre16/v2/run.sh b/egs/sre16/v2/run.sh
index b2072dfd69d..7780c30560b 100755
--- a/egs/sre16/v2/run.sh
+++ b/egs/sre16/v2/run.sh
@@ -135,7 +135,7 @@ if [ $stage -le 2 ]; then
 
   # Prepare the MUSAN corpus, which consists of music, speech, and noise
   # suitable for augmentation.
-  local/make_musan.sh /export/corpora/JHU/musan data
+  steps/data/make_musan.sh --sampling-rate 8000 /export/corpora/JHU/musan data
 
   # Get the duration of the MUSAN recordings.  This will be used by the
   # script augment_data_dir.py.
@@ -174,6 +174,7 @@ if [ $stage -le 2 ]; then
   utils/copy_data_dir.sh data/swbd_sre_combined data/sre_combined
   utils/filter_scp.pl data/sre/spk2utt data/swbd_sre_combined/spk2utt | utils/spk2utt_to_utt2spk.pl > data/sre_combined/utt2spk
   utils/fix_data_dir.sh data/sre_combined
+
 fi
 
 # Now we prepare the features to generate examples for xvector training.
diff --git a/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7k.sh b/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7k.sh
index 6792332da56..20dcab8eb50 100755
--- a/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7k.sh
+++ b/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7k.sh
@@ -152,7 +152,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_aug_1a.sh b/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_aug_1a.sh
new file mode 100755
index 00000000000..8762430ee7f
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_aug_1a.sh
@@ -0,0 +1,281 @@
+#!/bin/bash
+
+# This recipe does multi-style training of TDNN model
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7q_sp tdnn1a_aug
+# System                tdnn7q_sp tdnn1a_aug
+# WER on train_dev(tg)      11.91     12.06
+# WER on train_dev(fg)      10.99     10.92
+# WER on eval2000(tg)        14.3      14.4
+# WER on eval2000(fg)        12.8      12.9
+# WER on rt03(tg)            17.2      17.1
+# WER on rt03(fg)            15.1      14.8
+# Final train prob         -0.062    -0.087
+# Final valid prob         -0.074    -0.105
+# Final train prob (xent)        -0.933    -1.164
+# Final valid prob (xent)       -0.9027   -1.2246
+# Num-parameters               18693376  18483664
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+num_epochs=3
+
+# Augmentation options
+aug_list="reverb babble music noise clean" # Original train dir is referred to as `clean`
+num_reverb_copies=1
+use_ivectors=true
+
+affix=1a
+suffix="_aug"
+if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
+
+decode_iter=
+decode_nj=50
+
+# training options
+frames_per_eg=150,110,100
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+dir=exp/chain/tdnn${affix}${suffix}
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+clean_set=train_nodup
+clean_ali=tri4_ali_nodup
+train_set=$clean_set$suffix # Will be prepared by the script local/nnet3/prepare_multistyle_data.sh
+ali_dir=$clean_ali$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+# First creates augmented data and then extracts features for it data
+# The script also creates alignments for aug data by copying clean alignments
+local/nnet3/multi_condition/run_aug_common.sh --stage $stage \
+  --aug-list "$aug_list" --num-reverb-copies $num_reverb_copies \
+  --use-ivectors "$use_ivectors" \
+  --train-set $clean_set --clean-ali $clean_ali || exit 1;
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  prefixes=""
+  include_original=false
+  for n in $aug_list; do
+    if [ "$n" == "reverb" ]; then
+      for i in `seq 1 $num_reverb_copies`; do
+        prefixes="$prefixes "reverb$i
+      done
+    elif [ "$n" != "clean" ]; then
+      prefixes="$prefixes "$n
+    else
+      # The original train directory will not have any prefix
+      # include_original flag will take care of copying the original lattices
+      include_original=true
+    fi
+  done
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/${clean_set} \
+    data/lang exp/tri4 exp/tri4_lats_nodup${suffix}_clean
+  rm exp/tri4_lats_nodup${suffix}_clean/fsts.*.gz # save space
+  steps/copy_lat_dir.sh --nj $nj --cmd "$train_cmd" \
+    --include-original "$include_original" --prefixes "$prefixes" \
+    data/${train_set} exp/tri4_lats_nodup${suffix}_clean exp/tri4_lats_nodup${suffix} || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 13 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang exp/$ali_dir $treedir
+fi
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.002"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 15 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$train_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0 --constrained false" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.00025 \
+    --trainer.optimization.final-effective-lrate 0.000025 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 16 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+iter_opts=
+if [ ! -z $decode_iter ]; then
+  iter_opts=" --iter $decode_iter "
+fi
+if [ $stage -le 17 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires \
+          $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+if $test_online_decoding && [ $stage -le 16 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_aug.sh b/egs/swbd/s5c/local/chain/run_tdnn_aug.sh
new file mode 120000
index 00000000000..390ed99f5cc
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_aug.sh
@@ -0,0 +1 @@
+multi_condition/run_tdnn_aug_1a.sh
\ No newline at end of file
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6j.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6j.sh
index ae7c97e7d08..acdae844b65 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6j.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6j.sh
@@ -120,7 +120,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6k.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6k.sh
index 90d672b9ae9..bbd8cb63697 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6k.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6k.sh
@@ -116,7 +116,7 @@ if [ $stage -le 12 ]; then
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20"
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6l.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6l.sh
index 68daf81ab01..16f2ea211d0 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6l.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6l.sh
@@ -125,7 +125,7 @@ if [ $stage -le 12 ]; then
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20 dropout-proportion=0.0"
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6m.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6m.sh
index 4668aac9ebc..09f7d72434c 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6m.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6m.sh
@@ -124,7 +124,7 @@ if [ $stage -le 12 ]; then
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20"
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6n.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6n.sh
index 22316d56ed2..8e44d0bc114 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6n.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6n.sh
@@ -123,7 +123,7 @@ if [ $stage -le 12 ]; then
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20"
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6o.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6o.sh
index ad2ac4bf043..6a836e81b09 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6o.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6o.sh
@@ -125,7 +125,7 @@ if [ $stage -le 12 ]; then
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20"
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_cnn_tdnn_1a.sh b/egs/swbd/s5c/local/chain/tuning/run_cnn_tdnn_1a.sh
index 174925315a0..d1a61360f85 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_cnn_tdnn_1a.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_cnn_tdnn_1a.sh
@@ -112,7 +112,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   cnn_opts="l2-regularize=0.01"
   ivector_affine_opts="l2-regularize=0.01"
diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_6j.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_6j.sh
index e432435a551..48db81f586f 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_lstm_6j.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_6j.sh
@@ -119,7 +119,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_6k.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_6k.sh
index b9b7152dcbe..021eab09506 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_lstm_6k.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_6k.sh
@@ -121,7 +121,7 @@ if [ $stage -le 12 ]; then
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20"
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_6l.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_6l.sh
index 12564c4faae..f219167f9ec 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_lstm_6l.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_6l.sh
@@ -131,7 +131,7 @@ if [ $stage -le 12 ]; then
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20"
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh
index fa6518a9ad9..0623d26a9e4 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh
@@ -117,7 +117,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7h.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7h.sh
index 9dfaa1d4509..dbbe3c1e6fd 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7h.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7h.sh
@@ -120,7 +120,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7i.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7i.sh
index c5b5633d94c..2a8a658bf6b 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7i.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7i.sh
@@ -113,7 +113,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7j.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7j.sh
index 793b40f7fe3..a9eba36ddaa 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7j.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7j.sh
@@ -112,7 +112,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7k.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7k.sh
index bd47ed61f23..8e0b290cf87 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7k.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7k.sh
@@ -114,7 +114,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh
index f7681a743e1..bb9ddf209d6 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh
@@ -112,7 +112,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m.sh
index 03b1ee3c97f..97f92c14f1f 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m.sh
@@ -122,7 +122,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25l.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25l.sh
index 0fa7353edb2..d9fe106e5d7 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25l.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25l.sh
@@ -452,7 +452,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.002 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
   linear_opts="orthonormal-constraint=1.0"
   output_opts="l2-regularize=0.0005"
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7n.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7n.sh
index cf4855db611..99e43443f99 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7n.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7n.sh
@@ -119,7 +119,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.002"
   linear_opts="orthonormal-constraint=1.0"
   output_opts="l2-regularize=0.0005 bottleneck-dim=256"
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh
index fb47b1e88ad..44ca3b3d279 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh
@@ -126,7 +126,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.004 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
   linear_opts="orthonormal-constraint=-1.0 l2-regularize=0.004"
   output_opts="l2-regularize=0.002"
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7p.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7p.sh
index 096ed9c54fd..d19a4ef4c0b 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7p.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7p.sh
@@ -114,7 +114,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.004 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
   linear_opts="orthonormal-constraint=-1.0 l2-regularize=0.004"
   output_opts="l2-regularize=0.002"
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7q.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7q.sh
index 8eab54a9dc2..cea0891d5d7 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7q.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7q.sh
@@ -118,7 +118,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
   tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
   linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_attention_1a.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_attention_1a.sh
index 3ce4fa68397..d4febd61e94 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_attention_1a.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_attention_1a.sh
@@ -122,7 +122,7 @@ fi
 if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1a.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1a.sh
index 7854bac44c5..4414147bf0e 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1a.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1a.sh
@@ -120,7 +120,7 @@ if [ $stage -le 12 ]; then
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20"
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1b.sh
index 3929cdc432e..cd9d4dc6f2b 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1b.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1b.sh
@@ -122,7 +122,7 @@ if [ $stage -le 12 ]; then
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20 dropout-proportion=0.0"
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1c.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1c.sh
index 311fe15d895..18b660b4080 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1c.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1c.sh
@@ -119,7 +119,7 @@ if [ $stage -le 12 ]; then
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20"
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1d.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1d.sh
index 4894e492542..be615e0e361 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1d.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1d.sh
@@ -112,7 +112,7 @@ if [ $stage -le 12 ]; then
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20 dropout-proportion=0.0"
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh
index 32234ff009c..43855e6f7ce 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -118,7 +118,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1b.sh
index 1d305186fc2..5c82ed0eb11 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1b.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1b.sh
@@ -114,7 +114,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1c.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1c.sh
index e2492ee277b..c3df0bf2b2c 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1c.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1c.sh
@@ -121,7 +121,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1d.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1d.sh
index 2028e20ff00..3d353387239 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1d.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1d.sh
@@ -140,7 +140,7 @@ if [ $stage -le 12 ]; then
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20"
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh
index bf3eddb90ae..2a2d508ecdd 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh
@@ -135,7 +135,7 @@ if [ $stage -le 12 ]; then
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20"
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1f.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1f.sh
index e500ee0a9a8..5af5463b372 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1f.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1f.sh
@@ -152,7 +152,7 @@ if [ $stage -le 12 ]; then
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20"
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1g.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1g.sh
index 9b3a5d29957..28105a587ec 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1g.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1g.sh
@@ -134,7 +134,7 @@ if [ $stage -le 12 ]; then
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=15"
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1h.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1h.sh
index ca578195323..d6e81f2d8eb 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1h.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1h.sh
@@ -131,7 +131,7 @@ if [ $stage -le 12 ]; then
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20"
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1i.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1i.sh
index a0848cc8894..060d98c9d05 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1i.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1i.sh
@@ -152,7 +152,7 @@ if [ $stage -le 12 ]; then
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20"
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1j.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1j.sh
index 84258624447..9bd39a262c5 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1j.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1j.sh
@@ -119,7 +119,7 @@ if [ $stage -le 12 ]; then
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20"
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1k.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1k.sh
index 258f067cf2b..ccd6138da6e 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1k.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1k.sh
@@ -129,7 +129,7 @@ if [ $stage -le 12 ]; then
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20"
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1l.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1l.sh
index 0a518572201..f702033377a 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1l.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1l.sh
@@ -120,7 +120,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m.sh
index 3a2b34792f3..b43577bd76c 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m.sh
@@ -128,7 +128,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=40"
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1n.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1n.sh
index 34fcf731639..5bb6e7da152 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1n.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1n.sh
@@ -125,7 +125,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   opts="l2-regularize=0.002"
   linear_opts="orthonormal-constraint=1.0"
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1a.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1a.sh
index 18d3f81ffde..4db38d74508 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1a.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1a.sh
@@ -134,7 +134,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   gru_opts="dropout-per-frame=true dropout-proportion=0.0"
 
   mkdir -p $dir/configs
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1b.sh
index 579008b5658..7e9dec67068 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1b.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1b.sh
@@ -132,7 +132,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   gru_opts="dropout-per-frame=true dropout-proportion=0.0 gru-nonlinearity-options=\"max-change=0.75\""
 
   mkdir -p $dir/configs
diff --git a/egs/swbd/s5c/local/nnet3/multi_condition/run_aug_common.sh b/egs/swbd/s5c/local/nnet3/multi_condition/run_aug_common.sh
new file mode 100755
index 00000000000..7d36cdfaac9
--- /dev/null
+++ b/egs/swbd/s5c/local/nnet3/multi_condition/run_aug_common.sh
@@ -0,0 +1,242 @@
+#!/bin/bash
+# Copyright 2019   Phani Sankar Nidadavolu
+# Apache 2.0.
+
+. ./cmd.sh
+
+set -e
+stage=0
+aug_list="reverb music noise babble clean"  #clean refers to the original train dir
+use_ivectors=true
+num_reverb_copies=1
+
+# Alignment directories
+lda_mllt_ali=tri2_ali_100k_nodup
+clean_ali=tri4_ali_nodup
+
+# train directories for ivectors and TDNNs
+ivector_trainset=train_100k_nodup
+train_set=train_nodup
+
+. ./path.sh
+. ./utils/parse_options.sh
+
+if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
+
+if [ $stage -le 0 ]; then
+  # Adding simulated RIRs to the original data directory
+  echo "$0: Preparing data/${train_set}_reverb directory"
+
+  if [ ! -d "RIRS_NOISES" ]; then
+    # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises
+    wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip
+    unzip rirs_noises.zip
+  fi
+
+  if [ ! -f data/$train_set/reco2dur ]; then
+    utils/data/get_reco2dur.sh --nj 6 --cmd "$train_cmd" data/$train_set || exit 1;
+  fi
+
+  # Make a version with reverberated speech
+  rvb_opts=()
+  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list")
+  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list")
+
+  # Make a reverberated version of the SWBD train_nodup.
+  # Note that we don't add any additive noise here.
+  steps/data/reverberate_data_dir.py \
+    "${rvb_opts[@]}" \
+    --speech-rvb-probability 1 \
+    --prefix "reverb" \
+    --pointsource-noise-addition-probability 0 \
+    --isotropic-noise-addition-probability 0 \
+    --num-replications $num_reverb_copies \
+    --source-sampling-rate 8000 \
+    data/$train_set data/${train_set}_reverb
+fi
+
+if [ $stage -le 1 ]; then
+  # Prepare the MUSAN corpus, which consists of music, speech, and noise
+  # We will use them as additive noises for data augmentation.
+  steps/data/make_musan.sh --sampling-rate 8000 --use-vocals "true" \
+        /export/corpora/JHU/musan data
+
+  # Augment with musan_noise
+  steps/data/augment_data_dir.py --utt-prefix "noise" --modify-spk-id "true" \
+    --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" \
+    data/${train_set} data/${train_set}_noise
+
+  # Augment with musan_music
+  steps/data/augment_data_dir.py --utt-prefix "music" --modify-spk-id "true" \
+    --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" \
+    data/${train_set} data/${train_set}_music
+
+  # Augment with musan_speech
+  steps/data/augment_data_dir.py --utt-prefix "babble" --modify-spk-id "true" \
+    --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" \
+    --bg-noise-dir "data/musan_speech" \
+    data/${train_set} data/${train_set}_babble
+
+  # Combine all the augmentation dirs
+  # This part can be simplified once we know what noise types we will add
+  combine_str=""
+  for n in $aug_list; do
+    if [ "$n" == "clean" ]; then
+      # clean refers to original of training directory
+      combine_str+="data/$train_set "
+    else
+      combine_str+="data/${train_set}_${n} "
+    fi
+  done
+  utils/combine_data.sh data/${train_set}_aug $combine_str
+fi
+
+if [ $stage -le 2 ]; then
+  # Extract low-resolution MFCCs for the augmented data
+  # To be used later to generate alignments for augmented data
+  echo "$0: Extracting low-resolution MFCCs for the augmented data. Useful for generating alignments"
+  mfccdir=mfcc_aug
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    date=$(date +'%m_%d_%H_%M')
+    utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/mfcc/swbd-$date/s5c/$mfccdir/storage $mfccdir/storage
+  fi
+  steps/make_mfcc.sh --cmd "$train_cmd" --nj 50 \
+                     data/${train_set}_aug exp/make_mfcc/${train_set}_aug $mfccdir
+  steps/compute_cmvn_stats.sh data/${train_set}_aug exp/make_mfcc/${train_set}_aug $mfccdir
+  utils/fix_data_dir.sh data/${train_set}_aug || exit 1;
+fi
+
+if [ $stage -le 3 ] && $generate_alignments; then
+  # obtain the alignment of augmented data from clean data
+  include_original=false
+  prefixes=""
+  for n in $aug_list; do
+    if [ "$n" == "reverb" ]; then
+      for i in `seq 1 $num_reverb_copies`; do
+        prefixes="$prefixes "reverb$i
+      done
+    elif [ "$n" != "clean" ]; then
+      prefixes="$prefixes "$n
+    else
+      # The original train directory will not have any prefix
+      # include_original flag will take care of copying the original alignments
+      include_original=true
+    fi
+  done
+  echo "$0: Creating alignments of aug data by copying alignments of clean data"
+  steps/copy_ali_dir.sh --nj 40 --cmd "$train_cmd" \
+    --include-original "$include_original" --prefixes "$prefixes" \
+    data/${train_set}_aug exp/${clean_ali} exp/${clean_ali}_aug
+fi
+
+if [ $stage -le 4 ]; then
+  mfccdir=mfcc_hires
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    date=$(date +'%m_%d_%H_%M')
+    utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/mfcc/swbd-$date/s5c/$mfccdir/storage $mfccdir/storage
+  fi
+
+  for dataset in ${train_set}_aug; do
+    echo "$0: Creating hi resolution MFCCs for dir data/$dataset"
+    utils/copy_data_dir.sh data/$dataset data/${dataset}_hires
+    utils/data/perturb_data_dir_volume.sh data/${dataset}_hires
+
+    steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
+        --cmd "$train_cmd" data/${dataset}_hires exp/make_hires/$dataset $mfccdir;
+    steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/${dataset} $mfccdir;
+
+    # Remove the small number of utterances that couldn't be extracted for some
+    # reason (e.g. too short; no such file).
+    utils/fix_data_dir.sh data/${dataset}_hires;
+  done
+fi
+
+if [ $stage -le 5 ]; then
+  mfccdir=mfcc_hires
+  for dataset in eval2000 train_dev $maybe_rt03; do
+    echo "$0: Creating hi resolution MFCCs for data/$dataset"
+    # Create MFCCs for the eval set
+    utils/copy_data_dir.sh data/$dataset data/${dataset}_hires
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 --mfcc-config conf/mfcc_hires.conf \
+        data/${dataset}_hires exp/make_hires/$dataset $mfccdir;
+    steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/$dataset $mfccdir;
+    utils/fix_data_dir.sh data/${dataset}_hires  # remove segments with problems
+  done
+fi
+
+if [ "$use_ivectors" == "true" ]; then
+  if [ $stage -le 6 ]; then
+    # Take  30k utterances from MS data this will be used for the diagubm training.
+    utils/subset_data_dir.sh data/${train_set}_aug_hires 30000 data/${train_set}_aug_30k_hires
+    utils/data/remove_dup_utts.sh 200 data/${train_set}_aug_30k_hires data/${train_set}_aug_30k_nodup_hires  # 33hr
+
+    # Make a 140 hr subset of augmented data to train i-vector extractor
+    # we don't extract hi res features again for ivector training data
+    # we take it from the ms features extracted on the entire training set
+    # First augment the train_100k_nodup directory which is used to train the i-vector extractor in baseline
+    utils/copy_data_dir.sh data/${train_set}_aug_hires data/${ivector_trainset}_aug_hires
+    utils/filter_scp.pl -f 2 data/${ivector_trainset}/utt2spk data/${train_set}_aug_hires/utt2uniq | \
+        utils/filter_scp.pl - data/${train_set}_aug_hires/utt2spk > data/${ivector_trainset}_aug_hires/utt2spk
+    utils/fix_data_dir.sh data/${ivector_trainset}_aug_hires
+
+    # Since the data size is now increased make a subset of it to bring the duration back to required size (140hr)
+    utils/subset_data_dir.sh data/${ivector_trainset}_aug_hires 100000 data/${ivector_trainset}_aug_hires_subset
+    utils/data/remove_dup_utts.sh 200 data/${ivector_trainset}_aug_hires_subset data/${ivector_trainset}_aug_hires
+    steps/compute_cmvn_stats.sh data/${ivector_trainset}_aug_hires exp/make_hires/${ivector_trainset} $mfccdir;
+    utils/fix_data_dir.sh data/${ivector_trainset}_aug_hires
+  fi
+
+  # ivector extractor training
+  if [ $stage -le 7 ]; then
+    # First copy the clean alignments to augmented alignments to train LDA+MLLT transform
+    # Since the alignments are created using  low-res mfcc features make a copy of ivector training directory
+    utils/copy_data_dir.sh data/${ivector_trainset}_aug_hires data/${ivector_trainset}_aug
+    utils/filter_scp.pl data/${ivector_trainset}_aug/utt2spk data/${train_set}_aug/feats.scp > data/${ivector_trainset}_aug/feats.scp
+    utils/fix_data_dir.sh data/${ivector_trainset}_aug
+    echo "$0: Creating alignments of aug data by copying alignments of clean data"
+    steps/copy_ali_dir.sh --nj 40 --cmd "$train_cmd" \
+        data/${ivector_trainset}_aug exp/${lda_mllt_ali} exp/${lda_mllt_ali}_aug
+
+    # We need to build a small system just because we need the LDA+MLLT transform
+    # to train the diag-UBM on top of.  We use --num-iters 13 because after we get
+    # the transform (12th iter is the last), any further training is pointless.
+    # this decision is based on fisher_english
+    steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
+      --splice-opts "--left-context=3 --right-context=3" \
+      5500 90000 data/${ivector_trainset}_aug_hires \
+      data/lang exp/${lda_mllt_ali}_aug exp/nnet3/tri3b
+  fi
+
+  if [ $stage -le 8 ]; then
+    # To train a diagonal UBM we don't need very much data, so use the smallest subset.
+    echo "$0: Training diagonal UBM for i-vector extractor"
+    steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 200000 \
+      data/${train_set}_aug_30k_nodup_hires 512 exp/nnet3/tri3b exp/nnet3/diag_ubm
+  fi
+
+  if [ $stage -le 9 ]; then
+    # iVector extractors can be sensitive to the amount of data, but this one has a
+    # fairly small dim (defaults to 100) so we don't use all of it, we use just the
+    # 100k subset (just under half the data).
+    echo "$0: Training i-vector extractor for speaker adaptation"
+    steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+      data/${ivector_trainset}_aug_hires exp/nnet3/diag_ubm exp/nnet3/extractor || exit 1;
+  fi
+
+  if [ $stage -le 10 ]; then
+    # We extract iVectors on all the train_nodup data, which will be what we
+    # train the system on.
+    # having a larger number of speakers is helpful for generalization, and to
+    # handle per-utterance decoding well (iVector starts at zero).
+    echo "$0: Extracting ivectors for train and eval directories"
+    utils/data/modify_speaker_info.sh --utts-per-spk-max 2 data/${train_set}_aug_hires data/${train_set}_aug_max2_hires
+
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
+      data/${train_set}_aug_max2_hires exp/nnet3/extractor exp/nnet3/ivectors_${train_set}_aug || exit 1;
+
+    for dataset in eval2000 train_dev $maybe_rt03; do
+      steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
+        data/${dataset}_hires exp/nnet3/extractor exp/nnet3/ivectors_$dataset || exit 1;
+    done
+  fi
+fi
diff --git a/egs/swbd/s5c/local/score_sclite_conf.sh b/egs/swbd/s5c/local/score_sclite_conf.sh
index 9a1fa5083bf..21da4520a4d 100755
--- a/egs/swbd/s5c/local/score_sclite_conf.sh
+++ b/egs/swbd/s5c/local/score_sclite_conf.sh
@@ -39,6 +39,12 @@ for f in $data/stm $data/glm $lang/words.txt $lang/phones/word_boundary.int \
   [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
 done
 
+if [ -f $dir/../frame_subsampling_factor ]; then
+  factor=$(cat $dir/../frame_subsampling_factor) || exit 1
+  frame_shift_opt="--frame-shift=0.0$factor"
+  echo "$0: $dir/../frame_subsampling_factor exists, using $frame_shift_opt"
+fi
+
 name=`basename $data`; # e.g. eval2000
 
 mkdir -p $dir/scoring/log
@@ -51,7 +57,7 @@ if [ $stage -le 0 ]; then
       ACWT=\`perl -e \"print 1.0/LMWT\;\"\` '&&' \
       lattice-add-penalty --word-ins-penalty=$wip "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
       lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \
-      lattice-to-ctm-conf --decode-mbr=$decode_mbr --acoustic-scale=\$ACWT  ark:- - \| \
+      lattice-to-ctm-conf $frame_shift_opt --decode-mbr=$decode_mbr --acoustic-scale=\$ACWT  ark:- - \| \
       utils/int2sym.pl -f 5 $lang/words.txt  \| \
       utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \
       '>' $dir/score_LMWT_${wip}/$name.ctm || exit 1;
diff --git a/egs/swbd/s5c/local/swbd1_map_words.pl b/egs/swbd/s5c/local/swbd1_map_words.pl
index 39f90d72816..125e4de0d61 100755
--- a/egs/swbd/s5c/local/swbd1_map_words.pl
+++ b/egs/swbd/s5c/local/swbd1_map_words.pl
@@ -44,7 +44,7 @@
       # which is a  mistake in the input.
       $a =~ s:^\{(.+)\}$:$1:;                 # e.g. {YUPPIEDOM} -> YUPPIEDOM
       $a =~ s:[A-Z]\[([^][])+\][A-Z]:$1-$3:i; # e.g. AMMU[N]IT- -> AMMU-IT-
-      $a =~ s:_\d$::;                         # e.g. THEM_1 -> THEM 
+      $a =~ s:_\d::;                         # e.g. THEM_1 -> THEM, THEM_1's -> THEM's
     }
     $A[$n] = $a;
   }
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_blstm_1a.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_blstm_1a.sh
index 5e60ee1178c..2ac8c09dad1 100644
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_blstm_1a.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_blstm_1a.sh
@@ -139,7 +139,7 @@ if [ $stage -le 17 ]; then
   
   lstm_opts="decay-time=20"
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1a.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1a.sh
index ec6b8941955..47557f93696 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1a.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1a.sh
@@ -152,7 +152,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1b.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1b.sh
index 53aa92710e8..7afa1b7f902 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1b.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1b.sh
@@ -153,7 +153,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1c.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1c.sh
index 83c2f3607f0..e69e499e152 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1c.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1c.sh
@@ -151,7 +151,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1d.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1d.sh
index 2665ea91ff8..86e0352828c 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1d.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1d.sh
@@ -164,7 +164,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1e.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1e.sh
index f768c7659d7..0fdb2b3b63e 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1e.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1e.sh
@@ -154,7 +154,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1b.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1b.sh
index 3384b085114..492d3efb804 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1b.sh
@@ -143,7 +143,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1c.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1c.sh
index 5dd838a15e3..01768c3875f 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1c.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1c.sh
@@ -160,7 +160,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh
index 4f86691b752..bb5007f4c9f 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh
@@ -151,7 +151,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1e.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1e.sh
index e32c08562c6..1476ed1fd40 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1e.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1e.sh
@@ -143,7 +143,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1f.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1f.sh
index 2eab0285828..47f939fea1c 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1f.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1f.sh
@@ -141,7 +141,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1g.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1g.sh
index 64ce1f02fdd..f02025674e8 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1g.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1g.sh
@@ -142,7 +142,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   affine_opts="l2-regularize=0.008 dropout-proportion=0.0 dropout-per-dim-continuous=true"
   tdnnf_opts="l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66"
   linear_opts="l2-regularize=0.008 orthonormal-constraint=-1.0"
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1a.sh
index 8f0be130e27..b03da27e760 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -156,7 +156,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1b.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1b.sh
index fef021c6482..e896a7867b3 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1b.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1b.sh
@@ -169,7 +169,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1c.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1c.sh
index d05ae15dfec..00f72fab796 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1c.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1c.sh
@@ -160,7 +160,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1d.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1d.sh
index 29d8e69b04c..80a9ed1c4d0 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1d.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1d.sh
@@ -165,7 +165,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh
index db3fde91656..031978f878a 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh
@@ -213,7 +213,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1f.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1f.sh
index f6a1d49890d..c60b8f7fefc 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1f.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1f.sh
@@ -167,7 +167,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1g.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1g.sh
index ff2c302fdf6..2d2048a6869 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1g.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1g.sh
@@ -170,7 +170,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1h.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1h.sh
index d4cb5e85657..a074e128270 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1h.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1h.sh
@@ -168,7 +168,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1i.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1i.sh
index 40b1bf7f54a..3bfe175806f 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1i.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1i.sh
@@ -189,7 +189,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1j.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1j.sh
index 838f49f977f..acbef783823 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1j.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1j.sh
@@ -186,7 +186,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1k.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1k.sh
index b1abfdcf525..173be863608 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1k.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1k.sh
@@ -184,7 +184,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   # note: the value of the dropout-proportion is not important, as it's
   # controlled by the dropout schedule; what's important is that we set it.
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1l.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1l.sh
index ef151d72875..94955d0472c 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1l.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1l.sh
@@ -174,7 +174,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   # note: the value of the dropout-proportion is not important, as it's
   # controlled by the dropout schedule; what's important is that we set it.
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1m.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1m.sh
index c2aac3f6e20..efd3bc98725 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1m.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1m.sh
@@ -174,7 +174,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   # note: the value of the dropout-proportion is not important, as it's
   # controlled by the dropout schedule; what's important is that we set it.
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1n.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1n.sh
index ed6cb66957d..c0559e8d389 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1n.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1n.sh
@@ -185,7 +185,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   # note: the value of the dropout-proportion is not important, as it's
   # controlled by the dropout schedule; what's important is that we set it.
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1o.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1o.sh
index 8a4b7468058..5a6dbaef8af 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1o.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1o.sh
@@ -189,7 +189,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   # note: the value of the dropout-proportion is not important, as it's
   # controlled by the dropout schedule; what's important is that we set it.
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1r.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1r.sh
index 8f80a6885ca..dd38d56759f 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1r.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1r.sh
@@ -187,7 +187,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   tdnn_opts='ng-affine-options="update-period=1"'
   lstmp_opts='ng-affine-options="update-period=1" decay-time=20'
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh
index ef1c7fc196f..1378d2d176d 100644
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh
@@ -151,7 +151,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh
index 19479de41aa..3c4882ec2c6 100644
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh
@@ -152,7 +152,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh
index 85c0e4a0661..23ea14ae151 100644
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh
@@ -145,7 +145,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1v.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1v.sh
index e0431a83ceb..7c44d963504 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1v.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1v.sh
@@ -149,7 +149,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_1a.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_1a.sh
index e1543c0120f..042ef346578 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_1a.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_1a.sh
@@ -159,7 +159,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_bs_1a.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_bs_1a.sh
index d08a7ad5e86..905e1845183 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_bs_1a.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_bs_1a.sh
@@ -163,7 +163,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_bs_1b.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_bs_1b.sh
index d256150484b..7bd96e7d82c 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_bs_1b.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_bs_1b.sh
@@ -150,7 +150,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r3/RESULTS b/egs/tedlium/s5_r3/RESULTS
new file mode 100644
index 00000000000..b2f9526a8fd
--- /dev/null
+++ b/egs/tedlium/s5_r3/RESULTS
@@ -0,0 +1,32 @@
+# This RESULTS file was obtained by running ./run.sh and then ./result.sh
+
+%WER 28.32 [ 5037 / 17783, 615 ins, 1171 del, 3251 sub ] exp/tri1/decode_nosp_dev/wer_10
+%WER 26.99 [ 4799 / 17783, 603 ins, 1169 del, 3027 sub ] exp/tri1/decode_nosp_dev_rescore/wer_10
+%WER 27.76 [ 7634 / 27500, 776 ins, 1689 del, 5169 sub ] exp/tri1/decode_nosp_test/wer_11
+%WER 26.52 [ 7292 / 27500, 766 ins, 1611 del, 4915 sub ] exp/tri1/decode_nosp_test_rescore/wer_11
+%WER 23.38 [ 4158 / 17783, 603 ins, 953 del, 2602 sub ] exp/tri2/decode_dev/wer_14
+%WER 21.98 [ 3909 / 17783, 597 ins, 910 del, 2402 sub ] exp/tri2/decode_dev_rescore/wer_14
+%WER 24.12 [ 4289 / 17783, 600 ins, 1014 del, 2675 sub ] exp/tri2/decode_nosp_dev/wer_12
+%WER 22.96 [ 4083 / 17783, 631 ins, 931 del, 2521 sub ] exp/tri2/decode_nosp_dev_rescore/wer_11
+%WER 23.30 [ 6408 / 27500, 727 ins, 1375 del, 4306 sub ] exp/tri2/decode_nosp_test/wer_13
+%WER 22.10 [ 6078 / 27500, 746 ins, 1281 del, 4051 sub ] exp/tri2/decode_nosp_test_rescore/wer_12
+%WER 22.31 [ 6134 / 27500, 794 ins, 1148 del, 4192 sub ] exp/tri2/decode_test/wer_13
+%WER 21.06 [ 5791 / 27500, 737 ins, 1147 del, 3907 sub ] exp/tri2/decode_test_rescore/wer_14
+%WER 19.99 [ 3554 / 17783, 570 ins, 816 del, 2168 sub ] exp/tri3_cleaned/decode_dev/wer_16
+%WER 18.92 [ 3364 / 17783, 588 ins, 791 del, 1985 sub ] exp/tri3_cleaned/decode_dev_rescore/wer_15
+%WER 23.85 [ 4241 / 17783, 686 ins, 874 del, 2681 sub ] exp/tri3_cleaned/decode_dev.si/wer_13
+%WER 17.73 [ 4876 / 27500, 700 ins, 935 del, 3241 sub ] exp/tri3_cleaned/decode_test/wer_16
+%WER 16.72 [ 4599 / 27500, 686 ins, 906 del, 3007 sub ] exp/tri3_cleaned/decode_test_rescore/wer_16
+%WER 22.10 [ 6077 / 27500, 864 ins, 1075 del, 4138 sub ] exp/tri3_cleaned/decode_test.si/wer_13
+%WER 19.63 [ 3490 / 17783, 585 ins, 809 del, 2096 sub ] exp/tri3/decode_dev/wer_15
+%WER 18.56 [ 3300 / 17783, 558 ins, 817 del, 1925 sub ] exp/tri3/decode_dev_rescore/wer_16
+%WER 23.75 [ 4224 / 17783, 661 ins, 917 del, 2646 sub ] exp/tri3/decode_dev.si/wer_14
+%WER 17.92 [ 4928 / 27500, 730 ins, 921 del, 3277 sub ] exp/tri3/decode_test/wer_14
+%WER 16.80 [ 4621 / 27500, 650 ins, 973 del, 2998 sub ] exp/tri3/decode_test_rescore/wer_17
+%WER 22.16 [ 6095 / 27500, 849 ins, 1070 del, 4176 sub ] exp/tri3/decode_test.si/wer_13
+%WER 8.17 [ 1453 / 17783, 242 ins, 310 del, 901 sub ] exp/chain_cleaned/tdnnf_1a/decode_dev/wer_9
+%WER 7.61 [ 1354 / 17783, 236 ins, 300 del, 818 sub ] exp/chain_cleaned/tdnnf_1a/decode_dev_rescore/wer_9
+%WER 6.17 [ 1097 / 17783, 207 ins, 292 del, 598 sub ] exp/chain_cleaned/tdnnf_1a/decode_dev_rnnlm_lstm_tdnn_a_averaged/wer_10
+%WER 8.16 [ 2245 / 27500, 288 ins, 605 del, 1352 sub ] exp/chain_cleaned/tdnnf_1a/decode_test/wer_9
+%WER 7.75 [ 2131 / 27500, 264 ins, 643 del, 1224 sub ] exp/chain_cleaned/tdnnf_1a/decode_test_rescore/wer_10
+%WER 6.84 [ 1880 / 27500, 283 ins, 533 del, 1064 sub ] exp/chain_cleaned/tdnnf_1a/decode_test_rnnlm_lstm_tdnn_a_averaged/wer_8
diff --git a/egs/tedlium/s5_r3/local/chain/compare_wer_general.sh b/egs/tedlium/s5_r3/local/chain/compare_wer_general.sh
index 88dde1ff0e2..c709e351e1e 100755
--- a/egs/tedlium/s5_r3/local/chain/compare_wer_general.sh
+++ b/egs/tedlium/s5_r3/local/chain/compare_wer_general.sh
@@ -55,7 +55,7 @@ for n in 0 1 2 3; do
    for x in $*; do
      set_names $x  # sets $dirname and $epoch_infix
      decode_names=(dev${epoch_infix} dev${epoch_infix}_rescore test${epoch_infix} test${epoch_infix}_rescore)
-     wer=$(grep Sum $dirname/decode_${decode_names[$n]}/score*/*ys | utils/best_wer.sh | awk '{print $2}')
+     wer=$(grep WER $dirname/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}')
      printf "% 10s" $wer
    done
    echo
@@ -64,7 +64,7 @@ for n in 0 1 2 3; do
      for x in $*; do
        set_names $x  # sets $dirname and $epoch_infix
        decode_names=(dev${epoch_infix} dev${epoch_infix}_rescore test${epoch_infix} test${epoch_infix}_rescore)
-       wer=$(grep Sum $dirname/decode_looped_${decode_names[$n]}/score*/*ys | utils/best_wer.sh | awk '{print $2}')
+       wer=$(grep WER $dirname/decode_looped_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}')
        printf "% 10s" $wer
      done
      echo
diff --git a/egs/tedlium/s5_r3/local/chain/run_tdnn.sh b/egs/tedlium/s5_r3/local/chain/run_tdnn.sh
new file mode 120000
index 00000000000..d48449e28bd
--- /dev/null
+++ b/egs/tedlium/s5_r3/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1c.sh
\ No newline at end of file
diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh
index 40cdcb5b5ff..1204ff6ce4c 100755
--- a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh
@@ -143,7 +143,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh
index 9144508e62b..744c964db2f 100755
--- a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh
@@ -55,7 +55,7 @@ nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
 # are just hardcoded at this level, in the commands below.
 train_stage=-10
 tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
-tdnnf_affix=_1a  #affix for TDNNF directory, e.g. "a" or "b", in case we change the configuration.
+tdnnf_affix=_1b  #affix for TDNNF directory, e.g. "a" or "b", in case we change the configuration.
 common_egs_dir=  # you can set this to use previously dumped egs.
 
 # End configuration section.
@@ -148,7 +148,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1c.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1c.sh
new file mode 100755
index 00000000000..faac365af54
--- /dev/null
+++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1c.sh
@@ -0,0 +1,249 @@
+#!/bin/bash
+
+# This is copied from tedlium/s5_r2/local/chain/tuning/run_tdnn_1g.sh setup, and it replaces the current run_tdnn_1b.sh script. 
+
+# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnnf_1b exp/chain_cleaned/tdnnf_1c
+# System                 tdnnf_1b  tdnnf_1c
+# WER on dev(orig)           8.15      8.03
+# WER on dev(rescored)       7.69      7.44
+# WER on test(orig)          8.19      8.30
+# WER on test(rescored)      7.77      7.85
+# Final train prob        -0.0692   -0.0669
+# Final valid prob        -0.0954   -0.0838
+# Final train prob (xent)   -0.9369   -0.9596
+# Final valid prob (xent)   -1.0730   -1.0780
+# Num-params                25741728   9463968
+
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnnf_1b/
+# exp/chain_cleaned/tdnnf_1b/: num-iters=945 nj=2..6 num-params=25.7M dim=40+100->3664 combine=-0.074->-0.071 (over 6) xent:train/valid[628,944,final]=(-1.07,-0.959,-0.937/-1.20,-1.10,-1.07) logprob:train/valid[628,944,final]=(-0.088,-0.070,-0.069/-0.111,-0.098,-0.095)
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnnf_1c
+# exp/chain_cleaned/tdnn1c/: num-iters=228 nj=3..12 num-params=9.5M dim=40+100->3664 combine=-0.068->-0.068 (over 4) xent:train/valid[151,227,final]=(-1.15,-0.967,-0.960/-1.25,-1.09,-1.08) logprob:train/valid[151,227,final]=(-0.090,-0.068,-0.067/-0.102,-0.05,-0.084)
+
+## how you run this (note: this assumes that the run_tdnn.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn.sh
+
+# without cleanup:
+# local/chain/run_tdnn.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=15
+decode_nj=15
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=1
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_affix=1c  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+remove_egs=true
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  affine_opts="l2-regularize=0.008 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.008 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.008"
+  output_opts="l2-regularize=0.002"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1024
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1024 small-dim=256
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1024 small-dim=256
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+
+fi
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0 --constrained false" \
+    --egs.chunk-width 150,110,100 \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 5000000 \
+    --trainer.num-epochs 6 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.00025 \
+    --trainer.optimization.final-effective-lrate 0.000025 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
diff --git a/egs/tedlium/s5_r3/local/download_data.sh b/egs/tedlium/s5_r3/local/download_data.sh
index c51effdd6fa..0b31a258613 100755
--- a/egs/tedlium/s5_r3/local/download_data.sh
+++ b/egs/tedlium/s5_r3/local/download_data.sh
@@ -25,7 +25,7 @@ else
     echo "$0: extracting TEDLIUM_release-3 data"
     tar xf "TEDLIUM_release-3.tgz"
   else
-    echo "$0: not downloading or un-tarring TEDLIUM_release2 because it already exists."
+    echo "$0: not downloading or un-tarring TEDLIUM_release3 because it already exists."
   fi
 fi
 
diff --git a/egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh b/egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh
index 32252db937d..73a684b6379 100755
--- a/egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh
+++ b/egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh
@@ -30,7 +30,6 @@ epochs=20
 [ -z "$cmd" ] && cmd=$train_cmd
 
 text_from_audio=data/train/text
-text=data/LM/train.txt
 wordlist=data/lang_chain/words.txt
 dev_sents=10000
 text_dir=data/rnnlm/text
@@ -44,8 +43,9 @@ done
 
 if [ $stage -le 0 ]; then
   mkdir -p $text_dir
+  gunzip -c db/TEDLIUM_release-3/LM/*.en.gz | sed 's/ <\/s>//g' > $text_dir/train.txt
   # shuffle text from audio and lm
-  cat $text_from_audio | cut -d ' ' -f2- | cat $text |\
+  cat $text_from_audio | cut -d ' ' -f2- | cat $text_dir/train.txt |\
     shuf > data/rnnlm/full_lm_data.shuffled
   # create dev and train sets based on audio and LM data
   cat data/rnnlm/full_lm_data.shuffled | head -n $dev_sents> $text_dir/dev.txt
diff --git a/egs/tedlium/s5_r3/local/ted_download_lm.sh b/egs/tedlium/s5_r3/local/ted_download_lm.sh
index ad833555b5f..6118876a0ab 100755
--- a/egs/tedlium/s5_r3/local/ted_download_lm.sh
+++ b/egs/tedlium/s5_r3/local/ted_download_lm.sh
@@ -13,4 +13,4 @@ echo "$0: downloading Tedlium 4 gram language models (it won't re-download if it
 wget --continue http://kaldi-asr.org/models/5/4gram_small.arpa.gz -P data/local/local_lm/data/arpa || exit 1
 wget --continue http://kaldi-asr.org/models/5/4gram_big.arpa.gz -P data/local/local_lm/data/arpa || exit 1
 
-exit 0
\ No newline at end of file
+exit 0
diff --git a/egs/tedlium/s5_r3/local/ted_download_rnnlm.sh b/egs/tedlium/s5_r3/local/ted_download_rnnlm.sh
index 431d44c6ff6..6cbcaaa85ee 100755
--- a/egs/tedlium/s5_r3/local/ted_download_rnnlm.sh
+++ b/egs/tedlium/s5_r3/local/ted_download_rnnlm.sh
@@ -14,7 +14,7 @@ wget --continue http://kaldi-asr.org/models/5/tedlium_rnnlm.tgz -P exp/rnnlm_lst
 cd exp/rnnlm_lstm_tdnn_a_averaged
 tar -xvzf tedlium_rnnlm.tgz || exit 1
 rm tedlium_rnnlm.tgz
-mkdir config
+mkdir -p config
 cd ../..
 cp data/lang/words.txt exp/rnnlm_lstm_tdnn_a_averaged/config/words.txt
 echo "<brk> 152217" >> exp/rnnlm_lstm_tdnn_a_averaged/config/words.txt
diff --git a/egs/tedlium/s5_r3/results.sh b/egs/tedlium/s5_r3/results.sh
index 98bcab94ec5..3e318cb4bc7 100755
--- a/egs/tedlium/s5_r3/results.sh
+++ b/egs/tedlium/s5_r3/results.sh
@@ -1,10 +1,25 @@
 #!/bin/bash
 
+# The output of this script (after successfully running ./run.sh) can be found in the RESULTS file.
+
 filter_regexp=.
 [ $# -ge 1 ] && filter_regexp=$1
 
-for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done 2>/dev/null
-  for x in exp/{mono,tri,sgmm,nnet,dnn,lstm,chain}*/decode*; do [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; done 2>/dev/null | grep $filter_regexp
-   for x in exp/{mono,tri,sgmm,nnet,dnn,lstm,chain}*/*/decode*; do [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; done 2>/dev/null | grep $filter_regexp
+for x in exp/*/decode*; do 
+  [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; 
+done 2>/dev/null
+
+for x in exp/{mono,tri,sgmm,nnet,dnn,lstm,chain}*/decode*; do 
+  [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; 
+done 2>/dev/null | grep $filter_regexp
+
+for x in exp/{mono,tri,sgmm,nnet,dnn,lstm,chain}*/*/decode*; do 
+  [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; 
+done 2>/dev/null | grep $filter_regexp
+
+for x in exp/{mono,tri,sgmm,nnet,dnn,lstm,chain}*/*/decode*; do 
+  [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; 
+done 2>/dev/null | grep $filter_regexp
+
 exit 0
 
diff --git a/egs/tunisian_msa/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/tunisian_msa/s5/local/chain/tuning/run_tdnn_1a.sh
index a2662584549..ab68ba6fb68 100755
--- a/egs/tunisian_msa/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/tunisian_msa/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -142,7 +142,7 @@ if [ $stage -le 13 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   affine_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true"
   tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66"
   linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0"
diff --git a/egs/uw3/v1/local/chain/run_cnn_1a.sh b/egs/uw3/v1/local/chain/run_cnn_1a.sh
index 582bfc90105..e3548609da7 100755
--- a/egs/uw3/v1/local/chain/run_cnn_1a.sh
+++ b/egs/uw3/v1/local/chain/run_cnn_1a.sh
@@ -130,7 +130,7 @@ if [ $stage -le 4 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   common1="required-time-offsets=0 height-offsets=-2,-1,0,1,2 num-filters-out=12"
 
   mkdir -p $dir/configs
diff --git a/egs/uw3/v1/local/process_data.py b/egs/uw3/v1/local/process_data.py
index 3643c0aca89..23b8e5402cf 100755
--- a/egs/uw3/v1/local/process_data.py
+++ b/egs/uw3/v1/local/process_data.py
@@ -52,10 +52,10 @@
       # The dataset is randomly split train 95% and test 5%
       coin = random.randint(0, 20)
       if coin >= 1:
-        train_text_fh.write(utt_id + ' ' + text + '\n')
+        train_text_fh.write("{} {}\n".format(utt_id, text))
         train_utt2spk_fh.write("{} {}\n".format(utt_id, page_count))
-        train_image_fh.write("{} {}\n".format(utt_id, image_path)
+        train_image_fh.write("{} {}\n".format(utt_id, image_path))
       elif coin < 1:
         test_text_fh.write("{} {}\n".format(utt_id, text))
         test_utt2spk_fh.write("{} {}\n".format(utt_id, page_count))
-        train_image_fh.write("{} {}\n".format(utt_id, image_path)
+        train_image_fh.write("{} {}\n".format(utt_id, image_path))
diff --git a/egs/uw3/v1/local/unk_arc_post_to_transcription.py b/egs/uw3/v1/local/unk_arc_post_to_transcription.py
index f8b69820601..1f1404b5165 100755
--- a/egs/uw3/v1/local/unk_arc_post_to_transcription.py
+++ b/egs/uw3/v1/local/unk_arc_post_to_transcription.py
@@ -25,6 +25,7 @@
       data/lang/oov.int
 """
 import argparse
+import io
 import os
 import sys
 parser = argparse.ArgumentParser(description="""uses phones to convert unk to word""")
@@ -42,17 +43,17 @@
 args = parser.parse_args()
 
 ### main ###
-phone_handle = open(args.phones, 'r', encoding='latin-1') # Create file handles 
-word_handle = open(args.words, 'r', encoding='latin-1')
-unk_handle = open(args.unk,'r', encoding='latin-1')
+phone_handle = open(args.phones, 'r', encoding='utf8') # Create file handles 
+word_handle = open(args.words, 'r', encoding='utf8')
+unk_handle = open(args.unk,'r', encoding='utf8')
 if args.one_best_arc_post == '-':
-    arc_post_handle = sys.stdin
+    arc_post_handle = io.TextIOWrapper(sys.stdin.buffer, encoding='utf8')
 else:
-    arc_post_handle = open(args.one_best_arc_post, 'r', encoding='latin-1')
+    arc_post_handle = open(args.one_best_arc_post, 'r', encoding='utf8')
 if args.output_text == '-':
-    output_text_handle = sys.stdout
+    output_text_handle = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8')
 else:
-    output_text_handle = open(args.output_text, 'w', encoding='latin-1')
+    output_text_handle = open(args.output_text, 'w', encoding='utf8')
 
 id2phone = dict() # Stores the mapping from phone_id (int) to phone (char)
 phones_data = phone_handle.read().strip().split("\n")
diff --git a/egs/voxceleb/v1/local/make_musan.py b/egs/voxceleb/v1/local/make_musan.py
deleted file mode 100755
index 565bfce0cc9..00000000000
--- a/egs/voxceleb/v1/local/make_musan.py
+++ /dev/null
@@ -1,123 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2015   David Snyder
-#           2018   Ewald Enzinger
-# Apache 2.0.
-#
-# Modified version of egs/sre16/v1/local/make_musan.py (commit e3fb7c4a0da4167f8c94b80f4d3cc5ab4d0e22e8).
-# This version uses the raw MUSAN audio files (16 kHz) and does not use sox to resample at 8 kHz.
-#
-# This file is meant to be invoked by make_musan.sh.
-
-import os, sys
-
-def process_music_annotations(path):
-  utt2spk = {}
-  utt2vocals = {}
-  lines = open(path, 'r').readlines()
-  for line in lines:
-    utt, genres, vocals, musician = line.rstrip().split()[:4]
-    # For this application, the musican ID isn't important
-    utt2spk[utt] = utt
-    utt2vocals[utt] = vocals == "Y"
-  return utt2spk, utt2vocals
-
-def prepare_music(root_dir, use_vocals):
-  utt2vocals = {}
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  music_dir = os.path.join(root_dir, "music")
-  for root, dirs, files in os.walk(music_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-      elif str(file) == "ANNOTATIONS":
-        utt2spk_part, utt2vocals_part = process_music_annotations(file_path)
-        utt2spk.update(utt2spk_part)
-        utt2vocals.update(utt2vocals_part)
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2vocals:
-    if utt in utt2wav:
-      if use_vocals or not utt2vocals[utt]:
-        utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-        utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
-      num_good_files += 1
-    else:
-      print("Missing file {}".format(utt))
-      num_bad_files += 1
-  print("In music directory, processed {} files; {} had missing wav data".format(num_good_files, num_bad_files))
-  return utt2spk_str, utt2wav_str
-
-def prepare_speech(root_dir):
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  speech_dir = os.path.join(root_dir, "speech")
-  for root, dirs, files in os.walk(speech_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-        utt2spk[utt] = utt
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2spk:
-    if utt in utt2wav:
-      utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-      utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
-      num_good_files += 1
-    else:
-      print("Missing file {}".format(utt))
-      num_bad_files += 1
-  print("In speech directory, processed {} files; {} had missing wav data".format(num_good_files, num_bad_files))
-  return utt2spk_str, utt2wav_str
-
-def prepare_noise(root_dir):
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  noise_dir = os.path.join(root_dir, "noise")
-  for root, dirs, files in os.walk(noise_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-        utt2spk[utt] = utt
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2spk:
-    if utt in utt2wav:
-      utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-      utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
-      num_good_files += 1
-    else:
-      print("Missing file {}".format(utt))
-      num_bad_files += 1
-  print("In noise directory, processed {} files; {} had missing wav data".format(num_good_files, num_bad_files))
-  return utt2spk_str, utt2wav_str
-
-def main():
-  in_dir = sys.argv[1]
-  out_dir = sys.argv[2]
-  use_vocals = sys.argv[3] == "Y"
-  utt2spk_music, utt2wav_music = prepare_music(in_dir, use_vocals)
-  utt2spk_speech, utt2wav_speech = prepare_speech(in_dir)
-  utt2spk_noise, utt2wav_noise = prepare_noise(in_dir)
-  utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise
-  utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise
-  wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w')
-  wav_fi.write(utt2wav)
-  utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w')
-  utt2spk_fi.write(utt2spk)
-
-
-if __name__=="__main__":
-  main()
diff --git a/egs/voxceleb/v1/local/make_musan.sh b/egs/voxceleb/v1/local/make_musan.sh
deleted file mode 100755
index 1565ef0d85c..00000000000
--- a/egs/voxceleb/v1/local/make_musan.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash
-# Copyright 2015   David Snyder
-# Apache 2.0.
-#
-# Copy of egs/sre16/v1/local/make_musan.sh (commit e3fb7c4a0da4167f8c94b80f4d3cc5ab4d0e22e8).
-#
-# This script, called by ../run.sh, creates the MUSAN
-# data directory. The required dataset is freely available at
-#   http://www.openslr.org/17/
-
-set -e
-in_dir=$1
-data_dir=$2
-use_vocals='Y'
-
-mkdir -p local/musan.tmp
-
-echo "Preparing ${data_dir}/musan..."
-mkdir -p ${data_dir}/musan
-local/make_musan.py ${in_dir} ${data_dir}/musan ${use_vocals}
-
-utils/fix_data_dir.sh ${data_dir}/musan
-
-grep "music" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_music
-grep "speech" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_speech
-grep "noise" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_noise
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_music \
-  ${data_dir}/musan ${data_dir}/musan_music
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_speech \
-  ${data_dir}/musan ${data_dir}/musan_speech
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_noise \
-  ${data_dir}/musan ${data_dir}/musan_noise
-
-utils/fix_data_dir.sh ${data_dir}/musan_music
-utils/fix_data_dir.sh ${data_dir}/musan_speech
-utils/fix_data_dir.sh ${data_dir}/musan_noise
-
-rm -rf local/musan.tmp
-
diff --git a/egs/voxceleb/v1/local/make_voxceleb1_v2.pl b/egs/voxceleb/v1/local/make_voxceleb1_v2.pl
new file mode 100755
index 00000000000..905b43d31a6
--- /dev/null
+++ b/egs/voxceleb/v1/local/make_voxceleb1_v2.pl
@@ -0,0 +1,123 @@
+#!/usr/bin/perl
+#
+# Copyright 2018  Ewald Enzinger
+#           2018  David Snyder
+#           2019  Soonshin Seo
+#
+# Usage: make_voxceleb1_v2.pl /export/voxceleb1 dev data/dev
+#
+# The VoxCeleb1 corpus underwent several updates that changed the directory and speaker ID format.
+# The script 'make_voxceleb1.pl' works for the oldest version of the corpus. 
+# This script should be used if you've downloaded the corpus recently.
+
+if (@ARGV != 3) {
+  print STDERR "Usage: $0 <path-to-voxceleb1> <dataset> <path-to-data-dir>\n";
+  print STDERR "e.g. $0 /export/voxceleb1 dev data/dev\n";
+  exit(1);
+}
+
+($data_base, $dataset, $out_dir) = @ARGV;
+
+if ("$dataset" ne "dev" && "$dataset" ne "test") {
+  die "dataset parameter must be 'dev' or 'test'!";
+}
+
+if (system("mkdir -p $out_dir") != 0) {
+  die "Error making directory $out_dir";
+}
+
+opendir my $dh, "$data_base/$dataset/wav" or die "Cannot open directory: $!";
+my @spkr_dirs = grep {-d "$data_base/$dataset/wav/$_" && ! /^\.{1,2}$/} readdir($dh);
+closedir $dh;
+
+if ($dataset eq "dev"){
+  open(SPKR_TRAIN, ">", "$out_dir/utt2spk") or die "could not open the output file $out_dir/utt2spk";
+  open(WAV_TRAIN, ">", "$out_dir/wav.scp") or die "could not open the output file $out_dir/wav.scp";
+
+  foreach (@spkr_dirs) {
+    my $spkr_id = $_;
+    opendir my $dh, "$data_base/$dataset/wav/$spkr_id/" or die "Cannot open directory: $!";
+    my @rec_dirs = grep {-d "$data_base/$dataset/wav/$spkr_id/$_" && ! /^\.{1,2}$/} readdir($dh);
+    closedir $dh;
+    foreach (@rec_dirs) {
+	  my $rec_id = $_;
+	  opendir my $dh, "$data_base/$dataset/wav/$spkr_id/$rec_id/" or die "Cannot open directory: $!";
+	  my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh);
+	  closedir $dh;
+  	  foreach (@files) {
+        my $name = $_;
+        my $wav = "$data_base/$dataset/wav/$spkr_id/$rec_id/$name.wav";
+        my $utt_id = "$spkr_id-$rec_id-$name";
+        print WAV_TRAIN "$utt_id", " $wav", "\n";
+        print SPKR_TRAIN "$utt_id", " $spkr_id", "\n";
+      }
+    }
+  }
+  close(SPKR_TRAIN) or die;
+  close(WAV_TRAIN) or die;
+}
+
+if ($dataset eq "test"){
+  if (! -e "$data_base/voxceleb1_test_v2.txt") {
+    system("wget -O $data_base/voxceleb1_test_v2.txt http://www.openslr.org/resources/49/voxceleb1_test_v2.txt");
+  }
+
+  open(TRIAL_IN, "<", "$data_base/voxceleb1_test_v2.txt") or die "could not open the verification trials file $data_base/voxceleb1_test_v2.txt";
+  open(TRIAL_OUT, ">", "$out_dir/trials") or die "Could not open the output file $out_test_dir/trials";
+  open(SPKR_TEST, ">", "$out_dir/utt2spk") or die "could not open the output file $out_dir/utt2spk";
+  open(WAV_TEST, ">", "$out_dir/wav.scp") or die "could not open the output file $out_dir/wav.scp";
+
+  my $test_spkrs = ();
+  while (<TRIAL_IN>) {
+    chomp;
+    my ($tar_or_non, $path1, $path2) = split;
+    # Create entry for left-hand side of trial
+    my ($spkr_id, $rec_id, $name) = split('/', $path1);
+    my $utt_id1 = "$spkr_id-$rec_id-$name";
+    $test_spkrs{$spkr_id} = ();
+
+    # Create entry for right-hand side of trial
+    my ($spkr_id, $rec_id, $name) = split('/', $path2);
+    my $utt_id2 = "$spkr_id-$rec_id-$name";
+    $test_spkrs{$spkr_id} = ();
+
+    my $target = "nontarget";
+    if ($tar_or_non eq "1") {
+      $target = "target";
+    }
+    print TRIAL_OUT "$utt_id1 $utt_id2 $target\n";
+  }
+
+  foreach (@spkr_dirs) {
+    my $spkr_id = $_;
+    opendir my $dh, "$data_base/$dataset/wav/$spkr_id/" or die "Cannot open directory: $!";
+    my @rec_dirs = grep {-d "$data_base/$dataset/wav/$spkr_id/$_" && ! /^\.{1,2}$/} readdir($dh);
+    closedir $dh;
+    foreach (@rec_dirs) {
+	  my $rec_id = $_;
+	  opendir my $dh, "$data_base/$dataset/wav/$spkr_id/$rec_id/" or die "Cannot open directory: $!";
+	  my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh);
+	  closedir $dh;
+  	  foreach (@files) {
+        my $name = $_;
+        my $wav = "$data_base/$dataset/wav/$spkr_id/$rec_id/$name.wav";
+        my $utt_id = "$spkr_id-$rec_id-$name";
+        print WAV_TEST "$utt_id", " $wav", "\n";
+        print SPKR_TEST "$utt_id", " $spkr_id", "\n";
+      }
+    }
+  }
+  close(SPKR_TEST) or die;
+  close(WAV_TEST) or die;
+  close(TRIAL_OUT) or die;
+  close(TRIAL_IN) or die;
+}
+
+if (system(
+  "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
+  die "Error creating spk2utt file in directory $out_dir";
+}
+system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir");
+if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
+  die "Error validating directory $out_dir";
+}
diff --git a/egs/voxceleb/v1/run.sh b/egs/voxceleb/v1/run.sh
index 8af2226423d..500c05c5db6 100755
--- a/egs/voxceleb/v1/run.sh
+++ b/egs/voxceleb/v1/run.sh
@@ -14,7 +14,7 @@ set -e
 mfccdir=`pwd`/mfcc
 vaddir=`pwd`/mfcc
 
-# The trials file is downloaded by local/make_voxceleb1.pl.
+# The trials file is downloaded by local/make_voxceleb1_v2.pl.
 voxceleb1_trials=data/voxceleb1_test/trials
 voxceleb1_root=/export/corpora/VoxCeleb1
 voxceleb2_root=/export/corpora/VoxCeleb2
@@ -24,11 +24,14 @@ stage=0
 if [ $stage -le 0 ]; then
   local/make_voxceleb2.pl $voxceleb2_root dev data/voxceleb2_train
   local/make_voxceleb2.pl $voxceleb2_root test data/voxceleb2_test
-  # This script reates data/voxceleb1_test and data/voxceleb1_train.
+  # This script creates data/voxceleb1_test and data/voxceleb1_train for latest version of VoxCeleb1.
   # Our evaluation set is the test portion of VoxCeleb1.
-  local/make_voxceleb1.pl $voxceleb1_root data
+  local/make_voxceleb1_v2.pl $voxceleb1_root dev data/voxceleb1_train
+  local/make_voxceleb1_v2.pl $voxceleb1_root test data/voxceleb1_test
+  # if you downloaded the dataset soon after it was released, you will want to use the make_voxceleb1.pl script instead.
+  # local/make_voxceleb1.pl $voxceleb1_root data
   # We'll train on all of VoxCeleb2, plus the training portion of VoxCeleb1.
-  # This should give 7,351 speakers and 1,277,503 utterances.
+  # This should give 7,323 speakers and 1,276,888 utterances.
   utils/combine_data.sh data/train data/voxceleb2_train data/voxceleb2_test data/voxceleb1_train
 fi
 
diff --git a/egs/voxceleb/v2/run.sh b/egs/voxceleb/v2/run.sh
index 37bb60fe35c..7c70e4a42c1 100755
--- a/egs/voxceleb/v2/run.sh
+++ b/egs/voxceleb/v2/run.sh
@@ -15,7 +15,7 @@ mfccdir=`pwd`/mfcc
 vaddir=`pwd`/mfcc
 
 
-# The trials file is downloaded by local/make_voxceleb1.pl.
+# The trials file is downloaded by local/make_voxceleb1_v2.pl.
 voxceleb1_trials=data/voxceleb1_test/trials
 voxceleb1_root=/export/corpora/VoxCeleb1
 voxceleb2_root=/export/corpora/VoxCeleb2
@@ -27,11 +27,14 @@ stage=0
 if [ $stage -le 0 ]; then
   local/make_voxceleb2.pl $voxceleb2_root dev data/voxceleb2_train
   local/make_voxceleb2.pl $voxceleb2_root test data/voxceleb2_test
-  # This script creates data/voxceleb1_test and data/voxceleb1_train.
+  # This script creates data/voxceleb1_test and data/voxceleb1_train for latest version of VoxCeleb1.
   # Our evaluation set is the test portion of VoxCeleb1.
-  local/make_voxceleb1.pl $voxceleb1_root data
+  local/make_voxceleb1_v2.pl $voxceleb1_root dev data/voxceleb1_train
+  local/make_voxceleb1_v2.pl $voxceleb1_root test data/voxceleb1_test
+  # if you downloaded the dataset soon after it was released, you will want to use the make_voxceleb1.pl script instead.
+  # local/make_voxceleb1.pl $voxceleb1_root data
   # We'll train on all of VoxCeleb2, plus the training portion of VoxCeleb1.
-  # This should give 7,351 speakers and 1,277,503 utterances.
+  # This should give 7,323 speakers and 1,276,888 utterances.
   utils/combine_data.sh data/train data/voxceleb2_train data/voxceleb2_test data/voxceleb1_train
 fi
 
@@ -81,7 +84,7 @@ if [ $stage -le 2 ]; then
 
   # Prepare the MUSAN corpus, which consists of music, speech, and noise
   # suitable for augmentation.
-  local/make_musan.sh $musan_root data
+  steps/data/make_musan.sh --sampling-rate 16000 $musan_root data
 
   # Get the duration of the MUSAN recordings.  This will be used by the
   # script augment_data_dir.py.
diff --git a/egs/voxforge/s5/local/voxforge_prepare_dict.sh b/egs/voxforge/s5/local/voxforge_prepare_dict.sh
index 4242af29d25..daf4e2326e5 100755
--- a/egs/voxforge/s5/local/voxforge_prepare_dict.sh
+++ b/egs/voxforge/s5/local/voxforge_prepare_dict.sh
@@ -49,7 +49,7 @@ if [[ "$(uname)" == "Darwin" ]]; then
   alias readlink=greadlink
 fi
 
-sequitur=$KALDI_ROOT/tools/sequitur
+sequitur=$KALDI_ROOT/tools/sequitur-g2p
 export PATH=$PATH:$sequitur/bin
 export PYTHONPATH=$PYTHONPATH:`utils/make_absolute.sh $sequitur/lib/python*/site-packages`
 
diff --git a/egs/vystadial_cz/s5b/local/chain/tuning/run_tdnn_1a.sh b/egs/vystadial_cz/s5b/local/chain/tuning/run_tdnn_1a.sh
index 496ee5e84ca..844ccf80677 100755
--- a/egs/vystadial_cz/s5b/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/vystadial_cz/s5b/local/chain/tuning/run_tdnn_1a.sh
@@ -148,7 +148,7 @@ if [ $stage -le 13 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.05 dropout-per-dim-continuous=true"
   output_opts="l2-regularize=0.02 bottleneck-dim=192"
 
diff --git a/egs/wsj/s5/local/chain/e2e/run_tdnnf_flatstart_char.sh b/egs/wsj/s5/local/chain/e2e/run_tdnnf_flatstart_char.sh
deleted file mode 100755
index 4ab0cf58d53..00000000000
--- a/egs/wsj/s5/local/chain/e2e/run_tdnnf_flatstart_char.sh
+++ /dev/null
@@ -1,225 +0,0 @@
-#!/bin/bash
-# Copyright    2017  Hossein Hadian
-
-# This script performs chain training in a flat-start manner
-# and without building or using any context-dependency tree.
-# It does not use ivecors or other forms of speaker adaptation
-# It is called from run_e2e_char.sh
-
-# Note: this script is configured as grapheme-based, if you want
-# to run it in phoneme mode, you'll need to change _char
-# to _nosp everywhere.
-
-# This is the same as run_tdnn_lstm_flatstart.sh except it uses
-# TDNN-F (and CMVN is disabled).
-
-
-# local/chain/compare_wer.sh exp/chain/e2e_tdnn_lstm_bichar_1a exp/chain/e2e_tdnnf_bichar1a
-# System                e2e_tdnn_lstm_bichar_1a e2e_tdnnf_bichar1a
-# WER dev93 (tgpr)                9.42      8.89
-# WER dev93 (tg)                  8.85      8.20
-# WER dev93 (big-dict,tgpr)       7.70      6.96
-# WER dev93 (big-dict,fg)         6.79      6.01
-# WER eval92 (tgpr)               6.42      6.08
-# WER eval92 (tg)                 6.11      5.79
-# WER eval92 (big-dict,tgpr)      4.50      4.39
-# WER eval92 (big-dict,fg)        4.09      3.88
-# Final train prob        -0.0610   -0.0598
-# Final valid prob        -0.0836   -0.0854
-# Final train prob (xent)
-# Final valid prob (xent)
-# Num-params                 9219188   7421044
-
-# steps/info/chain_dir_info.pl exp/chain/e2e_tdnnf_bichar1a
-# exp/chain/e2e_tdnnf_bichar1a: num-iters=180 nj=2..8 num-params=7.4M dim=40->3444 combine=-0.064->-0.064 (over 3) logprob:train/valid[119,179,final]=(-0.093,-0.060,-0.060/-0.107,-0.086,-0.085)
-
-
-set -e
-
-# configs for 'chain'
-stage=0
-train_stage=-10
-get_egs_stage=-10
-affix=1a
-
-# training options
-dropout_schedule='0,0@0.20,0.5@0.50,0'
-num_epochs=10
-num_jobs_initial=2
-num_jobs_final=8
-minibatch_size=150=128,64/300=64,32/600=32,16/1200=8
-common_egs_dir=
-l2_regularize=0.00005
-frames_per_iter=3000000
-cmvn_opts="--norm-means=false --norm-vars=false"
-train_set=train_si284_spe2e_hires
-test_sets="test_dev93 test_eval92"
-
-# End configuration section.
-echo "$0 $@"  # Print the command line for logging
-
-. ./cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-lang=data/lang_e2e_char
-treedir=exp/chain/e2e_bichar_tree
-dir=exp/chain/e2e_tdnnf_bichar${affix}
-
-if [ $stage -le 0 ]; then
-  # Create a version of the lang/ directory that has one state per phone in the
-  # topo file. [note, it really has two states.. the first one is only repeated
-  # once, the second one has zero or more repeats.]
-  rm -rf $lang
-  cp -r data/lang_char $lang
-  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
-  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
-  # Use our special topology... note that later on may have to tune this
-  # topology.
-  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
-fi
-
-if [ $stage -le 1 ]; then
-  echo "$0: Estimating a phone language model for the denominator graph..."
-  mkdir -p $treedir/log
-  $train_cmd $treedir/log/make_phone_lm.log \
-             cat data/$train_set/text \| \
-             steps/nnet3/chain/e2e/text_to_phones.py --between-silprob 0.1 \
-             data/lang_char \| \
-             utils/sym2int.pl -f 2- data/lang_char/phones.txt \| \
-             chain-est-phone-lm --num-extra-lm-states=2000 \
-             ark:- $treedir/phone_lm.fst
-  steps/nnet3/chain/e2e/prepare_e2e.sh --nj 30 --cmd "$train_cmd" \
-                                       --type biphone \
-                                       --shared-phones true \
-                                       data/$train_set $lang $treedir
-fi
-
-if [ $stage -le 2 ]; then
-  echo "$0: creating neural net configs using the xconfig parser";
-  num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
-  tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true"
-  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
-  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
-  prefinal_opts="l2-regularize=0.01"
-  output_opts="l2-regularize=0.005"
-
-  mkdir -p $dir/configs
-  cat <<EOF > $dir/configs/network.xconfig
-
-  input dim=40 name=input
-
-  relu-batchnorm-dropout-layer name=tdnn1 input=Append(-1,0,1) $tdnn_opts dim=1024
-  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
-  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
-  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
-  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=0
-  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
-  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
-  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
-  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
-  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
-  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
-  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
-  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
-  linear-component name=prefinal-l dim=192 $linear_opts
-
-
-  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192
-  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
-
-EOF
-  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs
-fi
-
-if [ $stage -le 3 ]; then
-  # no need to store the egs in a shared storage because we always
-  # remove them. Anyway, it takes only 5 minutes to generate them.
-
-  steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \
-    --cmd "$decode_cmd" \
-    --feat.cmvn-opts "$cmvn_opts" \
-    --chain.leaky-hmm-coefficient 0.1 \
-    --chain.l2-regularize $l2_regularize \
-    --chain.apply-deriv-weights false \
-    --egs.dir "$common_egs_dir" \
-    --egs.stage $get_egs_stage \
-    --egs.opts "" \
-    --trainer.dropout-schedule $dropout_schedule \
-    --trainer.num-chunk-per-minibatch $minibatch_size \
-    --trainer.frames-per-iter $frames_per_iter \
-    --trainer.num-epochs $num_epochs \
-    --trainer.optimization.momentum 0 \
-    --trainer.optimization.num-jobs-initial $num_jobs_initial \
-    --trainer.optimization.num-jobs-final $num_jobs_final \
-    --trainer.optimization.initial-effective-lrate 0.0005 \
-    --trainer.optimization.final-effective-lrate 0.00005 \
-    --trainer.optimization.shrink-value 1.0 \
-    --trainer.max-param-change 2.0 \
-    --cleanup.remove-egs true \
-    --feat-dir data/${train_set} \
-    --tree-dir $treedir \
-    --dir $dir  || exit 1;
-fi
-
-if [ $stage -le 4 ]; then
-  # The reason we are using data/lang here, instead of $lang, is just to
-  # emphasize that it's not actually important to give mkgraph.sh the
-  # lang directory with the matched topology (since it gets the
-  # topology file from the model).  So you could give it a different
-  # lang directory, one that contained a wordlist and LM of your choice,
-  # as long as phones.txt was compatible.
-
-  utils/lang/check_phones_compatible.sh \
-    data/lang_char_test_tgpr/phones.txt $lang/phones.txt
-  utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_char_test_tgpr \
-    $dir $treedir/graph_tgpr || exit 1;
-
-  utils/lang/check_phones_compatible.sh \
-    data/lang_char_test_bd_tgpr/phones.txt $lang/phones.txt
-  utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_char_test_bd_tgpr \
-    $dir $treedir/graph_bd_tgpr || exit 1;
-fi
-
-if [ $stage -le 5 ]; then
-  frames_per_chunk=150
-  rm $dir/.error 2>/dev/null || true
-
-  for data in $test_sets; do
-    (
-      data_affix=$(echo $data | sed s/test_//)
-      nspk=$(wc -l <data/${data}_hires/spk2utt)
-      for lmtype in tgpr bd_tgpr; do
-        steps/nnet3/decode.sh \
-          --acwt 1.0 --post-decode-acwt 10.0 \
-          --extra-left-context-initial 0 \
-          --extra-right-context-final 0 \
-          --frames-per-chunk $frames_per_chunk \
-          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
-          $treedir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
-      done
-      steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
-        --cmd "$decode_cmd" data/lang_char_test_{tgpr,tg} \
-        data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
-      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-        data/lang_char_test_bd_{tgpr,fgconst} \
-       data/${data}_hires ${dir}/decode_${lmtype}_${data_affix}{,_fg} || exit 1
-    ) || touch $dir/.error &
-  done
-  wait
-  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
-fi
-
-echo "Done. Date: $(date). Results:"
-local/chain/compare_wer.sh $dir
diff --git a/egs/wsj/s5/local/chain/e2e/run_tdnnf_flatstart_char.sh b/egs/wsj/s5/local/chain/e2e/run_tdnnf_flatstart_char.sh
new file mode 120000
index 00000000000..b20849c2a48
--- /dev/null
+++ b/egs/wsj/s5/local/chain/e2e/run_tdnnf_flatstart_char.sh
@@ -0,0 +1 @@
+tuning/run_tdnnf_flatstart_char1b.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/local/chain/e2e/tuning/run_tdnnf_flatstart_char1a.sh b/egs/wsj/s5/local/chain/e2e/tuning/run_tdnnf_flatstart_char1a.sh
new file mode 100755
index 00000000000..4ab0cf58d53
--- /dev/null
+++ b/egs/wsj/s5/local/chain/e2e/tuning/run_tdnnf_flatstart_char1a.sh
@@ -0,0 +1,225 @@
+#!/bin/bash
+# Copyright    2017  Hossein Hadian
+
+# This script performs chain training in a flat-start manner
+# and without building or using any context-dependency tree.
+# It does not use ivecors or other forms of speaker adaptation
+# It is called from run_e2e_char.sh
+
+# Note: this script is configured as grapheme-based, if you want
+# to run it in phoneme mode, you'll need to change _char
+# to _nosp everywhere.
+
+# This is the same as run_tdnn_lstm_flatstart.sh except it uses
+# TDNN-F (and CMVN is disabled).
+
+
+# local/chain/compare_wer.sh exp/chain/e2e_tdnn_lstm_bichar_1a exp/chain/e2e_tdnnf_bichar1a
+# System                e2e_tdnn_lstm_bichar_1a e2e_tdnnf_bichar1a
+# WER dev93 (tgpr)                9.42      8.89
+# WER dev93 (tg)                  8.85      8.20
+# WER dev93 (big-dict,tgpr)       7.70      6.96
+# WER dev93 (big-dict,fg)         6.79      6.01
+# WER eval92 (tgpr)               6.42      6.08
+# WER eval92 (tg)                 6.11      5.79
+# WER eval92 (big-dict,tgpr)      4.50      4.39
+# WER eval92 (big-dict,fg)        4.09      3.88
+# Final train prob        -0.0610   -0.0598
+# Final valid prob        -0.0836   -0.0854
+# Final train prob (xent)
+# Final valid prob (xent)
+# Num-params                 9219188   7421044
+
+# steps/info/chain_dir_info.pl exp/chain/e2e_tdnnf_bichar1a
+# exp/chain/e2e_tdnnf_bichar1a: num-iters=180 nj=2..8 num-params=7.4M dim=40->3444 combine=-0.064->-0.064 (over 3) logprob:train/valid[119,179,final]=(-0.093,-0.060,-0.060/-0.107,-0.086,-0.085)
+
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+affix=1a
+
+# training options
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+num_epochs=10
+num_jobs_initial=2
+num_jobs_final=8
+minibatch_size=150=128,64/300=64,32/600=32,16/1200=8
+common_egs_dir=
+l2_regularize=0.00005
+frames_per_iter=3000000
+cmvn_opts="--norm-means=false --norm-vars=false"
+train_set=train_si284_spe2e_hires
+test_sets="test_dev93 test_eval92"
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+lang=data/lang_e2e_char
+treedir=exp/chain/e2e_bichar_tree
+dir=exp/chain/e2e_tdnnf_bichar${affix}
+
+if [ $stage -le 0 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang_char $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$0: Estimating a phone language model for the denominator graph..."
+  mkdir -p $treedir/log
+  $train_cmd $treedir/log/make_phone_lm.log \
+             cat data/$train_set/text \| \
+             steps/nnet3/chain/e2e/text_to_phones.py --between-silprob 0.1 \
+             data/lang_char \| \
+             utils/sym2int.pl -f 2- data/lang_char/phones.txt \| \
+             chain-est-phone-lm --num-extra-lm-states=2000 \
+             ark:- $treedir/phone_lm.fst
+  steps/nnet3/chain/e2e/prepare_e2e.sh --nj 30 --cmd "$train_cmd" \
+                                       --type biphone \
+                                       --shared-phones true \
+                                       data/$train_set $lang $treedir
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+  num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
+  tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.005"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+
+  input dim=40 name=input
+
+  relu-batchnorm-dropout-layer name=tdnn1 input=Append(-1,0,1) $tdnn_opts dim=1024
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs
+fi
+
+if [ $stage -le 3 ]; then
+  # no need to store the egs in a shared storage because we always
+  # remove them. Anyway, it takes only 5 minutes to generate them.
+
+  steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.cmvn-opts "$cmvn_opts" \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize $l2_regularize \
+    --chain.apply-deriv-weights false \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter $frames_per_iter \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.momentum 0 \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate 0.0005 \
+    --trainer.optimization.final-effective-lrate 0.00005 \
+    --trainer.optimization.shrink-value 1.0 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir data/${train_set} \
+    --tree-dir $treedir \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_char_test_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_char_test_tgpr \
+    $dir $treedir/graph_tgpr || exit 1;
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_char_test_bd_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_char_test_bd_tgpr \
+    $dir $treedir/graph_bd_tgpr || exit 1;
+fi
+
+if [ $stage -le 5 ]; then
+  frames_per_chunk=150
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          $treedir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 1.0 \
+        --cmd "$decode_cmd" data/lang_char_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_char_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+echo "Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/wsj/s5/local/chain/e2e/tuning/run_tdnnf_flatstart_char1b.sh b/egs/wsj/s5/local/chain/e2e/tuning/run_tdnnf_flatstart_char1b.sh
new file mode 100755
index 00000000000..4e66fae8baa
--- /dev/null
+++ b/egs/wsj/s5/local/chain/e2e/tuning/run_tdnnf_flatstart_char1b.sh
@@ -0,0 +1,227 @@
+#!/bin/bash
+# Copyright    2019  Hossein Hadian
+
+# 1b is the same as 1a except it uses a better tree (which is
+# pruned based on training transcripts).
+
+# This script performs chain training in a flat-start manner
+# and without building or using any context-dependency tree.
+# It does not use ivecors or other forms of speaker adaptation
+# It is called from run_e2e_char.sh
+
+# Note: this script is configured as grapheme-based, if you want
+# to run it in phoneme mode, you'll need to change _char
+# to _nosp everywhere.
+
+# local/chain/compare_wer.sh exp/chain/e2e_tdnnf_bichar1a exp/chain/e2e_tdnnf_bichar1b
+# System                e2e_tdnnf_bichar1a e2e_tdnnf_bichar1b
+#WER dev93 (tgpr)                8.89      9.06
+#WER dev93 (tg)                  8.20      8.43
+#WER dev93 (big-dict,tgpr)       6.96      6.95
+#WER dev93 (big-dict,fg)         6.01      6.08
+#WER eval92 (tgpr)               6.08      5.98
+#WER eval92 (tg)                 5.79      5.94
+#WER eval92 (big-dict,tgpr)      4.39      4.29
+#WER eval92 (big-dict,fg)        3.88      3.69
+# Final train prob        -0.0598   -0.0601
+# Final valid prob        -0.0854   -0.0855
+# Final train prob (xent)
+# Final valid prob (xent)
+# Num-params                 7421044   7025973
+
+# steps/info/chain_dir_info.pl exp/chain/e2e_tdnnf_bichar1b
+# exp/chain/e2e_tdnnf_bichar1b: num-iters=180 nj=2..8 num-params=7.0M dim=40->1397 combine=-0.064->-0.064 (over 2) logprob:train/valid[119,179,final]=(-0.086,-0.060,-0.060/-0.099,-0.087,-0.087)
+
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+affix=1b
+
+# training options
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+num_epochs=10
+num_jobs_initial=2
+num_jobs_final=8
+minibatch_size=150=128,64/300=64,32/600=32,16/1200=8
+common_egs_dir=
+l2_regularize=0.00005
+frames_per_iter=3000000
+cmvn_opts="--norm-means=false --norm-vars=false"
+train_set=train_si284_spe2e_hires
+test_sets="test_dev93 test_eval92"
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+lang=data/lang_e2e_char
+treedir=exp/chain/e2e_bichar_tree_tied1a
+dir=exp/chain/e2e_tdnnf_bichar${affix}
+
+if [ $stage -le 0 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang_char $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$0: Estimating a phone language model for the denominator graph..."
+  mkdir -p $treedir/log
+  $train_cmd $treedir/log/make_phone_lm.log \
+             cat data/$train_set/text \| \
+             steps/nnet3/chain/e2e/text_to_phones.py --between-silprob 0.1 \
+             data/lang_char \| \
+             utils/sym2int.pl -f 2- data/lang_char/phones.txt \| \
+             chain-est-phone-lm --num-extra-lm-states=2000 \
+             ark:- $treedir/phone_lm.fst
+  steps/nnet3/chain/e2e/prepare_e2e.sh --nj 30 --cmd "$train_cmd" \
+                                       --type biphone \
+                                       --shared-phones true \
+                                       --tie true \
+                                       --min-biphone-count 100 \
+                                       --min-monophone-count 20 \
+                                       data/$train_set $lang $treedir
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+  num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
+  tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.005"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+
+  input dim=40 name=input
+
+  relu-batchnorm-dropout-layer name=tdnn1 input=Append(-1,0,1) $tdnn_opts dim=1024
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs
+fi
+
+if [ $stage -le 3 ]; then
+  # no need to store the egs in a shared storage because we always
+  # remove them. Anyway, it takes only 5 minutes to generate them.
+
+  steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.cmvn-opts "$cmvn_opts" \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize $l2_regularize \
+    --chain.apply-deriv-weights false \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter $frames_per_iter \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.momentum 0 \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate 0.0005 \
+    --trainer.optimization.final-effective-lrate 0.00005 \
+    --trainer.optimization.shrink-value 1.0 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir data/${train_set} \
+    --tree-dir $treedir \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_char_test_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_char_test_tgpr \
+    $dir $treedir/graph_tgpr || exit 1;
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_char_test_bd_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_char_test_bd_tgpr \
+    $dir $treedir/graph_bd_tgpr || exit 1;
+fi
+
+if [ $stage -le 5 ]; then
+  frames_per_chunk=150
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          $treedir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 1.0 \
+        --cmd "$decode_cmd" data/lang_char_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_char_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+echo "Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1a.sh b/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
index ceca428f5c1..e656b67e529 100755
--- a/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
@@ -167,7 +167,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1b.sh b/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1b.sh
index a3a747ed743..9db76e94430 100755
--- a/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1b.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1b.sh
@@ -170,7 +170,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1c.sh b/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1c.sh
index dc47681593f..36ec5bb61af 100755
--- a/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1c.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1c.sh
@@ -155,7 +155,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   cnn_opts="l2-regularize=0.01"
   ivector_affine_opts="l2-regularize=0.01"
   tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true"
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1a.sh
index 10a9c608811..8d44db6f917 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -183,7 +183,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1b.sh
index a2bb7e93388..544b9b04a0a 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -158,7 +158,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1c.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1c.sh
index 7dc30ecf8fe..b268ed7feda 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_1c.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1c.sh
@@ -159,7 +159,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1d.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1d.sh
index 603e0f064b9..d1a7f9d0663 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_1d.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1d.sh
@@ -159,7 +159,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1e.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1e.sh
index 9808e274d83..e20069fbfa1 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_1e.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1e.sh
@@ -167,7 +167,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.01"
   output_opts="l2-regularize=0.0025"
 
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh
index e3d13ac1f65..86df0779841 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh
@@ -161,7 +161,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.01"
   output_opts="l2-regularize=0.005 bottleneck-dim=320"
 
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh
index 526059b7b90..ba90afbb213 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh
@@ -32,7 +32,14 @@ train_set=train_si284
 test_sets="test_dev93 test_eval92"
 gmm=tri4b        # this is the source gmm-dir that we'll use for alignments; it
                  # should have alignments for the specified training data.
+
 num_threads_ubm=32
+
+nj_extractor=10
+# It runs a JOB with '-pe smp N', where N=$[threads*processes]
+num_threads_extractor=4
+num_processes_extractor=4
+
 nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
 
 # Options which are not passed through to run_ivector_common.sh
@@ -79,6 +86,9 @@ local/nnet3/run_ivector_common.sh \
   --stage $stage --nj $nj \
   --train-set $train_set --gmm $gmm \
   --num-threads-ubm $num_threads_ubm \
+  --nj-extractor $nj_extractor \
+  --num-processes-extractor $num_processes_extractor \
+  --num-threads-extractor $num_threads_extractor \
   --nnet3-affix "$nnet3_affix"
 
 
@@ -160,7 +170,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print(0.5/$xent_regularize)" | python)
   tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true"
   tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
   linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
index 4b752a55a4b..6e4f220c1f2 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -181,7 +181,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1b.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1b.sh
index 51fefb9ca88..2d113e58a93 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1b.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1b.sh
@@ -473,7 +473,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   tdnn_opts="l2-regularize=0.01"
   output_opts="l2-regularize=0.005 bottleneck-dim=256"
   lstm_opts="l2-regularize=0.005 self-scale=2.0"
diff --git a/egs/wsj/s5/local/nnet3/run_ivector_common.sh b/egs/wsj/s5/local/nnet3/run_ivector_common.sh
index 813c6e14aed..7d4c9ef3c48 100755
--- a/egs/wsj/s5/local/nnet3/run_ivector_common.sh
+++ b/egs/wsj/s5/local/nnet3/run_ivector_common.sh
@@ -16,6 +16,12 @@ gmm=tri4b                # This specifies a GMM-dir from the features of the typ
                          # it should contain alignments for 'train_set'.
 
 num_threads_ubm=32
+
+nj_extractor=10
+# It runs a JOB with '-pe smp N', where N=$[threads*processes]
+num_processes_extractor=4
+num_threads_extractor=4
+
 nnet3_affix=             # affix for exp/nnet3 directory to put iVector stuff in (e.g.
                          # in the tedlium recip it's _cleaned).
 
@@ -110,7 +116,8 @@ if [ $stage -le 4 ]; then
   # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
   # 100.
   echo "$0: training the iVector extractor"
-  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" \
+    --nj $nj_extractor --num-threads $num_threads_extractor --num-processes $num_processes_extractor \
     data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm exp/nnet3${nnet3_affix}/extractor || exit 1;
 fi
 
diff --git a/egs/wsj/s5/steps/align_fmllr_lats.sh b/egs/wsj/s5/steps/align_fmllr_lats.sh
index b47b97ef994..b331b40d73c 100755
--- a/egs/wsj/s5/steps/align_fmllr_lats.sh
+++ b/egs/wsj/s5/steps/align_fmllr_lats.sh
@@ -24,6 +24,7 @@ final_beam=20  # For the lattice-generation phase there is no retry-beam.  This
 boost_silence=1.0 # factor by which to boost silence during alignment.
 fmllr_update_type=full
 generate_ali_from_lats=false # If true, alingments generated from lattices.
+max_active=7000
 # End configuration options.
 
 echo "$0 $@"  # Print the command line for logging
@@ -149,7 +150,7 @@ if [ $stage -le 3 ]; then
   #    will be small anyway).
   echo "$0: generating lattices containing alternate pronunciations."
   $cmd JOB=1:$nj $dir/log/generate_lattices.JOB.log \
-    gmm-latgen-faster --acoustic-scale=$acoustic_scale --beam=$final_beam \
+    gmm-latgen-faster --max-active=$max_active --acoustic-scale=$acoustic_scale --beam=$final_beam \
         --lattice-beam=$final_beam --allow-partial=false --word-determinize=false \
       "$mdl_cmd" "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \
       "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
diff --git a/egs/wsj/s5/steps/cleanup/clean_and_segment_data_nnet3.sh b/egs/wsj/s5/steps/cleanup/clean_and_segment_data_nnet3.sh
index 35b07d184f4..cc8da298d2f 100755
--- a/egs/wsj/s5/steps/cleanup/clean_and_segment_data_nnet3.sh
+++ b/egs/wsj/s5/steps/cleanup/clean_and_segment_data_nnet3.sh
@@ -23,12 +23,12 @@ cleanup=true  # remove temporary directories and files
 nj=4
 # Decode options
 graph_opts=
+scale_opts=
 beam=15.0
 lattice_beam=1.0
 
 acwt=0.1  # Just a default value, used for adaptation and beam-pruning..
-post_decode_acwt=1.0  # can be used in 'chain' systems to scale acoustics by 10 so the
-                      # regular scoring script works.
+lmwt=10
 
 # Contexts must ideally match training
 extra_left_context=0  # Set to some large value, typically 40 for LSTM (must match training)
@@ -109,6 +109,22 @@ cp $srcdir/cmvn_opts $dir
 cp $srcdir/{splice_opts,delta_opts,final.mat,final.alimdl} $dir 2>/dev/null || true
 cp $srcdir/frame_subsampling_factor $dir 2>/dev/null || true
 
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  echo "$0: guessing that this is a chain system, checking parameters."
+  if [ -z $scale_opts ]; then
+    echo "$0: setting scale_opts"
+    scale_opts="--self-loop-scale=1.0 --transition-scale=1.0"
+  fi
+  if [ $acwt == 0.1 ]; then
+    echo "$0: setting acwt=1.0"
+    acwt=1.0
+  fi
+  if [ $lmwt == 10 ]; then
+    echo "$0: setting lmwt=1.0"
+    lmwt=1
+  fi
+fi
+
 utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt
 cp $lang/phones.txt $dir
 
@@ -142,7 +158,7 @@ if [ $stage -le 3 ]; then
   echo "$0: Decoding with biased language models..."
 
   steps/cleanup/decode_segmentation_nnet3.sh \
-    --acwt $acwt --post-decode-acwt $post_decode_acwt \
+    --acwt $acwt  \
     --beam $beam --lattice-beam $lattice_beam --nj $nj --cmd "$cmd --mem 4G" \
     --skip-scoring true --allow-partial false \
     --extra-left-context $extra_left_context \
diff --git a/egs/wsj/s5/steps/cleanup/internal/align_ctm_ref.py b/egs/wsj/s5/steps/cleanup/internal/align_ctm_ref.py
index 848ca61ebe4..d3e012da13c 100755
--- a/egs/wsj/s5/steps/cleanup/internal/align_ctm_ref.py
+++ b/egs/wsj/s5/steps/cleanup/internal/align_ctm_ref.py
@@ -127,7 +127,7 @@ def read_text(text_file):
                 "Did not get enough columns; line {0} in {1}"
                 "".format(line, text_file.name))
         elif len(parts) == 1:
-            logger.warn("Empty transcript for utterance %s in %s", 
+            logger.warn("Empty transcript for utterance %s in %s",
                         parts[0], text_file.name)
             yield parts[0], []
         else:
diff --git a/egs/wsj/s5/steps/cleanup/internal/get_ctm_edits.py b/egs/wsj/s5/steps/cleanup/internal/get_ctm_edits.py
index a19c5344572..3032a4b434a 100755
--- a/egs/wsj/s5/steps/cleanup/internal/get_ctm_edits.py
+++ b/egs/wsj/s5/steps/cleanup/internal/get_ctm_edits.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # Copyright 2016   Vimal Manohar
 #           2016   Johns Hopkins University (author: Daniel Povey)
@@ -116,17 +116,17 @@
 def OpenFiles():
     global ctm_edits_out, edits_in, ctm_in, symbol_table, oov_word
     try:
-        ctm_edits_out = open(args.ctm_edits_out, 'w')
+        ctm_edits_out = open(args.ctm_edits_out, 'w', encoding='utf-8')
     except:
         sys.exit("get_ctm_edits.py: error opening ctm-edits file {0} for output".format(
                 args.ctm_edits_out))
     try:
-        edits_in = open(args.edits_in)
+        edits_in = open(args.edits_in, encoding='utf-8')
     except:
         sys.exit("get_ctm_edits.py: error opening edits file {0} for input".format(
                 args.edits_in))
     try:
-        ctm_in = open(args.ctm_in)
+        ctm_in = open(args.ctm_in, encoding='utf-8')
     except:
         sys.exit("get_ctm_edits.py: error opening ctm file {0} for input".format(
                 args.ctm_in))
@@ -138,7 +138,7 @@ def OpenFiles():
             print("get_ctm_edits.py: error: if you set the the --symbol-table option "
                   "you must also set the --oov option", file = sys.stderr)
         try:
-            f = open(args.symbol_table, 'r')
+            f = open(args.symbol_table, 'r', encoding='utf-8')
             for line in f.readlines():
                 [ word, integer ] = line.split()
                 if int(integer) == args.oov:
diff --git a/egs/wsj/s5/steps/cleanup/internal/get_non_scored_words.py b/egs/wsj/s5/steps/cleanup/internal/get_non_scored_words.py
index aa71fa47d84..50ee8e2333f 100755
--- a/egs/wsj/s5/steps/cleanup/internal/get_non_scored_words.py
+++ b/egs/wsj/s5/steps/cleanup/internal/get_non_scored_words.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # Copyright 2016   Vimal Manohar
 #           2016   Johns Hopkins University (author: Daniel Povey)
@@ -12,6 +12,10 @@
 import sys
 from collections import defaultdict
 
+import io
+sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf8")
+sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf8")
+
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -90,7 +94,7 @@ def read_lang(lang_dir):
         raise
 
     try:
-        for line in open(lang_dir + '/words.txt').readlines():
+        for line in open(lang_dir + '/words.txt', encoding='utf-8').readlines():
             [ word, integer ] = line.split()
             if int(integer) in silence_word_ints:
                 non_scored_words.add(word)
diff --git a/egs/wsj/s5/steps/cleanup/internal/get_pron_stats.py b/egs/wsj/s5/steps/cleanup/internal/get_pron_stats.py
index a33ba85d9fa..3ea217b6589 100755
--- a/egs/wsj/s5/steps/cleanup/internal/get_pron_stats.py
+++ b/egs/wsj/s5/steps/cleanup/internal/get_pron_stats.py
@@ -75,14 +75,14 @@ def ReadEntries(file_handle):
 # Each entry in the list represents the pronounciation candidate(s) of a word.
 # For each non-<eps> word, the entry is a list: [utt_id, word, set(pronunciation_candidates)]. e.g:
 # [911Mothers_2010W-0010916-0012901-1, other, set('AH DH ER', 'AH DH ER K AH N')]
-# For each <eps>, we split the phones it aligns to into two parts: "nonsil_left", 
+# For each <eps>, we split the phones it aligns to into two parts: "nonsil_left",
 # which includes phones before the first silphone, and "nonsil_right", which includes
-# phones after the last silphone. For example, for <eps> : 'V SIL B AH SIL', 
+# phones after the last silphone. For example, for <eps> : 'V SIL B AH SIL',
 # nonsil_left is 'V' and nonsil_right is empty ''. After processing an <eps> entry
 # in ctm_prons, we put it in "info" as an entry:  [utt_id, word, nonsil_right]
 # only if it's nonsil_right segment is not empty, which may be used when processing
 # the next word.
-# 
+#
 # Normally, one non-<eps> word is only aligned to one pronounciation candidate. However
 # when there is a preceding/following <eps>, like in the following example, we
 # assume the phones aligned to <eps> should be statistically distributed
@@ -90,7 +90,7 @@ def ReadEntries(file_handle):
 # Thus we append the "nonsil_left" segment of these phones to the pronounciation
 # of the preceding word, if the last phone of this pronounciation is not a silence phone,
 # Similarly we can add a pron candidate to the following word.
-# 
+#
 # For example, for the following part of a ctm_prons file:
 # 911Mothers_2010W-0010916-0012901-1 other AH DH ER
 # 911Mothers_2010W-0010916-0012901-1 <eps> K AH N SIL B
@@ -99,11 +99,11 @@ def ReadEntries(file_handle):
 # 911Mothers_2010W-0010916-0012901-1 when W EH N
 # 911Mothers_2010W-0010916-0012901-1 people P IY P AH L
 # 911Mothers_2010W-0010916-0012901-1 <eps> SIL
-# 911Mothers_2010W-0010916-0012901-1 heard HH ER 
+# 911Mothers_2010W-0010916-0012901-1 heard HH ER
 # 911Mothers_2010W-0010916-0012901-1 <eps> D
 # 911Mothers_2010W-0010916-0012901-1 that SIL DH AH T
 # 911Mothers_2010W-0010916-0012901-1 my M AY
-# 
+#
 # The corresponding segment in the "info" list is:
 # [911Mothers_2010W-0010916-0012901-1, other, set('AH DH ER', 'AH DH ER K AH N')]
 # [911Mothers_2010W-0010916-0012901-1, <eps>, 'B'
@@ -113,7 +113,7 @@ def ReadEntries(file_handle):
 # [911Mothers_2010W-0010916-0012901-1, <eps>, 'D']
 # [911Mothers_2010W-0010916-0012901-1, that, set('SIL DH AH T')]
 # [911Mothers_2010W-0010916-0012901-1, my, set('M AY')]
-# 
+#
 # Then we accumulate pronouciation stats from "info". Basically, for each occurence
 # of a word, each pronounciation candidate gets equal soft counts. e.g. In the above
 # example, each pron candidate of "because" gets a count of 1/4. The stats is stored
@@ -139,20 +139,20 @@ def GetStatsFromCtmProns(silphones, optional_silence, non_scored_words, ctm_pron
         # So we apply the same merging method in these cases.
         if word == '<eps>' or (word in non_scored_words and word != '<unk>' and word != '<UNK>'):
             nonsil_left = []
-            nonsil_right = [] 
+            nonsil_right = []
             for phone in phones:
                 if phone in silphones:
                     break
                 nonsil_left.append(phone)
-            
+
             for phone in reversed(phones):
                 if phone in silphones:
                     break
                 nonsil_right.insert(0, phone)
-            
+
             # info[-1][0] is the utt_id of the last entry
-            if len(nonsil_left) > 0 and len(info) > 0 and utt == info[-1][0]: 
-                # pron_ext is a set of extended pron candidates. 
+            if len(nonsil_left) > 0 and len(info) > 0 and utt == info[-1][0]:
+                # pron_ext is a set of extended pron candidates.
                 pron_ext = set()
                 # info[-1][2] is the set of pron candidates of the last entry.
                 for pron in info[-1][2]:
@@ -211,7 +211,7 @@ def GetStatsFromCtmProns(silphones, optional_silence, non_scored_words, ctm_pron
                 stats[(word, phones)] = stats.get((word, phones), 0) + count
     return stats
 
-def WriteStats(stats, file_handle):            
+def WriteStats(stats, file_handle):
     for word_pron, count in stats.items():
         print('{0} {1} {2}'.format(count, word_pron[0], word_pron[1]), file=file_handle)
     file_handle.close()
@@ -222,7 +222,7 @@ def Main():
     non_scored_words = ReadEntries(args.non_scored_words_file_handle)
     optional_silence = ReadEntries(args.optional_silence_file_handle)
     stats = GetStatsFromCtmProns(silphones, optional_silence.pop(), non_scored_words, args.ctm_prons_file_handle)
-    WriteStats(stats, args.stats_file_handle)            
+    WriteStats(stats, args.stats_file_handle)
 
 if __name__ == "__main__":
     Main()
diff --git a/egs/wsj/s5/steps/cleanup/internal/make_one_biased_lm.py b/egs/wsj/s5/steps/cleanup/internal/make_one_biased_lm.py
index e41a67705e9..e5f4a8d1996 100755
--- a/egs/wsj/s5/steps/cleanup/internal/make_one_biased_lm.py
+++ b/egs/wsj/s5/steps/cleanup/internal/make_one_biased_lm.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # Copyright 2016  Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0.
@@ -10,6 +10,11 @@
 import math
 from collections import defaultdict
 
+import io
+sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding="utf8")
+sys.stderr = io.TextIOWrapper(sys.stderr.buffer,encoding="utf8")
+sys.stdin = io.TextIOWrapper(sys.stdin.buffer,encoding="utf8")
+
 parser = argparse.ArgumentParser(description="""
 This script creates a biased language model suitable for alignment and
 data-cleanup purposes.   It reads (possibly multiple) lines of integerized text
@@ -142,16 +147,18 @@ def CompletelyDiscountLowCountStates(self, min_count):
         hist_to_total_count = self.GetHistToTotalCount()
         for n in reversed(list(range(2, self.ngram_order))):
             this_order_counts = self.counts[n]
+            to_delete = []
             for hist in this_order_counts.keys():
                 if hist_to_total_count[hist] < min_count:
                     # we need to completely back off this count.
                     word_to_count = this_order_counts[hist]
-                    del this_order_counts[hist] # delete the key from the dict.
+                    # mark this key for deleting
+                    to_delete.append(hist)
                     backoff_hist = hist[1:]  # this will be a tuple not a list.
                     for word, count in word_to_count.items():
                         self.AddCount(backoff_hist, word, count)
-
-
+            for hist in to_delete:
+                del this_order_counts[hist]
 
     # This backs off the counts according to Kneser-Ney (unmodified,
     # with interpolation).
@@ -200,7 +207,7 @@ def AddTopWords(self, top_words_file):
         word_to_count = self.counts[0][empty_history]
         total = sum(word_to_count.values())
         try:
-            f = open(top_words_file)
+            f = open(top_words_file, mode='r', encoding='utf-8')
         except:
             sys.exit("make_one_biased_lm.py: error opening top-words file: "
                      "--top-words=" + top_words_file)
diff --git a/egs/wsj/s5/steps/cleanup/internal/modify_ctm_edits.py b/egs/wsj/s5/steps/cleanup/internal/modify_ctm_edits.py
index d6f0d0f6b23..af63ca27d2b 100755
--- a/egs/wsj/s5/steps/cleanup/internal/modify_ctm_edits.py
+++ b/egs/wsj/s5/steps/cleanup/internal/modify_ctm_edits.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # Copyright 2016   Vimal Manohar
 #           2016   Johns Hopkins University (author: Daniel Povey)
@@ -105,7 +105,7 @@
 def ReadNonScoredWords(non_scored_words_file):
     global non_scored_words
     try:
-        f = open(non_scored_words_file)
+        f = open(non_scored_words_file, encoding='utf-8')
     except:
         sys.exit("modify_ctm_edits.py: error opening file: "
                  "--non-scored-words=" + non_scored_words_file)
@@ -317,12 +317,12 @@ def ProcessUtterance(split_lines_of_utt):
 
 def ProcessData():
     try:
-        f_in = open(args.ctm_edits_in)
+        f_in = open(args.ctm_edits_in, encoding='utf-8')
     except:
         sys.exit("modify_ctm_edits.py: error opening ctm-edits input "
                  "file {0}".format(args.ctm_edits_in))
     try:
-        f_out = open(args.ctm_edits_out, 'w')
+        f_out = open(args.ctm_edits_out, 'w', encoding='utf-8')
     except:
         sys.exit("modify_ctm_edits.py: error opening ctm-edits output "
                  "file {0}".format(args.ctm_edits_out))
diff --git a/egs/wsj/s5/steps/cleanup/internal/resolve_ctm_edits_overlaps.py b/egs/wsj/s5/steps/cleanup/internal/resolve_ctm_edits_overlaps.py
index a123b13f532..2801639274b 100755
--- a/egs/wsj/s5/steps/cleanup/internal/resolve_ctm_edits_overlaps.py
+++ b/egs/wsj/s5/steps/cleanup/internal/resolve_ctm_edits_overlaps.py
@@ -136,7 +136,7 @@ def wer(ctm_edit_lines):
         return float('inf')
     if num_words == 0 and num_incorrect_words == 0:
         return 0
-    return (float(num_incorrect_words) / num_words, -num_words)
+    return float(num_incorrect_words) / num_words
 
 
 def choose_best_ctm_lines(first_lines, second_lines,
@@ -144,9 +144,9 @@ def choose_best_ctm_lines(first_lines, second_lines,
     """Returns ctm lines that have lower WER. If the WER is the lines with
     the higher number of words is returned.
     """
-    i, best_lines = min((0, first_lines), (1, second_lines),
+    i, best_lines = min((0, first_lines),
+                        (1, second_lines),
                         key=lambda x: wer(x[1]))
-
     return i
 
 
@@ -308,7 +308,7 @@ def run(args):
         try:
             if len(ctm_edits_for_reco) == 0:
                 logger.warn('CTMs for recording %s is empty.',
-                             reco)
+                            reco)
                 continue   # Go to the next recording
 
             # Process CTMs in the recordings
diff --git a/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits.py b/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits.py
index 39d6cb6ed80..2ea8f5f6070 100755
--- a/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits.py
+++ b/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits.py
@@ -1,4 +1,5 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
+
 
 # Copyright 2016   Vimal Manohar
 #           2016   Johns Hopkins University (author: Daniel Povey)
@@ -69,7 +70,7 @@
                     help="""Minimum duration of silence or non-scored word
                     to be considered a viable split point when
                     truncating based on junk proportion.""")
-parser.add_argument("--max-deleted-words-kept-when-merging", type = str, default = 1,
+parser.add_argument("--max-deleted-words-kept-when-merging", type = int, default = 1,
                     help = "When merging segments that are found to be overlapping or "
                     "adjacent after all other processing, keep in the transcript the "
                     "reference words that were deleted between the segments [if any] "
@@ -894,7 +895,7 @@ def AccWordStatsForUtterance(split_lines_of_utt,
 
 def PrintWordStats(word_stats_out):
     try:
-        f = open(word_stats_out, 'w')
+        f = open(word_stats_out, 'w', encoding='utf-8')
     except:
         sys.exit("segment_ctm_edits.py: error opening word-stats file --word-stats-out={0} "
                  "for writing".format(word_stats_out))
@@ -924,23 +925,23 @@ def PrintWordStats(word_stats_out):
 
 def ProcessData():
     try:
-        f_in = open(args.ctm_edits_in)
+        f_in = open(args.ctm_edits_in, encoding='utf-8')
     except:
         sys.exit("segment_ctm_edits.py: error opening ctm-edits input "
                  "file {0}".format(args.ctm_edits_in))
     try:
-        text_output_handle = open(args.text_out, 'w')
+        text_output_handle = open(args.text_out, 'w', encoding='utf-8')
     except:
         sys.exit("segment_ctm_edits.py: error opening text output "
                  "file {0}".format(args.text_out))
     try:
-        segments_output_handle = open(args.segments_out, 'w')
+        segments_output_handle = open(args.segments_out, 'w', encoding='utf-8')
     except:
         sys.exit("segment_ctm_edits.py: error opening segments output "
                  "file {0}".format(args.text_out))
     if args.ctm_edits_out != None:
         try:
-            ctm_edits_output_handle = open(args.ctm_edits_out, 'w')
+            ctm_edits_output_handle = open(args.ctm_edits_out, 'w', encoding='utf-8')
         except:
             sys.exit("segment_ctm_edits.py: error opening ctm-edits output "
                      "file {0}".format(args.ctm_edits_out))
@@ -994,7 +995,7 @@ def ProcessData():
 def ReadNonScoredWords(non_scored_words_file):
     global non_scored_words
     try:
-        f = open(non_scored_words_file)
+        f = open(non_scored_words_file, encoding='utf-8')
     except:
         sys.exit("segment_ctm_edits.py: error opening file: "
                  "--non-scored-words=" + non_scored_words_file)
@@ -1015,7 +1016,7 @@ def ReadNonScoredWords(non_scored_words_file):
 oov_symbol = None
 if args.oov_symbol_file != None:
     try:
-        with open(args.oov_symbol_file) as f:
+        with open(args.oov_symbol_file, encoding='utf-8') as f:
             line = f.readline()
             assert len(line.split()) == 1
             oov_symbol = line.split()[0]
diff --git a/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits_mild.py b/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits_mild.py
index 9fcc2e89360..1ebfdaf7465 100755
--- a/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits_mild.py
+++ b/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits_mild.py
@@ -1331,8 +1331,7 @@ def merge_clusters(self, scoring_function,
                     if reject:
                         rejected_clusters.add(tuple(new_cluster))
                         continue
-
-                    heapq.heappush(heap, (-scoring_function(merged_segment),
+                    heapq.heappush(heap, ((-scoring_function(merged_segment), i),
                                           (merged_segment, i, new_cluster)))
 
                 candidate_index = -1
@@ -1527,7 +1526,7 @@ def get_segments_for_utterance(split_lines_of_utt, args, utterance_stats):
         _global_logger.debug("stage 0: segment %d = %s", i, x)
 
     if args.verbose > 4:
-        print ("Stage 0 [segment cores]:", file=sys.stderr)
+        print("Stage 0 [segment cores]:", file=sys.stderr)
         segments_copy = [x.copy() for x in segments]
         print_debug_info_for_utterance(sys.stderr,
                                        copy.deepcopy(split_lines_of_utt),
@@ -1542,7 +1541,7 @@ def get_segments_for_utterance(split_lines_of_utt, args, utterance_stats):
         _global_logger.debug("stage 1: segment %d = %s", i, x)
 
     if args.verbose > 4:
-        print ("Stage 1 [add tainted lines]:", file=sys.stderr)
+        print("Stage 1 [add tainted lines]:", file=sys.stderr)
         segments_copy = [x.copy() for x in segments]
         print_debug_info_for_utterance(sys.stderr,
                                        copy.deepcopy(split_lines_of_utt),
@@ -1556,7 +1555,7 @@ def get_segments_for_utterance(split_lines_of_utt, args, utterance_stats):
         _global_logger.debug("stage 2: segment %d = %s", i, x)
 
     if args.verbose > 4:
-        print ("Stage 2 [merge segments]:", file=sys.stderr)
+        print("Stage 2 [merge segments]:", file=sys.stderr)
         segments_copy = [x.copy() for x in segments]
         print_debug_info_for_utterance(sys.stderr,
                                        copy.deepcopy(split_lines_of_utt),
@@ -1576,7 +1575,7 @@ def get_segments_for_utterance(split_lines_of_utt, args, utterance_stats):
             "stage 3: segment %d, %s", i, x.debug_info(False))
 
     if args.verbose > 4:
-        print ("Stage 3 [split segments]:", file=sys.stderr)
+        print("Stage 3 [split segments]:", file=sys.stderr)
         segments_copy = [x.copy() for x in segments]
         print_debug_info_for_utterance(sys.stderr,
                                        copy.deepcopy(split_lines_of_utt),
@@ -1598,7 +1597,7 @@ def get_segments_for_utterance(split_lines_of_utt, args, utterance_stats):
             "stage 4: segment %d, %s", i, x.debug_info(False))
 
     if args.verbose > 4:
-        print ("Stage 4 [split long segments]:", file=sys.stderr)
+        print("Stage 4 [split long segments]:", file=sys.stderr)
         segments_copy = [x.copy() for x in segments]
         print_debug_info_for_utterance(sys.stderr,
                                        copy.deepcopy(split_lines_of_utt),
@@ -1615,7 +1614,7 @@ def get_segments_for_utterance(split_lines_of_utt, args, utterance_stats):
             "stage 5: segment %d = %s", i, x.debug_info(False))
 
     if args.verbose > 4:
-        print ("Stage 5 [truncate boundaries]:", file=sys.stderr)
+        print("Stage 5 [truncate boundaries]:", file=sys.stderr)
         segments_copy = [x.copy() for x in segments]
         print_debug_info_for_utterance(sys.stderr,
                                        copy.deepcopy(split_lines_of_utt),
@@ -1632,7 +1631,7 @@ def get_segments_for_utterance(split_lines_of_utt, args, utterance_stats):
             "stage 6: segment %d = %s", i, x.debug_info(False))
 
     if args.verbose > 4:
-        print ("Stage 6 [relax boundary truncation]:", file=sys.stderr)
+        print("Stage 6 [relax boundary truncation]:", file=sys.stderr)
         segments_copy = [x.copy() for x in segments]
         print_debug_info_for_utterance(sys.stderr,
                                        copy.deepcopy(split_lines_of_utt),
@@ -1648,7 +1647,7 @@ def get_segments_for_utterance(split_lines_of_utt, args, utterance_stats):
             "stage 7: segment %d = %s", i, x.debug_info(False))
 
     if args.verbose > 4:
-        print ("Stage 7 [unk-padding]:", file=sys.stderr)
+        print("Stage 7 [unk-padding]:", file=sys.stderr)
         segments_copy = [x.copy() for x in segments]
         print_debug_info_for_utterance(sys.stderr,
                                        copy.deepcopy(split_lines_of_utt),
@@ -1674,8 +1673,8 @@ def get_segments_for_utterance(split_lines_of_utt, args, utterance_stats):
             "stage 8: segment %d = %s", i, x.debug_info(False))
 
     if args.verbose > 4:
-        print ("Stage 8 [remove new segments under "
-               "--min-new-segment-length]:", file=sys.stderr)
+        print("Stage 8 [remove new segments under "
+              "--min-new-segment-length]:", file=sys.stderr)
         segments_copy = [x.copy() for x in segments]
         print_debug_info_for_utterance(sys.stderr,
                                        copy.deepcopy(split_lines_of_utt),
@@ -1698,8 +1697,8 @@ def get_segments_for_utterance(split_lines_of_utt, args, utterance_stats):
             "stage 9: segment %d = %s", i, x.debug_info(False))
 
     if args.verbose > 4:
-        print ("Stage 9 [remove segments under "
-               "--min-segment-length]:", file=sys.stderr)
+        print("Stage 9 [remove segments under "
+              "--min-segment-length]:", file=sys.stderr)
         segments_copy = [x.copy() for x in segments]
         print_debug_info_for_utterance(sys.stderr,
                                        copy.deepcopy(split_lines_of_utt),
@@ -1721,8 +1720,8 @@ def get_segments_for_utterance(split_lines_of_utt, args, utterance_stats):
             "stage 10: segment %d = %s", i, x.debug_info(False))
 
     if args.verbose > 4:
-        print ("Stage 10 [remove segments without scored, non-OOV words "
-               "", file=sys.stderr)
+        print("Stage 10 [remove segments without scored, non-OOV words "
+              "", file=sys.stderr)
         segments_copy = [x.copy() for x in segments]
         print_debug_info_for_utterance(sys.stderr,
                                        copy.deepcopy(split_lines_of_utt),
@@ -1756,7 +1755,7 @@ def time_to_string(time, frame_length):
     """ Gives time in string form as an exact multiple of the frame-length,
     e.g. 0.01 (after rounding).
     """
-    n = round(time /frame_length)
+    n = round(time / frame_length)
     assert n >= 0
     # The next function call will remove trailing zeros while printing it, so
     # that e.g. 0.01 will be printed as 0.01 and not 0.0099999999999999.  It
@@ -1929,6 +1928,7 @@ def process_data(args, oov_symbol, utterance_stats, word_stats):
                         args.ctm_edits_out, split_lines_of_cur_utterance,
                         segments_for_utterance, deleted_segments_for_utterance,
                         frame_length=args.frame_length)
+
                 split_lines_of_cur_utterance = []
                 if len(split_pending_line) == 0:
                     break
diff --git a/egs/wsj/s5/steps/cleanup/internal/taint_ctm_edits.py b/egs/wsj/s5/steps/cleanup/internal/taint_ctm_edits.py
index 4e0e1ae2283..908be55ec0d 100755
--- a/egs/wsj/s5/steps/cleanup/internal/taint_ctm_edits.py
+++ b/egs/wsj/s5/steps/cleanup/internal/taint_ctm_edits.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # Copyright 2016   Vimal Manohar
 #           2016   Johns Hopkins University (author: Daniel Povey)
@@ -8,6 +8,10 @@
 import sys, operator, argparse, os
 from collections import defaultdict
 
+import io
+sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf8")
+
+
 # This script reads and writes the 'ctm-edits' file that is
 # produced by get_ctm_edits.py.
 #
@@ -136,12 +140,12 @@ def ProcessUtterance(split_lines_of_utt, remove_deletions=True):
 
 def ProcessData():
     try:
-        f_in = open(args.ctm_edits_in)
+        f_in = open(args.ctm_edits_in, encoding="utf8")
     except:
         sys.exit("taint_ctm_edits.py: error opening ctm-edits input "
                  "file {0}".format(args.ctm_edits_in))
     try:
-        f_out = open(args.ctm_edits_out, 'w')
+        f_out = open(args.ctm_edits_out, 'w', encoding="utf8")
     except:
         sys.exit("taint_ctm_edits.py: error opening ctm-edits output "
                  "file {0}".format(args.ctm_edits_out))
@@ -246,4 +250,3 @@ def PrintStats():
 
 ProcessData()
 PrintStats()
-
diff --git a/egs/wsj/s5/steps/cleanup/internal/tf_idf.py b/egs/wsj/s5/steps/cleanup/internal/tf_idf.py
index a098d9f2a44..15773d0977e 100644
--- a/egs/wsj/s5/steps/cleanup/internal/tf_idf.py
+++ b/egs/wsj/s5/steps/cleanup/internal/tf_idf.py
@@ -401,7 +401,6 @@ def read_key(fd):
     str += char
   str = str.strip()
   if str == '': return None # end of file,
-  assert(re.match('^[\.a-zA-Z0-9_:-]+$',str) != None) # check format,
   return str
 
 
diff --git a/egs/wsj/s5/steps/cleanup/make_biased_lms.py b/egs/wsj/s5/steps/cleanup/make_biased_lms.py
index ab508eedc9c..7c6fce990d4 100755
--- a/egs/wsj/s5/steps/cleanup/make_biased_lms.py
+++ b/egs/wsj/s5/steps/cleanup/make_biased_lms.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 from __future__ import print_function
 import sys
@@ -7,6 +7,11 @@
 import subprocess
 from collections import defaultdict
 
+import io
+sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding="utf8")
+sys.stderr = io.TextIOWrapper(sys.stderr.buffer,encoding="utf8")
+sys.stdin = io.TextIOWrapper(sys.stdin.buffer,encoding="utf8")
+
 parser = argparse.ArgumentParser(description="""
 This script is a wrapper for make_one_biased_lm.py that reads a Kaldi archive
 of (integerized) text data from the standard input and writes a Kaldi archive of
@@ -55,7 +60,7 @@ def ProcessGroupOfLines(group_of_lines):
     try:
         command = "steps/cleanup/internal/make_one_biased_lm.py " + args.lm_opts
         p = subprocess.Popen(command, shell = True, stdin = subprocess.PIPE,
-                            stdout = sys.stdout, stderr = sys.stderr)
+                             stdout = sys.stdout, stderr = sys.stderr)
         for line in group_of_lines:
             a = line.split()
             if len(a) == 0:
@@ -63,13 +68,15 @@ def ProcessGroupOfLines(group_of_lines):
             utterance_id = a[0]
             # print <utt> <utt-group> to utterance-map file
             print(utterance_id, group_utterance_id, file = utterance_map_file)
-            rest_of_line = ' '.join(a[1:])  # get rid of utterance id.
-            print(rest_of_line, file=p.stdin)
+            rest_of_line = ' '.join(a[1:]) + '\n' # get rid of utterance id.
+            p.stdin.write(rest_of_line.encode('utf-8'))
         p.stdin.close()
         assert p.wait() == 0
-    except Exception as e:
-        sys.exit("make_biased_lms.py: error calling subprocess, command was: " +
-                 command + ", error was : " + str(e))
+    except Exception:
+        sys.stderr.write(
+            "make_biased_lms.py: error calling subprocess, command was: " +
+            command)
+        raise
     # Print a blank line; this terminates the FST in the Kaldi fst-archive
     # format.
     print("")
diff --git a/egs/wsj/s5/steps/cleanup/segment_long_utterances.sh b/egs/wsj/s5/steps/cleanup/segment_long_utterances.sh
index c7e50ea165e..7a16bdcdb12 100755
--- a/egs/wsj/s5/steps/cleanup/segment_long_utterances.sh
+++ b/egs/wsj/s5/steps/cleanup/segment_long_utterances.sh
@@ -174,10 +174,17 @@ if [ $stage -le 3 ]; then
   cp $srcdir/phones.txt $dir 2>/dev/null || true
 
   mkdir -p $graph_dir
+  
+  n_reco=$(cat $text | wc -l) || exit 1
+  nj_reco=$nj
+
+  if [ $nj -gt $n_reco ]; then
+    nj_reco=$n_reco
+  fi
 
   # Make graphs w.r.t. to the original text (usually recording-level)
   steps/cleanup/make_biased_lm_graphs.sh $graph_opts \
-    --nj $nj --cmd "$cmd" $text \
+    --nj $nj_reco --cmd "$cmd" $text \
     $lang $dir $dir/graphs
   if [ -z "$utt2text" ]; then
     # and then copy it to the sub-segments.
diff --git a/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh b/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh
index ae355c9f753..f0df1e7730c 100755
--- a/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh
+++ b/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh
@@ -4,7 +4,8 @@
 #           2016  Vimal Manohar
 # Apache 2.0
 
-# This script is similar to steps/cleanup/segment_long_utterances.sh, but 
+
+# This script is similar to steps/cleanup/segment_long_utterances.sh, but
 # uses nnet3 acoustic model instead of GMM acoustic model for decoding.
 # This script performs segmentation of the input data based on the transcription
 # and outputs segmented data along with the corresponding aligned transcription.
@@ -13,7 +14,7 @@
 # are of manageable length for further processing, along with the portion of the
 # transcript that seems to match (aligns with) each segment.
 # This the light-supervised training scenario where the input transcription is
-# not expected to be completely clean and may have significant errors. 
+# not expected to be completely clean and may have significant errors.
 # See "JHU Kaldi System for Arabic MGB-3 ASR Challenge using Diarization,
 # Audio-transcript Alignment and Transfer Learning": Vimal Manohar, Daniel
 # Povey, Sanjeev Khudanpur, ASRU 2017
@@ -39,24 +40,22 @@ seconds_per_spk_max=30
 
 # Decode options
 graph_opts=
+scale_opts=  # for making the graphs
 beam=15.0
 lattice_beam=1.0
 lmwt=10
-
 acwt=0.1  # Just a default value, used for adaptation and beam-pruning..
-post_decode_acwt=1.0  # can be used in 'chain' systems to scale acoustics by 10 so the
-                      # regular scoring script works.
 
 # Contexts must ideally match training
 extra_left_context=0  # Set to some large value, typically 40 for LSTM (must match training)
-extra_right_context=0  
+extra_right_context=0
 extra_left_context_initial=-1
 extra_right_context_final=-1
 frames_per_chunk=150
 
 # i-vector options
-extractor=    # i-Vector extractor. If provided, will extract i-vectors. 
-              # Required if the network was trained with i-vector extractor. 
+extractor=    # i-Vector extractor. If provided, will extract i-vectors.
+              # Required if the network was trained with i-vector extractor.
 use_vad=false # Use energy-based VAD for i-vector extraction
 
 # TF-IDF similarity search options
@@ -116,12 +115,12 @@ it and eliminate data where the transcript doesn't seem to match.
     --segmentation-extra-opts 'opts'  # Additional options to segment_ctm_edits_mild.py.
                                 # Please run steps/cleanup/internal/segment_ctm_edits_mild.py
                                 # without arguments to see allowed options.
-    --align-full-hyp <true|false>  # If true, align full hypothesis 
-                                   i.e. trackback from the end to get the alignment. 
-                                   This is different from the normal 
+    --align-full-hyp <true|false>  # If true, align full hypothesis
+                                   i.e. trackback from the end to get the alignment.
+                                   This is different from the normal
                                    Smith-Waterman alignment, where the
                                    traceback will be from the maximum score.
-    --extractor <extractor>     # i-vector extractor directory if i-vector is 
+    --extractor <extractor>     # i-vector extractor directory if i-vector is
                                 # to be used during decoding. Must match
                                 # the extractor used for training neural-network.
     --use-vad <true|false>      # If true, uses energy-based VAD to apply frame weights
@@ -168,6 +167,23 @@ cp $srcdir/cmvn_opts $dir
 cp $srcdir/{splice_opts,delta_opts,final.mat,final.alimdl} $dir 2>/dev/null || true
 cp $srcdir/frame_subsampling_factor $dir 2>/dev/null || true
 
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  echo "$0: guessing that this is a chain system, checking parameters."
+  if [ -z $scale_opts ]; then
+    echo "$0: setting scale_opts"
+    scale_opts="--self-loop-scale=1.0 --transition-scale=1.0"
+  fi
+  if [ $acwt == 0.1 ]; then
+    echo "$0: setting acwt=1.0"
+    acwt=1.0
+  fi
+  if [ $lmwt == 10 ]; then
+    echo "$0: setting lmwt=1.0"
+    lmwt=1
+  fi
+fi
+
+
 utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt
 cp $lang/phones.txt $dir
 
@@ -219,9 +235,17 @@ if [ $stage -le 3 ]; then
 
   mkdir -p $graph_dir
 
+  n_reco=$(cat $text | wc -l) || exit 1
+  nj_reco=$nj
+
+  if [ $nj -gt $n_reco ]; then
+    nj_reco=$n_reco
+  fi
+
   # Make graphs w.r.t. to the original text (usually recording-level)
   steps/cleanup/make_biased_lm_graphs.sh $graph_opts \
-    --nj $nj --cmd "$cmd" $text \
+    --scale-opts "$scale_opts" \
+    --nj $nj_reco --cmd "$cmd" $text \
     $lang $dir $dir/graphs
   if [ -z "$utt2text" ]; then
     # and then copy it to the sub-segments.
@@ -267,7 +291,7 @@ if [ $stage -le 5 ]; then
   echo "$0: Decoding with biased language models..."
 
   steps/cleanup/decode_segmentation_nnet3.sh \
-    --acwt $acwt --post-decode-acwt $post_decode_acwt \
+    --acwt $acwt \
     --beam $beam --lattice-beam $lattice_beam --nj $nj --cmd "$cmd --mem 4G" \
     --skip-scoring true --allow-partial false \
     --extra-left-context $extra_left_context \
diff --git a/egs/wsj/s5/steps/combine_ali_dirs.sh b/egs/wsj/s5/steps/combine_ali_dirs.sh
index fe704da3dc6..d2cd5d8de2a 100755
--- a/egs/wsj/s5/steps/combine_ali_dirs.sh
+++ b/egs/wsj/s5/steps/combine_ali_dirs.sh
@@ -1,105 +1,209 @@
 #!/bin/bash
 # Copyright 2016  Xiaohui Zhang  Apache 2.0.
+# Copyright 2019  SmartAction (kkm)
 
-# This srcipt operates on alignment directories, such as exp/tri4a_ali
-# the output is a new ali dir which has alignments from all the input ali dirs
+# This script combines alignment directories, such as exp/tri4a_ali, and
+# validates matching of the utterances and alignments after combining.
 
 # Begin configuration section.
 cmd=run.pl
-extra_files=
-num_jobs=4
+nj=4
+combine_lat=true
+combine_ali=true
+tolerance=10
 # End configuration section.
-echo "$0 $@"  # Print the command line for logging
+echo "$0 $@"  # Print the command line for logging.
 
-if [ -f path.sh ]; then . ./path.sh; fi
-. parse_options.sh || exit 1;
+[[ -f path.sh ]] && . ./path.sh
+. parse_options.sh || exit 1
+
+export LC_ALL=C
 
 if [[ $# -lt 3 ]]; then
-  echo "Usage: $0 [options] <data> <dest-ali-dir> <src-ali-dir1> <src-ali-dir2> ..."
-  echo "e.g.: $0 --num-jobs 32 data/train exp/tri3_ali_combined exp/tri3_ali_1 exp_tri3_ali_2"
-  echo "Options:"
-  echo " --extra-files <file1 file2...>   # specify addtional files in 'src-ali-dir1' to copy"
-  echo " --num-jobs <nj>                  # number of jobs used to split the data directory."
-  echo " Note, files that don't appear in the first source dir will not be added even if they appear in later ones."
-  echo " Other than alignments, only files from the first src ali dir are copied."
+  cat >&2 <<EOF
+Usage: $0 [options] <data> <dest-dir> <src-dir1> <src-dir2> ...
+ e.g.: $0 --nj 32 data/train exp/tri3_ali_combined exp/tri3_ali_1 exp_tri3_ali_2
+Options:
+ --nj <nj>              # number of jobs to split combined archives [4]
+ --combine_ali false    # merge ali.*.gz if present [true]
+ --combine_lat false    # merge lat.*.gz if present [true]
+ --tolerance <int,%>    # maximum percentage of missing alignments or lattices
+                        # w.r.t. total utterances in <data> before error is
+                        # reported [10]
+
+The script checks that certain important files are present and compatible in all
+source directories (phones.txt, tree); other are copied from the first source
+(cmvn_opts, final.mdl) without much checking.
+
+Both --combine_ali and --combine_lat are true by default, but the script
+proceeds with a warning if directories do not contain either alignments or
+alignment lattices. Check for files ali.1.gz and/or lat.1.gz in the <dest-dir>
+after the script completes if additional programmatic check is required.
+EOF
   exit 1;
 fi
 
-data=$1;
-shift;
-dest=$1;
-shift;
-first_src=$1;
-
-mkdir -p $dest;
-rm $dest/{ali.*.gz,num_jobs} 2>/dev/null
-
-cp $first_src/phones.txt $dest 2>/dev/null
-
-export LC_ALL=C
+if [[ ! $combine_lat && ! $combine_ali ]]; then
+  echo "$0: at least one of --combine_lat and --combine_ali must be true"
+  exit 1
+fi
 
-for dir in $*; do
-  if [ ! -f $dir/ali.1.gz ]; then
-    echo "$0: check if alignments (ali.*.gz) are present in $dir."
-    exit 1;
+data=$1
+dest=$2
+shift 2
+first_src=$1
+
+do_ali=$combine_ali
+do_lat=$combine_lat
+
+# Check if alignments and/or lattices are present. Since we combine both,
+# whichever present, issue a warning only. Also verify that the target is
+# different from any source; we cannot combine in-place, and a lot of damage
+# could result.
+for src in $@; do
+  if [[ "$(cd 2>/dev/null -P -- "$src" && pwd)" = \
+        "$(cd 2>/dev/null -P -- "$dest" && pwd)" ]]; then
+    echo "$0: error: Source $src is same as target $dest."
+    exit 1
+  fi
+  if $do_ali && [[ ! -f $src/ali.1.gz ]]; then
+    echo "$0: warning: Alignments (ali.*.gz) are not present in $src, not" \
+         "combining. Consider '--combine_ali false' to suppress this warning."
+    do_ali=false
+  fi
+  if $do_lat && [[ ! -f $src/lat.1.gz ]]; then
+    echo "$0: warning: Alignment lattices (lat.*.gz) are not present in $src,"\
+      "not combining. Consider '--combine_lat false' to suppress this warning."
+    do_lat=false
   fi
 done
 
-for dir in $*; do
-  for f in tree; do
-    diff $first_src/$f $dir/$f 1>/dev/null 2>&1
-    if [ $? -ne 0 ]; then
-      echo "$0: Cannot combine alignment directories with different $f files."
-    fi
-  done
-done
+if ! $do_ali && ! $do_lat; then
+  echo "$0: error: Cannot combine directories."
+  exit 1
+fi
 
-for f in final.mdl tree cmvn_opts num_jobs $extra_files; do
+# Verify that required files are present in the first directory.
+for f in cmvn_opts final.mdl num_jobs phones.txt tree; do
   if [ ! -f $first_src/$f ]; then
-    echo "combine_ali_dir.sh: no such file $first_src/$f"
-    exit 1;
+    echo "$0: error: Required source file $first_src/$f is missing."
+    exit 1
   fi
-  cp $first_src/$f $dest/
 done
 
-src_id=0
-temp_dir=$dest/temp
-[ -d $temp_dir ] && rm -r $temp_dir;
-mkdir -p $temp_dir
-echo "$0: dumping alignments in each source directory as single archive and index."
-for dir in $*; do
-  src_id=$((src_id + 1))
-  cur_num_jobs=$(cat $dir/num_jobs) || exit 1;
-  alis=$(for n in $(seq $cur_num_jobs); do echo -n "$dir/ali.$n.gz "; done)
-  $cmd $dir/log/copy_alignments.log \
-    copy-int-vector "ark:gunzip -c $alis|" \
-    ark,scp:$temp_dir/ali.$src_id.ark,$temp_dir/ali.$src_id.scp || exit 1;
+# Verify that phones and trees are compatible in all directories, and than
+# num_jobs files are present, too.
+for src in $@; do
+  if [[ $src != $first_src ]]; then
+    if [[ ! -f $src/num_jobs ]]; then
+      echo "$0: error: Required source file $src/num_jobs is missing."
+      exit 1
+    fi
+    if ! cmp -s $first_src/tree $src/tree; then
+      echo "$0: error: tree $src/tree is either missing or not the" \
+           "same as $first_src/tree."
+      exit 1
+    fi
+    if [[ ! -f $src/phones.txt ]]; then
+      echo "$0: error: Required source file $src/phones.txt is missing."
+      exit 1
+    fi
+    utils/lang/check_phones_compatible.sh $first_src/phones.txt \
+                                          $src/phones.txt || exit 1
+  fi
 done
-sort -m $temp_dir/ali.*.scp > $temp_dir/ali.scp || exit 1;
 
-echo "$0: splitting data to get reference utt2spk for individual ali.JOB.gz files."
-utils/split_data.sh $data $num_jobs || exit 1;
+# All checks passed, ok to prepare directory. Copy model and other files from
+# the first source, as they either checked to be compatible, or we do not care
+# if they are.
+mkdir -p $dest || exit 1
+rm -f $dest/{cmvn_opts,final.mdl,num_jobs,phones.txt,tree}
+$do_ali && rm -f $dest/ali.*.{gz,scp}
+$do_lat && rm -f $dest/lat.*.{gz,scp}
+cp $first_src/{cmvn_opts,final.mdl,phones.txt,tree} $dest/ || exit 1
+cp $first_src/frame_subsampling_factor $dest/ 2>/dev/null  # If present.
+echo $nj > $dest/num_jobs || exit 1
+
+# Make temporary directory, delete on signal, but not on 'exit 1'.
+temp_dir=$(mktemp -d $dest/temp.XXXXXX) || exit 1
+cleanup() { rm -rf "$temp_dir"; }
+trap cleanup HUP INT TERM
+echo "$0: note: Temporary directory $temp_dir will not be deleted in case of" \
+     "script failure, so you could examine it for troubleshooting."
+
+
+# This function may be called twice, once to combine alignments and the second
+# time to combine lattices. The two invocations are as follows:
+#   do_combine ali alignments copy-int-vector $@
+#   do_combine lat lattices   lattice-copy $@
+# where 'ali'/'lat' is a prefix to archive name, 'alignments'/'lattices' go into
+# log messages and logfile names, and 'copy-int-vector'/'lattice-copy' is the
+# program used to copy corresponding objects.
+do_combine() {
+  local ark=$1 entities=$2 copy_program=$3
+  shift 3
+
+  echo "$0: Gathering $entities from each source directory."
+  # Assign all source gzipped archive names to an exported variable, one each
+  # per source directory, so that we can copy archives in a job per source.
+  src_id=0
+  for src in $@; do
+    src_id=$((src_id + 1))
+    nj_src=$(cat $src/num_jobs) || exit 1
+    # Create and export variable src_arcs_${src_id} for the job runner.
+    # Each numbered variable will contain the list of archives, e. g.:
+    # src_arcs_1="exp/tri3_ali/ali.1.gz exp/tri3_ali/ali.1.gz ..."
+    # ('printf' repeats its format as long as there are more arguments).
+    printf -v src_arks_${src_id} "$src/$ark.%d.gz " $(seq $nj_src)
+    export src_arks_${src_id}
+  done
 
-echo "$0: splitting the alignments to appropriate chunks according to the reference utt2spk files."
-utils/filter_scps.pl JOB=1:$num_jobs \
-  $data/split$num_jobs/JOB/utt2spk $temp_dir/ali.scp $temp_dir/ali.JOB.scp
+  # Gather archives in parallel jobs.
+  $cmd JOB=1:$src_id $dest/log/gather_$entities.JOB.log \
+    $copy_program \
+      "ark:gunzip -c \${src_arks_JOB} |" \
+      "ark,scp:$temp_dir/$ark.JOB.ark,$temp_dir/$ark.JOB.scp" || exit 1
+
+  # Merge (presumed already sorted) scp's into a single script.
+  sort -m $temp_dir/$ark.*.scp > $temp_dir/$ark.scp || exit 1
+
+  echo "$0: Splitting combined $entities into $nj archives on speaker boundary."
+  $cmd JOB=1:$nj $dest/log/chop_combined_$entities.JOB.log \
+    $copy_program \
+      "scp:utils/split_scp.pl --utt2spk=$data/utt2spk --one-based -j $nj JOB $temp_dir/$ark.scp |" \
+      "ark:| gzip -c > $dest/$ark.JOB.gz" || exit 1
+
+  # Get some interesting stats, and signal an error if error threshold exceeded.
+  n_utt=$(wc -l <$data/utt2spk)
+  n_ali=$(wc -l <$temp_dir/$ark.scp)
+  n_ali_no_utt=$(join -j1 -v2 $data/utt2spk $temp_dir/$ark.scp | wc -l)
+  n_utt_no_ali=$(join -j1 -v1 $data/utt2spk $temp_dir/$ark.scp | wc -l)
+  n_utt_no_ali_pct=$(perl -e "print int($n_utt_no_ali/$n_utt * 100 + .5);")
+  echo "$0: Combined $n_ali $entities for $n_utt utterances." \
+       "There were $n_utt_no_ali utterances (${n_utt_no_ali_pct}%) without" \
+       "$entities, and $n_ali_no_utt $entities not matching any utterance."
+
+  if (( $n_utt_no_ali_pct >= $tolerance )); then
+    echo "$0: error: Percentage of utterances missing $entities," \
+         "${n_utt_no_ali_pct}%, is at or above error tolerance ${tolerance}%."
+    exit 1
+  fi
 
-for i in `seq 1 $num_jobs`; do
-    copy-int-vector scp:$temp_dir/ali.${i}.scp "ark:|gzip -c >$dest/ali.$i.gz" || exit 1;
-done
+  return 0
+}
 
-echo $num_jobs > $dest/num_jobs  || exit 1
+# Do the actual combining. Do not check returned exit code, as
+# the function always calls 'exit 1' on failure.
+$do_ali && do_combine ali 'alignments' copy-int-vector "$@"
+$do_lat && do_combine lat 'lattices' lattice-copy "$@"
 
-echo "$0: checking the alignment files generated have at least 90% of the utterances."
-for i in `seq 1 $num_jobs`; do
-  num_lines=`cat $temp_dir/ali.$i.scp | wc -l` || exit 1;
-  num_lines_tot=`cat $data/split$num_jobs/$i/utt2spk | wc -l` || exit 1;
-  python -c "import sys;
-percent = 100.0 * float($num_lines) / $num_lines_tot
-if percent < 90 :
-  print ('$dest/ali.$i.gz {0}% utterances missing.'.format(percent))"  || exit 1;
-done
-rm -r $temp_dir 2>/dev/null
+# Delete the temporary directory on success.
+cleanup
 
-echo "Combined alignments and stored in $dest"
+what=
+$do_ali && what+='alignments '
+$do_ali && $do_lat && what+='and '
+$do_lat && what+='lattices '
+echo "$0: Stored combined ${what}in $dest"  # No period, interferes with
+                                            # copy/paste from tty emulator.
 exit 0
diff --git a/egs/wsj/s5/steps/combine_lat_dirs.sh b/egs/wsj/s5/steps/combine_lat_dirs.sh
new file mode 120000
index 00000000000..9cc58c3a616
--- /dev/null
+++ b/egs/wsj/s5/steps/combine_lat_dirs.sh
@@ -0,0 +1 @@
+combine_ali_dirs.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/steps/compute_cmvn_stats.sh b/egs/wsj/s5/steps/compute_cmvn_stats.sh
index 6e7531394a2..6c05c66a0bc 100755
--- a/egs/wsj/s5/steps/compute_cmvn_stats.sh
+++ b/egs/wsj/s5/steps/compute_cmvn_stats.sh
@@ -81,7 +81,7 @@ required="$data/feats.scp $data/spk2utt"
 
 for f in $required; do
   if [ ! -f $f ]; then
-    echo "make_cmvn.sh: no such file $f"
+    echo "$0: no such file $f"
     exit 1;
   fi
 done
diff --git a/egs/wsj/s5/steps/conf/get_ctm_conf.sh b/egs/wsj/s5/steps/conf/get_ctm_conf.sh
index 8dbc9f449cd..5ce39b1ddb6 100755
--- a/egs/wsj/s5/steps/conf/get_ctm_conf.sh
+++ b/egs/wsj/s5/steps/conf/get_ctm_conf.sh
@@ -2,7 +2,8 @@
 # Copyright Johns Hopkins University (Author: Daniel Povey) 2012.  Apache 2.0.
 
 # This script produces CTM files from a decoding directory that has lattices
-# present. This version gives you confidence scores. See also steps/get_ctm.sh
+# present.  This version gives you confidence scores using MBR decoding.
+# See also steps/get_ctm.sh
 
 
 # begin configuration section.
@@ -13,6 +14,7 @@ max_lmwt=20
 use_segments=true # if we have a segments file, use it to convert
                   # the segments to be relative to the original files.
 iter=final
+beam=5  # pruning beam before MBR decoding
 #end configuration section.
 
 echo "$0 $@"  # Print the command line for logging
@@ -21,6 +23,8 @@ echo "$0 $@"  # Print the command line for logging
 . parse_options.sh || exit 1;
 
 if [ $# -ne 3 ]; then
+  echo "This script produces CTM files from a decoding directory that has lattices "
+  echo "present.  This version gives you confidence scores using MBR decoding."
   echo "Usage: $0 [options] <data-dir> <lang-dir|graph-dir> <decode-dir>"
   echo " Options:"
   echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
@@ -50,6 +54,7 @@ name=`basename $data`; # e.g. eval2000
 
 mkdir -p $dir/scoring/log
 
+frame_shift_opt=
 if [ -f $dir/../frame_shift ]; then
   frame_shift_opt="--frame-shift=$(cat $dir/../frame_shift)"
   echo "$0: $dir/../frame_shift exists, using $frame_shift_opt"
@@ -68,10 +73,12 @@ if [ $stage -le 0 ]; then
     filter_cmd=cat
   fi
 
+  nj=$(cat $dir/num_jobs)
+  lats=$(for n in $(seq $nj); do echo -n "$dir/lat.$n.gz "; done)
   if [ -f $lang/phones/word_boundary.int ]; then
     $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.log \
-      mkdir -p $dir/score_LMWT/ '&&' \
-      lattice-prune --inv-acoustic-scale=LMWT --beam=5 "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
+      set -o pipefail '&&' mkdir -p $dir/score_LMWT/ '&&' \
+      lattice-prune --inv-acoustic-scale=LMWT --beam=$beam "ark:gunzip -c $lats|" ark:- \| \
       lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \
       lattice-to-ctm-conf $frame_shift_opt --decode-mbr=true --inv-acoustic-scale=LMWT ark:- - \| \
       utils/int2sym.pl -f 5 $lang/words.txt \| \
@@ -82,8 +89,8 @@ if [ $stage -le 0 ]; then
       exit 1;
     fi
     $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.log \
-      mkdir -p $dir/score_LMWT/ '&&' \
-      lattice-prune --inv-acoustic-scale=LMWT --beam=5 "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
+      set -o pipefail '&&' mkdir -p $dir/score_LMWT/ '&&' \
+      lattice-prune --inv-acoustic-scale=LMWT --beam=$beam "ark:gunzip -c $lats|" ark:- \| \
       lattice-align-words-lexicon $lang/phones/align_lexicon.int $model ark:- ark:- \| \
       lattice-to-ctm-conf $frame_shift_opt --decode-mbr=true --inv-acoustic-scale=LMWT ark:- - \| \
       utils/int2sym.pl -f 5 $lang/words.txt \| \
diff --git a/egs/wsj/s5/steps/copy_ali_dir.sh b/egs/wsj/s5/steps/copy_ali_dir.sh
new file mode 100755
index 00000000000..60618a2f4bf
--- /dev/null
+++ b/egs/wsj/s5/steps/copy_ali_dir.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+# Copyright 2019   Phani Sankar Nidadavolu
+# Apache 2.0.
+
+prefixes="reverb1 babble music noise"
+include_original=true
+max_jobs_run=50
+nj=100
+cmd=queue.pl
+write_binary=true
+
+. ./path.sh
+. utils/parse_options.sh
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 <out-data> <src-ali-dir> <out-ali-dir>"
+  echo "This script creates alignments for the aug dirs by copying "
+  echo " the alignments of original train dir"
+  echo "While copying it adds prefix to the utterances specified by prefixes option"
+  echo "Note that the original train dir does not have any prefix"
+  echo "To include the original training directory in the copied "
+  echo "version set the --include-original option to true"
+  echo "main options (for others, see top of script file)"
+  echo "  --prefixes <string of prefixes to add>    # All the prefixes of aug data to be included"
+  echo "  --include-original <true/false>           # If true, will copy the alignements of original dir"
+  echo "  --write-compact <true/false>              # Write lattices in compact mode"
+  exit 1
+fi
+
+data=$1
+src_dir=$2
+dir=$3
+
+mkdir -p $dir
+
+num_jobs=$(cat $src_dir/num_jobs)
+
+rm -f $dir/ali_tmp.*.{ark,scp} 2>/dev/null
+
+# Copy the alignments temporarily
+echo "creating temporary alignments in $dir"
+$cmd --max-jobs-run $max_jobs_run JOB=1:$num_jobs $dir/log/copy_ali_temp.JOB.log \
+  copy-int-vector --binary=$write_binary \
+  "ark:gunzip -c $src_dir/ali.JOB.gz |" \
+  ark,scp:$dir/ali_tmp.JOB.ark,$dir/ali_tmp.JOB.scp || exit 1
+
+# Make copies of utterances for perturbed data
+for p in $prefixes; do
+  cat $dir/ali_tmp.*.scp | awk -v p=$p '{print p"-"$0}'
+done | sort -k1,1 > $dir/ali_out.scp.aug
+
+if [ "$include_original" == "true" ]; then
+  cat $dir/ali_tmp.*.scp | awk '{print $0}' | sort -k1,1 > $dir/ali_out.scp.clean
+  cat $dir/ali_out.scp.clean $dir/ali_out.scp.aug | sort -k1,1 > $dir/ali_out.scp
+else
+  cat $dir/ali_out.scp.aug | sort -k1,1 > $dir/ali_out.scp
+fi
+
+utils/split_data.sh ${data} $nj
+
+# Copy and dump the lattices for perturbed data
+echo Creating alignments for augmented data by copying alignments from clean data
+$cmd --max-jobs-run $max_jobs_run JOB=1:$nj $dir/log/copy_out_ali.JOB.log \
+  copy-int-vector --binary=$write_binary \
+  "scp:utils/filter_scp.pl ${data}/split$nj/JOB/utt2spk $dir/ali_out.scp |" \
+  "ark:| gzip -c > $dir/ali.JOB.gz" || exit 1
+
+rm $dir/ali_out.scp.{aug,clean} $dir/ali_out.scp
+rm $dir/ali_tmp.*
+
+echo $nj > $dir/num_jobs
+
+for f in cmvn_opts tree splice_opts phones.txt final.mdl splice_opts tree frame_subsampling_factor; do
+  if [ -f $src_dir/$f ]; then cp $src_dir/$f $dir/$f; fi
+done
diff --git a/egs/wsj/s5/steps/copy_lat_dir.sh b/egs/wsj/s5/steps/copy_lat_dir.sh
new file mode 100755
index 00000000000..dd1e10fb307
--- /dev/null
+++ b/egs/wsj/s5/steps/copy_lat_dir.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+# Copyright 2019   Phani Sankar Nidadavolu
+# Apache 2.0.
+
+prefixes="reverb1 babble music noise"
+include_original=true
+max_jobs_run=50
+nj=100
+cmd=queue.pl
+write_compact=true
+
+. ./path.sh
+. utils/parse_options.sh
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [options] <out-data> <src-lat-dir> <out-lat-dir>"
+  echo "This script creates lattices for the aug dirs by copying the lattices of original train dir"
+  echo "While copying it adds prefix to the utterances specified by prefixes option"
+  echo "Note that the original train dir does not have any prefix"
+  echo "To include the original training directory in the copied "
+  echo "version set the --include-original option to true"
+  echo "main options (for others, see top of script file)"
+  echo "  --prefixes <string of prefixes to add>             # All the prefixes of aug data to be included"
+  echo "  --include-original <true/false>                    # If true, will copy the lattices of original dir"
+  echo "  --write-compact <true/false>                       # Write lattices in compact mode"
+  exit 1
+fi
+
+data=$1
+src_dir=$2
+dir=$3
+
+mkdir -p $dir
+
+num_jobs=$(cat $src_dir/num_jobs)
+
+rm -f $dir/lat_tmp.*.{ark,scp} 2>/dev/null
+
+# Copy the alignments temporarily
+echo "creating temporary lattices in $dir"
+$cmd --max-jobs-run $max_jobs_run JOB=1:$num_jobs $dir/log/copy_lat_temp.JOB.log \
+  lattice-copy --write-compact=$write_compact \
+  "ark:gunzip -c $src_dir/lat.JOB.gz |" \
+  ark,scp:$dir/lat_tmp.JOB.ark,$dir/lat_tmp.JOB.scp || exit 1
+
+# Make copies of utterances for perturbed data
+for p in $prefixes; do
+  cat $dir/lat_tmp.*.scp | awk -v p=$p '{print p"-"$0}'
+done | sort -k1,1 > $dir/lat_out.scp.aug
+
+if [ "$include_original" == "true" ]; then
+  cat $dir/lat_tmp.*.scp | awk '{print $0}' | sort -k1,1 > $dir/lat_out.scp.clean
+  cat $dir/lat_out.scp.clean $dir/lat_out.scp.aug | sort -k1,1 > $dir/lat_out.scp
+else
+  cat $dir/lat_out.scp.aug | sort -k1,1 > $dir/lat_out.scp
+fi
+
+utils/split_data.sh ${data} $nj
+
+# Copy and dump the lattices for perturbed data
+echo Creating lattices for augmented data by copying lattices from clean data
+$cmd --max-jobs-run $max_jobs_run JOB=1:$nj $dir/log/copy_out_lat.JOB.log \
+  lattice-copy --write-compact=$write_compact \
+  "scp:utils/filter_scp.pl ${data}/split$nj/JOB/utt2spk $dir/lat_out.scp |" \
+  "ark:| gzip -c > $dir/lat.JOB.gz" || exit 1
+
+rm $dir/lat_out.scp.{aug,clean} $dir/lat_out.scp
+rm $dir/lat_tmp.*
+
+echo $nj > $dir/num_jobs
+
+for f in cmvn_opts splice_opts final.mdl splice_opts tree frame_subsampling_factor; do
+  if [ -f $src_dir/$f ]; then cp $src_dir/$f $dir/$f; fi
+done
diff --git a/egs/wsj/s5/steps/data/augment_data_dir.py b/egs/wsj/s5/steps/data/augment_data_dir.py
index 7edcdda2636..0274350e133 100755
--- a/egs/wsj/s5/steps/data/augment_data_dir.py
+++ b/egs/wsj/s5/steps/data/augment_data_dir.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # Copyright 2017  David Snyder
 #           2017  Ye Bai
+#           2019  Phani Sankar Nidadavolu
 # Apache 2.0
 #
 # This script generates augmented data.  It is based on
@@ -10,11 +11,14 @@
 from __future__ import print_function
 import sys, random, argparse, os, imp
 sys.path.append("steps/data/")
-from reverberate_data_dir import ParseFileToDict
-from reverberate_data_dir import WriteDictToFile
+sys.path.insert(0, 'steps/')
+
+from reverberate_data_dir import parse_file_to_dict
+from reverberate_data_dir import write_dict_to_file
+import libs.common as common_lib
 data_lib = imp.load_source('dml', 'steps/data/data_dir_manipulation_lib.py')
 
-def GetArgs():
+def get_args():
     parser = argparse.ArgumentParser(description="Augment the data directory with additive noises. "
         "Noises are separated into background and foreground noises which are added together or "
         "separately.  Background noises are added to the entire recording, and repeated as necessary "
@@ -29,13 +33,29 @@ def GetArgs():
                         help='When foreground noises are being added, the script will iterate through these SNRs.')
     parser.add_argument('--bg-snrs', type=str, dest = "bg_snr_str", default = '20:10:0',
                         help='When background noises are being added, the script will iterate through these SNRs.')
-    parser.add_argument('--num-bg-noises', type=str, dest = "num_bg_noises", default = '1',
-                        help='Number of overlapping background noises that we iterate over. For example, if the input is "1:2:3" then the output wavs will have either 1, 2, or 3 randomly chosen background noises overlapping the entire recording')
-    parser.add_argument('--fg-interval', type=int, dest = "fg_interval", default = 0,
-                        help='Number of seconds between the end of one foreground noise and the beginning of the next.')
-    parser.add_argument('--utt-suffix', type=str, dest = "utt_suffix", default = "aug", help='Suffix added to utterance IDs.')
-    parser.add_argument('--random-seed', type=int, dest = "random_seed", default = 123, help='Random seed.')
-
+    parser.add_argument('--num-bg-noises', type=str,
+                        dest = "num_bg_noises", default = '1',
+                        help='Number of overlapping background noises that we iterate over.'
+                            ' For example, if the input is "1:2:3" then the output wavs will have either '
+                            '1, 2, or 3 randomly chosen background noises overlapping the entire recording')
+    parser.add_argument('--fg-interval', type=int,
+                        dest = "fg_interval", default = 0,
+                        help='Number of seconds between the end of one '
+                            'foreground noise and the beginning of the next.')
+    parser.add_argument('--utt-suffix', type=str,
+                        dest = "utt_suffix", default = None,
+                        help='Suffix added to utterance IDs.')
+    parser.add_argument('--utt-prefix', type=str,
+                        dest = "utt_prefix", default = None,
+                        help='Prefix added to utterance IDs.')
+    parser.add_argument('--random-seed', type=int, dest = "random_seed",
+                        default = 123, help='Random seed.')
+    parser.add_argument("--modify-spk-id", type=str,
+                        dest='modify_spk_id', default=False,
+                        action=common_lib.StrToBoolAction,
+                        choices=["true", "false"],
+                        help='Utt prefix or suffix would be added to the spk id '
+                            'also (used in ASR), in speaker id it is left unmodifed')
     parser.add_argument("--bg-noise-dir", type=str, dest="bg_noise_dir",
                         help="Background noise data directory")
     parser.add_argument("--fg-noise-dir", type=str, dest="fg_noise_dir",
@@ -45,10 +65,23 @@ def GetArgs():
 
     print(' '.join(sys.argv))
     args = parser.parse_args()
-    args = CheckArgs(args)
+    args = check_args(args)
     return args
 
-def CheckArgs(args):
+def check_args(args):
+    # Check args
+    if args.utt_suffix is None and args.utt_prefix is None:
+        args.utt_modifier_type = None
+        args.utt_modifier = ""
+    elif args.utt_suffix is None and args.utt_prefix is not None:
+        args.utt_modifier_type = "prefix"
+        args.utt_modifier = args.utt_prefix
+    elif args.utt_suffix is not None and args.utt_prefix is None:
+        args.utt_modifier_type = "suffix"
+        args.utt_modifier = args.utt_suffix
+    else:
+        raise Exception("Trying to add both prefix and suffix. Choose either of them")
+
     if not os.path.exists(args.output_dir):
         os.makedirs(args.output_dir)
     if not args.fg_interval >= 0:
@@ -57,7 +90,7 @@ def CheckArgs(args):
         raise Exception("Either --fg-noise-dir or --bg-noise-dir must be specified")
     return args
 
-def GetNoiseList(noise_wav_scp_filename):
+def get_noise_list(noise_wav_scp_filename):
     noise_wav_scp_file = open(noise_wav_scp_filename, 'r').readlines()
     noise_wavs = {}
     noise_utts = []
@@ -68,7 +101,7 @@ def GetNoiseList(noise_wav_scp_filename):
         noise_wavs[toks[0]] = wav.rstrip()
     return noise_utts, noise_wavs
 
-def AugmentWav(utt, wav, dur, fg_snr_opts, bg_snr_opts, fg_noise_utts, \
+def augment_wav(utt, wav, dur, fg_snr_opts, bg_snr_opts, fg_noise_utts, \
     bg_noise_utts, noise_wavs, noise2dur, interval, num_opts):
     # This section is common to both foreground and background noises
     new_wav = ""
@@ -117,25 +150,59 @@ def AugmentWav(utt, wav, dur, fg_snr_opts, bg_snr_opts, fg_noise_utts, \
             + start_times_str + " " + snrs_str + " - - |"
     return new_wav
 
-def CopyFileIfExists(utt_suffix, filename, input_dir, output_dir):
-    if os.path.isfile(input_dir + "/" + filename):
-        dict = ParseFileToDict(input_dir + "/" + filename,
+def get_new_id(utt, utt_modifier_type, utt_modifier):
+    """ This function generates a new id from the input id
+        This is needed when we have to create multiple copies of the original data
+        E.g. get_new_id("swb0035", prefix="rvb", copy=1) returns a string "rvb1_swb0035"
+    """
+    if utt_modifier_type == "suffix" and len(utt_modifier) > 0:
+        new_utt = utt + "-" + utt_modifier
+    elif utt_modifier_type == "prefix" and len(utt_modifier) > 0:
+        new_utt = utt_modifier + "-" + utt
+    else:
+        new_utt = utt
+
+    return new_utt
+
+def copy_file_if_exists(input_file, output_file, utt_modifier_type,
+                        utt_modifier, fields=[0]):
+    if os.path.isfile(input_file):
+        clean_dict = parse_file_to_dict(input_file,
             value_processor = lambda x: " ".join(x))
-        if len(utt_suffix) > 0:
-            new_dict = {}
-            for key in dict.keys():
-                new_dict[key + "-" + utt_suffix] = dict[key]
-            dict = new_dict
-        WriteDictToFile(dict, output_dir + "/" + filename)
+        new_dict = {}
+        for key in clean_dict.keys():
+            modified_key = get_new_id(key, utt_modifier_type, utt_modifier)
+            if len(fields) > 1:
+                values = clean_dict[key].split(" ")
+                modified_values = values
+                for idx in range(1, len(fields)):
+                    modified_values[idx-1] = get_new_id(values[idx-1],
+                                            utt_modifier_type, utt_modifier)
+                new_dict[modified_key] = " ".join(modified_values)
+            else:
+                new_dict[modified_key] = clean_dict[key]
+        write_dict_to_file(new_dict, output_file)
+
+def create_augmented_utt2uniq(input_dir, output_dir,
+                            utt_modifier_type, utt_modifier):
+    clean_utt2spk_file = input_dir + "/utt2spk"
+    clean_utt2spk_dict = parse_file_to_dict(clean_utt2spk_file,
+                            value_processor = lambda x: " ".join(x))
+    augmented_utt2uniq_dict = {}
+    for key in clean_utt2spk_dict.keys():
+        modified_key = get_new_id(key, utt_modifier_type, utt_modifier)
+        augmented_utt2uniq_dict[modified_key] = key
+    write_dict_to_file(augmented_utt2uniq_dict, output_dir + "/utt2uniq")
 
 def main():
-    args = GetArgs()
-    fg_snrs = [int(i) for i in args.fg_snr_str.split(":")]
-    bg_snrs = [int(i) for i in args.bg_snr_str.split(":")]
+    args = get_args()
     input_dir = args.input_dir
     output_dir = args.output_dir
+
+    fg_snrs = [int(i) for i in args.fg_snr_str.split(":")]
+    bg_snrs = [int(i) for i in args.bg_snr_str.split(":")]
     num_bg_noises = [int(i) for i in args.num_bg_noises.split(":")]
-    reco2dur = ParseFileToDict(input_dir + "/reco2dur",
+    reco2dur = parse_file_to_dict(input_dir + "/reco2dur",
         value_processor = lambda x: float(x[0]))
     wav_scp_file = open(input_dir + "/wav.scp", 'r').readlines()
 
@@ -147,18 +214,18 @@ def main():
     # Load background noises
     if args.bg_noise_dir:
         bg_noise_wav_filename = args.bg_noise_dir + "/wav.scp"
-        bg_noise_utts, bg_noise_wavs = GetNoiseList(bg_noise_wav_filename)
-        bg_noise_reco2dur = ParseFileToDict(args.bg_noise_dir + "/reco2dur",
+        bg_noise_utts, bg_noise_wavs = get_noise_list(bg_noise_wav_filename)
+        bg_noise_reco2dur = parse_file_to_dict(args.bg_noise_dir + "/reco2dur",
             value_processor = lambda x: float(x[0]))
         noise_wavs.update(bg_noise_wavs)
         noise_reco2dur.update(bg_noise_reco2dur)
 
-    # Load background noises
+    # Load foreground noises
     if args.fg_noise_dir:
         fg_noise_wav_filename = args.fg_noise_dir + "/wav.scp"
         fg_noise_reco2dur_filename = args.fg_noise_dir + "/reco2dur"
-        fg_noise_utts, fg_noise_wavs = GetNoiseList(fg_noise_wav_filename)
-        fg_noise_reco2dur = ParseFileToDict(args.fg_noise_dir + "/reco2dur",
+        fg_noise_utts, fg_noise_wavs = get_noise_list(fg_noise_wav_filename)
+        fg_noise_reco2dur = parse_file_to_dict(args.fg_noise_dir + "/reco2dur",
             value_processor = lambda x: float(x[0]))
         noise_wavs.update(fg_noise_wavs)
         noise_reco2dur.update(fg_noise_reco2dur)
@@ -173,24 +240,58 @@ def main():
         utt = toks[0]
         wav = " ".join(toks[1:])
         dur = reco2dur[utt]
-        new_wav = AugmentWav(utt, wav, dur, fg_snrs, bg_snrs, fg_noise_utts,
+        new_wav = augment_wav(utt, wav, dur, fg_snrs, bg_snrs, fg_noise_utts,
             bg_noise_utts, noise_wavs, noise_reco2dur, args.fg_interval,
             num_bg_noises)
-        new_utt = utt + "-" + args.utt_suffix
+
+        new_utt = get_new_id(utt, args.utt_modifier_type, args.utt_modifier)
+
         new_utt2wav[new_utt] = new_wav
 
     if not os.path.exists(output_dir):
         os.makedirs(output_dir)
 
-    WriteDictToFile(new_utt2wav, output_dir + "/wav.scp")
-    CopyFileIfExists(args.utt_suffix, "reco2dur", input_dir, output_dir)
-    CopyFileIfExists(args.utt_suffix, "utt2dur", input_dir, output_dir)
-    CopyFileIfExists(args.utt_suffix, "utt2spk", input_dir, output_dir)
-    CopyFileIfExists(args.utt_suffix, "utt2lang", input_dir, output_dir)
-    CopyFileIfExists(args.utt_suffix, "text", input_dir, output_dir)
-    CopyFileIfExists(args.utt_suffix, "utt2spk", input_dir, output_dir)
-    CopyFileIfExists(args.utt_suffix, "vad.scp", input_dir, output_dir)
-    CopyFileIfExists("", "spk2gender", input_dir, output_dir)
+    write_dict_to_file(new_utt2wav, output_dir + "/wav.scp")
+    copy_file_if_exists(input_dir + "/reco2dur", output_dir + "/reco2dur",
+                                args.utt_modifier_type, args.utt_modifier)
+    copy_file_if_exists(input_dir + "/utt2dur", output_dir + "/utt2dur",
+                                args.utt_modifier_type, args.utt_modifier)
+
+    # Check whether to modify the speaker id or not while creating utt2spk file
+    fields = ([0, 1] if args.modify_spk_id else [0])
+    copy_file_if_exists(input_dir + "/utt2spk", output_dir + "/utt2spk",
+                        args.utt_modifier_type, args.utt_modifier, fields=fields)
+    copy_file_if_exists(input_dir + "/utt2lang", output_dir + "/utt2lang",
+                        args.utt_modifier_type, args.utt_modifier)
+    copy_file_if_exists(input_dir + "/utt2num_frames", output_dir + "/utt2num_frames",
+                        args.utt_modifier_type, args.utt_modifier)
+    copy_file_if_exists(input_dir + "/text", output_dir + "/text", args.utt_modifier_type,
+                        args.utt_modifier)
+    copy_file_if_exists(input_dir + "/segments", output_dir + "/segments",
+                        args.utt_modifier_type, args.utt_modifier, fields=[0, 1])
+    copy_file_if_exists(input_dir + "/vad.scp", output_dir + "/vad.scp",
+                        args.utt_modifier_type, args.utt_modifier)
+    copy_file_if_exists(input_dir + "/reco2file_and_channel",
+                        output_dir + "/reco2file_and_channel",
+                        args.utt_modifier_type, args.utt_modifier, fields=[0, 1])
+
+    if args.modify_spk_id:
+        copy_file_if_exists(input_dir + "/spk2gender", output_dir + "/spk2gender",
+                        args.utt_modifier_type, args.utt_modifier)
+    else:
+        copy_file_if_exists(input_dir + "/spk2gender", output_dir + "/spk2gender", None, "")
+
+    # Create utt2uniq file
+    if os.path.isfile(input_dir + "/utt2uniq"):
+        copy_file_if_exists(input_dir + "/utt2uniq", output_dir + "/utt2uniq",
+                        args.utt_modifier_type, args.utt_modifier, fields=[0])
+    else:
+        create_augmented_utt2uniq(input_dir, output_dir,
+                        args.utt_modifier_type, args.utt_modifier)
+
+    data_lib.RunKaldiCommand("utils/utt2spk_to_spk2utt.pl <{output_dir}/utt2spk >{output_dir}/spk2utt"
+                    .format(output_dir = output_dir))
+
     data_lib.RunKaldiCommand("utils/fix_data_dir.sh {output_dir}".format(output_dir = output_dir))
 
 if __name__ == "__main__":
diff --git a/egs/wsj/s5/steps/data/make_musan.py b/egs/wsj/s5/steps/data/make_musan.py
new file mode 100755
index 00000000000..9165fd7e522
--- /dev/null
+++ b/egs/wsj/s5/steps/data/make_musan.py
@@ -0,0 +1,178 @@
+#!/usr/bin/env python3
+# Copyright 2015   David Snyder
+#           2019   Phani Sankar Nidadavolu
+# Apache 2.0.
+#
+# This file is meant to be invoked by make_musan.sh.
+
+import os, sys, argparse
+sys.path.append("steps/data/")
+sys.path.insert(0, 'steps/')
+import libs.common as common_lib
+
+def get_args():
+    parser = argparse.ArgumentParser(description="Create MUSAN corpus",
+                        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("--use-vocals", type=str,
+                        dest='use_vocals', default=True,
+                        action=common_lib.StrToBoolAction,
+                        choices=["true", "false"],
+                        help='use vocals from the music corpus')
+    parser.add_argument('--sampling-rate', type=int, default=16000,
+                        help="Sampling rate of the source data. If a positive integer is specified with this option, "
+                        "the MUSAN corpus will be resampled to the rate of the source data."
+                        "Original MUSAN corpus is sampled at 16KHz. Defaults to 16000 Hz")
+    parser.add_argument("in_dir", help="Input data directory")
+    parser.add_argument("out_dir", help="Output data directory")
+
+    print(' '.join(sys.argv))
+    args = parser.parse_args()
+    args = check_args(args)
+
+    return args
+
+def check_args(args):
+    if not os.path.exists(args.in_dir):
+        raise Exception('input dir {0} does not exist'.format(args.in_dir))
+    if not os.path.exists(args.out_dir):
+        print("Preparing {0}/musan...".format(args.out_dir))
+        os.makedirs(args.out_dir)
+
+    return args
+
+def process_music_annotations(path):
+    utt2spk = {}
+    utt2vocals = {}
+    lines = open(path, 'r').readlines()
+    for line in lines:
+        utt, genres, vocals, musician = line.rstrip().split()[:4]
+        # For this application, the musican ID isn't important
+        utt2spk[utt] = utt
+        utt2vocals[utt] = vocals == "Y"
+    return utt2spk, utt2vocals
+
+def prepare_music(root_dir, use_vocals, sampling_rate):
+    utt2vocals = {}
+    utt2spk = {}
+    utt2wav = {}
+    num_good_files = 0
+    num_bad_files = 0
+    music_dir = os.path.join(root_dir, "music")
+    for root, dirs, files in os.walk(music_dir):
+        for file in files:
+            file_path = os.path.join(root, file)
+            if file.endswith(".wav"):
+                utt = str(file).replace(".wav", "")
+                utt2wav[utt] = file_path
+            elif str(file) == "ANNOTATIONS":
+                utt2spk_part, utt2vocals_part = process_music_annotations(file_path)
+                utt2spk.update(utt2spk_part)
+                utt2vocals.update(utt2vocals_part)
+
+    utt2spk_str = ""
+    utt2wav_str = ""
+    for utt in utt2vocals:
+        if utt in utt2wav:
+            if use_vocals or not utt2vocals[utt]:
+                utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
+                if sampling_rate == 16000:
+                    utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
+                else:
+                    utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r" \
+                                    " {fs} -t wav - |\n".format(fs=sampling_rate)
+            num_good_files += 1
+        else:
+            print("Missing file {}".format(utt))
+            num_bad_files += 1
+    print("In music directory, processed {} files; {} had missing wav data".format(
+                                                    num_good_files, num_bad_files))
+    return utt2spk_str, utt2wav_str
+
+
+def prepare_speech(root_dir, sampling_rate):
+    utt2spk = {}
+    utt2wav = {}
+    num_good_files = 0
+    num_bad_files = 0
+    speech_dir = os.path.join(root_dir, "speech")
+    for root, dirs, files in os.walk(speech_dir):
+        for file in files:
+            file_path = os.path.join(root, file)
+            if file.endswith(".wav"):
+                utt = str(file).replace(".wav", "")
+                utt2wav[utt] = file_path
+                utt2spk[utt] = utt
+
+    utt2spk_str = ""
+    utt2wav_str = ""
+    for utt in utt2spk:
+        if utt in utt2wav:
+            utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
+            if sampling_rate == 16000:
+                utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
+            else:
+                utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r" \
+                                    " {fs} -t wav - |\n".format(fs=sampling_rate)
+            num_good_files += 1
+        else:
+            print("Missing file {}".format(utt))
+            num_bad_files += 1
+    print("In speech directory, processed {} files; {} had missing wav data".format(
+                                                    num_good_files, num_bad_files))
+    return utt2spk_str, utt2wav_str
+
+
+def prepare_noise(root_dir, sampling_rate):
+    utt2spk = {}
+    utt2wav = {}
+    num_good_files = 0
+    num_bad_files = 0
+    noise_dir = os.path.join(root_dir, "noise")
+    for root, dirs, files in os.walk(noise_dir):
+        for file in files:
+            file_path = os.path.join(root, file)
+            if file.endswith(".wav"):
+                utt = str(file).replace(".wav", "")
+                utt2wav[utt] = file_path
+                utt2spk[utt] = utt
+
+    utt2spk_str = ""
+    utt2wav_str = ""
+    for utt in utt2spk:
+        if utt in utt2wav:
+            utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
+            if sampling_rate == 16000:
+                utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
+            else:
+                utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r" \
+                                    " {fs} -t wav - |\n".format(fs=sampling_rate)
+            num_good_files += 1
+        else:
+            print("Missing file {}".format(utt))
+            num_bad_files += 1
+    print("In noise directory, processed {} files; {} had missing wav data".format(
+                                    num_good_files, num_bad_files))
+    return utt2spk_str, utt2wav_str
+
+
+def main():
+    args = get_args()
+    in_dir = args.in_dir
+    out_dir = args.out_dir
+    use_vocals = args.use_vocals
+    sampling_rate = args.sampling_rate
+
+    utt2spk_music, utt2wav_music = prepare_music(in_dir, use_vocals, sampling_rate)
+    utt2spk_speech, utt2wav_speech = prepare_speech(in_dir, sampling_rate)
+    utt2spk_noise, utt2wav_noise = prepare_noise(in_dir, sampling_rate)
+
+    utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise
+    utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise
+    wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w')
+    wav_fi.write(utt2wav)
+    utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w')
+    utt2spk_fi.write(utt2spk)
+
+
+if __name__=="__main__":
+    main()
diff --git a/egs/wsj/s5/steps/data/make_musan.sh b/egs/wsj/s5/steps/data/make_musan.sh
new file mode 100755
index 00000000000..40ec9b9a279
--- /dev/null
+++ b/egs/wsj/s5/steps/data/make_musan.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+# Copyright 2015   David Snyder
+#           2019   Phani Sankar Nidadavolu
+# Apache 2.0.
+#
+# This script creates the MUSAN data directory.
+# Consists of babble, music and noise files.
+# Used to create augmented data
+# The required dataset is freely available at http://www.openslr.org/17/
+
+# The corpus can be cited as follows:
+# @misc{musan2015,
+#  author = {David Snyder and Guoguo Chen and Daniel Povey},
+#  title = {{MUSAN}: {A} {M}usic, {S}peech, and {N}oise {C}orpus},
+#  year = {2015},
+#  eprint = {1510.08484},
+#  note = {arXiv:1510.08484v1}
+# }
+
+set -e
+use_vocals=true
+sampling_rate=16000
+stage=0
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# -ne 2 ]; then
+    echo USAGE: $0 input_dir output_dir
+    echo input_dir is the path where the MUSAN corpus is located
+    echo e.g: $0 /export/corpora/JHU/musan data
+    echo "main options (for others, see top of script file)"
+    echo "  --sampling-rate <sampling frequency>        # Sampling frequency of source dir"
+    echo "  --use-vocals <true/false>        # Use vocals from music portion of MUSAN corpus"
+    exit 1;
+fi
+
+in_dir=$1
+data_dir=$2
+
+mkdir -p local/musan.tmp
+
+# The below script will create the musan corpus
+steps/data/make_musan.py --use-vocals ${use_vocals} \
+                        --sampling-rate ${sampling_rate} \
+                        ${in_dir} ${data_dir}/musan || exit 1;
+
+utils/fix_data_dir.sh ${data_dir}/musan
+
+grep "music" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_music
+grep "speech" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_speech
+grep "noise" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_noise
+
+utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_music \
+        ${data_dir}/musan ${data_dir}/musan_music
+utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_speech \
+        ${data_dir}/musan ${data_dir}/musan_speech
+utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_noise \
+        ${data_dir}/musan ${data_dir}/musan_noise
+
+utils/fix_data_dir.sh ${data_dir}/musan_music
+utils/fix_data_dir.sh ${data_dir}/musan_speech
+utils/fix_data_dir.sh ${data_dir}/musan_noise
+
+rm -rf local/musan.tmp
+
+for name in speech noise music; do
+    utils/data/get_reco2dur.sh ${data_dir}/musan_${name}
+done
diff --git a/egs/wsj/s5/steps/data/reverberate_data_dir.py b/egs/wsj/s5/steps/data/reverberate_data_dir.py
index 189f4619ddb..64112032cdc 100755
--- a/egs/wsj/s5/steps/data/reverberate_data_dir.py
+++ b/egs/wsj/s5/steps/data/reverberate_data_dir.py
@@ -1,15 +1,15 @@
 #!/usr/bin/env python3
 # Copyright 2016  Tom Ko
 #           2018  David Snyder
+#           2019  Phani Sankar Nidadavolu
 # Apache 2.0
 # script to generate reverberated data
 
-# we're using python 3.x style print but want it to work in python 2.x,
 import argparse, shlex, glob, math, os, random, sys, warnings, copy, imp, ast
 
 data_lib = imp.load_source('dml', 'steps/data/data_dir_manipulation_lib.py')
 
-def GetArgs():
+def get_args():
     # we add required arguments as named arguments for readability
     parser = argparse.ArgumentParser(description="Reverberate the data directory with an option "
                                                  "to add isotropic and point source noises. "
@@ -79,15 +79,11 @@ def GetArgs():
     print(' '.join(sys.argv))
 
     args = parser.parse_args()
-    args = CheckArgs(args)
+    args = check_args(args)
 
     return args
 
-def CheckArgs(args):
-    if not os.path.exists(args.output_dir):
-        os.makedirs(args.output_dir)
-
-    ## Check arguments
+def check_args(args):
     if args.prefix is None:
         if args.num_replicas > 1 or args.include_original_data == "true":
             args.prefix = "rvb"
@@ -121,43 +117,46 @@ def CheckArgs(args):
 
 
 class list_cyclic_iterator(object):
-  def __init__(self, list):
-    self.list_index = 0
-    self.list = list
-    random.shuffle(self.list)
-
-  def __next__(self):
-    item = self.list[self.list_index]
-    self.list_index = (self.list_index + 1) % len(self.list)
-    return item
-
-  next = __next__  # for Python 2
-
-# This functions picks an item from the collection according to the associated probability distribution.
-# The probability estimate of each item in the collection is stored in the "probability" field of
-# the particular item. x : a collection (list or dictionary) where the values contain a field called probability
-def PickItemWithProbability(x):
-   if isinstance(x, dict):
-     plist = list(set(x.values()))
-   else:
-     plist = x
-   total_p = sum(item.probability for item in plist)
-   p = random.uniform(0, total_p)
-   accumulate_p = 0
-   for item in plist:
-      if accumulate_p + item.probability >= p:
-         return item
-      accumulate_p += item.probability
-   assert False, "Shouldn't get here as the accumulated probability should always equal to 1"
-
-
-# This function parses a file and pack the data into a dictionary
-# It is useful for parsing file like wav.scp, utt2spk, text...etc
-def ParseFileToDict(file, assert2fields = False, value_processor = None):
+    def __init__(self, list):
+        self.list_index = 0
+        self.list = list
+        random.shuffle(self.list)
+
+    def __next__(self):
+        item = self.list[self.list_index]
+        self.list_index = (self.list_index + 1) % len(self.list)
+        return item
+
+    next = __next__  # for Python 2
+
+def pick_item_with_probability(x):
+    """ This functions picks an item from the collection according to the associated
+        probability distribution. The probability estimate of each item in the collection
+        is stored in the "probability" field of the particular item. x : a
+        collection (list or dictionary) where the values contain a field called probability
+    """
+    if isinstance(x, dict):
+        plist = list(set(x.values()))
+    else:
+        plist = x
+    total_p = sum(item.probability for item in plist)
+    p = random.uniform(0, total_p)
+    accumulate_p = 0
+    for item in plist:
+        if accumulate_p + item.probability >= p:
+            return item
+        accumulate_p += item.probability
+    assert False, "Shouldn't get here as the accumulated probability should always equal to 1"
+
+
+def parse_file_to_dict(file, assert2fields = False, value_processor = None):
+    """ This function parses a file and pack the data into a dictionary
+        It is useful for parsing file like wav.scp, utt2spk, text...etc
+    """
     if value_processor is None:
         value_processor = lambda x: x[0]
     dict = {}
-    for line in open(file, 'r'):
+    for line in open(file, 'r', encoding='utf-8'):
         parts = line.split()
         if assert2fields:
             assert(len(parts) == 2)
@@ -165,9 +164,10 @@ def ParseFileToDict(file, assert2fields = False, value_processor = None):
         dict[parts[0]] = value_processor(parts[1:])
     return dict
 
-# This function creates a file and write the content of a dictionary into it
-def WriteDictToFile(dict, file_name):
-    file = open(file_name, 'w')
+def write_dict_to_file(dict, file_name):
+    """ This function creates a file and write the content of a dictionary into it
+    """
+    file = open(file_name, 'w', encoding='utf-8')
     keys = sorted(dict.keys())
     for key in keys:
         value = dict[key]
@@ -180,11 +180,12 @@ def WriteDictToFile(dict, file_name):
     file.close()
 
 
-# This function creates the utt2uniq file from the utterance id in utt2spk file
-def CreateCorruptedUtt2uniq(input_dir, output_dir, num_replicas, include_original, prefix):
+def create_corrupted_utt2uniq(input_dir, output_dir, num_replicas, include_original, prefix):
+    """This function creates the utt2uniq file from the utterance id in utt2spk file
+    """
     corrupted_utt2uniq = {}
     # Parse the utt2spk to get the utterance id
-    utt2spk = ParseFileToDict(input_dir + "/utt2spk", value_processor = lambda x: " ".join(x))
+    utt2spk = parse_file_to_dict(input_dir + "/utt2spk", value_processor = lambda x: " ".join(x))
     keys = sorted(utt2spk.keys())
     if include_original:
         start_index = 0
@@ -193,13 +194,13 @@ def CreateCorruptedUtt2uniq(input_dir, output_dir, num_replicas, include_origina
 
     for i in range(start_index, num_replicas+1):
         for utt_id in keys:
-            new_utt_id = GetNewId(utt_id, prefix, i)
+            new_utt_id = get_new_id(utt_id, prefix, i)
             corrupted_utt2uniq[new_utt_id] = utt_id
 
-    WriteDictToFile(corrupted_utt2uniq, output_dir + "/utt2uniq")
+    write_dict_to_file(corrupted_utt2uniq, output_dir + "/utt2uniq")
 
 
-def AddPointSourceNoise(noise_addition_descriptor,  # descriptor to store the information of the noise added
+def add_point_source_noise(noise_addition_descriptor,  # descriptor to store the information of the noise added
                         room,  # the room selected
                         pointsource_noise_list, # the point source noise list
                         pointsource_noise_addition_probability, # Probability of adding point-source noises
@@ -211,8 +212,8 @@ def AddPointSourceNoise(noise_addition_descriptor,  # descriptor to store the in
     if len(pointsource_noise_list) > 0 and random.random() < pointsource_noise_addition_probability and max_noises_recording >= 1:
         for k in range(random.randint(1, max_noises_recording)):
             # pick the RIR to reverberate the point-source noise
-            noise = PickItemWithProbability(pointsource_noise_list)
-            noise_rir = PickItemWithProbability(room.rir_list)
+            noise = pick_item_with_probability(pointsource_noise_list)
+            noise_rir = pick_item_with_probability(room.rir_list)
             # If it is a background noise, the noise will be extended and be added to the whole speech
             # if it is a foreground noise, the noise will not extended and be added at a random time of the speech
             if noise.bg_fg_type == "background":
@@ -233,10 +234,7 @@ def AddPointSourceNoise(noise_addition_descriptor,  # descriptor to store the in
     return noise_addition_descriptor
 
 
-# This function randomly decides whether to reverberate, and sample a RIR if it does
-# It also decides whether to add the appropriate noises
-# This function return the string of options to the binary wav-reverberate
-def GenerateReverberationOpts(room_dict,  # the room dictionary, please refer to MakeRoomDict() for the format
+def generate_reverberation_opts(room_dict,  # the room dictionary, please refer to make_room_dict() for the format
                               pointsource_noise_list, # the point source noise list
                               iso_noise_dict, # the isotropic noise dictionary
                               foreground_snrs, # the SNR for adding the foreground noises
@@ -247,15 +245,19 @@ def GenerateReverberationOpts(room_dict,  # the room dictionary, please refer to
                               speech_dur,  # duration of the recording
                               max_noises_recording  # Maximum number of point-source noises that can be added
                               ):
+    """ This function randomly decides whether to reverberate, and sample a RIR if it does
+        It also decides whether to add the appropriate noises
+        This function return the string of options to the binary wav-reverberate
+    """
     reverberate_opts = ""
     noise_addition_descriptor = {'noise_io': [],
                                  'start_times': [],
                                  'snrs': []}
     # Randomly select the room
     # Here the room probability is a sum of the probabilities of the RIRs recorded in the room.
-    room = PickItemWithProbability(room_dict)
+    room = pick_item_with_probability(room_dict)
     # Randomly select the RIR in the room
-    speech_rir = PickItemWithProbability(room.rir_list)
+    speech_rir = pick_item_with_probability(room.rir_list)
     if random.random() < speech_rvb_probability:
         # pick the RIR to reverberate the speech
         reverberate_opts += """--impulse-response="{0}" """.format(speech_rir.rir_rspecifier)
@@ -265,7 +267,7 @@ def GenerateReverberationOpts(room_dict,  # the room dictionary, please refer to
         rir_iso_noise_list = iso_noise_dict[speech_rir.room_id]
     # Add the corresponding isotropic noise associated with the selected RIR
     if len(rir_iso_noise_list) > 0 and random.random() < isotropic_noise_addition_probability:
-        isotropic_noise = PickItemWithProbability(rir_iso_noise_list)
+        isotropic_noise = pick_item_with_probability(rir_iso_noise_list)
         # extend the isotropic noise to the length of the speech waveform
         # check if the rspecifier is a pipe or not
         if len(isotropic_noise.noise_rspecifier.split()) == 1:
@@ -275,7 +277,7 @@ def GenerateReverberationOpts(room_dict,  # the room dictionary, please refer to
         noise_addition_descriptor['start_times'].append(0)
         noise_addition_descriptor['snrs'].append(next(background_snrs))
 
-    noise_addition_descriptor = AddPointSourceNoise(noise_addition_descriptor,  # descriptor to store the information of the noise added
+    noise_addition_descriptor = add_point_source_noise(noise_addition_descriptor,  # descriptor to store the information of the noise added
                                                     room,  # the room selected
                                                     pointsource_noise_list, # the point source noise list
                                                     pointsource_noise_addition_probability, # Probability of adding point-source noises
@@ -294,26 +296,23 @@ def GenerateReverberationOpts(room_dict,  # the room dictionary, please refer to
 
     return reverberate_opts
 
-# This function generates a new id from the input id
-# This is needed when we have to create multiple copies of the original data
-# E.g. GetNewId("swb0035", prefix="rvb", copy=1) returns a string "rvb1_swb0035"
-def GetNewId(id, prefix=None, copy=0):
+def get_new_id(id, prefix=None, copy=0):
+    """ This function generates a new id from the input id
+        This is needed when we have to create multiple copies of the original data
+        E.g. get_new_id("swb0035", prefix="rvb", copy=1) returns a string "rvb1-swb0035"
+    """
     if prefix is not None:
-        new_id = prefix + str(copy) + "_" + id
+        new_id = prefix + str(copy) + "-" + id
     else:
         new_id = id
 
     return new_id
 
 
-# This is the main function to generate pipeline command for the corruption
-# The generic command of wav-reverberate will be like:
-# wav-reverberate --duration=t --impulse-response=rir.wav
-# --additive-signals='noise1.wav,noise2.wav' --snrs='snr1,snr2' --start-times='s1,s2' input.wav output.wav
-def GenerateReverberatedWavScp(wav_scp,  # a dictionary whose values are the Kaldi-IO strings of the speech recordings
+def generate_reverberated_wav_scp(wav_scp,  # a dictionary whose values are the Kaldi-IO strings of the speech recordings
                                durations, # a dictionary whose values are the duration (in sec) of the speech recordings
                                output_dir, # output directory to write the corrupted wav.scp
-                               room_dict,  # the room dictionary, please refer to MakeRoomDict() for the format
+                               room_dict,  # the room dictionary, please refer to make_room_dict() for the format
                                pointsource_noise_list, # the point source noise list
                                iso_noise_dict, # the isotropic noise dictionary
                                foreground_snr_array, # the SNR for adding the foreground noises
@@ -327,6 +326,11 @@ def GenerateReverberatedWavScp(wav_scp,  # a dictionary whose values are the Kal
                                pointsource_noise_addition_probability, # Probability of adding point-source noises
                                max_noises_per_minute # maximum number of point-source noises that can be added to a recording according to its duration
                                ):
+    """ This is the main function to generate pipeline command for the corruption
+        The generic command of wav-reverberate will be like:
+        wav-reverberate --duration=t --impulse-response=rir.wav
+        --additive-signals='noise1.wav,noise2.wav' --snrs='snr1,snr2' --start-times='s1,s2' input.wav output.wav
+    """
     foreground_snrs = list_cyclic_iterator(foreground_snr_array)
     background_snrs = list_cyclic_iterator(background_snr_array)
     corrupted_wav_scp = {}
@@ -345,7 +349,7 @@ def GenerateReverberatedWavScp(wav_scp,  # a dictionary whose values are the Kal
             speech_dur = durations[recording_id]
             max_noises_recording = math.floor(max_noises_per_minute * speech_dur / 60)
 
-            reverberate_opts = GenerateReverberationOpts(room_dict,  # the room dictionary, please refer to MakeRoomDict() for the format
+            reverberate_opts = generate_reverberation_opts(room_dict,  # the room dictionary, please refer to make_room_dict() for the format
                                                          pointsource_noise_list, # the point source noise list
                                                          iso_noise_dict, # the isotropic noise dictionary
                                                          foreground_snrs, # the SNR for adding the foreground noises
@@ -363,16 +367,17 @@ def GenerateReverberatedWavScp(wav_scp,  # a dictionary whose values are the Kal
             else:
                 wav_corrupted_pipe = "{0} wav-reverberate --shift-output={1} {2} - - |".format(wav_original_pipe, shift_output, reverberate_opts)
 
-            new_recording_id = GetNewId(recording_id, prefix, i)
+            new_recording_id = get_new_id(recording_id, prefix, i)
             corrupted_wav_scp[new_recording_id] = wav_corrupted_pipe
 
-    WriteDictToFile(corrupted_wav_scp, output_dir + "/wav.scp")
+    write_dict_to_file(corrupted_wav_scp, output_dir + "/wav.scp")
 
 
-# This function replicate the entries in files like segments, utt2spk, text
-def AddPrefixToFields(input_file, output_file, num_replicas, include_original, prefix, field = [0]):
-    list = [x.strip() for x in open(input_file)]
-    f = open(output_file, "w")
+def add_prefix_to_fields(input_file, output_file, num_replicas, include_original, prefix, field = [0]):
+    """ This function replicate the entries in files like segments, utt2spk, text
+    """
+    list = [x.strip() for x in open(input_file, encoding='utf-8')]
+    f = open(output_file, "w" ,encoding='utf-8')
     if include_original:
         start_index = 0
     else:
@@ -383,17 +388,16 @@ def AddPrefixToFields(input_file, output_file, num_replicas, include_original, p
             if len(line) > 0 and line[0] != ';':
                 split1 = line.split()
                 for j in field:
-                    split1[j] = GetNewId(split1[j], prefix, i)
+                    split1[j] = get_new_id(split1[j], prefix, i)
                 print(" ".join(split1), file=f)
             else:
                 print(line, file=f)
     f.close()
 
 
-# This function creates multiple copies of the necessary files, e.g. utt2spk, wav.scp ...
-def CreateReverberatedCopy(input_dir,
+def create_reverberated_copy(input_dir,
                            output_dir,
-                           room_dict,  # the room dictionary, please refer to MakeRoomDict() for the format
+                           room_dict,  # the room dictionary, please refer to make_room_dict() for the format
                            pointsource_noise_list, # the point source noise list
                            iso_noise_dict, # the isotropic noise dictionary
                            foreground_snr_string, # the SNR for adding the foreground noises
@@ -407,43 +411,48 @@ def CreateReverberatedCopy(input_dir,
                            pointsource_noise_addition_probability, # Probability of adding point-source noises
                            max_noises_per_minute  # maximum number of point-source noises that can be added to a recording according to its duration
                            ):
-
-    wav_scp = ParseFileToDict(input_dir + "/wav.scp", value_processor = lambda x: " ".join(x))
+    """ This function creates multiple copies of the necessary files,
+        e.g. utt2spk, wav.scp ...
+    """
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    wav_scp = parse_file_to_dict(input_dir + "/wav.scp", value_processor = lambda x: " ".join(x))
     if not os.path.isfile(input_dir + "/reco2dur"):
         print("Getting the duration of the recordings...");
         data_lib.RunKaldiCommand("utils/data/get_reco2dur.sh {}".format(input_dir))
-    durations = ParseFileToDict(input_dir + "/reco2dur", value_processor = lambda x: float(x[0]))
+    durations = parse_file_to_dict(input_dir + "/reco2dur", value_processor = lambda x: float(x[0]))
     foreground_snr_array = [float(x) for x in foreground_snr_string.split(':')]
     background_snr_array = [float(x) for x in background_snr_string.split(':')]
 
-    GenerateReverberatedWavScp(wav_scp, durations, output_dir, room_dict, pointsource_noise_list, iso_noise_dict,
+    generate_reverberated_wav_scp(wav_scp, durations, output_dir, room_dict, pointsource_noise_list, iso_noise_dict,
                foreground_snr_array, background_snr_array, num_replicas, include_original, prefix,
                speech_rvb_probability, shift_output, isotropic_noise_addition_probability,
                pointsource_noise_addition_probability, max_noises_per_minute)
 
-    AddPrefixToFields(input_dir + "/utt2spk", output_dir + "/utt2spk", num_replicas, include_original, prefix, field = [0,1])
+    add_prefix_to_fields(input_dir + "/utt2spk", output_dir + "/utt2spk", num_replicas, include_original, prefix, field = [0,1])
     data_lib.RunKaldiCommand("utils/utt2spk_to_spk2utt.pl <{output_dir}/utt2spk >{output_dir}/spk2utt"
                     .format(output_dir = output_dir))
 
     if os.path.isfile(input_dir + "/utt2uniq"):
-        AddPrefixToFields(input_dir + "/utt2uniq", output_dir + "/utt2uniq", num_replicas, include_original, prefix, field =[0])
+        add_prefix_to_fields(input_dir + "/utt2uniq", output_dir + "/utt2uniq", num_replicas, include_original, prefix, field =[0])
     else:
         # Create the utt2uniq file
-        CreateCorruptedUtt2uniq(input_dir, output_dir, num_replicas, include_original, prefix)
+        create_corrupted_utt2uniq(input_dir, output_dir, num_replicas, include_original, prefix)
 
     if os.path.isfile(input_dir + "/text"):
-        AddPrefixToFields(input_dir + "/text", output_dir + "/text", num_replicas, include_original, prefix, field =[0])
+        add_prefix_to_fields(input_dir + "/text", output_dir + "/text", num_replicas, include_original, prefix, field =[0])
     if os.path.isfile(input_dir + "/segments"):
-        AddPrefixToFields(input_dir + "/segments", output_dir + "/segments", num_replicas, include_original, prefix, field = [0,1])
+        add_prefix_to_fields(input_dir + "/segments", output_dir + "/segments", num_replicas, include_original, prefix, field = [0,1])
     if os.path.isfile(input_dir + "/reco2file_and_channel"):
-        AddPrefixToFields(input_dir + "/reco2file_and_channel", output_dir + "/reco2file_and_channel", num_replicas, include_original, prefix, field = [0,1])
+        add_prefix_to_fields(input_dir + "/reco2file_and_channel", output_dir + "/reco2file_and_channel", num_replicas, include_original, prefix, field = [0,1])
 
     data_lib.RunKaldiCommand("utils/validate_data_dir.sh --no-feats --no-text {output_dir}"
                     .format(output_dir = output_dir))
 
 
-# This function smooths the probability distribution in the list
-def SmoothProbabilityDistribution(set_list, smoothing_weight=0.0, target_sum=1.0):
+def smooth_probability_distribution(set_list, smoothing_weight=0.0, target_sum=1.0):
+    """ This function smooths the probability distribution in the list
+    """
     if len(list(set_list)) > 0:
       num_unspecified = 0
       accumulated_prob = 0
@@ -476,10 +485,11 @@ def SmoothProbabilityDistribution(set_list, smoothing_weight=0.0, target_sum=1.0
     return set_list
 
 
-# This function parse the array of rir set parameter strings.
-# It will assign probabilities to those rir sets which don't have a probability
-# It will also check the existence of the rir list files.
-def ParseSetParameterStrings(set_para_array):
+def parse_set_parameter_strings(set_para_array):
+    """ This function parse the array of rir set parameter strings.
+        It will assign probabilities to those rir sets which don't have a probability
+        It will also check the existence of the rir list files.
+    """
     set_list = []
     for set_para in set_para_array:
         set = lambda: None
@@ -495,14 +505,15 @@ def ParseSetParameterStrings(set_para_array):
             raise Exception(set.filename + " not found")
         set_list.append(set)
 
-    return SmoothProbabilityDistribution(set_list)
+    return smooth_probability_distribution(set_list)
 
 
-# This function creates the RIR list
-# Each rir object in the list contains the following attributes:
-# rir_id, room_id, receiver_position_id, source_position_id, rt60, drr, probability
-# Please refer to the help messages in the parser for the meaning of these attributes
-def ParseRirList(rir_set_para_array, smoothing_weight, sampling_rate = None):
+def parse_rir_list(rir_set_para_array, smoothing_weight, sampling_rate = None):
+    """ This function creates the RIR list
+        Each rir object in the list contains the following attributes:
+        rir_id, room_id, receiver_position_id, source_position_id, rt60, drr, probability
+        Please refer to the help messages in the parser for the meaning of these attributes
+    """
     rir_parser = argparse.ArgumentParser()
     rir_parser.add_argument('--rir-id', type=str, required=True, help='This id is unique for each RIR and the noise may associate with a particular RIR by refering to this id')
     rir_parser.add_argument('--room-id', type=str, required=True, help='This is the room that where the RIR is generated')
@@ -515,7 +526,7 @@ def ParseRirList(rir_set_para_array, smoothing_weight, sampling_rate = None):
     rir_parser.add_argument('rir_rspecifier', type=str, help="""rir rspecifier, it can be either a filename or a piped command.
                             E.g. data/impulses/Room001-00001.wav or "sox data/impulses/Room001-00001.wav -t wav - |" """)
 
-    set_list = ParseSetParameterStrings(rir_set_para_array)
+    set_list = parse_set_parameter_strings(rir_set_para_array)
 
     rir_list = []
     for rir_set in set_list:
@@ -528,20 +539,23 @@ def ParseRirList(rir_set_para_array, smoothing_weight, sampling_rate = None):
                 else:
                     rir.rir_rspecifier = "{0} sox -t wav - -r {1} -t wav - |".format(rir.rir_rspecifier, sampling_rate)
 
-        rir_list += SmoothProbabilityDistribution(current_rir_list, smoothing_weight, rir_set.probability)
+        rir_list += smooth_probability_distribution(current_rir_list, smoothing_weight, rir_set.probability)
 
     return rir_list
 
 
-# This dunction checks if the inputs are approximately equal assuming they are floats.
 def almost_equal(value_1, value_2, accuracy = 10**-8):
+    """ This function checks if the inputs are approximately equal assuming they are floats.
+    """
     return abs(value_1 - value_2) < accuracy
 
-# This function converts a list of RIRs into a dictionary of RIRs indexed by the room-id.
-# Its values are objects with two attributes: a local RIR list
-# and the probability of the corresponding room
-# Please look at the comments at ParseRirList() for the attributes that a RIR object contains
-def MakeRoomDict(rir_list):
+
+def make_room_dict(rir_list):
+    """ This function converts a list of RIRs into a dictionary of RIRs indexed by the room-id.
+        Its values are objects with two attributes: a local RIR list
+        and the probability of the corresponding room
+        Please look at the comments at parse_rir_list() for the attributes that a RIR object contains
+    """
     room_dict = {}
     for rir in rir_list:
         if rir.room_id not in room_dict:
@@ -559,15 +573,15 @@ def MakeRoomDict(rir_list):
 
     return room_dict
 
-
-# This function creates the point-source noise list
-# and the isotropic noise dictionary from the noise information file
-# The isotropic noise dictionary is indexed by the room
-# and its value is the corrresponding isotropic noise list
-# Each noise object in the list contains the following attributes:
-# noise_id, noise_type, bg_fg_type, room_linkage, probability, noise_rspecifier
-# Please refer to the help messages in the parser for the meaning of these attributes
-def ParseNoiseList(noise_set_para_array, smoothing_weight, sampling_rate = None):
+def parse_noise_list(noise_set_para_array, smoothing_weight, sampling_rate = None):
+    """ This function creates the point-source noise list
+         and the isotropic noise dictionary from the noise information file
+         The isotropic noise dictionary is indexed by the room
+         and its value is the corrresponding isotropic noise list
+         Each noise object in the list contains the following attributes:
+         noise_id, noise_type, bg_fg_type, room_linkage, probability, noise_rspecifier
+         Please refer to the help messages in the parser for the meaning of these attributes
+    """
     noise_parser = argparse.ArgumentParser()
     noise_parser.add_argument('--noise-id', type=str, required=True, help='noise id')
     noise_parser.add_argument('--noise-type', type=str, required=True, help='the type of noise; i.e. isotropic or point-source', choices = ["isotropic", "point-source"])
@@ -579,7 +593,7 @@ def ParseNoiseList(noise_set_para_array, smoothing_weight, sampling_rate = None)
     noise_parser.add_argument('noise_rspecifier', type=str, help="""noise rspecifier, it can be either a filename or a piped command.
                               E.g. type5_noise_cirline_ofc_ambient1.wav or "sox type5_noise_cirline_ofc_ambient1.wav -t wav - |" """)
 
-    set_list = ParseSetParameterStrings(noise_set_para_array)
+    set_list = parse_set_parameter_strings(noise_set_para_array)
 
     pointsource_noise_list = []
     iso_noise_dict = {}
@@ -604,40 +618,42 @@ def ParseNoiseList(noise_set_para_array, smoothing_weight, sampling_rate = None)
             else:
                 current_pointsource_noise_list.append(noise)
 
-        pointsource_noise_list += SmoothProbabilityDistribution(current_pointsource_noise_list, smoothing_weight, noise_set.probability)
+        pointsource_noise_list += smooth_probability_distribution(current_pointsource_noise_list, smoothing_weight, noise_set.probability)
 
     # ensure the point-source noise probabilities sum to 1
-    pointsource_noise_list = SmoothProbabilityDistribution(pointsource_noise_list, smoothing_weight, 1.0)
+    pointsource_noise_list = smooth_probability_distribution(pointsource_noise_list, smoothing_weight, 1.0)
     if len(pointsource_noise_list) > 0:
         assert almost_equal(sum(noise.probability for noise in pointsource_noise_list), 1.0)
 
     # ensure the isotropic noise source probabilities for a given room sum to 1
     for key in iso_noise_dict.keys():
-        iso_noise_dict[key] = SmoothProbabilityDistribution(iso_noise_dict[key])
+        iso_noise_dict[key] = smooth_probability_distribution(iso_noise_dict[key])
         assert almost_equal(sum(noise.probability for noise in iso_noise_dict[key]), 1.0)
 
     return (pointsource_noise_list, iso_noise_dict)
 
 
-def Main():
-    args = GetArgs()
+def main():
+    args = get_args()
+
     random.seed(args.random_seed)
-    rir_list = ParseRirList(args.rir_set_para_array, args.rir_smoothing_weight, args.source_sampling_rate)
+    rir_list = parse_rir_list(args.rir_set_para_array, args.rir_smoothing_weight, args.source_sampling_rate)
     print("Number of RIRs is {0}".format(len(rir_list)))
     pointsource_noise_list = []
     iso_noise_dict = {}
     if args.noise_set_para_array is not None:
-        pointsource_noise_list, iso_noise_dict = ParseNoiseList(args.noise_set_para_array, args.noise_smoothing_weight, args.source_sampling_rate)
+        pointsource_noise_list, iso_noise_dict = parse_noise_list(args.noise_set_para_array,
+                                                                args.noise_smoothing_weight,
+                                                                args.source_sampling_rate)
         print("Number of point-source noises is {0}".format(len(pointsource_noise_list)))
         print("Number of isotropic noises is {0}".format(sum(len(iso_noise_dict[key]) for key in iso_noise_dict.keys())))
-    room_dict = MakeRoomDict(rir_list)
+    room_dict = make_room_dict(rir_list)
 
     if args.include_original_data == "true":
         include_original = True
     else:
         include_original = False
-
-    CreateReverberatedCopy(input_dir = args.input_dir,
+    create_reverberated_copy(input_dir = args.input_dir,
                            output_dir = args.output_dir,
                            room_dict = room_dict,
                            pointsource_noise_list = pointsource_noise_list,
@@ -653,6 +669,9 @@ def Main():
                            pointsource_noise_addition_probability = args.pointsource_noise_addition_probability,
                            max_noises_per_minute = args.max_noises_per_minute)
 
-if __name__ == "__main__":
-    Main()
 
+    data_lib.RunKaldiCommand("utils/validate_data_dir.sh --no-feats --no-text {output_dir}"
+                    .format(output_dir = args.output_dir))
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/wsj/s5/steps/dict/learn_lexicon_bayesian.sh b/egs/wsj/s5/steps/dict/learn_lexicon_bayesian.sh
index 042f8f94da4..adff11dd1b4 100755
--- a/egs/wsj/s5/steps/dict/learn_lexicon_bayesian.sh
+++ b/egs/wsj/s5/steps/dict/learn_lexicon_bayesian.sh
@@ -329,7 +329,7 @@ if [ $stage -le 5 ]; then
   # Prune away pronunciations which have low acoustic evidence from the first pass of lattice alignment.
   $cmd $dir/lats_iter1/log/prune_pron_candidates.log steps/dict/internal/prune_pron_candidates.py \
     --variant-counts-ratio $variant_counts_ratio \
-    $dir/lats_iter1/pron_stats.txt $dir/lexicon_phonetic_decoding_pruned.txt $dir/lexiconp_g2p.txt $dir/ref_lexicon.txt \
+    $dir/lats_iter1/pron_stats.txt $dir/lexicon_phonetic_decoding.txt $dir/lexiconp_g2p.txt $dir/ref_lexicon.txt \
     $dir/lexicon_phonetic_decoding_pruned.txt $dir/lexicon_g2p_pruned.txt
 
   # Filter out words which don't appear in the acoustic training data
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py
index 1a038cc23f2..f230e12e96f 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py
@@ -269,8 +269,7 @@ def validate_minibatch_size_str(minibatch_size_str):
                 return False
         # check that the thing before the '=' sign is a positive integer
         try:
-            i = b[0]
-            if i <= 0:
+            if int(b[0]) <= 0:
                 return False
         except:
             return False  # not an integer at all.
@@ -602,6 +601,16 @@ def get_model_combine_iters(num_iters, num_epochs,
     return models_to_combine
 
 
+def get_current_num_jobs(it, num_it, start, step, end):
+    "Get number of jobs for iteration number 'it' of range('num_it')"
+
+    ideal = float(start) + (end - start) * float(it) / num_it
+    if ideal < step:
+        return int(0.5 + ideal)
+    else:
+        return int(0.5 + ideal / step) * step
+
+
 def get_learning_rate(iter, num_jobs, num_iters, num_archives_processed,
                       num_archives_to_process,
                       initial_effective_lrate, final_effective_lrate):
@@ -682,13 +691,11 @@ def remove_model(nnet_dir, iter, num_iters, models_to_combine=None,
         os.remove(file_name)
 
 
-def self_test():
-    assert halve_minibatch_size_str('64') == '32'
-    assert halve_minibatch_size_str('64,16:32') == '32,8:16'
-    assert halve_minibatch_size_str('1') == '1'
-    assert halve_minibatch_size_str('128=64/256=40,80:100') == '128=32/256=20,40:50'
-    assert validate_chunk_width('64')
-    assert validate_chunk_width('64,25,128')
+def positive_int(arg):
+   val = int(arg)
+   if (val <= 0):
+      raise argparse.ArgumentTypeError("must be positive int: '%s'" % arg)
+   return val
 
 
 class CommonParser(object):
@@ -845,6 +852,10 @@ def __init__(self,
                                  type=int, dest='num_jobs_final', default=8,
                                  help="Number of neural net jobs to run in "
                                  "parallel at the end of training")
+        self.parser.add_argument("--trainer.optimization.num-jobs-step",
+            type=positive_int,  metavar='N', dest='num_jobs_step', default=1,
+            help="""Number of jobs increment, when exceeds this number. For
+            example, if N=3, the number of jobs may progress as 1, 2, 3, 6, 9...""")
         self.parser.add_argument("--trainer.optimization.max-models-combine",
                                  "--trainer.max-models-combine",
                                  type=int, dest='max_models_combine',
@@ -983,5 +994,43 @@ def __init__(self,
                                  then only failure notifications are sent""")
 
 
+import unittest
+
+class SelfTest(unittest.TestCase):
+
+    def test_halve_minibatch_size_str(self):
+        self.assertEqual('32', halve_minibatch_size_str('64'))
+        self.assertEqual('32,8:16', halve_minibatch_size_str('64,16:32'))
+        self.assertEqual('1', halve_minibatch_size_str('1'))
+        self.assertEqual('128=32/256=20,40:50', halve_minibatch_size_str('128=64/256=40,80:100'))
+
+
+    def test_validate_chunk_width(self):
+        for s in [ '64', '64,25,128' ]:
+            self.assertTrue(validate_chunk_width(s), s)
+
+
+    def test_validate_minibatch_size_str(self):
+        # Good descriptors.
+        for s in [ '32', '32,64', '1:32', '1:32,64', '64,1:32', '1:5,10:15',
+                   '128=64:128/256=32,64', '1=2/3=4', '1=1/2=2/3=3/4=4' ]:
+            self.assertTrue(validate_minibatch_size_str(s), s)
+        # Bad descriptors.
+        for s in [ None, 42, (43,), '', '1:', ':2', '3,', ',4', '5:6,', ',7:8',
+                   '9=', '10=10/', '11=11/11', '12=1:2//13=1:3' '14=/15=15',
+                   '16/17=17', '/18=18', '/18', '//19', '/' ]:
+            self.assertFalse(validate_minibatch_size_str(s), s)
+
+
+    def test_get_current_num_jobs(self):
+        niters = 12
+        self.assertEqual([2, 3, 3, 4, 4, 5, 6, 6, 7, 7, 8, 8],
+                         [get_current_num_jobs(i, niters, 2, 1, 9)
+                              for i in range(niters)])
+        self.assertEqual([2, 3, 3, 3, 3, 6, 6, 6, 6, 6, 9, 9],
+                         [get_current_num_jobs(i, niters, 2, 3, 9)
+                              for i in range(niters)])
+
+
 if __name__ == '__main__':
-    _self_test()
+    unittest.main()
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py b/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py
index 0ad93e5977d..0de9074517f 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py
@@ -186,9 +186,22 @@ def _get_component_dropout(dropout_schedule, data_fraction):
 
 def _get_dropout_proportions(dropout_schedule, data_fraction):
     """Returns dropout proportions based on the dropout_schedule for the
-    fraction of data seen at this stage of training.
+    fraction of data seen at this stage of training.  Returns a list of
+    pairs (pattern, dropout_proportion); for instance, it might return
+    the list ['*', 0.625] meaning a dropout proportion of 0.625 is to
+    be applied to all dropout components.
+
     Returns None if dropout_schedule is None.
 
+    dropout_schedule might be (in the sample case using the default pattern of
+    '*'): '0.1,0.5@0.5,0.1', meaning a piecewise linear function that starts at
+    0.1 when data_fraction=0.0, rises to 0.5 when data_fraction=0.5, and falls
+    again to 0.1 when data_fraction=1.0.   It can also contain space-separated
+    items of the form 'pattern=schedule', for instance:
+       '*=0.0,0.5,0.0 lstm.*=0.0,0.3@0.75,0.0'
+    The more specific patterns should go later, otherwise they will be overridden
+    by the less specific patterns' commands.
+
     Calls _get_component_dropout() for the different component name patterns
     in dropout_schedule.
 
@@ -198,6 +211,7 @@ def _get_dropout_proportions(dropout_schedule, data_fraction):
             See _self_test() for examples.
         data_fraction: The fraction of data seen until this stage of
             training.
+
     """
     if dropout_schedule is None:
         return None
@@ -213,6 +227,10 @@ def _get_dropout_proportions(dropout_schedule, data_fraction):
 def get_dropout_edit_string(dropout_schedule, data_fraction, iter_):
     """Return an nnet3-copy --edits line to modify raw_model_string to
     set dropout proportions according to dropout_proportions.
+    E.g. if _dropout_proportions(dropout_schedule, data_fraction)
+    returns [('*', 0.625)],  this will return the string:
+     "nnet3-copy --edits='set-dropout-proportion name=* proportion=0.625'"
+
 
     Arguments:
         dropout_schedule: Value for the --trainer.dropout-schedule option.
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
index 5ac2ed59003..b540423e3cd 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
@@ -27,6 +27,7 @@
         'relu-batchnorm-layer' : xlayers.XconfigBasicLayer,
         'relu-batchnorm-so-layer' : xlayers.XconfigBasicLayer,
         'batchnorm-so-relu-layer' : xlayers.XconfigBasicLayer,
+        'batchnorm-layer' : xlayers.XconfigBasicLayer,
         'sigmoid-layer' : xlayers.XconfigBasicLayer,
         'tanh-layer' : xlayers.XconfigBasicLayer,
         'fixed-affine-layer' : xlayers.XconfigFixedAffineLayer,
diff --git a/egs/wsj/s5/steps/make_fbank.sh b/egs/wsj/s5/steps/make_fbank.sh
index 77c48be2e90..29153458f9b 100755
--- a/egs/wsj/s5/steps/make_fbank.sh
+++ b/egs/wsj/s5/steps/make_fbank.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 
-# Copyright 2012-2016  Karel Vesely  Johns Hopkins University (Author: Daniel Povey)
+# Copyright 2012-2016  Karel Vesely
+# Copyright 2012-2016  Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0
 # To be run from .. (one directory up from here)
 # see ../run.sh for example
@@ -10,23 +11,28 @@ nj=4
 cmd=run.pl
 fbank_config=conf/fbank.conf
 compress=true
-write_utt2num_frames=false  # if true writes utt2num_frames
+write_utt2num_frames=true  # If true writes utt2num_frames.
+write_utt2dur=true
 # End configuration section.
 
-echo "$0 $@"  # Print the command line for logging
+echo "$0 $@"  # Print the command line for logging.
 
 if [ -f path.sh ]; then . ./path.sh; fi
 . parse_options.sh || exit 1;
 
 if [ $# -lt 1 ] || [ $# -gt 3 ]; then
-   echo "Usage: $0 [options] <data-dir> [<log-dir> [<fbank-dir>] ]";
-   echo "e.g.: $0 data/train exp/make_fbank/train mfcc"
-   echo "Note: <log-dir> defaults to <data-dir>/log, and <fbank-dir> defaults to <data-dir>/data"
-   echo "Options: "
-   echo "  --fbank-config <config-file>                     # config passed to compute-fbank-feats "
-   echo "  --nj <nj>                                        # number of parallel jobs"
-   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
-   echo "  --write-utt2num-frames <true|false>     # If true, write utt2num_frames file."
+  cat >&2 <<EOF
+Usage: $0 [options] <data-dir> [<log-dir> [<fbank-dir>] ]
+ e.g.: $0 data/train
+Note: <log-dir> defaults to <data-dir>/log, and
+      <fbank-dir> defaults to <data-dir>/data
+Options:
+  --fbank-config <config-file>         # config passed to compute-fbank-feats.
+  --nj <nj>                            # number of parallel jobs.
+  --cmd <run.pl|queue.pl <queue opts>> # how to run jobs.
+  --write-utt2num-frames <true|false>  # If true, write utt2num_frames file.
+  --write-utt2dur <true|false>         # If true, write utt2dur file.
+EOF
    exit 1;
 fi
 
@@ -64,7 +70,7 @@ required="$scp $fbank_config"
 
 for f in $required; do
   if [ ! -f $f ]; then
-    echo "make_fbank.sh: no such file $f"
+    echo "$0: no such file $f"
     exit 1;
   fi
 done
@@ -91,9 +97,15 @@ else
   write_num_frames_opt=
 fi
 
+if $write_utt2dur; then
+  write_utt2dur_opt="--write-utt2dur=ark,t:$logdir/utt2dur.JOB"
+else
+  write_utt2dur_opt=
+fi
+
 if [ -f $data/segments ]; then
   echo "$0 [info]: segments file exists: using that."
-  split_segments=""
+  split_segments=
   for n in $(seq $nj); do
     split_segments="$split_segments $logdir/segments.$n"
   done
@@ -103,11 +115,11 @@ if [ -f $data/segments ]; then
 
   $cmd JOB=1:$nj $logdir/make_fbank_${name}.JOB.log \
     extract-segments scp,p:$scp $logdir/segments.JOB ark:- \| \
-    compute-fbank-feats $vtln_opts --verbose=2 --config=$fbank_config ark:- ark:- \| \
+    compute-fbank-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
+      --config=$fbank_config ark:- ark:- \| \
     copy-feats --compress=$compress $write_num_frames_opt ark:- \
      ark,scp:$fbankdir/raw_fbank_$name.JOB.ark,$fbankdir/raw_fbank_$name.JOB.scp \
      || exit 1;
-
 else
   echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
   split_scps=""
@@ -118,39 +130,57 @@ else
   utils/split_scp.pl $scp $split_scps || exit 1;
 
   $cmd JOB=1:$nj $logdir/make_fbank_${name}.JOB.log \
-    compute-fbank-feats $vtln_opts --verbose=2 --config=$fbank_config scp,p:$logdir/wav.JOB.scp ark:- \| \
+    compute-fbank-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
+     --config=$fbank_config scp,p:$logdir/wav.JOB.scp ark:- \| \
     copy-feats --compress=$compress $write_num_frames_opt ark:- \
      ark,scp:$fbankdir/raw_fbank_$name.JOB.ark,$fbankdir/raw_fbank_$name.JOB.scp \
      || exit 1;
-
 fi
 
 
 if [ -f $logdir/.error.$name ]; then
-  echo "Error producing fbank features for $name:"
+  echo "$0: Error producing filterbank features for $name:"
   tail $logdir/make_fbank_${name}.1.log
   exit 1;
 fi
 
 # concatenate the .scp files together.
 for n in $(seq $nj); do
-  cat $fbankdir/raw_fbank_$name.$n.scp || exit 1;
-done > $data/feats.scp
+  cat $fbankdir/raw_fbank_$name.$n.scp || exit 1
+done > $data/feats.scp || exit 1
 
 if $write_utt2num_frames; then
   for n in $(seq $nj); do
-    cat $logdir/utt2num_frames.$n || exit 1;
+    cat $logdir/utt2num_frames.$n || exit 1
   done > $data/utt2num_frames || exit 1
-  rm $logdir/utt2num_frames.*
 fi
 
-rm $logdir/wav.*.scp  $logdir/segments.* 2>/dev/null
+if $write_utt2dur; then
+  for n in $(seq $nj); do
+    cat $logdir/utt2dur.$n || exit 1
+  done > $data/utt2dur || exit 1
+fi
+
+# Store frame_shift and fbank_config along with features.
+frame_shift=$(perl -ne 'if (/^--frame-shift=(\d+)/) {
+                          printf "%.3f", 0.001 * $1; exit; }' $fbank_config)
+echo ${frame_shift:-'0.01'} > $data/frame_shift
+mkdir -p $data/conf && cp $fbank_config $data/conf/fbank.conf || exit 1
+
+rm $logdir/wav_${name}.*.scp  $logdir/segments.* \
+   $logdir/utt2num_frames.* $logdir/utt2dur.* 2>/dev/null
 
-nf=`cat $data/feats.scp | wc -l`
-nu=`cat $data/utt2spk | wc -l`
+nf=$(wc -l < $data/feats.scp)
+nu=$(wc -l < $data/utt2spk)
 if [ $nf -ne $nu ]; then
-  echo "It seems not all of the feature files were successfully ($nf != $nu);"
-  echo "consider using utils/fix_data_dir.sh $data"
+  echo "$0: It seems not all of the feature files were successfully procesed" \
+       "($nf != $nu); consider using utils/fix_data_dir.sh $data"
+fi
+
+if (( nf < nu - nu/20 )); then
+  echo "$0: Less than 95% the features were successfully generated."\
+       "Probably a serious error."
+  exit 1
 fi
 
-echo "Succeeded creating filterbank features for $name"
+echo "$0: Succeeded creating filterbank features for $name"
diff --git a/egs/wsj/s5/steps/make_fbank_pitch.sh b/egs/wsj/s5/steps/make_fbank_pitch.sh
index b250128fd03..7f971df54ae 100755
--- a/egs/wsj/s5/steps/make_fbank_pitch.sh
+++ b/egs/wsj/s5/steps/make_fbank_pitch.sh
@@ -2,7 +2,7 @@
 
 # Copyright 2013  The Shenzhen Key Laboratory of Intelligent Media and Speech,
 #                 PKU-HKUST Shenzhen Hong Kong Institution (Author: Wei Shi)
-#           2016  Johns Hopkins University (author: Daniel Povey)
+#           2016  Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0
 # Combine filterbank and pitch features together
 # Note: This file is based on make_fbank.sh and make_pitch_kaldi.sh
@@ -15,26 +15,31 @@ pitch_config=conf/pitch.conf
 pitch_postprocess_config=
 paste_length_tolerance=2
 compress=true
-write_utt2num_frames=false  # if true writes utt2num_frames
+write_utt2num_frames=true  # If true writes utt2num_frames.
+write_utt2dur=true
 # End configuration section.
 
-echo "$0 $@"  # Print the command line for logging
+echo "$0 $@"  # Print the command line for logging.
 
 if [ -f path.sh ]; then . ./path.sh; fi
 . parse_options.sh || exit 1;
 
 if [ $# -lt 1 ] || [ $# -gt 3 ]; then
-   echo "Usage: $0 [options] <data-dir> [<log-dir> [<fbank-dir>] ]";
-   echo "e.g.: $0 data/train exp/make_fbank/train mfcc"
-   echo "Note: <log-dir> defaults to <data-dir>/log, and <fbank-dir> defaults to <data-dir>/data"
-   echo "Options: "
-   echo "  --fbank-config             <config-file>             # config passed to compute-fbank-feats "
-   echo "  --pitch-config             <pitch-config-file>       # config passed to compute-kaldi-pitch-feats "
-   echo "  --pitch-postprocess-config <postprocess-config-file> # config passed to process-kaldi-pitch-feats "
-   echo "  --paste-length-tolerance   <tolerance>               # length tolerance passed to paste-feats"
-   echo "  --nj                       <nj>                      # number of parallel jobs"
-   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>)     # how to run jobs."
-   echo "  --write-utt2num-frames <true|false>     # If true, write utt2num_frames file."
+  cat >&2 <<EOF
+Usage: $0 [options] <data-dir> [<log-dir> [<fbank-dir>] ]
+ e.g.: $0 data/train
+Note: <log-dir> defaults to <data-dir>/log, and
+      <fbank-dir> defaults to <data-dir>/data
+Options:
+  --fbank-config <fbank-config-file>   # config passed to compute-fbank-feats.
+  --pitch-config <pitch-config-file>   # config passed to compute-kaldi-pitch-feats.
+  --pitch-postprocess-config <postprocess-config-file> # config passed to process-kaldi-pitch-feats.
+  --paste-length-tolerance <tolerance> # length tolerance passed to paste-feats.
+  --nj <nj>                            # number of parallel jobs.
+  --cmd <run.pl|queue.pl <queue opts>> # how to run jobs.
+  --write-utt2num-frames <true|false>  # If true, write utt2num_frames file.
+  --write-utt2dur <true|false>         # If true, write utt2dur file.
+EOF
    exit 1;
 fi
 
@@ -72,19 +77,19 @@ required="$scp $fbank_config $pitch_config"
 
 for f in $required; do
   if [ ! -f $f ]; then
-    echo "make_fbank_pitch.sh: no such file $f"
+    echo "$0: no such file $f"
     exit 1;
   fi
 done
 
+utils/validate_data_dir.sh --no-text --no-feats $data || exit 1;
+
 if [ ! -z "$pitch_postprocess_config" ]; then
   postprocess_config_opt="--config=$pitch_postprocess_config";
 else
   postprocess_config_opt=
 fi
 
-utils/validate_data_dir.sh --no-text --no-feats $data || exit 1;
-
 if [ -f $data/spk2warp ]; then
   echo "$0 [info]: using VTLN warp factors from $data/spk2warp"
   vtln_opts="--vtln-map=ark:$data/spk2warp --utt2spk=ark:$data/utt2spk"
@@ -105,9 +110,15 @@ else
   write_num_frames_opt=
 fi
 
+if $write_utt2dur; then
+  write_utt2dur_opt="--write-utt2dur=ark,t:$logdir/utt2dur.JOB"
+else
+  write_utt2dur_opt=
+fi
+
 if [ -f $data/segments ]; then
   echo "$0 [info]: segments file exists: using that."
-  split_segments=""
+  split_segments=
   for n in $(seq $nj); do
     split_segments="$split_segments $logdir/segments.$n"
   done
@@ -115,61 +126,89 @@ if [ -f $data/segments ]; then
   utils/split_scp.pl $data/segments $split_segments || exit 1;
   rm $logdir/.error 2>/dev/null
 
-  fbank_feats="ark:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | compute-fbank-feats $vtln_opts --verbose=2 --config=$fbank_config ark:- ark:- |"
-  pitch_feats="ark,s,cs:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config ark:- ark:- | process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
+  fbank_feats="ark:extract-segments scp,p:$scp $logdir/segments.JOB ark:- |\
+    compute-fbank-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
+      --config=$fbank_config ark:- ark:- |"
+  pitch_feats="ark,s,cs:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | \
+    compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config ark:- ark:- | \
+    process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
 
   $cmd JOB=1:$nj $logdir/make_fbank_pitch_${name}.JOB.log \
-    paste-feats --length-tolerance=$paste_length_tolerance "$fbank_feats" "$pitch_feats" ark:- \| \
+    paste-feats --length-tolerance=$paste_length_tolerance \
+      "$fbank_feats" "$pitch_feats" ark:- \| \
     copy-feats --compress=$compress $write_num_frames_opt ark:- \
       ark,scp:$fbank_pitch_dir/raw_fbank_pitch_$name.JOB.ark,$fbank_pitch_dir/raw_fbank_pitch_$name.JOB.scp \
      || exit 1;
 
 else
   echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
-  split_scps=""
+  split_scps=
   for n in $(seq $nj); do
-    split_scps="$split_scps $logdir/wav.$n.scp"
+    split_scps="$split_scps $logdir/wav_${name}.$n.scp"
   done
 
   utils/split_scp.pl $scp $split_scps || exit 1;
 
-  fbank_feats="ark:compute-fbank-feats $vtln_opts --verbose=2 --config=$fbank_config scp,p:$logdir/wav.JOB.scp ark:- |"
-  pitch_feats="ark,s,cs:compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config scp,p:$logdir/wav.JOB.scp ark:- | process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
+  fbank_feats="ark:compute-fbank-feats $vtln_opts $write_utt2dur_opt \
+   --verbose=2 --config=$fbank_config scp,p:$logdir/wav_${name}.JOB.scp ark:- |"
+  pitch_feats="ark,s,cs:compute-kaldi-pitch-feats --verbose=2 \
+      --config=$pitch_config scp,p:$logdir/wav_${name}.JOB.scp ark:- | \
+    process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
 
   $cmd JOB=1:$nj $logdir/make_fbank_pitch_${name}.JOB.log \
-    paste-feats --length-tolerance=$paste_length_tolerance "$fbank_feats" "$pitch_feats" ark:- \| \
+    paste-feats --length-tolerance=$paste_length_tolerance \
+      "$fbank_feats" "$pitch_feats" ark:- \| \
     copy-feats --compress=$compress $write_num_frames_opt ark:- \
       ark,scp:$fbank_pitch_dir/raw_fbank_pitch_$name.JOB.ark,$fbank_pitch_dir/raw_fbank_pitch_$name.JOB.scp \
       || exit 1;
-
 fi
 
 
 if [ -f $logdir/.error.$name ]; then
-  echo "Error producing fbank & pitch features for $name:"
+  echo "$0: Error producing filterbank and pitch features for $name:"
   tail $logdir/make_fbank_pitch_${name}.1.log
   exit 1;
 fi
 
-# concatenate the .scp files together.
+# Concatenate the .scp files together.
 for n in $(seq $nj); do
-  cat $fbank_pitch_dir/raw_fbank_pitch_$name.$n.scp || exit 1;
-done > $data/feats.scp
+  cat $fbank_pitch_dir/raw_fbank_pitch_$name.$n.scp || exit 1
+done > $data/feats.scp || exit 1
 
 if $write_utt2num_frames; then
   for n in $(seq $nj); do
-    cat $logdir/utt2num_frames.$n || exit 1;
+    cat $logdir/utt2num_frames.$n || exit 1
   done > $data/utt2num_frames || exit 1
-  rm $logdir/utt2num_frames.*
 fi
 
-rm $logdir/wav.*.scp  $logdir/segments.* 2>/dev/null
+if $write_utt2dur; then
+  for n in $(seq $nj); do
+    cat $logdir/utt2dur.$n || exit 1
+  done > $data/utt2dur || exit 1
+fi
+
+# Store frame_shift, fbank_config and pitch_config along with features.
+frame_shift=$(perl -ne 'if (/^--frame-shift=(\d+)/) {
+                          printf "%.3f", 0.001 * $1; exit; }' $fbank_config)
+echo ${frame_shift:-'0.01'} > $data/frame_shift
+mkdir -p $data/conf &&
+  cp $fbank_config $data/conf/fbank.conf &&
+  cp $pitch_config $data/conf/pitch.conf || exit 1
 
-nf=`cat $data/feats.scp | wc -l`
-nu=`cat $data/utt2spk | wc -l`
+rm $logdir/wav_${name}.*.scp  $logdir/segments.* \
+   $logdir/utt2num_frames.* $logdir/utt2dur.* 2>/dev/null
+
+nf=$(wc -l < $data/feats.scp)
+nu=$(wc -l < $data/utt2spk)
 if [ $nf -ne $nu ]; then
-  echo "It seems not all of the feature files were successfully processed ($nf != $nu);"
-  echo "consider using utils/fix_data_dir.sh $data"
+  echo "$0: It seems not all of the feature files were successfully procesed" \
+       "($nf != $nu); consider using utils/fix_data_dir.sh $data"
+fi
+
+if (( nf < nu - nu/20 )); then
+  echo "$0: Less than 95% the features were successfully generated."\
+       "Probably a serious error."
+  exit 1
 fi
 
-echo "Succeeded creating filterbank & pitch features for $name"
+echo "$0: Succeeded creating filterbank and pitch features for $name"
diff --git a/egs/wsj/s5/steps/make_mfcc.sh b/egs/wsj/s5/steps/make_mfcc.sh
index c88e0d65e65..37433f87dcd 100755
--- a/egs/wsj/s5/steps/make_mfcc.sh
+++ b/egs/wsj/s5/steps/make_mfcc.sh
@@ -10,23 +10,28 @@ nj=4
 cmd=run.pl
 mfcc_config=conf/mfcc.conf
 compress=true
-write_utt2num_frames=false  # if true writes utt2num_frames
+write_utt2num_frames=true  # If true writes utt2num_frames.
+write_utt2dur=true
 # End configuration section.
 
-echo "$0 $@"  # Print the command line for logging
+echo "$0 $@"  # Print the command line for logging.
 
 if [ -f path.sh ]; then . ./path.sh; fi
 . parse_options.sh || exit 1;
 
 if [ $# -lt 1 ] || [ $# -gt 3 ]; then
-   echo "Usage: $0 [options] <data-dir> [<log-dir> [<mfcc-dir>] ]";
-   echo "e.g.: $0 data/train exp/make_mfcc/train mfcc"
-   echo "Note: <log-dir> defaults to <data-dir>/log, and <mfccdir> defaults to <data-dir>/data"
-   echo "Options: "
-   echo "  --mfcc-config <config-file>                      # config passed to compute-mfcc-feats "
-   echo "  --nj <nj>                                        # number of parallel jobs"
-   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
-   echo "  --write-utt2num-frames <true|false>     # If true, write utt2num_frames file."
+  cat >&2 <<EOF
+Usage: $0 [options] <data-dir> [<log-dir> [<mfcc-dir>] ]
+ e.g.: $0 data/train
+Note: <log-dir> defaults to <data-dir>/log, and
+      <mfcc-dir> defaults to <data-dir>/data.
+Options:
+  --mfcc-config <config-file>          # config passed to compute-mfcc-feats.
+  --nj <nj>                            # number of parallel jobs.
+  --cmd <run.pl|queue.pl <queue opts>> # how to run jobs.
+  --write-utt2num-frames <true|false>  # If true, write utt2num_frames file.
+  --write-utt2dur <true|false>         # If true, write utt2dur file.
+EOF
    exit 1;
 fi
 
@@ -63,10 +68,11 @@ required="$scp $mfcc_config"
 
 for f in $required; do
   if [ ! -f $f ]; then
-    echo "make_mfcc.sh: no such file $f"
+    echo "$0: no such file $f"
     exit 1;
   fi
 done
+
 utils/validate_data_dir.sh --no-text --no-feats $data || exit 1;
 
 if [ -f $data/spk2warp ]; then
@@ -75,6 +81,8 @@ if [ -f $data/spk2warp ]; then
 elif [ -f $data/utt2warp ]; then
   echo "$0 [info]: using VTLN warp factors from $data/utt2warp"
   vtln_opts="--vtln-map=ark:$data/utt2warp"
+else
+  vtln_opts=""
 fi
 
 for n in $(seq $nj); do
@@ -90,11 +98,16 @@ else
   write_num_frames_opt=
 fi
 
+if $write_utt2dur; then
+  write_utt2dur_opt="--write-utt2dur=ark,t:$logdir/utt2dur.JOB"
+else
+  write_utt2dur_opt=
+fi
 
 if [ -f $data/segments ]; then
   echo "$0 [info]: segments file exists: using that."
 
-  split_segments=""
+  split_segments=
   for n in $(seq $nj); do
     split_segments="$split_segments $logdir/segments.$n"
   done
@@ -104,14 +117,15 @@ if [ -f $data/segments ]; then
 
   $cmd JOB=1:$nj $logdir/make_mfcc_${name}.JOB.log \
     extract-segments scp,p:$scp $logdir/segments.JOB ark:- \| \
-    compute-mfcc-feats $vtln_opts --verbose=2 --config=$mfcc_config ark:- ark:- \| \
+    compute-mfcc-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
+      --config=$mfcc_config ark:- ark:- \| \
     copy-feats --compress=$compress $write_num_frames_opt ark:- \
       ark,scp:$mfccdir/raw_mfcc_$name.JOB.ark,$mfccdir/raw_mfcc_$name.JOB.scp \
      || exit 1;
 
 else
   echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
-  split_scps=""
+  split_scps=
   for n in $(seq $nj); do
     split_scps="$split_scps $logdir/wav_${name}.$n.scp"
   done
@@ -123,44 +137,58 @@ else
   # utterances that have bad wave data.
 
   $cmd JOB=1:$nj $logdir/make_mfcc_${name}.JOB.log \
-    compute-mfcc-feats  $vtln_opts --verbose=2 --config=$mfcc_config \
-     scp,p:$logdir/wav_${name}.JOB.scp ark:- \| \
-      copy-feats $write_num_frames_opt --compress=$compress ark:- \
+    compute-mfcc-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
+      --config=$mfcc_config scp,p:$logdir/wav_${name}.JOB.scp ark:- \| \
+    copy-feats $write_num_frames_opt --compress=$compress ark:- \
       ark,scp:$mfccdir/raw_mfcc_$name.JOB.ark,$mfccdir/raw_mfcc_$name.JOB.scp \
       || exit 1;
 fi
 
 
 if [ -f $logdir/.error.$name ]; then
-  echo "Error producing mfcc features for $name:"
+  echo "$0: Error producing MFCC features for $name:"
   tail $logdir/make_mfcc_${name}.1.log
   exit 1;
 fi
 
 # concatenate the .scp files together.
 for n in $(seq $nj); do
-  cat $mfccdir/raw_mfcc_$name.$n.scp || exit 1;
+  cat $mfccdir/raw_mfcc_$name.$n.scp || exit 1
 done > $data/feats.scp || exit 1
 
 if $write_utt2num_frames; then
   for n in $(seq $nj); do
-    cat $logdir/utt2num_frames.$n || exit 1;
+    cat $logdir/utt2num_frames.$n || exit 1
   done > $data/utt2num_frames || exit 1
-  rm $logdir/utt2num_frames.*
 fi
 
-rm $logdir/wav_${name}.*.scp  $logdir/segments.* 2>/dev/null
+if $write_utt2dur; then
+  for n in $(seq $nj); do
+    cat $logdir/utt2dur.$n || exit 1
+  done > $data/utt2dur || exit 1
+fi
+
+# Store frame_shift and mfcc_config along with features.
+frame_shift=$(perl -ne 'if (/^--frame-shift=(\d+)/) {
+                          printf "%.3f", 0.001 * $1; exit; }' $mfcc_config)
+echo ${frame_shift:-'0.01'} > $data/frame_shift
+mkdir -p $data/conf && cp $mfcc_config $data/conf/mfcc.conf || exit 1
+
+rm $logdir/wav_${name}.*.scp  $logdir/segments.* \
+   $logdir/utt2num_frames.* $logdir/utt2dur.* 2>/dev/null
 
-nf=`cat $data/feats.scp | wc -l`
-nu=`cat $data/utt2spk | wc -l`
+nf=$(wc -l < $data/feats.scp)
+nu=$(wc -l < $data/utt2spk)
 if [ $nf -ne $nu ]; then
-  echo "It seems not all of the feature files were successfully processed ($nf != $nu);"
-  echo "consider using utils/fix_data_dir.sh $data"
+  echo "$0: It seems not all of the feature files were successfully procesed" \
+       "($nf != $nu); consider using utils/fix_data_dir.sh $data"
 fi
 
-if [ $nf -lt $[$nu - ($nu/20)] ]; then
-  echo "Less than 95% the features were successfully generated.  Probably a serious error."
-  exit 1;
+if (( nf < nu - nu/20 )); then
+  echo "$0: Less than 95% the features were successfully generated."\
+       "Probably a serious error."
+  exit 1
 fi
 
-echo "Succeeded creating MFCC features for $name"
+
+echo "$0: Succeeded creating MFCC features for $name"
diff --git a/egs/wsj/s5/steps/make_mfcc_pitch.sh b/egs/wsj/s5/steps/make_mfcc_pitch.sh
index 98b670b82ae..dda31667d6a 100755
--- a/egs/wsj/s5/steps/make_mfcc_pitch.sh
+++ b/egs/wsj/s5/steps/make_mfcc_pitch.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
-# Copyright 2013 The Shenzhen Key Laboratory of Intelligent Media and Speech,
-#                PKU-HKUST Shenzhen Hong Kong Institution (Author: Wei Shi)
+# Copyright 2013  The Shenzhen Key Laboratory of Intelligent Media and Speech,
+#                 PKU-HKUST Shenzhen Hong Kong Institution (Author: Wei Shi)
 #           2016  Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0
 # Combine MFCC and pitch features together
@@ -15,26 +15,31 @@ pitch_config=conf/pitch.conf
 pitch_postprocess_config=
 paste_length_tolerance=2
 compress=true
-write_utt2num_frames=false  # if true writes utt2num_frames
+write_utt2num_frames=true  # If true writes utt2num_frames.
+write_utt2dur=true
 # End configuration section.
 
-echo "$0 $@"  # Print the command line for logging
+echo "$0 $@"  # Print the command line for logging.
 
 if [ -f path.sh ]; then . ./path.sh; fi
 . parse_options.sh || exit 1;
 
 if [ $# -lt 1 ] || [ $# -gt 3 ]; then
-   echo "Usage: $0 [options] <data-dir> [<log-dir> [<mfcc-dir>] ]";
-   echo "e.g.: $0 data/train exp/make_mfcc/train mfcc"
-   echo "Note: <log-dir> defaults to <data-dir>/log, and <mfcc-dir> defaults to <data-dir>/data"
-   echo "Options: "
-   echo "  --mfcc-config              <mfcc-config-file>        # config passed to compute-mfcc-feats "
-   echo "  --pitch-config             <pitch-config-file>       # config passed to compute-kaldi-pitch-feats "
-   echo "  --pitch-postprocess-config <postprocess-config-file>  # config passed to process-kaldi-pitch-feats "
-   echo "  --paste-length-tolerance   <tolerance>               # length tolerance passed to paste-feats"
-   echo "  --nj                       <nj>                      # number of parallel jobs"
-   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>)     # how to run jobs."
-   echo "  --write-utt2num-frames <true|false>     # If true, write utt2num_frames file."
+  cat >&2 <<EOF
+Usage: $0 [options] <data-dir> [<log-dir> [<mfcc-dir>] ]
+ e.g.: $0 data/train
+Note: <log-dir> defaults to <data-dir>/log, and
+      <mfcc-dir> defaults to <data-dir>/data
+Options:
+  --mfcc-config <mfcc-config-file>     # config passed to compute-mfcc-feats.
+  --pitch-config <pitch-config-file>   # config passed to compute-kaldi-pitch-feats.
+  --pitch-postprocess-config <postprocess-config-file> # config passed to process-kaldi-pitch-feats.
+  --paste-length-tolerance <tolerance> # length tolerance passed to paste-feats.
+  --nj <nj>                            # number of parallel jobs.
+  --cmd <run.pl|queue.pl <queue opts>> # how to run jobs.
+  --write-utt2num-frames <true|false>  # If true, write utt2num_frames file.
+  --write-utt2dur <true|false>         # If true, write utt2dur file.
+EOF
    exit 1;
 fi
 
@@ -72,10 +77,11 @@ required="$scp $mfcc_config $pitch_config"
 
 for f in $required; do
   if [ ! -f $f ]; then
-    echo "make_mfcc_pitch.sh: no such file $f"
+    echo "$0: no such file $f"
     exit 1;
   fi
 done
+
 utils/validate_data_dir.sh --no-text --no-feats $data || exit 1;
 
 if [ ! -z "$pitch_postprocess_config" ]; then
@@ -104,9 +110,15 @@ else
   write_num_frames_opt=
 fi
 
+if $write_utt2dur; then
+  write_utt2dur_opt="--write-utt2dur=ark,t:$logdir/utt2dur.JOB"
+else
+  write_utt2dur_opt=
+fi
+
 if [ -f $data/segments ]; then
   echo "$0 [info]: segments file exists: using that."
-  split_segments=""
+  split_segments=
   for n in $(seq $nj); do
     split_segments="$split_segments $logdir/segments.$n"
   done
@@ -114,66 +126,89 @@ if [ -f $data/segments ]; then
   utils/split_scp.pl $data/segments $split_segments || exit 1;
   rm $logdir/.error 2>/dev/null
 
-  mfcc_feats="ark:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | compute-mfcc-feats $vtln_opts --verbose=2 --config=$mfcc_config ark:- ark:- |"
-  pitch_feats="ark,s,cs:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config ark:- ark:- | process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
+  mfcc_feats="ark:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | \
+    compute-mfcc-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
+      --config=$mfcc_config ark:- ark:- |"
+  pitch_feats="ark,s,cs:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | \
+    compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config ark:- ark:- | \
+    process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
 
   $cmd JOB=1:$nj $logdir/make_mfcc_pitch_${name}.JOB.log \
-    paste-feats --length-tolerance=$paste_length_tolerance "$mfcc_feats" "$pitch_feats" ark:- \| \
+    paste-feats --length-tolerance=$paste_length_tolerance \
+      "$mfcc_feats" "$pitch_feats" ark:- \| \
     copy-feats --compress=$compress $write_num_frames_opt ark:- \
       ark,scp:$mfcc_pitch_dir/raw_mfcc_pitch_$name.JOB.ark,$mfcc_pitch_dir/raw_mfcc_pitch_$name.JOB.scp \
      || exit 1;
 
 else
   echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
-  split_scps=""
+  split_scps=
   for n in $(seq $nj); do
     split_scps="$split_scps $logdir/wav_${name}.$n.scp"
   done
 
   utils/split_scp.pl $scp $split_scps || exit 1;
 
-  mfcc_feats="ark:compute-mfcc-feats $vtln_opts --verbose=2 --config=$mfcc_config scp,p:$logdir/wav_${name}.JOB.scp ark:- |"
-  pitch_feats="ark,s,cs:compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config scp,p:$logdir/wav_${name}.JOB.scp ark:- | process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
+  mfcc_feats="ark:compute-mfcc-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
+    --config=$mfcc_config scp,p:$logdir/wav_${name}.JOB.scp ark:- |"
+  pitch_feats="ark,s,cs:compute-kaldi-pitch-feats --verbose=2 \
+      --config=$pitch_config scp,p:$logdir/wav_${name}.JOB.scp ark:- | \
+    process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
 
   $cmd JOB=1:$nj $logdir/make_mfcc_pitch_${name}.JOB.log \
-    paste-feats --length-tolerance=$paste_length_tolerance "$mfcc_feats" "$pitch_feats" ark:- \| \
+    paste-feats --length-tolerance=$paste_length_tolerance \
+      "$mfcc_feats" "$pitch_feats" ark:- \| \
     copy-feats --compress=$compress $write_num_frames_opt ark:- \
       ark,scp:$mfcc_pitch_dir/raw_mfcc_pitch_$name.JOB.ark,$mfcc_pitch_dir/raw_mfcc_pitch_$name.JOB.scp \
       || exit 1;
-
 fi
 
 
 if [ -f $logdir/.error.$name ]; then
-  echo "Error producing mfcc & pitch features for $name:"
+  echo "$0: Error producing MFCC and pitch features for $name:"
   tail $logdir/make_mfcc_pitch_${name}.1.log
   exit 1;
 fi
 
-# concatenate the .scp files together.
+# Concatenate the .scp files together.
 for n in $(seq $nj); do
   cat $mfcc_pitch_dir/raw_mfcc_pitch_$name.$n.scp || exit 1;
-done > $data/feats.scp
+done > $data/feats.scp || exit 1
 
 if $write_utt2num_frames; then
   for n in $(seq $nj); do
-    cat $logdir/utt2num_frames.$n || exit 1;
+    cat $logdir/utt2num_frames.$n || exit 1
   done > $data/utt2num_frames || exit 1
-  rm $logdir/utt2num_frames.*
 fi
 
-rm $logdir/wav_${name}.*.scp  $logdir/segments.* 2>/dev/null
+if $write_utt2dur; then
+  for n in $(seq $nj); do
+    cat $logdir/utt2dur.$n || exit 1
+  done > $data/utt2dur || exit 1
+fi
+
+# Store frame_shift, mfcc_config and pitch_config along with features.
+frame_shift=$(perl -ne 'if (/^--frame-shift=(\d+)/) {
+                          printf "%.3f", 0.001 * $1; exit; }' $mfcc_config)
+echo ${frame_shift:-'0.01'} > $data/frame_shift
+mkdir -p $data/conf &&
+  cp $mfcc_config $data/conf/mfcc.conf &&
+  cp $pitch_config $data/conf/pitch.conf || exit 1
 
-nf=`cat $data/feats.scp | wc -l`
-nu=`cat $data/utt2spk | wc -l`
+rm $logdir/wav_${name}.*.scp  $logdir/segments.* \
+   $logdir/utt2num_frames.* $logdir/utt2dur.* 2>/dev/null
+
+nf=$(wc -l < $data/feats.scp)
+nu=$(wc -l < $data/utt2spk)
 if [ $nf -ne $nu ]; then
-  echo "It seems not all of the feature files were successfully processed ($nf != $nu);"
-  echo "consider using utils/fix_data_dir.sh $data"
+  echo "$0: It seems not all of the feature files were successfully procesed" \
+       "($nf != $nu); consider using utils/fix_data_dir.sh $data"
 fi
 
-if [ $nf -lt $[$nu - ($nu/20)] ]; then
-  echo "Less than 95% the features were successfully generated.  Probably a serious error."
-  exit 1;
+if (( nf < nu - nu/20 )); then
+  echo "$0: Less than 95% the features were successfully generated."\
+       "Probably a serious error."
+  exit 1
 fi
 
-echo "Succeeded creating MFCC & Pitch features for $name"
+echo "$0: Succeeded creating MFCC and pitch features for $name"
diff --git a/egs/wsj/s5/steps/make_mfcc_pitch_online.sh b/egs/wsj/s5/steps/make_mfcc_pitch_online.sh
index df51057a00b..001c1e4c6f4 100755
--- a/egs/wsj/s5/steps/make_mfcc_pitch_online.sh
+++ b/egs/wsj/s5/steps/make_mfcc_pitch_online.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
-# Copyright 2013 The Shenzhen Key Laboratory of Intelligent Media and Speech,
-#                PKU-HKUST Shenzhen Hong Kong Institution (Author: Wei Shi)
+# Copyright 2013  The Shenzhen Key Laboratory of Intelligent Media and Speech,
+#                 PKU-HKUST Shenzhen Hong Kong Institution (Author: Wei Shi)
 #           2014-2016  Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0
 # Combine MFCC and online-pitch features together
@@ -14,25 +14,30 @@ mfcc_config=conf/mfcc.conf
 online_pitch_config=conf/online_pitch.conf
 paste_length_tolerance=2
 compress=true
+write_utt2num_frames=true  # If true writes utt2num_frames.
+write_utt2dur=true
 # End configuration section.
 
-echo "$0 $@"  # Print the command line for logging
+echo "$0 $@"  # Print the command line for logging.
 
 if [ -f path.sh ]; then . ./path.sh; fi
 . parse_options.sh || exit 1;
 
 if [ $# -lt 1 ] || [ $# -gt 3 ]; then
-   echo "Usage: $0 [options] <data-dir> [<log-dir> [<mfcc-dir>] ]";
-   echo "e.g.: $0 data/train exp/make_mfcc/train mfcc"
-   echo "Note: <log-dir> defaults to <data-dir>/log, and <mfcc-dir> defaults to <data-dir>/data"
-   echo "Options: "
-   echo "  --mfcc-config              <mfcc-config-file>        # config passed to compute-mfcc-feats, default "
-   echo "                                                       # is conf/mfcc.conf"
-   echo "  --online-pitch-config <online-pitch-config-file>     # config passed to compute-and-process-kaldi-pitch-feats, "
-   echo "                                                       # default is conf/online_pitch.conf"
-   echo "  --paste-length-tolerance   <tolerance>               # length tolerance passed to paste-feats"
-   echo "  --nj                       <nj>                      # number of parallel jobs"
-   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>)     # how to run jobs."
+  cat >&2 <<EOF
+Usage: $0 [options] <data-dir> [<log-dir> [<mfcc-dir>] ]
+ e.g.: $0 data/train
+Note: <log-dir> defaults to <data-dir>/log, and
+      <mfcc-dir> defaults to <data-dir>/data
+Options:
+  --mfcc-config <mfcc-config-file>     # config passed to compute-mfcc-feats [conf/mfcc.conf]
+  --online-pitch-config <online-pitch-config-file> # config passed to compute-and-process-kaldi-pitch-feats [conf/online_pitch.conf]
+  --paste-length-tolerance <tolerance> # length tolerance passed to paste-feats.
+  --nj <nj>                            # number of parallel jobs.
+  --cmd <run.pl|queue.pl <queue opts>> # how to run jobs.
+  --write-utt2num-frames <true|false>  # If true, write utt2num_frames file.
+  --write-utt2dur <true|false>         # If true, write utt2dur file.
+EOF
    exit 1;
 fi
 
@@ -90,9 +95,21 @@ for n in $(seq $nj); do
   utils/create_data_link.pl $mfcc_pitch_dir/raw_mfcc_online_pitch_$name.$n.ark
 done
 
+if $write_utt2num_frames; then
+  write_num_frames_opt="--write-num-frames=ark,t:$logdir/utt2num_frames.JOB"
+else
+  write_num_frames_opt=
+fi
+
+if $write_utt2dur; then
+  write_utt2dur_opt="--write-utt2dur=ark,t:$logdir/utt2dur.JOB"
+else
+  write_utt2dur_opt=
+fi
+
 if [ -f $data/segments ]; then
   echo "$0 [info]: segments file exists: using that."
-  split_segments=""
+  split_segments=
   for n in $(seq $nj); do
     split_segments="$split_segments $logdir/segments.$n"
   done
@@ -100,58 +117,88 @@ if [ -f $data/segments ]; then
   utils/split_scp.pl $data/segments $split_segments || exit 1;
   rm $logdir/.error 2>/dev/null
 
-  mfcc_feats="ark:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | compute-mfcc-feats $vtln_opts --verbose=2 --config=$mfcc_config ark:- ark:- |"
-  pitch_feats="ark,s,cs:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | compute-and-process-kaldi-pitch-feats --verbose=2 --config=$online_pitch_config ark:- ark:- |"
+  mfcc_feats="ark:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | \
+    compute-mfcc-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
+      --config=$mfcc_config ark:- ark:- |"
+  pitch_feats="ark,s,cs:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | \
+    compute-and-process-kaldi-pitch-feats --verbose=2 \
+      --config=$online_pitch_config ark:- ark:- |"
 
   $cmd JOB=1:$nj $logdir/make_mfcc_pitch_${name}.JOB.log \
-    paste-feats --length-tolerance=$paste_length_tolerance "$mfcc_feats" "$pitch_feats" ark:- \| \
-    copy-feats --compress=$compress ark:- \
+    paste-feats --length-tolerance=$paste_length_tolerance \
+      "$mfcc_feats" "$pitch_feats" ark:- \| \
+    copy-feats --compress=$compress $write_num_frames_opt ark:- \
       ark,scp:$mfcc_pitch_dir/raw_mfcc_online_pitch_$name.JOB.ark,$mfcc_pitch_dir/raw_mfcc_online_pitch_$name.JOB.scp \
      || exit 1;
 
 else
   echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
-  split_scps=""
+  split_scps=
   for n in $(seq $nj); do
     split_scps="$split_scps $logdir/wav_${name}.$n.scp"
   done
 
   utils/split_scp.pl $scp $split_scps || exit 1;
 
-  mfcc_feats="ark:compute-mfcc-feats $vtln_opts --verbose=2 --config=$mfcc_config scp,p:$logdir/wav_${name}.JOB.scp ark:- |"
-  pitch_feats="ark,s,cs:compute-and-process-kaldi-pitch-feats --verbose=2 --config=$online_pitch_config scp,p:$logdir/wav_${name}.JOB.scp ark:- |"
+  mfcc_feats="ark:compute-mfcc-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
+    --config=$mfcc_config scp,p:$logdir/wav_${name}.JOB.scp ark:- |"
+  pitch_feats="ark,s,cs:compute-and-process-kaldi-pitch-feats --verbose=2 \
+    --config=$online_pitch_config scp,p:$logdir/wav_${name}.JOB.scp ark:- |"
 
   $cmd JOB=1:$nj $logdir/make_mfcc_pitch_${name}.JOB.log \
-    paste-feats --length-tolerance=$paste_length_tolerance "$mfcc_feats" "$pitch_feats" ark:- \| \
-    copy-feats --compress=$compress ark:- \
+    paste-feats --length-tolerance=$paste_length_tolerance \
+      "$mfcc_feats" "$pitch_feats" ark:- \| \
+    copy-feats --compress=$compress $write_num_frames_opt ark:- \
       ark,scp:$mfcc_pitch_dir/raw_mfcc_online_pitch_$name.JOB.ark,$mfcc_pitch_dir/raw_mfcc_online_pitch_$name.JOB.scp \
       || exit 1;
 fi
 
 
 if [ -f $logdir/.error.$name ]; then
-  echo "Error producing mfcc & pitch features for $name:"
+  echo "$0: Error producing MFCC and online-pitch features for $name:"
   tail $logdir/make_mfcc_pitch_${name}.1.log
   exit 1;
 fi
 
-# concatenate the .scp files together.
+# Concatenate the .scp files together.
 for n in $(seq $nj); do
-  cat $mfcc_pitch_dir/raw_mfcc_online_pitch_$name.$n.scp || exit 1;
-done > $data/feats.scp
+  cat $mfcc_pitch_dir/raw_mfcc_online_pitch_$name.$n.scp || exit 1
+done > $data/feats.scp || exit 1
+
+if $write_utt2num_frames; then
+  for n in $(seq $nj); do
+    cat $logdir/utt2num_frames.$n || exit 1
+  done > $data/utt2num_frames || exit 1
+fi
 
-rm $logdir/wav_${name}.*.scp  $logdir/segments.* 2>/dev/null
+if $write_utt2dur; then
+  for n in $(seq $nj); do
+    cat $logdir/utt2dur.$n || exit 1
+  done > $data/utt2dur || exit 1
+fi
 
-nf=`cat $data/feats.scp | wc -l`
-nu=`cat $data/utt2spk | wc -l`
+# Store frame_shift, mfcc_config and pitch_config_online along with features.
+frame_shift=$(perl -ne 'if (/^--frame-shift=(\d+)/) {
+                          printf "%.3f", 0.001 * $1; exit; }' $mfcc_config)
+echo ${frame_shift:-'0.01'} > $data/frame_shift
+mkdir -p $data/conf &&
+  cp $mfcc_config $data/conf/mfcc.conf &&
+  cp $online_pitch_config $data/conf/online_pitch.conf || exit 1
+
+rm $logdir/wav_${name}.*.scp  $logdir/segments.* \
+   $logdir/utt2num_frames.* $logdir/utt2dur.* 2>/dev/null
+
+nf=$(wc -l < $data/feats.scp)
+nu=$(wc -l < $data/utt2spk)
 if [ $nf -ne $nu ]; then
-  echo "It seems not all of the feature files were successfully processed ($nf != $nu);"
-  echo "consider using utils/fix_data_dir.sh $data"
+  echo "$0: It seems not all of the feature files were successfully procesed" \
+       "($nf != $nu); consider using utils/fix_data_dir.sh $data"
 fi
 
-if [ $nf -lt $[$nu - ($nu/20)] ]; then
-  echo "Less than 95% the features were successfully generated.  Probably a serious error."
-  exit 1;
+if (( nf < nu - nu/20 )); then
+  echo "$0: Less than 95% the features were successfully generated."\
+       "Probably a serious error."
+  exit 1
 fi
 
-echo "Succeeded creating MFCC & online-pitch features for $name"
+echo "$0: Succeeded creating MFCC and online-pitch features for $name"
diff --git a/egs/wsj/s5/steps/make_plp.sh b/egs/wsj/s5/steps/make_plp.sh
index 85b4a02fbb6..c4a987aaeeb 100755
--- a/egs/wsj/s5/steps/make_plp.sh
+++ b/egs/wsj/s5/steps/make_plp.sh
@@ -10,22 +10,28 @@ nj=4
 cmd=run.pl
 plp_config=conf/plp.conf
 compress=true
-write_utt2num_frames=false  # if true writes utt2num_frames
+write_utt2num_frames=true  # If true writes utt2num_frames.
+write_utt2dur=true
 # End configuration section.
 
-echo "$0 $@"  # Print the command line for logging
+echo "$0 $@"  # Print the command line for logging.
 
 if [ -f path.sh ]; then . ./path.sh; fi
 . parse_options.sh || exit 1;
 
 if [ $# -lt 1 ] || [ $# -gt 3 ]; then
-   echo "Usage: $0 [options] <data-dir> [<log-dir> [<plp-dir>] ]";
-   echo "e.g.: $0 data/train exp/make_plp/train mfcc"
-   echo "Note: <log-dir> defaults to <data-dir>/log, and <plp-dir> defaults to <data-dir>/data"
-   echo "Options: "
-   echo "  --plp-config <config-file>                      # config passed to compute-plp-feats "
-   echo "  --nj <nj>                                        # number of parallel jobs"
-   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  cat >&2 <<EOF
+Usage: $0 [options] <data-dir> [<log-dir> [<plp-dir>] ]
+ e.g.: $0 data/train
+Note: <log-dir> defaults to <data-dir>/log, and
+      <plp-dir> defaults to <data-dir>/data
+Options:
+  --plp-config <config-file>           # config passed to compute-plp-feats.
+  --nj <nj>                            # number of parallel jobs.
+  --cmd <run.pl|queue.pl <queue opts>> # how to run jobs.
+  --write-utt2num-frames <true|false>  # If true, write utt2num_frames file.
+  --write-utt2dur <true|false>         # If true, write utt2dur file.
+EOF
    exit 1;
 fi
 
@@ -62,7 +68,7 @@ required="$scp $plp_config"
 
 for f in $required; do
   if [ ! -f $f ]; then
-    echo "make_plp.sh: no such file $f"
+    echo "$0: no such file $f"
     exit 1;
   fi
 done
@@ -74,6 +80,8 @@ if [ -f $data/spk2warp ]; then
 elif [ -f $data/utt2warp ]; then
   echo "$0 [info]: using VTLN warp factors from $data/utt2warp"
   vtln_opts="--vtln-map=ark:$data/utt2warp"
+else
+  vtln_opts=
 fi
 
 for n in $(seq $nj); do
@@ -88,9 +96,15 @@ else
   write_num_frames_opt=
 fi
 
+if $write_utt2dur; then
+  write_utt2dur_opt="--write-utt2dur=ark,t:$logdir/utt2dur.JOB"
+else
+  write_utt2dur_opt=
+fi
+
 if [ -f $data/segments ]; then
   echo "$0 [info]: segments file exists: using that."
-  split_segments=""
+  split_segments=
   for n in $(seq $nj); do
     split_segments="$split_segments $logdir/segments.$n"
   done
@@ -100,14 +114,15 @@ if [ -f $data/segments ]; then
 
   $cmd JOB=1:$nj $logdir/make_plp_${name}.JOB.log \
     extract-segments scp,p:$scp $logdir/segments.JOB ark:- \| \
-    compute-plp-feats $vtln_opts --verbose=2 --config=$plp_config ark:- ark:- \| \
+    compute-plp-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
+      --config=$plp_config ark:- ark:- \| \
     copy-feats --compress=$compress $write_num_frames_opt ark:- \
       ark,scp:$plpdir/raw_plp_$name.JOB.ark,$plpdir/raw_plp_$name.JOB.scp \
      || exit 1;
 
 else
   echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
-  split_scps=""
+  split_scps=
   for n in $(seq $nj); do
     split_scps="$split_scps $logdir/wav_${name}.$n.scp"
   done
@@ -115,7 +130,8 @@ else
   utils/split_scp.pl $scp $split_scps || exit 1;
 
   $cmd JOB=1:$nj $logdir/make_plp_${name}.JOB.log \
-    compute-plp-feats  $vtln_opts --verbose=2 --config=$plp_config scp,p:$logdir/wav_${name}.JOB.scp ark:- \| \
+    compute-plp-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
+      --config=$plp_config scp,p:$logdir/wav_${name}.JOB.scp ark:- \| \
     copy-feats --compress=$compress $write_num_frames_opt ark:- \
       ark,scp:$plpdir/raw_plp_$name.JOB.ark,$plpdir/raw_plp_$name.JOB.scp \
       || exit 1;
@@ -124,34 +140,48 @@ fi
 
 
 if [ -f $logdir/.error.$name ]; then
-  echo "Error producing plp features for $name:"
+  echo "$0: Error producing PLP features for $name:"
   tail $logdir/make_plp_${name}.1.log
   exit 1;
 fi
 
 # concatenate the .scp files together.
 for n in $(seq $nj); do
-  cat $plpdir/raw_plp_$name.$n.scp || exit 1;
+  cat $plpdir/raw_plp_$name.$n.scp || exit 1
 done > $data/feats.scp
 
 if $write_utt2num_frames; then
   for n in $(seq $nj); do
-    cat $logdir/utt2num_frames.$n || exit 1;
+    cat $logdir/utt2num_frames.$n || exit 1
   done > $data/utt2num_frames || exit 1
-  rm $logdir/utt2num_frames.*
 fi
 
-rm $logdir/wav_${name}.*.scp  $logdir/segments.* 2>/dev/null
+if $write_utt2dur; then
+  for n in $(seq $nj); do
+    cat $logdir/utt2dur.$n || exit 1
+  done > $data/utt2dur || exit 1
+fi
+
+# Store frame_shift and plp_config along with features.
+frame_shift=$(perl -ne 'if (/^--frame-shift=(\d+)/) {
+                          printf "%.3f", 0.001 * $1; exit; }' $plp_config)
+echo ${frame_shift:-'0.01'} > $data/frame_shift
+mkdir -p $data/conf && cp $plp_config $data/conf/plp.conf || exit 1
+
+rm $logdir/wav_${name}.*.scp  $logdir/segments.* \
+   $logdir/utt2num_frames.* $logdir/utt2dur.* 2>/dev/null
 
-nf=`cat $data/feats.scp | wc -l`
-nu=`cat $data/utt2spk | wc -l`
+nf=$(wc -l < $data/feats.scp)
+nu=$(wc -l < $data/utt2spk)
 if [ $nf -ne $nu ]; then
-  echo "It seems not all of the feature files were successfully ($nf != $nu);"
-  echo "consider using utils/fix_data_dir.sh $data"
+  echo "$0: It seems not all of the feature files were successfully procesed" \
+       "($nf != $nu); consider using utils/fix_data_dir.sh $data"
 fi
-if [ $nf -lt $[$nu - ($nu/20)] ]; then
-  echo "Less than 95% the features were successfully generated.  Probably a serious error."
-  exit 1;
+
+if (( nf < nu - nu/20 )); then
+  echo "$0: Less than 95% the features were successfully generated."\
+       "Probably a serious error."
+  exit 1
 fi
 
-echo "Succeeded creating PLP features for $name"
+echo "$0: Succeeded creating PLP features for $name"
diff --git a/egs/wsj/s5/steps/make_plp_pitch.sh b/egs/wsj/s5/steps/make_plp_pitch.sh
index 40ddd314f6c..9f565d8a5bf 100755
--- a/egs/wsj/s5/steps/make_plp_pitch.sh
+++ b/egs/wsj/s5/steps/make_plp_pitch.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
-# Copyright 2013 The Shenzhen Key Laboratory of Intelligent Media and Speech,
-#                PKU-HKUST Shenzhen Hong Kong Institution (Author: Wei Shi)
+# Copyright 2013  The Shenzhen Key Laboratory of Intelligent Media and Speech,
+#                 PKU-HKUST Shenzhen Hong Kong Institution (Author: Wei Shi)
 #           2016  Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0
 # Combine PLP and pitch features together
@@ -15,25 +15,31 @@ pitch_config=conf/pitch.conf
 pitch_postprocess_config=
 paste_length_tolerance=2
 compress=true
-write_utt2num_frames=false  # if true writes utt2num_frames
+write_utt2num_frames=true  # If true writes utt2num_frames.
+write_utt2dur=true
 # End configuration section.
 
-echo "$0 $@"  # Print the command line for logging
+echo "$0 $@"  # Print the command line for logging.
 
 if [ -f ./path.sh ]; then . ./path.sh;  fi
 . parse_options.sh || exit 1;
 
 if [ $# -lt 1 ] || [ $# -gt 3 ]; then
-   echo "Usage: $0 [options] <data-dir> [<log-dir> [<plp-dir>] ]";
-   echo "e.g.: $0 data/train exp/make_plp/train mfcc"
-   echo "Note: <log-dir> defaults to <data-dir>/log, and <plp-dir> defaults to <data-dir>/data"
-   echo "Options: "
-   echo "  --plp-config               <config-file>             # config passed to compute-plp-feats "
-   echo "  --pitch-config             <pitch-config-file>       # config passed to compute-kaldi-pitch-feats "
-   echo "  --pitch-postprocess-config <postprocess-config-file> # config passed to process-kaldi-pitch-feats "
-   echo "  --paste-length-tolerance   <tolerance>               # length tolerance passed to paste-feats"
-   echo "  --nj                       <nj>                      # number of parallel jobs"
-   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>)     # how to run jobs."
+  cat >&2 <<EOF
+Usage: $0 [options] <data-dir> [<log-dir> [<plp-dir>] ]
+ e.g.: $0 data/train
+Note: <log-dir> defaults to <data-dir>/log, and
+      <plp-dir> defaults to <data-dir>/data
+Options:
+  --plp-config <plp-config-file>       # config passed to compute-plp-feats.
+  --pitch-config <pitch-config-file>   # config passed to compute-kaldi-pitch-feats.
+  --pitch-postprocess-config <postprocess-config-file> # config passed to process-kaldi-pitch-feats.
+  --paste-length-tolerance <tolerance> # length tolerance passed to paste-feats.
+  --nj <nj>                            # number of parallel jobs.
+  --cmd <run.pl|queue.pl <queue opts>> # how to run jobs.
+  --write-utt2num-frames <true|false>  # If true, write utt2num_frames file.
+  --write-utt2dur <true|false>         # If true, write utt2dur file.
+EOF
    exit 1;
 fi
 
@@ -70,7 +76,7 @@ required="$scp $plp_config $pitch_config"
 
 for f in $required; do
   if [ ! -f $f ]; then
-    echo "make_plp_pitch.sh: no such file $f"
+    echo "$0: no such file $f"
     exit 1;
   fi
 done
@@ -102,9 +108,15 @@ else
   write_num_frames_opt=
 fi
 
+if $write_utt2dur; then
+  write_utt2dur_opt="--write-utt2dur=ark,t:$logdir/utt2dur.JOB"
+else
+  write_utt2dur_opt=
+fi
+
 if [ -f $data/segments ]; then
   echo "$0 [info]: segments file exists: using that."
-  split_segments=""
+  split_segments=
   for n in $(seq $nj); do
     split_segments="$split_segments $logdir/segments.$n"
   done
@@ -112,67 +124,89 @@ if [ -f $data/segments ]; then
   utils/split_scp.pl $data/segments $split_segments || exit 1;
   rm $logdir/.error 2>/dev/null
 
-  plp_feats="ark:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | compute-plp-feats $vtln_opts --verbose=2 --config=$plp_config ark:- ark:- |"
-  pitch_feats="ark,s,cs:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config ark:- ark:- | process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
+  plp_feats="ark:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | \
+    compute-plp-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
+      --config=$plp_config ark:- ark:- |"
+  pitch_feats="ark,s,cs:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | \
+    compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config ark:- ark:- | \
+    process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
 
   $cmd JOB=1:$nj $logdir/make_plp_pitch_${name}.JOB.log \
-    paste-feats --length-tolerance=$paste_length_tolerance "$plp_feats" "$pitch_feats" ark:- \| \
+    paste-feats --length-tolerance=$paste_length_tolerance \
+      "$plp_feats" "$pitch_feats" ark:- \| \
     copy-feats --compress=$compress $write_num_frames_opt ark:- \
       ark,scp:$plp_pitch_dir/raw_plp_pitch_$name.JOB.ark,$plp_pitch_dir/raw_plp_pitch_$name.JOB.scp \
      || exit 1;
 
 else
   echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
-  split_scps=""
+  split_scps=
   for n in $(seq $nj); do
     split_scps="$split_scps $logdir/wav_${name}.$n.scp"
   done
 
   utils/split_scp.pl $scp $split_scps || exit 1;
 
-
-  plp_feats="ark:compute-plp-feats $vtln_opts --verbose=2 --config=$plp_config scp,p:$logdir/wav_${name}.JOB.scp ark:- |"
-  pitch_feats="ark,s,cs:compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config scp,p:$logdir/wav_${name}.JOB.scp ark:- | process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
+  plp_feats="ark:compute-plp-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
+    --config=$plp_config scp,p:$logdir/wav_${name}.JOB.scp ark:- |"
+  pitch_feats="ark,s,cs:compute-kaldi-pitch-feats --verbose=2 \
+      --config=$pitch_config scp,p:$logdir/wav_${name}.JOB.scp ark:- | \
+    process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
 
   $cmd JOB=1:$nj $logdir/make_plp_pitch_${name}.JOB.log \
-    paste-feats --length-tolerance=$paste_length_tolerance "$plp_feats" "$pitch_feats" ark:- \| \
+    paste-feats --length-tolerance=$paste_length_tolerance \
+      "$plp_feats" "$pitch_feats" ark:- \| \
     copy-feats --compress=$compress $write_num_frames_opt ark:- \
       ark,scp:$plp_pitch_dir/raw_plp_pitch_$name.JOB.ark,$plp_pitch_dir/raw_plp_pitch_$name.JOB.scp \
       || exit 1;
-
 fi
 
 
 if [ -f $logdir/.error.$name ]; then
-  echo "Error producing plp & pitch features for $name:"
+  echo "$0: Error producing PLP and pitch features for $name:"
   tail $logdir/make_plp_pitch_${name}.1.log
   exit 1;
 fi
 
-# concatenate the .scp files together.
+# Concatenate the .scp files together.
 for n in $(seq $nj); do
-  cat $plp_pitch_dir/raw_plp_pitch_$name.$n.scp || exit 1;
-done > $data/feats.scp
+  cat $plp_pitch_dir/raw_plp_pitch_$name.$n.scp || exit 1
+done > $data/feats.scp || exit 1
 
 if $write_utt2num_frames; then
   for n in $(seq $nj); do
-    cat $logdir/utt2num_frames.$n || exit 1;
+    cat $logdir/utt2num_frames.$n || exit 1
   done > $data/utt2num_frames || exit 1
-  rm $logdir/utt2num_frames.*
 fi
 
-rm $logdir/wav_${name}.*.scp  $logdir/segments.* 2>/dev/null
+if $write_utt2dur; then
+  for n in $(seq $nj); do
+    cat $logdir/utt2dur.$n || exit 1
+  done > $data/utt2dur || exit 1
+fi
+
+# Store frame_shift, plp_config and pitch_config along with features.
+frame_shift=$(perl -ne 'if (/^--frame-shift=(\d+)/) {
+                          printf "%.3f", 0.001 * $1; exit; }' $plp_config)
+echo ${frame_shift:-'0.01'} > $data/frame_shift
+mkdir -p $data/conf &&
+  cp $plp_config $data/conf/plp.conf &&
+  cp $pitch_config $data/conf/pitch.conf || exit 1
 
-nf=`cat $data/feats.scp | wc -l`
-nu=`cat $data/utt2spk | wc -l`
+rm $logdir/wav_${name}.*.scp  $logdir/segments.* \
+   $logdir/utt2num_frames.* $logdir/utt2dur.* 2>/dev/null
+
+nf=$(wc -l < $data/feats.scp)
+nu=$(wc -l < $data/utt2spk)
 if [ $nf -ne $nu ]; then
-  echo "It seems not all of the feature files were successfully processed ($nf != $nu);"
-  echo "consider using utils/fix_data_dir.sh $data"
+  echo "$0: It seems not all of the feature files were successfully procesed" \
+       "($nf != $nu); consider using utils/fix_data_dir.sh $data"
 fi
 
-if [ $nf -lt $[$nu - ($nu/20)] ]; then
-  echo "Less than 95% the features were successfully generated.  Probably a serious error."
-  exit 1;
+if (( nf < nu - nu/20 )); then
+  echo "$0: Less than 95% the features were successfully generated."\
+       "Probably a serious error."
+  exit 1
 fi
 
-echo "Succeeded creating PLP & Pitch features for $name"
+echo "$0: Succeeded creating PLP and pitch features for $name"
diff --git a/egs/wsj/s5/steps/nnet/align.sh b/egs/wsj/s5/steps/nnet/align.sh
index 78f45a0609c..5f700cf28ed 100755
--- a/egs/wsj/s5/steps/nnet/align.sh
+++ b/egs/wsj/s5/steps/nnet/align.sh
@@ -5,7 +5,7 @@
 # Aligns 'data' to sequences of transition-ids using Neural Network based acoustic model.
 # Optionally produces alignment in lattice format, this is handy to get word alignment.
 
-# Begin configuration section.  
+# Begin configuration section.
 nj=4
 cmd=run.pl
 stage=0
@@ -71,28 +71,29 @@ done
 
 # PREPARE FEATURE EXTRACTION PIPELINE
 # import config,
+online_cmvn_opts=
 cmvn_opts=
 delta_opts=
 D=$srcdir
-[ -e $D/norm_vars ] && cmvn_opts="--norm-means=true --norm-vars=$(cat $D/norm_vars)" # Bwd-compatibility,
+[ -e $D/online_cmvn_opts ] && online_cmvn_opts=$(cat $D/online_cmvn_opts)
 [ -e $D/cmvn_opts ] && cmvn_opts=$(cat $D/cmvn_opts)
-[ -e $D/delta_order ] && delta_opts="--delta-order=$(cat $D/delta_order)" # Bwd-compatibility,
 [ -e $D/delta_opts ] && delta_opts=$(cat $D/delta_opts)
 #
 # Create the feature stream,
 feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- |"
+# apply-cmvn-online (optional),
+[ -n "$online_cmvn_opts" -a ! -f $nndir/global_cmvn_stats.mat ] && echo "$0: Missing $nndir/global_cmvn_stats.mat" && exit 1
+[ -n "$online_cmvn_opts" ] && feats="$feats apply-cmvn-online $online_cmvn_opts --spk2utt=ark:$srcdata/spk2utt $nndir/global_cmvn_stats.mat ark:- ark:- |"
 # apply-cmvn (optional),
-[ ! -z "$cmvn_opts" -a ! -f $sdata/1/cmvn.scp ] && echo "$0: Missing $sdata/1/cmvn.scp" && exit 1
-[ ! -z "$cmvn_opts" ] && feats="$feats apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp ark:- ark:- |"
+[ -n "$cmvn_opts" -a ! -f $sdata/1/cmvn.scp ] && echo "$0: Missing $sdata/1/cmvn.scp" && exit 1
+[ -n "$cmvn_opts" ] && feats="$feats apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp ark:- ark:- |"
 # add-deltas (optional),
-[ ! -z "$delta_opts" ] && feats="$feats add-deltas $delta_opts ark:- ark:- |"
-# add-pytel transform (optional),
-[ -e $D/pytel_transform.py ] && feats="$feats /bin/env python $D/pytel_transform.py |"
+[ -n "$delta_opts" ] && feats="$feats add-deltas $delta_opts ark:- ark:- |"
 
 # add-ivector (optional),
 if [ -e $D/ivector_dim ]; then
   [ -z $ivector ] && echo "Missing --ivector, they were used in training!" && exit 1
-  # Get the tool, 
+  # Get the tool,
   ivector_append_tool=append-vector-to-feats # default,
   [ -e $D/ivector_append_tool ] && ivector_append_tool=$(cat $D/ivector_append_tool)
   # Check dims,
@@ -113,7 +114,7 @@ feats="$feats nnet-forward $nnet_forward_opts --feature-transform=$feature_trans
 
 echo "$0: aligning data '$data' using nnet/model '$srcdir', putting alignments in '$dir'"
 
-# Map oovs in reference transcription, 
+# Map oovs in reference transcription,
 oov=`cat $lang/oov.int` || exit 1;
 [ -z "$text" ] && text=$sdata/JOB/text
 tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $text |";
diff --git a/egs/wsj/s5/steps/nnet/decode.sh b/egs/wsj/s5/steps/nnet/decode.sh
index d2766661e12..ba368bfc56b 100755
--- a/egs/wsj/s5/steps/nnet/decode.sh
+++ b/egs/wsj/s5/steps/nnet/decode.sh
@@ -98,23 +98,24 @@ thread_string=
 
 # PREPARE FEATURE EXTRACTION PIPELINE
 # import config,
+online_cmvn_opts=
 cmvn_opts=
 delta_opts=
 D=$srcdir
-[ -e $D/norm_vars ] && cmvn_opts="--norm-means=true --norm-vars=$(cat $D/norm_vars)" # Bwd-compatibility,
+[ -e $D/online_cmvn_opts ] && online_cmvn_opts=$(cat $D/online_cmvn_opts)
 [ -e $D/cmvn_opts ] && cmvn_opts=$(cat $D/cmvn_opts)
-[ -e $D/delta_order ] && delta_opts="--delta-order=$(cat $D/delta_order)" # Bwd-compatibility,
 [ -e $D/delta_opts ] && delta_opts=$(cat $D/delta_opts)
 #
 # Create the feature stream,
 feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- |"
+# apply-cmvn-online (optional),
+[ -n "$online_cmvn_opts" -a ! -f $nndir/global_cmvn_stats.mat ] && echo "$0: Missing $nndir/global_cmvn_stats.mat" && exit 1
+[ -n "$online_cmvn_opts" ] && feats="$feats apply-cmvn-online $online_cmvn_opts --spk2utt=ark:$srcdata/spk2utt $nndir/global_cmvn_stats.mat ark:- ark:- |"
 # apply-cmvn (optional),
-[ ! -z "$cmvn_opts" -a ! -f $sdata/1/cmvn.scp ] && echo "$0: Missing $sdata/1/cmvn.scp" && exit 1
-[ ! -z "$cmvn_opts" ] && feats="$feats apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp ark:- ark:- |"
+[ -n "$cmvn_opts" -a ! -f $sdata/1/cmvn.scp ] && echo "$0: Missing $sdata/1/cmvn.scp" && exit 1
+[ -n "$cmvn_opts" ] && feats="$feats apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp ark:- ark:- |"
 # add-deltas (optional),
-[ ! -z "$delta_opts" ] && feats="$feats add-deltas $delta_opts ark:- ark:- |"
-# add-pytel transform (optional),
-[ -e $D/pytel_transform.py ] && feats="$feats /bin/env python $D/pytel_transform.py |"
+[ -n "$delta_opts" ] && feats="$feats add-deltas $delta_opts ark:- ark:- |"
 
 # add-ivector (optional),
 if [ -e $D/ivector_dim ]; then
diff --git a/egs/wsj/s5/steps/nnet/make_bn_feats.sh b/egs/wsj/s5/steps/nnet/make_bn_feats.sh
index 8489e824f2f..631f3d5243a 100755
--- a/egs/wsj/s5/steps/nnet/make_bn_feats.sh
+++ b/egs/wsj/s5/steps/nnet/make_bn_feats.sh
@@ -1,4 +1,4 @@
-#!/bin/bash 
+#!/bin/bash
 
 # Copyright 2012-2015 Brno University of Technology (author: Karel Vesely)
 # Apache 2.0
@@ -67,28 +67,29 @@ echo "Creating bn-feats into $data"
 
 # PREPARE FEATURE EXTRACTION PIPELINE
 # import config,
+online_cmvn_opts=
 cmvn_opts=
 delta_opts=
 D=$nndir
-[ -e $D/norm_vars ] && cmvn_opts="--norm-means=true --norm-vars=$(cat $D/norm_vars)" # Bwd-compatibility,
+[ -e $D/online_cmvn_opts ] && online_cmvn_opts=$(cat $D/online_cmvn_opts)
 [ -e $D/cmvn_opts ] && cmvn_opts=$(cat $D/cmvn_opts)
-[ -e $D/delta_order ] && delta_opts="--delta-order=$(cat $D/delta_order)" # Bwd-compatibility,
 [ -e $D/delta_opts ] && delta_opts=$(cat $D/delta_opts)
 #
 # Create the feature stream,
 feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- |"
+# apply-cmvn-online (optional),
+[ -n "$online_cmvn_opts" -a ! -f $nndir/global_cmvn_stats.mat ] && echo "$0: Missing $nndir/global_cmvn_stats.mat" && exit 1
+[ -n "$online_cmvn_opts" ] && feats="$feats apply-cmvn-online $online_cmvn_opts --spk2utt=ark:$srcdata/spk2utt $nndir/global_cmvn_stats.mat ark:- ark:- |"
 # apply-cmvn (optional),
-[ ! -z "$cmvn_opts" -a ! -f $sdata/1/cmvn.scp ] && echo "$0: Missing $sdata/1/cmvn.scp" && exit 1
-[ ! -z "$cmvn_opts" ] && feats="$feats apply-cmvn $cmvn_opts --utt2spk=ark:$srcdata/utt2spk scp:$srcdata/cmvn.scp ark:- ark:- |"
+[ -n "$cmvn_opts" -a ! -f $sdata/1/cmvn.scp ] && echo "$0: Missing $sdata/1/cmvn.scp" && exit 1
+[ -n "$cmvn_opts" ] && feats="$feats apply-cmvn $cmvn_opts --utt2spk=ark:$srcdata/utt2spk scp:$srcdata/cmvn.scp ark:- ark:- |"
 # add-deltas (optional),
-[ ! -z "$delta_opts" ] && feats="$feats add-deltas $delta_opts ark:- ark:- |"
-# add-pytel transform (optional),
-[ -e $D/pytel_transform.py ] && feats="$feats /bin/env python $D/pytel_transform.py |"
+[ -n "$delta_opts" ] && feats="$feats add-deltas $delta_opts ark:- ark:- |"
 
 # add-ivector (optional),
 if [ -e $D/ivector_dim ]; then
   [ -z $ivector ] && echo "Missing --ivector, they were used in training!" && exit 1
-  # Get the tool, 
+  # Get the tool,
   ivector_append_tool=append-vector-to-feats # default,
   [ -e $D/ivector_append_tool ] && ivector_append_tool=$(cat $D/ivector_append_tool)
   # Check dims,
@@ -115,7 +116,7 @@ if [ $htk_save == false ]; then
   done
 
   # check sentence counts,
-  N0=$(cat $srcdata/feats.scp | wc -l) 
+  N0=$(cat $srcdata/feats.scp | wc -l)
   N1=$(cat $data/feats.scp | wc -l)
   [[ "$N0" != "$N1" ]] && echo "$0: sentence-count mismatch, $srcdata $N0, $data $N1" && exit 1
   echo "Succeeded creating MLP-BN features '$data'"
diff --git a/egs/wsj/s5/steps/nnet/train.sh b/egs/wsj/s5/steps/nnet/train.sh
index c23a15362c7..a017ceac84d 100755
--- a/egs/wsj/s5/steps/nnet/train.sh
+++ b/egs/wsj/s5/steps/nnet/train.sh
@@ -22,6 +22,7 @@ nnet_proto=         # (optional) use this NN prototype for initialization,
 
 # feature processing,
 splice=5            # (default) splice features both-ways along time axis,
+online_cmvn_opts=   # (optional) adds 'apply-cmvn-online' to input feature pipeline, see opts,
 cmvn_opts=          # (optional) adds 'apply-cmvn' to input feature pipeline, see opts,
 delta_opts=         # (optional) adds 'add-deltas' to input feature pipeline, see opts,
 ivector=            # (optional) adds 'append-vector-to-feats', the option is rx-filename for the 2nd stream,
@@ -34,7 +35,6 @@ splice_after_transf=5 # (feat_type=transf) splice after the linear transform,
 
 feature_transform_proto= # (optional) use this prototype for 'feature_transform',
 feature_transform=  # (optional) directly use this 'feature_transform',
-pytel_transform=    # (BUT) use external python transform,
 
 # labels,
 labels=            # (optional) specify non-default training targets,
@@ -209,12 +209,11 @@ cp $data_cv/feats.scp $dir/cv.scp_non_local
 
 ###### OPTIONALLY IMPORT FEATURE SETTINGS (from pre-training) ######
 ivector_dim= # no ivectors,
-if [ ! -z $feature_transform ]; then
+if [ -n $feature_transform ]; then
   D=$(dirname $feature_transform)
   echo "# importing feature settings from dir '$D'"
-  [ -e $D/norm_vars ] && cmvn_opts="--norm-means=true --norm-vars=$(cat $D/norm_vars)" # Bwd-compatibility,
+  [ -e $D/online_cmvn_opts ] && online_cmvn_opts=$(cat $D/online_cmvn_opts)
   [ -e $D/cmvn_opts ] && cmvn_opts=$(cat $D/cmvn_opts)
-  [ -e $D/delta_order ] && delta_opts="--delta-order=$(cat $D/delta_order)" # Bwd-compatibility,
   [ -e $D/delta_opts ] && delta_opts=$(cat $D/delta_opts)
   [ -e $D/ivector_dim ] && ivector_dim=$(cat $D/ivector_dim)
   [ -e $D/ivector_append_tool ] && ivector_append_tool=$(cat $D/ivector_append_tool)
@@ -227,7 +226,14 @@ feats_tr="ark:copy-feats scp:$dir/train.scp ark:- |"
 feats_cv="ark:copy-feats scp:$dir/cv.scp ark:- |"
 
 # optionally add per-speaker CMVN,
-if [ ! -z "$cmvn_opts" ]; then
+[ -n "$online_cmvn_opts" -a -n "$cmvn_opts" ] && echo "Error: use \$online_cmvn_opts or \$cmvn_opts, not both!" && exit 1
+if [ -n "$online_cmvn_opts" ]; then
+  echo "# + 'apply-cmvn-online' with '$online_cmvn_opts' is used,"
+  global_cmvn_stats=$dir/global_cmvn_stats.mat
+  matrix-sum --binary=false scp:$data/cmvn.scp $global_cmvn_stats
+  feats_tr="$feats_tr apply-cmvn-online $online_cmvn_opts $global_cmvn_stats ark:- ark:- |"
+  feats_cv="$feats_cv apply-cmvn-online $online_cmvn_opts $global_cmvn_stats ark:- ark:- |"
+elif [ -n "$cmvn_opts" ]; then
   echo "# + 'apply-cmvn' with '$cmvn_opts' using statistics : $data/cmvn.scp, $data_cv/cmvn.scp"
   [ ! -r $data/cmvn.scp ] && echo "Missing $data/cmvn.scp" && exit 1;
   [ ! -r $data_cv/cmvn.scp ] && echo "Missing $data_cv/cmvn.scp" && exit 1;
@@ -245,20 +251,11 @@ if [ ! -z "$delta_opts" ]; then
 fi
 
 # keep track of the config,
-[ ! -z "$cmvn_opts" ] && echo "$cmvn_opts" >$dir/cmvn_opts
-[ ! -z "$delta_opts" ] && echo "$delta_opts" >$dir/delta_opts
+[ -n "$online_cmvn_opts" ] && echo "$online_cmvn_opts" >$dir/online_cmvn_opts
+[ -n "$cmvn_opts" ] && echo "$cmvn_opts" >$dir/cmvn_opts
+[ -n "$delta_opts" ] && echo "$delta_opts" >$dir/delta_opts
 #
 
-# optionally append python feature transform,
-if [ ! -z "$pytel_transform" ]; then
-  cp $pytel_transform $dir/pytel_transform.py
-  { echo; echo "### Comes from here: '$pytel_transform' ###"; } >> $dir/pytel_transform.py
-  pytel_transform=$dir/pytel_transform.py
-  feats_tr="$feats_tr /bin/env python $pytel_transform |"
-  feats_cv="$feats_cv /bin/env python $pytel_transform |"
-  echo "# + 'pytel-transform' from '$pytel_transform'"
-fi
-
 # temoprary pipeline with first 10k,
 feats_tr_10k="${feats_tr/train.scp/train.scp.10k}"
 
@@ -273,13 +270,13 @@ echo "# feature dim : $feat_dim (input of 'feature_transform')"
 # So it has to be done by a single process (we are using exclusive mode).
 # This also reduces the CPU-GPU uploads/downloads to minimum.
 
-if [ ! -z "$feature_transform" ]; then
+if [ -n "$feature_transform" ]; then
   echo "# importing 'feature_transform' from '$feature_transform'"
   tmp=$dir/imported_$(basename $feature_transform)
   cp $feature_transform $tmp; feature_transform=$tmp
 else
   # Make default proto with splice,
-  if [ ! -z $feature_transform_proto ]; then
+  if [ -n $feature_transform_proto ]; then
     echo "# importing custom 'feature_transform_proto' from '$feature_transform_proto'"
   else
     echo "# + default 'feature_transform_proto' with splice +/-$splice frames,"
@@ -374,7 +371,7 @@ if [ ! -z $ivector ]; then
   echo $dim_ivec >$dir/ivector_dim # mark down the iVec dim!
   echo $ivector_append_tool >$dir/ivector_append_tool
 
-  # pasting the iVecs to the feaures,
+  # pasting the iVecs to the features,
   echo "# + ivector input '$ivector'"
   feats_tr="$feats_tr $ivector_append_tool ark:- '$ivector' ark:- |"
   feats_cv="$feats_cv $ivector_append_tool ark:- '$ivector' ark:- |"
@@ -433,18 +430,6 @@ else
         ${bn_dim:+ --bottleneck-dim=$bn_dim} \
         "$cnn_fea" $num_tgt $hid_layers $hid_dim >>$nnet_proto
       ;;
-    cnn2d)
-      delta_order=$([ -z $delta_opts ] && echo "0" || { echo $delta_opts | tr ' ' '\n' | grep "delta[-_]order" | sed 's:^.*=::'; })
-      echo "Debug : $delta_opts, delta_order $delta_order"
-      utils/nnet/make_cnn2d_proto.py $cnn_proto_opts \
-        --splice=$splice --delta-order=$delta_order --dir=$dir \
-        $num_fea >$nnet_proto
-      cnn_fea=$(cat $nnet_proto | grep -v '^$' | tail -n1 | awk '{ print $5; }')
-      utils/nnet/make_nnet_proto.py $proto_opts \
-        --no-smaller-input-weights \
-        ${bn_dim:+ --bottleneck-dim=$bn_dim} \
-        "$cnn_fea" $num_tgt $hid_layers $hid_dim >>$nnet_proto
-      ;;
     lstm)
       utils/nnet/make_lstm_proto.py $proto_opts \
         $num_fea $num_tgt >$nnet_proto
diff --git a/egs/wsj/s5/steps/nnet3/align_lats.sh b/egs/wsj/s5/steps/nnet3/align_lats.sh
index e4ba7309435..201cc3552ba 100755
--- a/egs/wsj/s5/steps/nnet3/align_lats.sh
+++ b/egs/wsj/s5/steps/nnet3/align_lats.sh
@@ -92,12 +92,16 @@ if [ -f $srcdir/frame_subsampling_factor ]; then
   frame_subsampling_factor=$(cat $srcdir/frame_subsampling_factor)
   frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor"
   cp $srcdir/frame_subsampling_factor $dir
-  if [ "$frame_subsampling_factor" -gt 1 ] && \
-     [ "$scale_opts" == "--transition-scale=1.0 --self-loop-scale=0.1" ]; then
-    echo "$0: frame-subsampling-factor is not 1 (so likely a chain system),"
-    echo "...  but the scale opts are the defaults.  You probably want"
-    echo "--scale-opts '--transition-scale=1.0 --self-loop-scale=1.0'"
-    sleep 1
+  if [[ $frame_subsampling_factor -gt 1 ]]; then
+    # Assume a chain system, check agrument sanity.
+    if [[ ! ($scale_opts == *--self-loop-scale=1.0* &&
+             $scale_opts == *--transition-scale=1.0* &&
+             $acoustic_scale = '1.0') ]]; then
+      echo "$0: ERROR: frame-subsampling-factor is not 1, assuming a chain system."
+      echo "... You should pass the following options to this script:"
+      echo "  --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0'" \
+           "--acoustic_scale 1.0"
+    fi
   fi
 fi
 
diff --git a/egs/wsj/s5/steps/nnet3/chain/e2e/compute_biphone_stats.py b/egs/wsj/s5/steps/nnet3/chain/e2e/compute_biphone_stats.py
new file mode 100755
index 00000000000..e009cc17a9b
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/chain/e2e/compute_biphone_stats.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+
+# Copyright    2018 Hossein Hadian
+# Apache 2.0
+
+import argparse
+from os.path import join
+import sys
+import copy
+import random
+
+parser = argparse.ArgumentParser(description="""This script reads
+    sequences of phone ids from std input and counts mono/biphone stats
+    and writes the results to std out. The output can be used with
+    gmm-init-biphone to create a better tree. The first part of the
+    outupt is biphone counts with this format for each line:
+    <phone-id> <phone-id> <count>
+    and the second part of the output is monophone counts with the
+    following format:
+    <phone-id> <count>""")
+parser.add_argument('langdir', type=str)
+parser.add_argument('--shared-phones', type=str, choices=['true','false'],
+                    default='true',
+                    help="If true, stats will be collected for shared phones.")
+
+args = parser.parse_args()
+args.shared_phones = True if args.shared_phones == 'true' else False
+
+# Read phone sets
+phone_sets = []
+phones = []
+phone_to_shard_phone = {}
+phone_to_shard_phone[0] = 0  # The no-left-context case
+with open(join(args.langdir, 'phones/sets.int'), 'r', encoding='latin-1') as f:
+    for line in f:
+        phone_set = line.strip().split()
+        phone_sets.append(phone_set)
+        for phone in phone_set:
+            phones.append(phone)
+            phone_to_shard_phone[phone] = phone_set[0]
+
+print('Loaded {} phone-sets containing {} phones.'.format(len(phone_sets),
+                                                          len(phones)),
+      file=sys.stderr)
+
+biphone_counts = {}
+mono_counts = {}
+for line in sys.stdin:
+    line = line.strip().split()
+    key = line[0]
+    line_phones = line[1:]
+    for pair in zip([0] + line_phones, line_phones):  # 0 is for the no left-context case
+        if args.shared_phones:
+            pair = (phone_to_shard_phone[pair[0]], phone_to_shard_phone[pair[1]])
+        if pair not in biphone_counts:
+            biphone_counts[pair] = 0
+        biphone_counts[pair] += 1
+        mono_counts[pair[1]] = 1 if pair[1] not in mono_counts else mono_counts[pair[1]] + 1
+
+for phone1 in [0] + phones:
+    for phone2 in phones:
+        pair = (phone1, phone2)
+        shared_pair = ((phone_to_shard_phone[pair[0]], phone_to_shard_phone[pair[1]])
+                       if args.shared_phones else pair)
+        count = biphone_counts[shared_pair] if shared_pair in biphone_counts else 0
+        if count != 0:
+            print('{} {} {}'.format(pair[0], pair[1], count))
+for phone in phones:
+    shared = phone_to_shard_phone[phone] if args.shared_phones else phone
+    count = mono_counts[shared] if shared in mono_counts else 0
+    if count != 0:
+        print('{} {}'.format(phone, count))
diff --git a/egs/wsj/s5/steps/nnet3/chain/e2e/prepare_e2e.sh b/egs/wsj/s5/steps/nnet3/chain/e2e/prepare_e2e.sh
index c211381bf8b..07d5ee8cfb8 100755
--- a/egs/wsj/s5/steps/nnet3/chain/e2e/prepare_e2e.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/e2e/prepare_e2e.sh
@@ -14,13 +14,23 @@ cmd=run.pl
 nj=4
 stage=0
 shared_phones=true
-treedir=              # if specified, the tree and model will be copied from there
+treedir=              # If specified, the tree and model will be copied from there
                       # note that it may not be flat start anymore.
-type=mono             # can be either mono or biphone -- either way
+type=mono             # Can be either mono or biphone -- either way
                       # the resulting tree is full (i.e. it doesn't do any tying)
-ci_silence=false      # if true, silence phones will be treated as context independent
+ci_silence=false      # If true, silence phones will be treated as context independent
 
 scale_opts="--transition-scale=0.0 --self-loop-scale=0.0"
+tie=false             # If true, gmm-init-biphone will do some tying when
+                      # creating the full biphone tree (it won't be full anymore).
+                      # Specifically, it will revert to monophone if the data
+                      # counts for a biphone are smaller than min_biphone_count.
+                      # If the monophone count is also smaller than min_monophone_count,
+                      # it will revert to a shared global phone. Note that this
+                      # only affects biphone models (i.e., type=biphone) which
+                      # use the special chain topology.
+min_biphone_count=100
+min_monophone_count=20
 # End configuration section.
 
 echo "$0 $@"  # Print the command line for logging
@@ -35,6 +45,7 @@ if [ $# != 3 ]; then
   echo "  --config <config-file>                           # config containing options"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --type <mono | biphone>                          # context dependency type"
+  echo "  --tie <true | false>                             # enable/disable count-based tying"
   exit 1;
 fi
 
@@ -69,12 +80,23 @@ if $ci_silence; then
   ci_opt="--ci-phones=$ciphonelist"
 fi
 
+tie_opts=
+if $tie && [[ "$type" = "biphone" ]]; then
+  cat $data/text | steps/chain/e2e/text_to_phones.py --edge-silprob 0 \
+                                                     --between-silprob 0 \
+                                                     $lang | \
+    cut -d' ' -f 2- | utils/sym2int.pl $lang/phones.txt | \
+    steps/chain/e2e/compute_biphone_stats.py $lang >$dir/phone-stats.txt
+  tie_opts="--min-biphone-count=$min_biphone_count \
+--min-monophone-count=$min_monophone_count --phone-counts=$dir/phone-stats.txt"
+fi
+
 if [ $stage -le 0 ]; then
   if [ -z $treedir ]; then
     echo "$0: Initializing $type system."
     # feat dim does not matter here. Just set it to 10
     $cmd $dir/log/init_${type}_mdl_tree.log \
-         gmm-init-$type $ci_opt $shared_phones_opt $lang/topo 10 \
+         gmm-init-$type $tie_opts $ci_opt $shared_phones_opt $lang/topo 10 \
          $dir/0.mdl $dir/tree || exit 1;
   else
     echo "$0: Copied tree/mdl from $treedir." >$dir/log/init_mdl_tree.log
diff --git a/egs/wsj/s5/steps/nnet3/chain/e2e/train_e2e.py b/egs/wsj/s5/steps/nnet3/chain/e2e/train_e2e.py
index e96f2a10820..d5fa89f3ce0 100755
--- a/egs/wsj/s5/steps/nnet3/chain/e2e/train_e2e.py
+++ b/egs/wsj/s5/steps/nnet3/chain/e2e/train_e2e.py
@@ -202,11 +202,10 @@ def process_args(args):
             "--trainer.deriv-truncate-margin.".format(
                 args.deriv_truncate_margin))
 
-    if (not os.path.exists(args.dir)
-            or not os.path.exists(args.dir+"/configs")):
-        raise Exception("This scripts expects {0} to exist and have a configs "
-                        "directory which is the output of "
-                        "make_configs.py script")
+    if (not os.path.exists(args.dir + "/configs")):
+        raise Exception("This scripts expects the directory specified with "
+                        "--dir={0} to exist and have a configs/ directory which "
+                        "is the output of make_configs.py script".format(args.dir))
 
     # set the options corresponding to args.use_gpu
     run_opts = common_train_lib.RunOpts()
@@ -423,9 +422,10 @@ def train(args, run_opts):
         if (args.exit_stage is not None) and (iter == args.exit_stage):
             logger.info("Exiting early due to --exit-stage {0}".format(iter))
             return
-        current_num_jobs = int(0.5 + args.num_jobs_initial
-                               + (args.num_jobs_final - args.num_jobs_initial)
-                               * float(iter) / num_iters)
+
+        current_num_jobs = common_train_lib.get_current_num_jobs(
+            iter, num_iters,
+            args.num_jobs_initial, args.num_jobs_step, args.num_jobs_final)
 
         if args.stage <= iter:
             model_file = "{dir}/{iter}.mdl".format(dir=args.dir, iter=iter)
@@ -451,12 +451,13 @@ def train(args, run_opts):
             shrink_info_str = ''
             if shrinkage_value != 1.0:
                 shrink_info_str = 'shrink: {0:0.5f}'.format(shrinkage_value)
-            logger.info("Iter: {0}/{1}    "
-                        "Epoch: {2:0.2f}/{3:0.1f} ({4:0.1f}% complete)    "
-                        "lr: {5:0.6f}    {6}".format(iter, num_iters - 1,
-                                                     epoch, args.num_epochs,
-                                                     percent,
-                                                     lrate, shrink_info_str))
+            logger.info("Iter: {0}/{1}   Jobs: {2}   "
+                        "Epoch: {3:0.2f}/{4:0.1f} ({5:0.1f}% complete)   "
+                        "lr: {6:0.6f}   {7}".format(iter, num_iters - 1,
+                                                    current_num_jobs,
+                                                    epoch, args.num_epochs,
+                                                    percent,
+                                                    lrate, shrink_info_str))
 
             chain_lib.train_one_iteration(
                 dir=args.dir,
diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
index 9996820d6d3..0185b9fbaad 100755
--- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
@@ -151,36 +151,41 @@ mkdir -p $dir/log $dir/info
 # Get list of validation utterances.
 frame_shift=$(utils/data/get_frame_shift.sh $data) || exit 1
 
+if [ -f $data/utt2uniq ]; then
+  # Must hold out all augmented versions of the same utterance.
+  echo "$0: File $data/utt2uniq exists, so ensuring the hold-out set" \
+       "includes all perturbed versions of the same source utterance."
+  utils/utt2spk_to_spk2utt.pl $data/utt2uniq 2>/dev/null | \
+      utils/shuffle_list.pl 2>/dev/null | \
+    awk -v max_utt=$num_utts_subset '{
+        for (n=2;n<=NF;n++) print $n;
+        printed += NF-1;
+        if (printed >= max_utt) nextfile; }' |
+    sort > $dir/valid_uttlist
+else
+  awk '{print $1}' $data/utt2spk | \
+    utils/shuffle_list.pl 2>/dev/null | \
+    head -$num_utts_subset > $dir/valid_uttlist
+fi
+len_valid_uttlist=$(wc -l < $dir/valid_uttlist)
+
 awk '{print $1}' $data/utt2spk | \
-  utils/shuffle_list.pl 2>/dev/null | head -$num_utts_subset > $dir/valid_uttlist
+   utils/filter_scp.pl --exclude $dir/valid_uttlist | \
+   utils/shuffle_list.pl 2>/dev/null | \
+   head -$num_utts_subset > $dir/train_subset_uttlist
+len_trainsub_uttlist=$(wc -l <$dir/train_subset_uttlist)
 
-len_uttlist=$(wc -l < $dir/valid_uttlist)
-if [ $len_uttlist -lt $num_utts_subset ]; then
-  echo "Number of utterances is very small. Please check your data." && exit 1;
+if [[ $len_valid_uttlist -lt $num_utts_subset ||
+      $len_trainsub_uttlist -lt $num_utts_subset ]]; then
+  echo "$0: Number of utterances is very small. Please check your data." && exit 1;
 fi
 
-if [ -f $data/utt2uniq ]; then  # this matters if you use data augmentation.
-  # because of this stage we can again have utts with lengths less than
-  # frames_per_eg
-  echo "File $data/utt2uniq exists, so augmenting valid_uttlist to"
-  echo "include all perturbed versions of the same 'real' utterances."
-  mv $dir/valid_uttlist $dir/valid_uttlist.tmp
-  utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt
-  cat $dir/valid_uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \
-    sort | uniq | utils/apply_map.pl $dir/uniq2utt | \
-    awk '{for(n=1;n<=NF;n++) print $n;}' | sort  > $dir/valid_uttlist
-  rm $dir/uniq2utt $dir/valid_uttlist.tmp
-fi
+echo "$0: Holding out $len_valid_uttlist utterances in validation set and" \
+     "$len_trainsub_uttlist in training diagnostic set, out of total" \
+     "$(wc -l < $data/utt2spk)."
 
-echo "$0: creating egs.  To ensure they are not deleted later you can do:  touch $dir/.nodelete"
 
-awk '{print $1}' $data/utt2spk | \
-   utils/filter_scp.pl --exclude $dir/valid_uttlist | \
-   utils/shuffle_list.pl 2>/dev/null | head -$num_utts_subset > $dir/train_subset_uttlist
-len_uttlist=$(wc -l <$dir/train_subset_uttlist)
-if [ $len_uttlist -lt $num_utts_subset ]; then
-  echo "Number of utterances is very small. Please check your data." && exit 1;
-fi
+echo "$0: creating egs.  To ensure they are not deleted later you can do:  touch $dir/.nodelete"
 
 ## Set up features.
 echo "$0: feature type is raw"
@@ -342,9 +347,8 @@ if [ $stage -le 2 ]; then
         $egs_opts --normalization-fst-scale=$normalization_fst_scale \
         $trans_mdl_opt $chaindir/normalization.fst \
         "$train_subset_feats" ark,s,cs:- "ark:$dir/train_subset_all.cegs" || exit 1
-    wait
     sleep 5  # wait for file system to sync.
-    echo "... Getting subsets of validation examples for diagnostics and combination."
+    echo "$0: Getting subsets of validation examples for diagnostics and combination."
     if $generate_egs_scp; then
       valid_diagnostic_output="ark,scp:$dir/valid_diagnostic.cegs,$dir/valid_diagnostic.scp"
       train_diagnostic_output="ark,scp:$dir/train_diagnostic.cegs,$dir/train_diagnostic.scp"
@@ -365,7 +369,6 @@ if [ $stage -le 2 ]; then
     $cmd $dir/log/create_train_subset_diagnostic.log \
       nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/train_subset_all.cegs \
       $train_diagnostic_output || exit 1
-    wait
     sleep 5  # wait for file system to sync.
     if $generate_egs_scp; then
       cat $dir/valid_combine.cegs $dir/train_combine.cegs | \
@@ -375,7 +378,7 @@ if [ $stage -le 2 ]; then
     fi
 
     for f in $dir/{combine,train_diagnostic,valid_diagnostic}.cegs; do
-      [ ! -s $f ] && echo "No examples in file $f" && exit 1;
+      [ ! -s $f ] && echo "$0: No examples in file $f" && exit 1;
     done
     rm $dir/valid_all.cegs $dir/train_subset_all.cegs $dir/{train,valid}_combine.cegs
   ) || touch $dir/.error &
@@ -412,7 +415,7 @@ if [ $stage -le 4 ]; then
 fi
 
 if [ -f $dir/.error ]; then
-  echo "Error detected while creating train/valid egs" && exit 1
+  echo "$0: Error detected while creating train/valid egs" && exit 1
 fi
 
 if [ $stage -le 5 ]; then
@@ -485,11 +488,11 @@ fi
 
 wait
 if [ -f $dir/.error ]; then
-  echo "Error detected while creating train/valid egs" && exit 1
+  echo "$0: Error detected while creating train/valid egs" && exit 1
 fi
 
 if [ $stage -le 6 ]; then
-  echo "$0: removing temporary archives"
+  echo "$0: Removing temporary archives, alignments and lattices"
   (
     cd $dir
     for f in $(ls -l . | grep 'cegs_orig' | awk '{ X=NF-1; Y=NF-2; if ($X == "->")  print $Y, $NF; }'); do rm $f; done
@@ -501,7 +504,6 @@ if [ $stage -le 6 ]; then
     # there are some extra soft links that we should delete.
     for f in $dir/cegs.*.*.ark; do rm $f; done
   fi
-  echo "$0: removing temporary alignments, lattices and transforms"
   rm $dir/ali.{ark,scp} 2>/dev/null
   rm $dir/lat_special.*.{ark,scp} 2>/dev/null
 fi
diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index 40b65afe273..91b7df4e8df 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -219,8 +219,9 @@ def process_args(args):
                 args.deriv_truncate_margin))
 
     if (not os.path.exists(args.dir)):
-        raise Exception("This script expects --dir={0} to exist.")
-    if (not os.path.exists(args.dir+"/configs") and
+        raise Exception("Directory specified with --dir={0} "
+                        "does not exist.".format(args.dir))
+    if (not os.path.exists(args.dir + "/configs") and
         (args.input_model is None or not os.path.exists(args.input_model))):
         raise Exception("Either --trainer.input-model option should be supplied, "
                         "and exist; or the {0}/configs directory should exist."
@@ -357,6 +358,13 @@ def train(args, run_opts):
                                right_context_final >= 0 else -1)
 
     default_egs_dir = '{0}/egs'.format(args.dir)
+
+    if (args.egs_dir is not None) and (args.cmvn_opts != "--norm-means=false --norm-vars=false"):
+        logger.warning("the --feat.cmvn-opts option has no effect because we are not dumping egs")
+
+    if (args.egs_dir is not None) and (args.frames_per_iter != 800000):
+        logger.warning("the --trainer.frames-per-iter option has no effect because we are not dumping egs")
+
     if ((args.stage <= -3) and args.egs_dir is None):
         logger.info("Generating egs")
         if (not os.path.exists("{0}/den.fst".format(args.dir)) or
@@ -470,9 +478,10 @@ def train(args, run_opts):
         if (args.exit_stage is not None) and (iter == args.exit_stage):
             logger.info("Exiting early due to --exit-stage {0}".format(iter))
             return
-        current_num_jobs = int(0.5 + args.num_jobs_initial
-                               + (args.num_jobs_final - args.num_jobs_initial)
-                               * float(iter) / num_iters)
+
+        current_num_jobs = common_train_lib.get_current_num_jobs(
+            iter, num_iters,
+            args.num_jobs_initial, args.num_jobs_step, args.num_jobs_final)
 
         if args.stage <= iter:
             model_file = "{dir}/{iter}.mdl".format(dir=args.dir, iter=iter)
@@ -501,12 +510,13 @@ def train(args, run_opts):
             shrink_info_str = ''
             if shrinkage_value != 1.0:
                 shrink_info_str = 'shrink: {0:0.5f}'.format(shrinkage_value)
-            logger.info("Iter: {0}/{1}    "
-                        "Epoch: {2:0.2f}/{3:0.1f} ({4:0.1f}% complete)    "
-                        "lr: {5:0.6f}    {6}".format(iter, num_iters - 1,
-                                                     epoch, args.num_epochs,
-                                                     percent,
-                                                     lrate, shrink_info_str))
+            logger.info("Iter: {0}/{1}   Jobs: {2}   "
+                        "Epoch: {3:0.2f}/{4:0.1f} ({5:0.1f}% complete)   "
+                        "lr: {6:0.6f}   {7}".format(iter, num_iters - 1,
+                                                    current_num_jobs,
+                                                    epoch, args.num_epochs,
+                                                    percent,
+                                                    lrate, shrink_info_str))
 
             chain_lib.train_one_iteration(
                 dir=args.dir,
diff --git a/egs/wsj/s5/steps/nnet3/convert_nnet2_to_nnet3.py b/egs/wsj/s5/steps/nnet3/convert_nnet2_to_nnet3.py
index 66ff633fbfc..edc2f7e4617 100755
--- a/egs/wsj/s5/steps/nnet3/convert_nnet2_to_nnet3.py
+++ b/egs/wsj/s5/steps/nnet3/convert_nnet2_to_nnet3.py
@@ -100,6 +100,7 @@ class Nnet3Model(object):
     def __init__(self):
         self.input_dim = -1
         self.output_dim = -1
+        self.ivector_dim = -1
         self.counts = defaultdict(int)
         self.num_components = 0
         self.components_read = 0
@@ -118,7 +119,10 @@ def add_component(self, component, pairs):
         Component = namedtuple("Component", "ident component pairs")
 
         if "<InputDim>" in pairs and self.input_dim == -1:
-            self.input_dim = pairs["<InputDim>"]
+            self.input_dim = int(pairs["<InputDim>"])
+
+        if "<ConstComponentDim>" in pairs and self.ivector_dim == -1:
+            self.ivector_dim = int(pairs["<ConstComponentDim>"])
 
         # remove nnet2 specific tokens and catch descriptors
         if component == "<PnormComponent>" and "<P>" in pairs:
@@ -159,13 +163,18 @@ def write_config(self, filename):
                                     config_string=config_string))
 
             f.write("\n# Component nodes\n")
-            f.write("input-node name=input dim={0}\n".format(self.input_dim))
+            if self.ivector_dim != -1:
+                f.write("input-node name=input dim={0}\n".format(self.input_dim-self.ivector_dim))
+                f.write("input-node name=ivector dim={0}\n".format(self.ivector_dim))
+            else:
+                f.write("input-node name=input dim={0}\n".format(self.input_dim))
             previous_component = "input"
             for component in self.components:
                 if component.ident == "splice":
                     # Create splice string for the next node
                     previous_component = make_splice_string(previous_component, 
-                                                   component.pairs["<Context>"])
+                                                   component.pairs["<Context>"],
+                                                   component.pairs["<ConstComponentDim>"])
                     continue
                 f.write("component-node name={name} component={name} "
                         "input={inp}\n".format(name=component.ident, 
@@ -264,7 +273,7 @@ def parse_component(line, line_buffer):
     pairs = {}
 
     if component in SPLICE_COMPONENTS:
-        pairs = parse_splice_component(component, line, line_buffer)
+        line, pairs = parse_splice_component(component, line, line_buffer)
     elif component in AFFINE_COMPONENTS:
         pairs = parse_affine_component(component, line, line_buffer)
     elif component == "<FixedScaleComponent>":
@@ -335,7 +344,13 @@ def parse_splice_component(component, line, line_buffer):
     line = consume_token("<Context>", line)
     context = line.strip()[1:-1].split()
 
-    return {"<InputDim>" : input_dim, "<Context>" : context}
+    const_component_dim = 0
+    line = next(line_buffer) # Context vector adds newline
+    line = consume_token("<ConstComponentDim>", line)
+    const_component_dim = int(line.strip().split()[0])
+
+    return line, {"<InputDim>" : input_dim, "<Context>" : context, 
+            "<ConstComponentDim>" : const_component_dim}
 
 def parse_end_of_component(component, line, line_buffer):
     # Keeps reading until it hits the end tag for component
@@ -422,7 +437,7 @@ def consume_token(token, line):
 
     return line.partition(token)[2]
 
-def make_splice_string(nodename, context):
+def make_splice_string(nodename, context, const_component_dim=0):
     """Generates splice string from a list of context.
 
     E.g. make_splice_string("renorm4", [-4, 4])
@@ -430,6 +445,8 @@ def make_splice_string(nodename, context):
     """
     assert type(context) == list, "context argument must be a list"
     string = ["Offset({0}, {1})".format(nodename, i) for i in context]
+    if const_component_dim > 0:
+        string.append("ReplaceIndex(ivector, t, 0)")
     string = "Append(" + ", ".join(string) + ")"
     return string
 
diff --git a/egs/wsj/s5/steps/nnet3/get_degs.sh b/egs/wsj/s5/steps/nnet3/get_degs.sh
index 8098b59c4ad..7853daa4563 100755
--- a/egs/wsj/s5/steps/nnet3/get_degs.sh
+++ b/egs/wsj/s5/steps/nnet3/get_degs.sh
@@ -471,7 +471,6 @@ if [ $stage -le 10 ] && $cleanup; then
 fi
 
 
-exit 0
-
-
 echo "$0: Finished decoding and preparing training examples"
+
+exit 0
diff --git a/egs/wsj/s5/steps/nnet3/report/generate_plots.py b/egs/wsj/s5/steps/nnet3/report/generate_plots.py
index 572e2cf08b7..d79db1604fd 100755
--- a/egs/wsj/s5/steps/nnet3/report/generate_plots.py
+++ b/egs/wsj/s5/steps/nnet3/report/generate_plots.py
@@ -23,76 +23,61 @@
     import matplotlib.pyplot as plt
     import numpy as np
     from matplotlib.patches import Rectangle
+    # matplotlib issue https://github.com/matplotlib/matplotlib/issues/12513
+    # plt.subplot() generates a false-positive warninig, suppress it for now.
+    from matplotlib.cbook import MatplotlibDeprecationWarning
+    warnings.filterwarnings('ignore', category=MatplotlibDeprecationWarning,
+                            message='Adding an axes using the same arguments')
     g_plot = True
 except ImportError:
-    warnings.warn(
-        """This script requires matplotlib and numpy.
-        Please install them to generate plots.
-        Proceeding with generation of tables.
-        If you are on a cluster where you do not have admin rights you could
-        try using virtualenv.""")
     g_plot = False
 
 
-logger = logging.getLogger('libs')
-logger.setLevel(logging.INFO)
-handler = logging.StreamHandler()
-handler.setLevel(logging.INFO)
-formatter = logging.Formatter("%(asctime)s [%(filename)s:%(lineno)s - "
-                              "%(funcName)s - %(levelname)s ] %(message)s")
-handler.setFormatter(formatter)
-logger.addHandler(handler)
-logger.info('Generating plots')
+logging.basicConfig(format="%(filename)s:%(lineno)s:%(levelname)s:%(message)s",
+                    level=logging.INFO)
+logger = logging.getLogger(__name__)
 
 
 def get_args():
     parser = argparse.ArgumentParser(
-        description="""Parses the training logs and generates a variety of
-        plots.
-        e.g. (deprecated): steps/nnet3/report/generate_plots.py
-        --comparison-dir exp/nnet3/tdnn1 --comparison-dir exp/nnet3/tdnn2
-        exp/nnet3/tdnn exp/nnet3/tdnn/report
-        or (current): steps/nnet3/report/generate_plots.py
-        exp/nnet3/tdnn exp/nnet3/tdnn1 exp/nnet3/tdnn2 exp/nnet3/tdnn/report.
-        Look for the report.pdf in the output (report) directory.""")
-
-    parser.add_argument("--comparison-dir", type=str, action='append',
-                        help="other experiment directories for comparison. "
-                        "These will only be used for plots, not tables"
-                        "Note: this option is deprecated.")
-    parser.add_argument("--start-iter", type=int,
-                        help="Iteration from which plotting will start",
-                        default=1)
-    parser.add_argument("--is-chain", type=str, default=False,
-                        action=common_lib.StrToBoolAction,
-                        help="True if directory contains chain models")
-    parser.add_argument("--is-rnnlm", type=str, default=False,
-                        action=common_lib.StrToBoolAction,
-                        help="True if directory contains RNNLM.")
-    parser.add_argument("--output-nodes", type=str, default=None,
+        prog=sys.argv[0],  # By default, prog is set this to filename only.
+        formatter_class=type('', (argparse.RawDescriptionHelpFormatter,
+                                  argparse.ArgumentDefaultsHelpFormatter), {}),
+        description="Parses the training logs and generates a variety of plots.\n"
+        "e.g.: %(prog)s \\\n"
+        "  exp/nnet3/tdnn exp/nnet3/tdnn1 exp/nnet3/tdnn2 exp/nnet3/tdnn/report.\n"
+        "The report file 'report.pdf' will be generated in the <output_dir> directory.")
+
+    parser.add_argument("--start-iter", type=int, metavar='N', default=1,
+                        help="Iteration from which plotting will start.")
+    parser.add_argument("--is-chain", type=common_lib.str_to_bool, default='false', metavar='BOOL',
+                        help="Set to 'true' if <exp_dir>s contain chain models.")
+    parser.add_argument("--is-rnnlm", type=common_lib.str_to_bool, default='false', metavar='BOOL',
+                        help="Set to 'true' if <exp_dir>s contain RNNLM.")
+    parser.add_argument("--output-nodes", type=str, metavar='NODES',
                         action=common_lib.NullstrToNoneAction,
-                        help="""List of space separated
-                        <output-node>:<objective-type> entities,
-                        one for each output node""")
+                        help="List of space separated <output-node>:<objective-type> entries, "
+                        "one for each output node")
+    parser.add_argument("--comparison-dir", type=str, metavar='DIR', action='append',
+                        help="[DEPRECATED] Experiment directories for comparison. "
+                        "These will only be used for plots, not tables.")
     parser.add_argument("exp_dir", nargs='+',
-                        help="the first dir is the experiment directory, "
-                        "e.g. exp/nnet3/tdnn, the rest dirs (if exist) "
-                        "are other experiment directories for comparison.")
+                        help="The first <exp_dir> is the current experiment directory, e.g. "
+                        "'exp/nnet3/tdnn'; the rest are up to 6 optional directories of other "
+                        "experiments to be graphed on same plots for comparison.")
     parser.add_argument("output_dir",
-                        help="experiment directory, "
-                        "e.g. exp/nnet3/tdnn/report")
+                        help="output directory for reports, e.g. 'exp/nnet3/tdnn/report'")
 
     args = parser.parse_args()
-    if (args.comparison_dir is not None and len(args.comparison_dir) > 6) or \
-    (args.exp_dir is not None and len(args.exp_dir) > 7):
+    if ((args.comparison_dir is not None and len(args.comparison_dir) > 6) or
+        (args.exp_dir is not None and len(args.exp_dir) > 7)):
         raise Exception(
-            """max 6 comparison directories can be specified.
-            If you want to compare with more comparison_dir, you would have to
-            carefully tune the plot_colors variable which specified colors used
-            for plotting.""")
+            "Up to 6 comparison directories may be specified. "
+            "If you want to compare with more experiments, you would have to carefully tune "
+            "the plot_colors variable which specified colors used for plotting.")
     assert args.start_iter >= 1
     if args.is_chain and args.is_rnnlm:
-        raise Exception("""is_chain and is_rnnlm is not compatible.""")
+        raise Exception("Options --is-chain and --is-rnnlm cannot be both true.")
     return args
 
 
@@ -104,23 +89,24 @@ class LatexReport(object):
     def __init__(self, pdf_file):
         self.pdf_file = pdf_file
         self.document = []
-        self.document.append("""
+        self.document.append(r"""
 \documentclass[prl,10pt,twocolumn]{revtex4}
 \usepackage{graphicx}    % Used to import the graphics
-\\begin{document}
+\begin{document}
 """)
 
     def add_figure(self, figure_pdf, title):
         """we will have keep extending this replacement list based on errors
         during compilation escaping underscores in the title"""
-        title = "\\texttt{"+re.sub("_", "\_", title)+"}"
-        fig_latex = """
+
+        title = r"\texttt{"+re.sub("_", "\_", title)+"}"
+        fig_latex = r"""
 %...
-\\newpage
-\\begin{figure}[h]
-  \\begin{center}
-    \caption{""" + title + """}
-    \includegraphics[width=\\textwidth]{""" + figure_pdf + """}
+\newpage
+\begin{figure}[h]
+  \begin{center}
+    \caption{""" + title + r"""}
+    \includegraphics[width=\textwidth]{""" + figure_pdf + r"""}
   \end{center}
 \end{figure}
 \clearpage
@@ -129,7 +115,7 @@ def add_figure(self, figure_pdf, title):
         self.document.append(fig_latex)
 
     def close(self):
-        self.document.append("\end{document}")
+        self.document.append(r"\end{document}")
         return self.compile()
 
     def compile(self):
@@ -139,14 +125,15 @@ def compile(self):
         lat_file = open(latex_file, "w")
         lat_file.write("\n".join(self.document))
         lat_file.close()
-        logger.info("Compiling the latex report.")
+        logger.info("Compiling the LaTeX report.")
         try:
             common_lib.execute_command(
                 "pdflatex -interaction=batchmode "
                 "-output-directory={0} {1}".format(dir_name, latex_file))
         except Exception as e:
-            logger.warning("There was an error compiling the latex file {0}, "
-                           "please do it manually: {1}".format(latex_file, e))
+            logger.warning("There was an error compiling LaTeX file %s. "
+                           "Check report.log generated by pdflatex in the same directory. %s",
+                           latex_file, e)
             return False
         return True
 
@@ -222,10 +209,11 @@ def generate_acc_logprob_plots(exp_dir, output_dir, plot, key='accuracy',
 # The name of five gates of lstmp
 g_lstm_gate = ['i_t_sigmoid', 'f_t_sigmoid', 'c_t_tanh', 'o_t_sigmoid', 'm_t_tanh']
 
-# The "extra" item looks like a placeholder. As each unit in python plot is
+# The "extra" item is a placeholder. As each unit in python plot is
 # composed by a legend_handle(linestyle) and a legend_label(description).
 # For the unit which doesn't have linestyle, we use the "extra" placeholder.
-extra = Rectangle((0, 0), 1, 1, facecolor="w", fill=False, edgecolor='none', linewidth=0)
+if g_plot:
+    extra = Rectangle((0, 0), 1, 1, facecolor="w", fill=False, edgecolor='none', linewidth=0)
 
 # This function is used to insert a column to the legend, the column_index is 1-based
 def insert_a_column_legend(legend_handle, legend_label, lp, mp, hp,
@@ -390,8 +378,7 @@ def generate_nonlin_stats_plots(exp_dir, output_dir, plot, comparison_dir=None,
             comp_data = stats_per_component_per_iter[component_name]
             comp_type = comp_data['type']
             comp_stats = comp_data['stats']
-            iters = comp_stats.keys()
-            iters.sort()
+            iters = sorted(comp_stats)
             iter_stats = []
             for iter in iters:
                 iter_stats.append([iter] + comp_stats[iter])
@@ -407,15 +394,16 @@ def generate_nonlin_stats_plots(exp_dir, output_dir, plot, comparison_dir=None,
                     dir=output_dir, comp_name=component_name), "w") as f:
             if with_oderiv:
                 # with oderiv-rms
-                f.write("Iteration\tValueMean\tValueStddev\tDerivMean\tDerivStddev\tOderivMean\tOderivStddev\t"
-                               "Value_5th\tValue_50th\tValue_95th\t"
-                               "Deriv_5th\tDeriv_50th\tDeriv_95th\t"
-                               "Oderiv_5th\tOderiv_50th\tOderiv_95th\n")
+                f.write("Iteration\tValueMean\tValueStddev\tDerivMean\tDerivStddev\t"
+                        "OderivMean\tOderivStddev\t"
+                        "Value_5th\tValue_50th\tValue_95th\t"
+                        "Deriv_5th\tDeriv_50th\tDeriv_95th\t"
+                        "Oderiv_5th\tOderiv_50th\tOderiv_95th\n")
             else:
                 # without oderiv-rms
                 f.write("Iteration\tValueMean\tValueStddev\tDerivMean\tDerivStddev\t"
-                               "Value_5th\tValue_50th\tValue_95th\t"
-                               "Deriv_5th\tDeriv_50th\tDeriv_95th\n")
+                        "Value_5th\tValue_50th\tValue_95th\t"
+                        "Deriv_5th\tDeriv_50th\tDeriv_95th\n")
             iter_stat_report = []
             iter_stats = main_stat_tables[component_name]
             for row in iter_stats:
@@ -423,21 +411,18 @@ def generate_nonlin_stats_plots(exp_dir, output_dir, plot, comparison_dir=None,
             f.write("\n".join(iter_stat_report))
             f.close()
     if plot:
-        main_component_names = list(main_stat_tables.keys())
-        main_component_names.sort()
-
+        main_component_names = sorted(main_stat_tables)
         plot_component_names = set(main_component_names)
         for dir in dirs:
             component_names = set(stats_per_dir[dir].keys())
             plot_component_names = plot_component_names.intersection(
                 component_names)
-        plot_component_names = list(plot_component_names)
-        plot_component_names.sort()
+        plot_component_names = sorted(plot_component_names)
         if plot_component_names != main_component_names:
-            logger.warning("""The components in all the neural networks in the
-            given experiment dirs are not the same, so comparison plots are
-            provided only for common component names. Make sure that these are
-            comparable experiments before analyzing these plots.""")
+            logger.warning("The components in all the neural networks in the "
+                           "given experiment dirs are not the same, so comparison plots are "
+                           "provided only for common component names. Make sure that these are "
+                           "comparable experiments before analyzing these plots.")
 
         fig = plt.figure()
 
@@ -510,9 +495,8 @@ def generate_clipped_proportion_plots(exp_dir, output_dir, plot,
         except log_parse.MalformedClippedProportionLineException as e:
             raise e
         except common_lib.KaldiCommandException as e:
-            warnings.warn("Could not extract the clipped proportions for {0},"
-                          " this might be because there are no "
-                          "ClipGradientComponents.".format(dir))
+            logger.warning("Could not extract the clipped proportions for %s, "
+                           "this might be because there are no ClipGradientComponents.", dir)
             continue
         if len(stats_per_dir[dir]) == 0:
             logger.warning("Couldn't find any rows for the"
@@ -520,9 +504,8 @@ def generate_clipped_proportion_plots(exp_dir, output_dir, plot,
     try:
         main_cp_stats = stats_per_dir[exp_dir]['table']
     except KeyError:
-        warnings.warn("The main experiment directory {0} does not have "
-                      "clipped proportions. So not generating clipped "
-                      "proportion plots.".format(exp_dir))
+        logger.warning("The main experiment directory %s does not have clipped proportions. "
+                       "Not generating clipped proportion plots.", exp_dir)
         return
 
     # this is the main experiment directory
@@ -534,26 +517,22 @@ def generate_clipped_proportion_plots(exp_dir, output_dir, plot,
     file.close()
 
     if plot:
-        main_component_names = (
-            list(stats_per_dir[exp_dir]['cp_per_iter_per_component'].keys()))
-        main_component_names.sort()
+        main_component_names = sorted(stats_per_dir[exp_dir]['cp_per_iter_per_component'])
         plot_component_names = set(main_component_names)
         for dir in dirs:
             try:
-                component_names = set(
-                    stats_per_dir[dir]['cp_per_iter_per_component'].keys())
+                component_names = set(stats_per_dir[dir]['cp_per_iter_per_component'])
                 plot_component_names = (
                     plot_component_names.intersection(component_names))
             except KeyError:
                 continue
-        plot_component_names = list(plot_component_names)
-        plot_component_names.sort()
+        plot_component_names = sorted(plot_component_names)
         if plot_component_names != main_component_names:
             logger.warning(
-                """The components in all the neural networks in the given
-                experiment dirs are not the same, so comparison plots are
-                provided only for common component names. Make sure that these
-                are comparable experiments before analyzing these plots.""")
+                "The components in all the neural networks in the given "
+                "experiment dirs are not the same, so comparison plots are "
+                "provided only for common component names. Make sure that these "
+                "are comparable experiments before analyzing these plots.")
 
         fig = plt.figure()
         for component_name in main_component_names:
@@ -638,32 +617,25 @@ def generate_parameter_diff_plots(exp_dir, output_dir, plot,
                         iter_data.append("NA")
                 if (float(total_missing_iterations)/len(component_names) > 20
                         and not gave_user_warning):
-                    logger.warning("There are more than {0} missing "
-                                   "iterations per component. "
-                                   "Something might be wrong.".format(
-                                       float(total_missing_iterations)/ len(component_names)))
+                    logger.warning("There are more than %.0f missing iterations per component. "
+                                   "Something might be wrong.",
+                                   float(total_missing_iterations)/ len(component_names))
                     gave_user_warning = True
 
-                f.write(" ".join(iter_data)+"\n")
+                f.write(" ".join(iter_data) + "\n")
 
     if plot:
         # get the component names
         diff_type = list(key_file.keys())[0]
-        main_component_names = list(stats_per_dir[exp_dir][diff_type][
-            'progress_per_component'].keys())
-        main_component_names.sort()
+        main_component_names = sorted(stats_per_dir[exp_dir][diff_type]['progress_per_component'])
         plot_component_names = set(main_component_names)
-
         for dir in dirs:
             try:
-                component_names = set(stats_per_dir[dir][diff_type][
-                    'progress_per_component'].keys())
-                plot_component_names = plot_component_names.intersection(
-                    component_names)
+                component_names = set(stats_per_dir[dir][diff_type]['progress_per_component'])
+                plot_component_names = plot_component_names.intersection(component_names)
             except KeyError:
                 continue
-        plot_component_names = list(plot_component_names)
-        plot_component_names.sort()
+        plot_component_names = sorted(plot_component_names)
         if plot_component_names != main_component_names:
             logger.warning("The components in all the neural networks in the "
                            "given experiment dirs are not the same, "
@@ -675,9 +647,8 @@ def generate_parameter_diff_plots(exp_dir, output_dir, plot,
         assert main_component_names
 
         fig = plt.figure()
-        logger.info("Generating parameter-difference plots for the "
-                    "following components:{0}".format(
-                        ', '.join(main_component_names)))
+        logger.info("Plotting parameter differences for components: " +
+                    ", ".join(main_component_names))
 
         for component_name in main_component_names:
             fig.clf()
@@ -698,12 +669,9 @@ def generate_parameter_diff_plots(exp_dir, output_dir, plot,
                     # this component is not available in this network so lets
                     # not just plot it
                     if dir == exp_dir:
-                        raise Exception("No parameter differences were "
-                                        "available even in the main "
-                                        "experiment dir for the component "
-                                        "{0}. Something went wrong: "
-                                        "{1}.".format(
-                                            component_name, str(e)))
+                        raise Exception("No parameter differences were available even in the main "
+                                        "experiment dir for the component {0}. Something went "
+                                        "wrong: {1}.".format(component_name, e))
                     continue
                 ax = plt.subplot(211)
                 mp, = ax.plot(iter_stats[0][:, 0], iter_stats[0][:, 1],
@@ -755,35 +723,35 @@ def generate_plots(exp_dir, output_dir, output_names, comparison_dir=None,
 
     for (output_name, objective_type) in output_names:
         if objective_type == "linear":
-            logger.info("Generating accuracy plots")
+            logger.info("Generating accuracy plots for '%s'", output_name)
             generate_acc_logprob_plots(
                 exp_dir, output_dir, g_plot, key='accuracy',
                 file_basename='accuracy', comparison_dir=comparison_dir,
                 start_iter=start_iter,
                 latex_report=latex_report, output_name=output_name)
 
-            logger.info("Generating log-likelihood plots")
+            logger.info("Generating log-likelihood plots for '%s'", output_name)
             generate_acc_logprob_plots(
                 exp_dir, output_dir, g_plot, key='log-likelihood',
                 file_basename='loglikelihood', comparison_dir=comparison_dir,
                 start_iter=start_iter,
                 latex_report=latex_report, output_name=output_name)
         elif objective_type == "chain":
-            logger.info("Generating log-probability plots")
+            logger.info("Generating log-probability plots for '%s'", output_name)
             generate_acc_logprob_plots(
                 exp_dir, output_dir, g_plot,
                 key='log-probability', file_basename='log_probability',
                 comparison_dir=comparison_dir, start_iter=start_iter,
                 latex_report=latex_report, output_name=output_name)
         elif objective_type == "rnnlm_objective":
-            logger.info("Generating RNNLM objective plots")
+            logger.info("Generating RNNLM objective plots for '%s'", output_name)
             generate_acc_logprob_plots(
                 exp_dir, output_dir, g_plot, key='rnnlm_objective',
                 file_basename='objective', comparison_dir=comparison_dir,
                 start_iter=start_iter,
                 latex_report=latex_report, output_name=output_name)
         else:
-            logger.info("Generating " + objective_type + " objective plots")
+            logger.info("Generating %s objective plots for '%s'", objective_type, output_name)
             generate_acc_logprob_plots(
                 exp_dir, output_dir, g_plot, key='objective',
                 file_basename='objective', comparison_dir=comparison_dir,
@@ -808,14 +776,19 @@ def generate_plots(exp_dir, output_dir, output_names, comparison_dir=None,
     if g_plot and latex_report is not None:
         has_compiled = latex_report.close()
         if has_compiled:
-            logger.info("Report has been generated. "
-                        "You can find it at the location "
-                        "{0}".format("{0}/report.pdf".format(output_dir)))
+            logger.info("Report file %s/report.pdf has been generated successfully.", output_dir)
 
 
 def main():
     args = get_args()
 
+    if not g_plot:
+        logger.warning(
+            "This script requires matplotlib and numpy.\n"
+            "... Install these packages to generate plots.\n"
+            "... If you are on a cluster where you do not have admin rights, use venv.\n"
+            "... Generating text data table files only.")
+
     output_nodes = []
 
     if args.output_nodes is not None:
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
index e72b29297a4..84817446b6e 100755
--- a/egs/wsj/s5/steps/nnet3/train_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -117,8 +117,9 @@ def process_args(args):
         raise Exception("--trainer.rnn.num-chunk-per-minibatch has an invalid value")
 
     if (not os.path.exists(args.dir)):
-        raise Exception("This script expects --dir={0} to exist.")
-    if (not os.path.exists(args.dir+"/configs") and
+        raise Exception("Directory specified with --dir={0} "
+                        "does not exist.".format(args.dir))
+    if (not os.path.exists(args.dir + "/configs") and
         (args.input_model is None or not os.path.exists(args.input_model))):
         raise Exception("Either --trainer.input-model option should be supplied, "
                         "and exist; or the {0}/configs directory should exist."
@@ -321,9 +322,10 @@ def train(args, run_opts):
         if (args.exit_stage is not None) and (iter == args.exit_stage):
             logger.info("Exiting early due to --exit-stage {0}".format(iter))
             return
-        current_num_jobs = int(0.5 + args.num_jobs_initial
-                               + (args.num_jobs_final - args.num_jobs_initial)
-                               * float(iter) / num_iters)
+
+        current_num_jobs = common_train_lib.get_current_num_jobs(
+            iter, num_iters,
+            args.num_jobs_initial, args.num_jobs_step, args.num_jobs_final)
 
         if args.stage <= iter:
             lrate = common_train_lib.get_learning_rate(iter, current_num_jobs,
@@ -344,12 +346,13 @@ def train(args, run_opts):
             shrink_info_str = ''
             if shrinkage_value != 1.0:
                 shrink_info_str = 'shrink: {0:0.5f}'.format(shrinkage_value)
-            logger.info("Iter: {0}/{1}    "
-                        "Epoch: {2:0.2f}/{3:0.1f} ({4:0.1f}% complete)    "
-                        "lr: {5:0.6f}    {6}".format(iter, num_iters - 1,
-                                                     epoch, args.num_epochs,
-                                                     percent,
-                                                     lrate, shrink_info_str))
+            logger.info("Iter: {0}/{1}   Jobs: {2}   "
+                        "Epoch: {3:0.2f}/{4:0.1f} ({5:0.1f}% complete)   "
+                        "lr: {6:0.6f}   {7}".format(iter, num_iters - 1,
+                                                    current_num_jobs,
+                                                    epoch, args.num_epochs,
+                                                    percent,
+                                                    lrate, shrink_info_str))
 
             train_lib.common.train_one_iteration(
                 dir=args.dir,
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
index ffccf443b99..af921048bb5 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
@@ -135,8 +135,9 @@ def process_args(args):
         raise Exception("--trainer.optimization.minibatch-size has an invalid value")
 
     if (not os.path.exists(args.dir)):
-        raise Exception("This script expects --dir={0} to exist.")
-    if (not os.path.exists(args.dir+"/configs") and
+        raise Exception("Directory specified with --dir={0} "
+                        "does not exist.".format(args.dir))
+    if (not os.path.exists(args.dir + "/configs") and
         (args.input_model is None or not os.path.exists(args.input_model))):
         raise Exception("Either --trainer.input-model option should be supplied, "
                         "and exist; or the {0}/configs directory should exist."
@@ -356,9 +357,10 @@ def train(args, run_opts):
         if (args.exit_stage is not None) and (iter == args.exit_stage):
             logger.info("Exiting early due to --exit-stage {0}".format(iter))
             return
-        current_num_jobs = int(0.5 + args.num_jobs_initial
-                               + (args.num_jobs_final - args.num_jobs_initial)
-                               * float(iter) / num_iters)
+
+        current_num_jobs = common_train_lib.get_current_num_jobs(
+            iter, num_iters,
+            args.num_jobs_initial, args.num_jobs_step, args.num_jobs_final)
 
         if args.stage <= iter:
             lrate = common_train_lib.get_learning_rate(iter, current_num_jobs,
@@ -380,12 +382,13 @@ def train(args, run_opts):
             shrink_info_str = ''
             if shrinkage_value != 1.0:
                 shrink_info_str = 'shrink: {0:0.5f}'.format(shrinkage_value)
-            logger.info("Iter: {0}/{1}    "
-                        "Epoch: {2:0.2f}/{3:0.1f} ({4:0.1f}% complete)    "
-                        "lr: {5:0.6f}    {6}".format(iter, num_iters - 1,
-                                                     epoch, args.num_epochs,
-                                                     percent,
-                                                     lrate, shrink_info_str))
+            logger.info("Iter: {0}/{1}   Jobs: {2}   "
+                        "Epoch: {3:0.2f}/{4:0.1f} ({5:0.1f}% complete)   "
+                        "lr: {6:0.6f}   {7}".format(iter, num_iters - 1,
+                                                    current_num_jobs,
+                                                    epoch, args.num_epochs,
+                                                    percent,
+                                                    lrate, shrink_info_str))
 
             train_lib.common.train_one_iteration(
                 dir=args.dir,
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
index c704b0725d3..b2d55ac20e7 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -181,8 +181,9 @@ def process_args(args):
         raise Exception("--egs.chunk-right-context should be non-negative")
 
     if (not os.path.exists(args.dir)):
-        raise Exception("This script expects --dir={0} to exist.")
-    if (not os.path.exists(args.dir+"/configs") and
+        raise Exception("Directory specified with --dir={0} "
+                        "does not exist.".format(args.dir))
+    if (not os.path.exists(args.dir + "/configs") and
         (args.input_model is None or not os.path.exists(args.input_model))):
         raise Exception("Either --trainer.input-model option should be supplied, "
                         "and exist; or the {0}/configs directory should exist."
@@ -411,9 +412,10 @@ def train(args, run_opts):
         if (args.exit_stage is not None) and (iter == args.exit_stage):
             logger.info("Exiting early due to --exit-stage {0}".format(iter))
             return
-        current_num_jobs = int(0.5 + args.num_jobs_initial
-                               + (args.num_jobs_final - args.num_jobs_initial)
-                               * float(iter) / num_iters)
+
+        current_num_jobs = common_train_lib.get_current_num_jobs(
+            iter, num_iters,
+            args.num_jobs_initial, args.num_jobs_step, args.num_jobs_final)
 
         if args.stage <= iter:
             model_file = "{dir}/{iter}.raw".format(dir=args.dir, iter=iter)
@@ -445,12 +447,13 @@ def train(args, run_opts):
             shrink_info_str = ''
             if shrinkage_value != 1.0:
                 shrink_info_str = 'shrink: {0:0.5f}'.format(shrinkage_value)
-            logger.info("Iter: {0}/{1}    "
-                        "Epoch: {2:0.2f}/{3:0.1f} ({4:0.1f}% complete)    "
-                        "lr: {5:0.6f}    {6}".format(iter, num_iters - 1,
-                                                     epoch, args.num_epochs,
-                                                     percent,
-                                                     lrate, shrink_info_str))
+            logger.info("Iter: {0}/{1}   Jobs: {2}   "
+                        "Epoch: {3:0.2f}/{4:0.1f} ({5:0.1f}% complete)   "
+                        "lr: {6:0.6f}   {7}".format(iter, num_iters - 1,
+                                                    current_num_jobs,
+                                                    epoch, args.num_epochs,
+                                                    percent,
+                                                    lrate, shrink_info_str))
 
             train_lib.common.train_one_iteration(
                 dir=args.dir,
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index ab2aa0c4d8d..6ed7197c22b 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -172,12 +172,12 @@ def process_args(args):
         raise Exception("--egs.chunk-right-context should be non-negative")
 
     if (not os.path.exists(args.dir)):
-        raise Exception("This script expects --dir={0} to exist.")
-
-    if (not os.path.exists(args.dir+"/configs") and
+        raise Exception("Directory specified with --dir={0} "
+                        "does not exist.".format(args.dir))
+    if (not os.path.exists(args.dir + "/configs") and
         (args.input_model is None or not os.path.exists(args.input_model))):
         raise Exception("Either --trainer.input-model option should be supplied, "
-                        "and exist; or the {0}/configs directory should exist."
+                        "and exist; or the {0}/configs directory should exist. "
                         "{0}/configs is the output of make_configs.py"
                         "".format(args.dir))
 
@@ -396,9 +396,10 @@ def train(args, run_opts):
         if (args.exit_stage is not None) and (iter == args.exit_stage):
             logger.info("Exiting early due to --exit-stage {0}".format(iter))
             return
-        current_num_jobs = int(0.5 + args.num_jobs_initial
-                               + (args.num_jobs_final - args.num_jobs_initial)
-                               * float(iter) / num_iters)
+
+        current_num_jobs = common_train_lib.get_current_num_jobs(
+            iter, num_iters,
+            args.num_jobs_initial, args.num_jobs_step, args.num_jobs_final)
 
         if args.stage <= iter:
             model_file = "{dir}/{iter}.mdl".format(dir=args.dir, iter=iter)
@@ -428,12 +429,13 @@ def train(args, run_opts):
             shrink_info_str = ''
             if shrinkage_value != 1.0:
                 shrink_info_str = 'shrink: {0:0.5f}'.format(shrinkage_value)
-            logger.info("Iter: {0}/{1}    "
-                        "Epoch: {2:0.2f}/{3:0.1f} ({4:0.1f}% complete)    "
-                        "lr: {5:0.6f}    {6}".format(iter, num_iters - 1,
-                                                     epoch, args.num_epochs,
-                                                     percent,
-                                                     lrate, shrink_info_str))
+            logger.info("Iter: {0}/{1}   Jobs: {2}   "
+                        "Epoch: {3:0.2f}/{4:0.1f} ({5:0.1f}% complete)   "
+                        "lr: {6:0.6f}   {7}".format(iter, num_iters - 1,
+                                                    current_num_jobs,
+                                                    epoch, args.num_epochs,
+                                                    percent,
+                                                    lrate, shrink_info_str))
 
             train_lib.common.train_one_iteration(
                 dir=args.dir,
diff --git a/egs/wsj/s5/steps/nnet3/xconfig_to_config.py b/egs/wsj/s5/steps/nnet3/xconfig_to_config.py
new file mode 100755
index 00000000000..952745cea9f
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/xconfig_to_config.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+
+# Copyright 2016-2018    Johns Hopkins University (Dan Povey)
+#           2016    Vijayaditya Peddinti
+#           2017    Google Inc. (vpeddinti@google.com)
+# Apache 2.0.
+
+# This is like xconfig_to_configs.py but with a simpler interface; it writes
+# to a single named file.
+
+
+import argparse
+import os
+import sys
+from collections import defaultdict
+
+sys.path.insert(0, 'steps/')
+# the following is in case we weren't running this from the normal directory.
+sys.path.insert(0, os.path.realpath(os.path.dirname(sys.argv[0])) + '/')
+
+import libs.nnet3.xconfig.parser as xparser
+import libs.common as common_lib
+
+
+def get_args():
+    # we add compulsory arguments as named arguments for readability
+    parser = argparse.ArgumentParser(
+        description="Reads an xconfig file and creates config files "
+                    "for neural net creation and training",
+        epilog='Search egs/*/*/local/{nnet3,chain}/*sh for examples')
+    parser.add_argument('--xconfig-file', required=True,
+                        help='Filename of input xconfig file')
+    parser.add_argument('--existing-model',
+                        help='Filename of previously trained neural net '
+                             '(e.g. final.mdl) which is useful in case of '
+                             'using nodes from list of component-nodes in '
+                             'already trained model '
+                             'to generate new config file for new model.'
+                             'The context info is also generated using '
+                             'a model generated by adding final.config '
+                             'to the existing model.'
+                             'e.g. In Transfer learning: generate new model using '
+                             'component nodes in existing model.')
+    parser.add_argument('--config-file-out', required=True,
+                        help='Filename to write nnet config file.');
+    parser.add_argument('--nnet-edits', type=str, default=None,
+                        action=common_lib.NullstrToNoneAction,
+                        help="""This option is useful in case the network you
+                        are creating does not have an output node called
+                        'output' (e.g. for multilingual setups).  You can set
+                        this to an edit-string like: 'rename-node old-name=xxx
+                        new-name=output' if node xxx plays the role of the
+                        output node in this network.  This is only used for
+                        computing the left/right context.""")
+
+    print(' '.join(sys.argv), file=sys.stderr)
+
+    args = parser.parse_args()
+
+    return args
+
+
+
+def write_config_file(config_file_out, all_layers):
+    # config_basename_to_lines is map from the basename of the
+    # config, as a string (i.e. 'ref', 'all', 'init') to a list of
+    # strings representing lines to put in the config file.
+    config_basename_to_lines = defaultdict(list)
+
+    for layer in all_layers:
+        try:
+            pairs = layer.get_full_config()
+            for config_basename, line in pairs:
+                config_basename_to_lines[config_basename].append(line)
+        except Exception as e:
+            print("{0}: error producing config lines from xconfig "
+                  "line '{1}': error was: {2}".format(sys.argv[0],
+                                                      str(layer), repr(e)),
+                  file=sys.stderr)
+            # we use raise rather than raise(e) as using a blank raise
+            # preserves the backtrace
+            raise
+
+    with open(config_file_out, 'w') as f:
+        print('# This file was created by the command:\n'
+              '# {0} '.format(sys.argv), file=f)
+        lines = config_basename_to_lines['final']
+        for line in lines:
+            print(line, file=f)
+
+
+def main():
+    args = get_args()
+    existing_layers = []
+    if args.existing_model is not None:
+        existing_layers = xparser.get_model_component_info(args.existing_model)
+    all_layers = xparser.read_xconfig_file(args.xconfig_file, existing_layers)
+    write_config_file(args.config_file_out, all_layers)
+
+
+if __name__ == '__main__':
+    main()
+
+
+# test:
+# (echo 'input dim=40 name=input'; echo 'output name=output input=Append(-1,0,1)')  >xconfig; steps/nnet3/xconfig_to_config.py --xconfig-file=xconfig --config-file-out=foo
diff --git a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py
index f025eb5b343..4fb7ec63afd 100755
--- a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py
+++ b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py
@@ -243,15 +243,15 @@ def add_nnet_context_info(config_dir, nnet_edits=None,
     if nnet_edits is not None:
         model = "nnet3-copy --edits='{0}' {1} - |".format(nnet_edits,
                                                           model)
-    out = common_lib.get_command_stdout('nnet3-info "{0}" | head -n 4 '
-                                        .format(model))
+    out = common_lib.get_command_stdout('nnet3-info "{0}"'.format(model))
     # out looks like this
     # left-context: 7
     # right-context: 0
     # num-parameters: 90543902
     # modulus: 1
+    # ...
     info = {}
-    for line in out.split("\n"):
+    for line in out.split("\n")[:4]: # take 4 initial lines,
         parts = line.split(":")
         if len(parts) != 2:
             continue
@@ -277,17 +277,17 @@ def check_model_contexts(config_dir, nnet_edits=None, existing_model=None):
                                                  None else '',
                                                  config_dir, file_name))
             model = "{0}/{1}.raw".format(config_dir, file_name)
-            if nnet_edits is not None:
+            if nnet_edits is not None and file_name != 'init':
                 model = "nnet3-copy --edits='{0}' {1} - |".format(nnet_edits,
                                                                   model)
-            out = common_lib.get_command_stdout('nnet3-info "{0}" | head -n 4 '
-                                                .format(model))
+            out = common_lib.get_command_stdout('nnet3-info "{0}"'.format(model))
             # out looks like this
             # left-context: 7
             # right-context: 0
             # num-parameters: 90543902
             # modulus: 1
-            for line in out.split("\n"):
+            # ...
+            for line in out.split("\n")[:4]: # take 4 initial lines,
                 parts = line.split(":")
                 if len(parts) != 2:
                     continue
diff --git a/egs/wsj/s5/steps/online/nnet2/extract_ivectors.sh b/egs/wsj/s5/steps/online/nnet2/extract_ivectors.sh
index a423be7aa20..858dd4b6730 100755
--- a/egs/wsj/s5/steps/online/nnet2/extract_ivectors.sh
+++ b/egs/wsj/s5/steps/online/nnet2/extract_ivectors.sh
@@ -64,6 +64,10 @@ if [ -f path.sh ]; then . ./path.sh; fi
 if [ $# != 4 ] && [ $# != 5 ]; then
   echo "Usage: $0 [options] <data> <lang> <extractor-dir> [<alignment-dir>|<decode-dir>|<weights-archive>] <ivector-dir>"
   echo " e.g.: $0 data/test data/lang exp/nnet2_online/extractor exp/tri3/decode_test exp/nnet2_online/ivectors_test"
+  echo "If <alignment-dir|decode-dir> is provided, it is converted to frame-weights "
+  echo "giving silence frames a weight of --silence-weight (default: 0.0). "
+  echo "If <weights-archive> is provided, it must be a single archive file compressed "
+  echo "(using gunzip) containing per-frame weights for each utterance."
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
@@ -90,7 +94,7 @@ else # 5 arguments
   data=$1
   lang=$2
   srcdir=$3
-  ali_or_decode_dir=$4
+  ali_or_decode_dir_or_weights=$4
   dir=$5
 fi
 
@@ -102,23 +106,23 @@ done
 mkdir -p $dir/log
 silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
 
-if [ ! -z "$ali_or_decode_dir" ]; then
+if [ ! -z "$ali_or_decode_dir_or_weights" ]; then
 
 
-  if [ -f $ali_or_decode_dir/ali.1.gz ]; then
-    if [ ! -f $ali_or_decode_dir/${mdl}.mdl ]; then
-      echo "$0: expected $ali_or_decode_dir/${mdl}.mdl to exist."
+  if [ -f $ali_or_decode_dir_or_weights/ali.1.gz ]; then
+    if [ ! -f $ali_or_decode_dir_or_weights/${mdl}.mdl ]; then
+      echo "$0: expected $ali_or_decode_dir_or_weights/${mdl}.mdl to exist."
       exit 1;
     fi
-    nj_orig=$(cat $ali_or_decode_dir/num_jobs) || exit 1;
+    nj_orig=$(cat $ali_or_decode_dir_or_weights/num_jobs) || exit 1;
 
     if [ $stage -le 0 ]; then
       rm $dir/weights.*.gz 2>/dev/null
 
       $cmd JOB=1:$nj_orig  $dir/log/ali_to_post.JOB.log \
-        gunzip -c $ali_or_decode_dir/ali.JOB.gz \| \
+        gunzip -c $ali_or_decode_dir_or_weights/ali.JOB.gz \| \
         ali-to-post ark:- ark:- \| \
-        weight-silence-post $silence_weight $silphonelist $ali_or_decode_dir/final.mdl ark:- ark:- \| \
+        weight-silence-post $silence_weight $silphonelist $ali_or_decode_dir_or_weights/final.mdl ark:- ark:- \| \
         post-to-weights ark:- "ark:|gzip -c >$dir/weights.JOB.gz" || exit 1;
 
       # put all the weights in one archive.
@@ -126,10 +130,10 @@ if [ ! -z "$ali_or_decode_dir" ]; then
       rm $dir/weights.*.gz || exit 1;
     fi
 
-  elif [ -f $ali_or_decode_dir/lat.1.gz ]; then
-    nj_orig=$(cat $ali_or_decode_dir/num_jobs) || exit 1;
-    if [ ! -f $ali_or_decode_dir/../${mdl}.mdl ]; then
-      echo "$0: expected $ali_or_decode_dir/../${mdl}.mdl to exist."
+  elif [ -f $ali_or_decode_dir_or_weights/lat.1.gz ]; then
+    nj_orig=$(cat $ali_or_decode_dir_or_weights/num_jobs) || exit 1;
+    if [ ! -f $ali_or_decode_dir_or_weights/../${mdl}.mdl ]; then
+      echo "$0: expected $ali_or_decode_dir_or_weights/../${mdl}.mdl to exist."
       exit 1;
     fi
 
@@ -138,19 +142,19 @@ if [ ! -z "$ali_or_decode_dir" ]; then
       rm $dir/weights.*.gz 2>/dev/null
 
       $cmd JOB=1:$nj_orig  $dir/log/lat_to_post.JOB.log \
-        lattice-best-path --acoustic-scale=$acwt "ark:gunzip -c $ali_or_decode_dir/lat.JOB.gz|" ark:/dev/null ark:- \| \
+        lattice-best-path --acoustic-scale=$acwt "ark:gunzip -c $ali_or_decode_dir_or_weights/lat.JOB.gz|" ark:/dev/null ark:- \| \
         ali-to-post ark:- ark:- \| \
-        weight-silence-post $silence_weight $silphonelist $ali_or_decode_dir/../${mdl}.mdl ark:- ark:- \| \
+        weight-silence-post $silence_weight $silphonelist $ali_or_decode_dir_or_weights/../${mdl}.mdl ark:- ark:- \| \
         post-to-weights ark:- "ark:|gzip -c >$dir/weights.JOB.gz" || exit 1;
 
       # put all the weights in one archive.
       for j in $(seq $nj_orig); do gunzip -c $dir/weights.$j.gz; done | gzip -c >$dir/weights.gz || exit 1;
       rm $dir/weights.*.gz || exit 1;
     fi
-  elif [ -f $ali_or_decode_dir ] && gunzip -c $ali_or_decode_dir >/dev/null; then
-    cp $ali_or_decode_dir $dir/weights.gz || exit 1;
+  elif [ -f $ali_or_decode_dir_or_weights ] && gunzip -c $ali_or_decode_dir_or_weights >/dev/null; then
+    cp $ali_or_decode_dir_or_weights $dir/weights.gz || exit 1;
   else
-    echo "$0: expected ali.1.gz or lat.1.gz to exist in $ali_or_decode_dir";
+    echo "$0: expected ali.1.gz or lat.1.gz to exist in $ali_or_decode_dir_or_weights";
     exit 1;
   fi
 fi
@@ -169,7 +173,7 @@ if [ $sub_speaker_frames -gt 0 ]; then
 
   if [ $stage -le 1 ]; then
   # We work out 'fake' spk2utt files that possibly split each speaker into multiple pieces.
-    if [ ! -z "$ali_or_decode_dir" ]; then
+    if [ ! -z "$ali_or_decode_dir_or_weights" ]; then
       gunzip -c $dir/weights.gz | copy-vector ark:- ark,t:- | \
         awk '{ sum=0; for (n=3;n<NF;n++) sum += $n; print $1, sum; }' > $dir/utt_counts || exit 1;
     else
@@ -230,7 +234,7 @@ else
 fi
 
 if [ $stage -le 2 ]; then
-  if [ ! -z "$ali_or_decode_dir" ]; then
+  if [ ! -z "$ali_or_decode_dir_or_weights" ]; then
     $cmd --num-threads $num_threads JOB=1:$nj $dir/log/extract_ivectors.JOB.log \
       gmm-global-get-post --n=$num_gselect --min-post=$min_post $srcdir/final.dubm "$gmm_feats" ark:- \| \
       weight-post ark:- "ark,s,cs:gunzip -c $dir/weights.gz|" ark:- \| \
diff --git a/egs/wsj/s5/steps/segmentation/ali_to_targets.sh b/egs/wsj/s5/steps/segmentation/ali_to_targets.sh
index 78c76a8ea01..56d93df3c6b 100644
--- a/egs/wsj/s5/steps/segmentation/ali_to_targets.sh
+++ b/egs/wsj/s5/steps/segmentation/ali_to_targets.sh
@@ -82,9 +82,9 @@ nj=$(cat $ali_dir/num_jobs) || exit 1
 
 $cmd JOB=1:$nj $dir/log/get_arc_info.JOB.log \
   ali-to-phones --ctm-output --frame-shift=1 \
-    $srcdir/final.mdl "ark:gunzip -c $ali_dir/lat.JOB.gz |" - \| \
+    $srcdir/final.mdl "ark:gunzip -c $ali_dir/ali.JOB.gz |" - \| \
   utils/int2sym.pl -f 5 $lang/phones.txt \| \
-  awk '{print $1" "int($3)" "int($4)" 1.0 "$5}' \| \
+  awk '{print $1" "int($3)" "int($4)" 1.0 "$5}' \> \
   $dir/arc_info_sym.JOB.txt || exit 1
 
 # make $dir an absolute pathname.
diff --git a/egs/wsj/s5/steps/segmentation/internal/merge_targets.py b/egs/wsj/s5/steps/segmentation/internal/merge_targets.py
index a14aef151c2..84b0c884f45 100755
--- a/egs/wsj/s5/steps/segmentation/internal/merge_targets.py
+++ b/egs/wsj/s5/steps/segmentation/internal/merge_targets.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # Copyright 2017  Vimal Manohar
 # Apache 2.0
@@ -16,8 +16,6 @@
 option.
 """
 
-from __future__ import print_function
-from __future__ import division
 import argparse
 import logging
 import numpy as np
@@ -111,7 +109,7 @@ def should_remove_frame(row, dim):
                                      # source[2] = [ 0 0 0 ]
     """
     assert len(row) % dim == 0
-    num_sources = len(row) / dim
+    num_sources = len(row) // dim
 
     max_idx = np.argmax(row)
     max_val = row[max_idx]
diff --git a/egs/wsj/s5/steps/train_mono.sh b/egs/wsj/s5/steps/train_mono.sh
index 141d128c329..5a0b79a4a1c 100755
--- a/egs/wsj/s5/steps/train_mono.sh
+++ b/egs/wsj/s5/steps/train_mono.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
 # Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+#           2019  Xiaohui Zhang
 # Apache 2.0
 
 
@@ -13,6 +14,9 @@ cmd=run.pl
 scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
 num_iters=40    # Number of iterations of training
 max_iter_inc=30 # Last iter to increase #Gauss on.
+initial_beam=6 # beam used in the first iteration (set smaller to speed up initialization)
+regular_beam=10 # beam used after the first iteration
+retry_beam=40
 totgauss=1000 # Target #Gaussians.
 careful=false
 boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
@@ -105,8 +109,7 @@ if [ $stage -le 0 ]; then
   rm $dir/0.*.acc
 fi
 
-
-beam=6 # will change to 10 below after 1st pass
+beam=$initial_beam # will change to regular_beam below after 1st pass
 # note: using slightly wider beams for WSJ vs. RM.
 x=1
 while [ $x -lt $num_iters ]; do
@@ -116,7 +119,7 @@ while [ $x -lt $num_iters ]; do
       echo "$0: Aligning data"
       mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |"
       $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \
-        gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$[$beam*4] --careful=$careful "$mdl" \
+        gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam --careful=$careful "$mdl" \
         "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" "ark,t:|gzip -c >$dir/ali.JOB.gz" \
         || exit 1;
     fi
@@ -132,7 +135,7 @@ while [ $x -lt $num_iters ]; do
   if [ $x -le $max_iter_inc ]; then
      numgauss=$[$numgauss+$incgauss];
   fi
-  beam=10
+  beam=$regular_beam
   x=$[$x+1]
 done
 
diff --git a/egs/wsj/s5/utils/add_lex_disambig.pl b/egs/wsj/s5/utils/add_lex_disambig.pl
index dd8a25de6e1..c4277e8dc06 100755
--- a/egs/wsj/s5/utils/add_lex_disambig.pl
+++ b/egs/wsj/s5/utils/add_lex_disambig.pl
@@ -122,6 +122,7 @@
     if ($sil_probs) {
       shift @A; # Remove silprob
       shift @A; # Remove silprob
+      shift @A; # Remove silprob, there three numbers for sil_probs
     }
     while(@A > 0) {
         pop @A;  # Remove last phone
diff --git a/egs/wsj/s5/utils/combine_data.sh b/egs/wsj/s5/utils/combine_data.sh
index a43cf9d77f3..8daffcea8c5 100755
--- a/egs/wsj/s5/utils/combine_data.sh
+++ b/egs/wsj/s5/utils/combine_data.sh
@@ -42,6 +42,20 @@ for dir in $*; do
   fi
 done
 
+# Check that frame_shift are compatible, where present together with features.
+dir_with_frame_shift=
+for dir in $*; do
+  if [[ -f $dir/feats.scp && -f $dir/frame_shift ]]; then
+    if [[ $dir_with_frame_shift ]] &&
+       ! cmp -s $dir_with_frame_shift/frame_shift $dir/frame_shift; then
+      echo "$0:error: different frame_shift in directories $dir and " \
+           "$dir_with_frame_shift. Cannot combine features."
+      exit 1;
+    fi
+    dir_with_frame_shift=$dir
+  fi
+done
+
 # W.r.t. utt2uniq file the script has different behavior compared to other files
 # it is not compulsary for it to exist in src directories, but if it exists in
 # even one it should exist in all. We will create the files where necessary
@@ -94,7 +108,7 @@ else
   echo "$0 [info]: not combining segments as it does not exist"
 fi
 
-for file in utt2spk utt2lang utt2dur reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do
+for file in utt2spk utt2lang utt2dur utt2num_frames reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do
   exists_somewhere=false
   absent_somewhere=false
   for d in $*; do
@@ -121,6 +135,10 @@ done
 
 utils/utt2spk_to_spk2utt.pl <$dest/utt2spk >$dest/spk2utt
 
+if [[ $dir_with_frame_shift ]]; then
+  cp $dir_with_frame_shift/frame_shift $dest
+fi
+
 if ! $skip_fix ; then
   utils/fix_data_dir.sh $dest || exit 1;
 fi
diff --git a/egs/wsj/s5/utils/copy_data_dir.sh b/egs/wsj/s5/utils/copy_data_dir.sh
index f3b885c5e79..fbd31203e34 100755
--- a/egs/wsj/s5/utils/copy_data_dir.sh
+++ b/egs/wsj/s5/utils/copy_data_dir.sh
@@ -103,6 +103,9 @@ fi
 if [ -f $srcdir/utt2dur ]; then
   utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur
 fi
+if [ -f $srcdir/utt2num_frames ]; then
+  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2num_frames >$destdir/utt2num_frames
+fi
 if [ -f $srcdir/reco2dur ]; then
   if [ -f $srcdir/segments ]; then
     cp $srcdir/reco2dur $destdir/reco2dur
@@ -116,7 +119,7 @@ fi
 if [ -f $srcdir/cmvn.scp ]; then
   utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/cmvn.scp >$destdir/cmvn.scp
 fi
-for f in stm glm ctm; do
+for f in frame_shift stm glm ctm; do
   if [ -f $srcdir/$f ]; then
     cp $srcdir/$f $destdir
   fi
@@ -126,7 +129,7 @@ rm $destdir/spk_map $destdir/utt_map
 
 echo "$0: copied data from $srcdir to $destdir"
 
-for f in feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp reco2file_and_channel stm glm ctm; do
+for f in feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp reco2file_and_channel frame_shift stm glm ctm; do
   if [ -f $destdir/$f ] && [ ! -f $srcdir/$f ]; then
     echo "$0: file $f exists in dest $destdir but not in src $srcdir.  Moving it to"
     echo " ... $destdir/.backup/$f"
diff --git a/egs/wsj/s5/utils/ctm/resolve_ctm_overlaps.py b/egs/wsj/s5/utils/ctm/resolve_ctm_overlaps.py
index 61c9a3014aa..4463bc9fcf0 100755
--- a/egs/wsj/s5/utils/ctm/resolve_ctm_overlaps.py
+++ b/egs/wsj/s5/utils/ctm/resolve_ctm_overlaps.py
@@ -38,7 +38,8 @@
 def get_args():
     """gets command line arguments"""
 
-    usage = """ Python script to resolve overlaps in ctms """
+    usage = """ Python script to resolve overlaps in ctms.  May be used with
+                utils/data/subsegment_data_dir.sh. """
     parser = argparse.ArgumentParser(usage)
     parser.add_argument('segments', type=argparse.FileType('r'),
                         help='use segments to resolve overlaps')
diff --git a/egs/wsj/s5/utils/data/get_frame_shift.sh b/egs/wsj/s5/utils/data/get_frame_shift.sh
index eaf21b37ea6..c836bde1b18 100755
--- a/egs/wsj/s5/utils/data/get_frame_shift.sh
+++ b/egs/wsj/s5/utils/data/get_frame_shift.sh
@@ -14,12 +14,16 @@
 . ./path.sh
 
 if [ $# != 1 ]; then
-  echo "Usage: $0 <datadir>"
-  echo "e.g.:"
-  echo " $0 data/train"
-  echo "This script prints the frame-shift (e.g. 0.01) to the standard out."
-  echo "If <datadir> does not contain utt2dur, this script may call utils/data/get_utt2dur.sh,"
-  echo "which will require write permission to <datadir>"
+  cat >&2 <<EOF
+Usage: frame_shift=\$($0 <datadir>)
+e.g.:  frame_shift=\$($0 data/train)
+
+This script prints the frame-shift in seconds (e.g. 0.01) to the standard out.
+Its output is intended to be captured in a shell variable.
+
+If <datadir> does not contain the file utt2dur, this script may invoke
+utils/data/get_utt2dur.sh, which will require write permission to <datadir>.
+EOF
   exit 1
 fi
 
@@ -27,6 +31,15 @@ export LC_ALL=C
 
 dir=$1
 
+if [[ -s $dir/frame_shift ]]; then
+  cat $dir/frame_shift
+  exit
+fi
+
+if [ ! -f $dir/feats.scp ]; then
+  echo "$0: $dir/feats.scp does not exist" 1>&2
+  exit 1
+fi
 
 if [ ! -s $dir/utt2dur ]; then
   if [ ! -e $dir/wav.scp ] && [ ! -s $dir/segments ]; then
@@ -35,37 +48,27 @@ if [ ! -s $dir/utt2dur ]; then
     exit 0
   fi
   echo "$0: $dir/utt2dur does not exist: creating it" 1>&2
-  utils/data/get_utt2dur.sh $dir 1>&2
+  utils/data/get_utt2dur.sh 1>&2 $dir || exit 1
 fi
 
-if [ ! -s $dir/frame_shift ]; then
-  if [ ! -f $dir/feats.scp ]; then
-    echo "$0: $dir/feats.scp does not exist" 1>&2
-    exit 1
-  fi
-
-  temp=$(mktemp /tmp/tmp.XXXX)
+temp=$(mktemp /tmp/tmp.XXXX) || exit 1
 
-  feat-to-len "scp:head -n 10 $dir/feats.scp|" ark,t:- > $temp
+feat-to-len --print-args=false "scp:head -n 10 $dir/feats.scp|" ark,t:- > $temp
 
-  if [ -z $temp ]; then
-    echo "$0: error running feat-to-len" 1>&2
-    exit 1
-  fi
-
-  frame_shift=$(head -n 10 $dir/utt2dur | paste - $temp | \
-    awk '{ dur += $2; frames += $4; } END { shift = dur / frames; if (shift > 0.01 && shift < 0.0102) shift = 0.01; print shift; }') || exit 1;
-
-  echo $frame_shift > $dir/frame_shift
+if [[ ! -s $temp ]]; then
   rm $temp
-fi
-
-frame_shift=$(cat $dir/frame_shift)
-if [ -z "$frame_shift" ]; then
-  echo "$0: Could not read get frame shift from directory $dir" 1>&2
+  echo "$0: error running feat-to-len" 1>&2
   exit 1
 fi
 
-echo $frame_shift
+frame_shift=$(head -n 10 $dir/utt2dur | paste - $temp | awk '
+      { dur += $2; frames += $4; }
+  END { shift = dur / frames;
+        if (shift > 0.01 && shift < 0.0102) shift = 0.01;
+        print shift; }') || exit 1;
 
+rm $temp
+
+echo $frame_shift > $dir/frame_shift
+echo $frame_shift
 exit 0
diff --git a/egs/wsj/s5/utils/data/get_uniform_subsegments.py b/egs/wsj/s5/utils/data/get_uniform_subsegments.py
index cc3015564a5..a963f35fa70 100755
--- a/egs/wsj/s5/utils/data/get_uniform_subsegments.py
+++ b/egs/wsj/s5/utils/data/get_uniform_subsegments.py
@@ -73,7 +73,7 @@ def run(args):
             new_utt = "{utt_id}-{s:08d}-{e:08d}".format(
                 utt_id=utt_id, s=int(100 * start_relative),
                 e=int(100 * end_relative))
-            print ("{new_utt} {utt_id} {s} {e}".format(
+            print ("{new_utt} {utt_id} {s:.3f} {e:.3f}".format(
                 new_utt=new_utt, utt_id=utt_id, s=start_relative,
                 e=start_relative + args.max_segment_duration))
             start += args.max_segment_duration - args.overlap_duration
@@ -90,7 +90,7 @@ def run(args):
         new_utt = "{utt_id}-{s:08d}-{e:08d}".format(
             utt_id=utt_id, s=int(round(100 * (start - start_time))),
             e=int(round(100 * (end - start_time))))
-        print ("{new_utt} {utt_id} {s} {e}".format(
+        print ("{new_utt} {utt_id} {s:.3f} {e:.3f}".format(
             new_utt=new_utt, utt_id=utt_id, s=start - start_time,
             e=end - start_time))
 
diff --git a/egs/wsj/s5/utils/data/get_utt2dur.sh b/egs/wsj/s5/utils/data/get_utt2dur.sh
index 995136a5575..a760981d198 100755
--- a/egs/wsj/s5/utils/data/get_utt2dur.sh
+++ b/egs/wsj/s5/utils/data/get_utt2dur.sh
@@ -23,7 +23,8 @@ if [ $# != 1 ]; then
   echo " $0 data/train"
   echo " Options:"
   echo " --frame-shift      # frame shift in seconds. Only relevant when we are"
-  echo "                    # getting duration from feats.scp (default: 0.01). "
+  echo "                    # getting duration from feats.scp, and only if the "
+  echo "                    # file frame_shift does not exist (default: 0.01). "
   exit 1
 fi
 
@@ -40,12 +41,17 @@ fi
 if [ -s $data/segments ]; then
   echo "$0: working out $data/utt2dur from $data/segments"
   awk '{len=$4-$3; print $1, len;}' < $data/segments  > $data/utt2dur
+elif [[ -s $data/frame_shift && -f $data/utt2num_frames ]]; then
+  echo "$0: computing $data/utt2dur from $data/{frame_shift,utt2num_frames}."
+  frame_shift=$(cat $data/frame_shift) || exit 1
+  # The 1.5 correction is the typical value of (frame_length-frame_shift)/frame_shift.
+  awk -v fs=$frame_shift '{ $2=($2+1.5)*fs; print }' <$data/utt2num_frames  >$data/utt2dur
 elif [ -f $data/wav.scp ]; then
   echo "$0: segments file does not exist so getting durations from wave files"
 
   # if the wav.scp contains only lines of the form
   # utt1  /foo/bar/sph2pipe -f wav /baz/foo.sph |
-  if cat $data/wav.scp | perl -e '
+  if perl <$data/wav.scp -e '
      while (<>) { s/\|\s*$/ |/;  # make sure final | is preceded by space.
              @A = split; if (!($#A == 5 && $A[1] =~ m/sph2pipe$/ &&
                                $A[2] eq "-f" && $A[3] eq "wav" && $A[5] eq "|")) { exit(1); }
@@ -102,7 +108,13 @@ elif [ -f $data/wav.scp ]; then
   fi
 elif [ -f $data/feats.scp ]; then
   echo "$0: wave file does not exist so getting durations from feats files"
-  feat-to-len scp:$data/feats.scp ark,t:- | awk -v frame_shift=$frame_shift '{print $1, $2*frame_shift;}' >$data/utt2dur
+  if [[ -s $data/frame_shift ]]; then
+    frame_shift=$(cat $data/frame_shift) || exit 1
+    echo "$0: using frame_shift=$frame_shift from file $data/frame_shift"
+  fi
+  # The 1.5 correction is the typical value of (frame_length-frame_shift)/frame_shift.
+  feat-to-len scp:$data/feats.scp ark,t:- |
+    awk -v frame_shift=$frame_shift '{print $1, ($2+1.5)*frame_shift}' >$data/utt2dur
 else
   echo "$0: Expected $data/wav.scp, $data/segments or $data/feats.scp to exist"
   exit 1
diff --git a/egs/wsj/s5/utils/data/get_utt2num_frames.sh b/egs/wsj/s5/utils/data/get_utt2num_frames.sh
index a6d4f0ecb10..d8b006a5fc0 100755
--- a/egs/wsj/s5/utils/data/get_utt2num_frames.sh
+++ b/egs/wsj/s5/utils/data/get_utt2num_frames.sh
@@ -10,13 +10,14 @@ frame_shift=0.01
 frame_overlap=0.015
 
 . utils/parse_options.sh
+. ./path.sh
 
 if [ $# -ne 1 ]; then
   echo "This script writes a file utt2num_frames with the "
   echo "number of frames in each utterance as measured based on the "
   echo "duration of the utterances (in utt2dur) and the specified "
   echo "frame_shift and frame_overlap."
-  echo "Usage: $0 <data>" 
+  echo "Usage: $0 <data>"
   exit 1
 fi
 
diff --git a/egs/wsj/s5/utils/data/perturb_data_dir_volume.sh b/egs/wsj/s5/utils/data/perturb_data_dir_volume.sh
index dae440b03a3..e357ba8cbfb 100755
--- a/egs/wsj/s5/utils/data/perturb_data_dir_volume.sh
+++ b/egs/wsj/s5/utils/data/perturb_data_dir_volume.sh
@@ -52,15 +52,15 @@ for line in sys.stdin.readlines():
   parts = line.strip().split()
   if line.strip()[-1] == '|':
     if re.search('sox --vol', ' '.join(parts[-11:])):
-      print 'true'
+      print('true')
       sys.exit(0)
   elif re.search(':[0-9]+$', line.strip()) is not None:
     continue
   else:
     if ' '.join(parts[1:3]) == 'sox --vol':
-      print 'true'
+      print('true')
       sys.exit(0)
-print 'false'
+print('false')
 "` || exit 1
 
 if $volume_perturb_done; then
diff --git a/egs/wsj/s5/utils/data/subsegment_data_dir.sh b/egs/wsj/s5/utils/data/subsegment_data_dir.sh
index 526fee0b4ef..d077b851d23 100755
--- a/egs/wsj/s5/utils/data/subsegment_data_dir.sh
+++ b/egs/wsj/s5/utils/data/subsegment_data_dir.sh
@@ -44,6 +44,7 @@ if [ $# != 4 ] && [ $# != 3 ]; then
   echo "                                             # not just applied to the input segments file, is that"
   echo "                                             # for purposes of computing the num-frames of the parts of"
   echo "                                             # matrices in feats.scp, the padding should not be done."
+  echo "  See also: resolve_ctm_overlaps.py"
   exit 1;
 fi
 
@@ -147,7 +148,7 @@ if [ -f $srcdir/feats.scp ]; then
     frame_shift=$(cat $srcdir/frame_shift)
   fi
   echo "$0: note: frame shift is $frame_shift [affects feats.scp]"
-  
+
   # The subsegments format is <new-utt-id> <old-utt-id> <start-time> <end-time>.
   # e.g. 'utt_foo-1 utt_foo 7.21 8.93'
   # The first awk command replaces this with the format:
@@ -167,31 +168,31 @@ if [ -f $srcdir/feats.scp ]; then
   # like pipes that might contain spaces, so it has to be able to produce output like the
   # following:
   # utt_foo-1 some command|[721:892]
-  # The 'end' frame is ensured to not exceed the feature archive size of 
+  # The 'end' frame is ensured to not exceed the feature archive size of
   # <old-utt-id>. This is done using the script fix_subsegment_feats.pl.
-  # e.g if the number of frames in foo-bar.ark is 891, then the features are 
+  # e.g if the number of frames in foo-bar.ark is 891, then the features are
   # truncated to that many frames.
   # utt_foo-1 foo-bar.ark:514231[721:890]
   # Lastly, utils/data/normalize_data_range.pl will only do something nontrivial if
   # the original data-dir already had data-ranges in square brackets.
-  
+
   # Here, we computes the maximum 'end' frame allowed for each <new-utt-id>.
   # This is equal to the number of frames in the feature archive for <old-utt-id>.
   if [ ! -f $srcdir/utt2num_frames ]; then
     echo "$0: WARNING: Could not find $srcdir/utt2num_frames. It might take a long time to run get_utt2num_frames.sh."
-    echo "Increase the number of jobs or write this file while extracting features by passing --write-utt2num-frames true to steps/make_mfcc.sh etc." 
+    echo "Increase the number of jobs or write this file while extracting features by passing --write-utt2num-frames true to steps/make_mfcc.sh etc."
   fi
   utils/data/get_utt2num_frames.sh --cmd "$cmd" --nj $nj $srcdir
   awk '{print $1" "$2}' $subsegments | \
     utils/apply_map.pl -f 2 $srcdir/utt2num_frames > \
     $dir/utt2max_frames
-  
+
   awk -v s=$frame_shift '{print $1, $2, int(($3/s)+0.5), int(($4/s)-0.5);}' <$subsegments| \
     utils/apply_map.pl -f 2 $srcdir/feats.scp | \
     awk '{p=NF-1; for (n=1;n<NF-2;n++) printf("%s ", $n); k=NF-2; l=NF-1; printf("%s[%d:%d]\n", $k, $l, $NF)}' | \
     utils/data/fix_subsegment_feats.pl $dir/utt2max_frames | \
     utils/data/normalize_data_range.pl >$dir/feats.scp || { echo "Failed to create $dir/feats.scp" && exit; }
-  
+
   # Parse the frame ranges from feats.scp, which is in the form of [first-frame:last-frame]
   # and write the number-of-frames = last-frame - first-frame + 1 for the utterance.
   cat $dir/feats.scp | perl -ne 'm/^(\S+) .+\[(\d+):(\d+)\]$/; print "$1 " . ($3-$2+1) . "\n"' > \
@@ -222,8 +223,11 @@ fi
 if [ -f $srcdir/glm ]; then
   cp $srcdir/glm $dir
 fi
+if [ -f $srcdir/stm ]; then
+  cp $srcdir/stm $dir
+fi
 
-for f in stm ctm; do
+for f in ctm; do
   if [ -f $srcdir/$f ]; then
     echo "$0: not copying $srcdir/$f to $dir because sub-segmenting it is "
     echo " ... not implemented yet (and probably it's not needed.)"
@@ -233,4 +237,3 @@ done
 rm $dir/new2old_utt
 
 echo "$0: subsegmented data from $srcdir to $dir"
-
diff --git a/egs/wsj/s5/utils/fix_data_dir.sh b/egs/wsj/s5/utils/fix_data_dir.sh
index ca0972ca85b..aba9037a080 100755
--- a/egs/wsj/s5/utils/fix_data_dir.sh
+++ b/egs/wsj/s5/utils/fix_data_dir.sh
@@ -155,12 +155,22 @@ function filter_utts {
   maybe_reco2dur=
   [ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist.
   [ -s $data/reco2dur ] && [ ! -f $data/segments ] && maybe_reco2dur=reco2dur # reco2dur indexed by utts
-  for x in feats.scp text segments utt2lang $maybe_wav; do
+
+  maybe_utt2dur=
+  if [ -f $data/utt2dur ]; then
+    cat $data/utt2dur | \
+      awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2dur.ok || exit 1
+    maybe_utt2dur=utt2dur.ok
+  fi
+
+  for x in feats.scp text segments utt2lang $maybe_wav $maybe_utt2dur; do
     if [ -f $data/$x ]; then
       utils/filter_scp.pl $data/$x $tmpdir/utts > $tmpdir/utts.tmp
       mv $tmpdir/utts.tmp $tmpdir/utts
     fi
   done
+  rm $data/utt2dur.ok 2>/dev/null || true
+
   [ ! -s $tmpdir/utts ] && echo "fix_data_dir.sh: no utterances remained: not proceeding further." && \
     rm $tmpdir/utts && exit 1;
 
diff --git a/egs/wsj/s5/utils/lang/adjust_unk_graph.sh b/egs/wsj/s5/utils/lang/adjust_unk_graph.sh
index 9a40a9960f2..c40f75ceec8 100755
--- a/egs/wsj/s5/utils/lang/adjust_unk_graph.sh
+++ b/egs/wsj/s5/utils/lang/adjust_unk_graph.sh
@@ -26,7 +26,7 @@ mkdir -p $graphdir_out
 
 required="HCLG.fst words.txt disambig_tid.int num_pdfs phones phones.txt words.txt"
 for f in $required; do
-  [ ! -f $graphdir_in/$f ] && echo "adjust_unk_graph.sh: expected $graphdir_in/$f to exist" && exit 1;
+  [ ! -e $graphdir_in/$f ] && echo "adjust_unk_graph.sh: expected $graphdir_in/$f to exist" && exit 1;
   cp -r $graphdir_in/$f $graphdir_out
 done
 
diff --git a/egs/wsj/s5/utils/lang/extend_lang.sh b/egs/wsj/s5/utils/lang/extend_lang.sh
index c13d5d3e78b..236e3ad6dd5 100755
--- a/egs/wsj/s5/utils/lang/extend_lang.sh
+++ b/egs/wsj/s5/utils/lang/extend_lang.sh
@@ -131,9 +131,14 @@ for n in $(seq 0 $ndisambig); do
   sym='#'$n; if ! grep -w -q "$sym" $dir/phones/disambig.txt; then echo "$sym"; fi
 done > $tmpdir/extra_disambig.txt
 highest_number=$(tail -n 1 $srcdir/phones.txt | awk '{print $2}')
-awk -v start=$highest_number '{print $1, NR+start}' <$tmpdir/extra_disambig.txt >>$dir/words.txt
+awk -v start=$highest_number '{print $1, NR+start}' <$tmpdir/extra_disambig.txt >>$dir/phones.txt
 echo "$0: added $(wc -l <$tmpdir/extra_disambig.txt) extra disambiguation symbols to phones.txt"
 
+# add extra_disambig symbols into disambig.txt
+cat $tmpdir/extra_disambig.txt >> $dir/phones/disambig.txt
+utils/sym2int.pl $dir/phones.txt <$dir/phones/disambig.txt >$dir/phones/disambig.int
+utils/sym2int.pl $dir/phones.txt <$dir/phones/disambig.txt | \
+  awk '{printf(":%d", $1);} END{printf "\n"}' | sed s/:// > $dir/phones/disambig.csl
 
 silphone=`cat $srcdir/phones/optional_silence.txt` || exit 1;
 [ -z "$silphone" ] && \
diff --git a/egs/wsj/s5/utils/lang/make_lexicon_fst.py b/egs/wsj/s5/utils/lang/make_lexicon_fst.py
index 790af2f2314..e22222db340 100755
--- a/egs/wsj/s5/utils/lang/make_lexicon_fst.py
+++ b/egs/wsj/s5/utils/lang/make_lexicon_fst.py
@@ -209,7 +209,7 @@ def write_fst_no_silence(lexicon, nonterminals=None, left_context_phones=None):
 
     if nonterminals is not None:
         next_state = write_nonterminal_arcs(
-            start_state, loop_state, next_state,
+            loop_state, loop_state, next_state,
             nonterminals, left_context_phones)
 
     print("{state}\t{final_cost}".format(
diff --git a/egs/wsj/s5/utils/lang/make_position_dependent_subword_lexicon.py b/egs/wsj/s5/utils/lang/make_position_dependent_subword_lexicon.py
new file mode 100755
index 00000000000..83aa145c946
--- /dev/null
+++ b/egs/wsj/s5/utils/lang/make_position_dependent_subword_lexicon.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python3
+
+# 2019 Dongji Gao
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+from make_lexicon_fst import read_lexiconp
+import argparse
+import math
+
+def get_args():
+    parser = argparse.ArgumentParser(description="""This script creates a
+        position-dependent subword lexicon from a position-independent subword lexicon
+        by adding suffixes ("_B", "_I", "_E", "_S") to the related phones.
+        It assumes that the input lexicon does not contain disambiguation symbols.""")
+    parser.add_argument("--separator", type=str, default="@@", help="""Separator
+        indicates the position of a subword in a word. 
+        Subword ends with separator can only appear at the beginning or middle of a word. 
+        Subword without separator can only appear at the end of a word or is a word itself.
+        E.g. "international -> inter@@ nation@@ al";
+             "nation        -> nation"
+        The separator should match the separator used in the input lexicon.""")
+    parser.add_argument("lexiconp", type=str, help="""Filename of subword position-independent 
+        lexicon with pronunciation probabilities, with lines of the form 'subword prob p1 p2 ...'""")
+    args = parser.parse_args()
+    return args
+
+def is_end(subword, separator):
+    """Return true if the subword can appear at the end of a word (i.e., the subword 
+    does not end with separator). Return false otherwise."""
+    return not subword.endswith(separator)
+
+def write_position_dependent_lexicon(lexiconp, separator):
+    """Print a position-dependent lexicon for each subword from the input lexiconp by adding
+    appropriate suffixes ("_B", "_I", "_E", "_S") to the phone sequence related to the subword.
+    There are 4 types of position-dependent subword:
+    1) Beginning subword. It can only appear at the beginning of a word.
+       The first phone suffix should be "_B" and other suffixes should be "_I"s:
+        nation@@ 1.0 n_B ey_I sh_I ih_I n_I
+        n@@      1.0 n_B
+    2) Middle subword. It can only appear at the middle of a word.
+       All phone suffixes should be "_I"s:
+        nation@@ 1.0 n_I ey_I sh_I ih_I n_I
+    3) End subword. It can only appear at the end of a word.
+       The last phone suffix should be "_E" and other suffixes should be "_I"s:
+        nation   1.0 n_I ey_I sh_I ih_I n_E
+        n        1.0 n_E
+    4) Singleton subword (i.e., the subword is word it self). 
+       The first phone suffix should be "_B" and the last suffix should be "_E".
+       All other suffixes should be "_I"s. If there is only one phone, its suffix should be "_S":
+        nation   1.0 n_B ey_I sh_I ih_I n_E
+        n        1.0 n_S
+    In most cases (i.e., subwords have more than 1 phones), the suffixes of phones in the middle are "_I"s.
+    So the suffix_list is initialized with all _I and we only replace the first and last phone suffix when
+    dealing with different cases when necessary.
+    """
+    for (word, prob, phones) in lexiconp:
+        phones_length = len(phones)
+
+        # suffix_list is initialized by all "_I"s.
+        suffix_list = ["_I" for i in range(phones_length)]
+
+        if is_end(word, separator):
+            # print end subword lexicon by replacing the last phone suffix by "_E"
+            suffix_list[-1] = "_E"
+            phones_list = [phone + suffix for (phone, suffix) in zip(phones, suffix_list)]
+            print("{} {} {}".format(word, prob, ' '.join(phones_list)))
+
+            # print singleton subword lexicon
+            # the phone suffix is "_S" if the there is only 1 phone.
+            if phones_length == 1:
+                suffix_list[0] = "_S"
+                phones_list = [phone + suffix for (phone, suffix) in zip(phones, suffix_list)]
+                print("{} {} {}".format(word, prob, ' '.join(phones_list)))
+            # the first phone suffix is "_B" is there is more than 1 phones.
+            else:
+                suffix_list[0] = "_B"
+                phones_list = [phone + suffix for (phone, suffix) in zip(phones, suffix_list)]
+                print("{} {} {}".format(word, prob, ' '.join(phones_list)))
+        else:
+            # print middle subword lexicon
+            phones_list = [phone + suffix for (phone, suffix) in zip(phones, suffix_list)]
+            print("{} {} {}".format(word, prob, ' '.join(phones_list)))
+
+            # print beginning subword lexicon by replacing the first phone suffix by "_B"
+            suffix_list[0] = "_B"
+            phones_list = [phone + suffix for (phone, suffix) in zip(phones, suffix_list)]
+            print("{} {} {}".format(word, prob, ' '.join(phones_list)))
+
+def main():
+    args = get_args()
+    lexiconp = read_lexiconp(args.lexiconp)
+    write_position_dependent_lexicon(lexiconp, args.separator)
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/wsj/s5/utils/lang/make_subword_lexicon_fst.py b/egs/wsj/s5/utils/lang/make_subword_lexicon_fst.py
new file mode 100755
index 00000000000..1beec500c13
--- /dev/null
+++ b/egs/wsj/s5/utils/lang/make_subword_lexicon_fst.py
@@ -0,0 +1,301 @@
+#!/usr/bin/env python3
+
+# 2019 Dongji Gao
+# Apache 2.0.
+
+from make_lexicon_fst import read_lexiconp
+import argparse
+import math
+import sys
+
+# see get_args() below for usage mesage
+def get_args():
+    parser = argparse.ArgumentParser(description="""This script creates the
+        text form of a subword lexicon FST to be compiled by fstcompile using
+        the appropriate symbol tables (phones.txt and words.txt). It will mostly
+        be invoked indirectly via utils/prepare_lang_subword.sh. The output
+        goes to the stdout. This script is the subword version of make_lexicon_fst.py.
+        It only allows optional silence to appear after end-subword or singleton-subword,
+        (i.e., subwords without separator). In this version we do not support
+        pronunciation probability. (i.e., pron-prob = 1.0)""")
+
+    parser.add_argument('--sil-phone', type=str, help="""Text form of
+        optional-silence phone, e.g. 'SIL'. See also the --sil-prob option.""")
+    parser.add_argument('--sil-prob', type=float, default=0.0, help="""Probability
+        of silence between words (including the beginning and end of word sequence).
+        Must be in range [0.0, 1.0). This refer to the optional silence inserted by
+        the lexicon; see the --sil-phone option.""")
+    parser.add_argument('--sil-disambig', type=str, help="""Disambiguation symbol
+        to disambiguate silence, e.g. #5. Will only be supplied if you are creating 
+        the version of L.fst with disambiguation symbols, intended for use with cyclic 
+        G.fst. This symbol was introduced to fix a rather obscure source of nondeterminism 
+        of CLG.fst, that has to do with reordering of disambiguation symbols and phone symbols.""")
+    parser.add_argument('--position-dependent', action="store_true", help="""Whether 
+        the input lexicon is position-dependent.""")
+    parser.add_argument("--separator", type=str, default="@@", help="""Separator
+        indicates the position of a subword in a word.
+        Subword followed by separator can only appear at the beginning or middle of a word.
+        Subword without separator can only appear at the end of a word or is a word itself.
+        E.g. "international -> inter@@ nation@@ al";
+             "nation        -> nation"
+    The separator should match the separator used in the input lexicon.""")
+    parser.add_argument('lexiconp', type=str, help="""Filename of lexicon with
+        pronunciation probabilities (normally lexiconp.txt), with lines of the
+        form 'subword prob p1 p2...', e.g. 'a, 1.0 ay'""")
+    args = parser.parse_args()
+    return args
+
+def contain_disambig_symbol(phones):
+    """Return true if the phone sequence contains disambiguation symbol.
+    Return false otherwise. Disambiguation symbol is at the end of phones 
+    in the form of #1, #2... There is at most one disambiguation 
+    symbol for each phone sequence"""
+    return True if phones[-1].startswith("#") else False
+
+def print_arc(src, dest, phone, word, cost):
+    print('{}\t{}\t{}\t{}\t{}'.format(src, dest, phone, word, cost))
+
+def is_end(word, separator):
+    """Return true if the subword can appear at the end of a word (i.e., the subword
+    does not end with separator). Return false otherwise."""
+    return not word.endswith(separator)
+
+def get_suffix(phone):
+    """Return the suffix of a phone. The suffix is in the form of '_B', '_I'..."""
+    if len(phone) < 3:
+        print("{}: invalid phone {} (please check if the phone is position-dependent)".format(
+              sys.argv[0], phone), file=sys.stderr)
+        sys.exit(1)
+    return phone[-2:]
+
+def write_fst_no_silence(lexicon, position_dependent, separator):
+    """Writes the text format of L.fst to the standard output.  This version is for
+    when --sil-prob=0.0, meaning there is no optional silence allowed.
+    loop_state here is the start and final state of the fst. It goes to word_start_state
+    via epsilon transition.
+    In position-independent case, there is no difference between beginning word and 
+    middle word. So all subwords with separator would leave from and enter word_start_state.
+    All subword without separator would leave from word_start_state and enter loop_state.
+    This guarantees that optional silence can only follow a word-end subword.
+
+    In position-dependent case, there are 4 types of position-dependent subword:
+    1) Beginning subword. The first phone suffix should be "_B" and other suffixes should be "_I"s:
+        nation@@ 1.0 n_B ey_I sh_I ih_I n_I
+        n@@      1.0 n_B
+    2) Middle subword. All phone suffixes should be "_I"s:
+        nation@@ 1.0 n_I ey_I sh_I ih_I n_I
+    3) End subword. The last phone suffix should be "_E" and other suffixes be should "_I"s:
+        nation   1.0 n_I ey_I sh_I ih_I n_E
+        n        1.0 n_E
+    4) Singleton subword (i.e., the subword is word it self).
+       The first phone suffix should be "_B" and the last suffix should be "_E".
+       All other suffix should be "_I"s. If there is only one phone, its suffix should be "_S":
+        nation   1.0 n_B ey_I sh_I ih_I n_E
+        n        1.0 n_S
+
+    So we need an extra word_internal_state. The beginning word 
+    would leave from word_start_state and enter word_internal_state and middle word
+    would leave from and enter word_internal_state. The rest part is same.
+
+      'lexicon' is a list of 3-tuples (subword, pron-prob, prons) as returned by
+      'position_dependent', which is true is the lexicon is position-dependent.
+      'separator' is a symbol which indicates the position of a subword in word.
+    """
+    # regular setting
+    loop_state = 0
+    word_start_state = 1
+    next_state = 2
+
+    print_arc(loop_state, word_start_state, "<eps>", "<eps>", 0.0)
+
+    # optional setting for word_internal_state
+    if position_dependent:
+        word_internal_state = next_state
+        next_state += 1
+
+    for (word, pron_prob, phones) in lexicon:
+        pron_cost = 0.0                # do not support pron_prob
+        phones_len = len(phones)
+
+        # set start and end state for different cases
+        if position_dependent:
+            first_phone_suffix = get_suffix(phones[0])
+            last_phone = phones[-2] if contain_disambig_symbol(phones) else phones[-1]
+            last_phone_suffix = get_suffix(last_phone)
+
+            # singleton word
+            if first_phone_suffix == "_S":
+                current_state = word_start_state
+                end_state = loop_state
+            # set the current_state
+            elif first_phone_suffix == "_B":
+                current_state = word_start_state
+            elif first_phone_suffix == "_I" or first_phone_suffix == "_E":
+                current_state = word_internal_state
+            # then set the end_state
+            if last_phone_suffix == "_B" or last_phone_suffix == "_I":
+                end_state = word_internal_state
+            elif last_phone_suffix == "_E":
+                end_state = loop_state
+        else:
+            current_state = word_start_state
+            end_state = loop_state if is_end(word, separator) else word_start_state
+
+        # print arcs (except the last one) for the subword
+        for i in range(phones_len - 1):
+            word = word if i == 0 else "<eps>"
+            cost = pron_cost if i == 0 else 0.0
+            print_arc(current_state, next_state, phones[i], word, cost)
+            current_state = next_state
+            next_state += 1
+
+        # print the last arc
+        i = phones_len - 1
+        phone = phones[i] if i >=0 else "<eps>"
+        word = word if i <= 0 else "<eps>"
+        cost = pron_cost if i <= 0 else 0.0
+        print_arc(current_state, end_state, phone, word, cost)
+
+    # set the final state
+    print("{state}\t{final_cost}".format(state=loop_state, final_cost=0.0))
+
+def write_fst_with_silence(lexicon, sil_phone, sil_prob, sil_disambig, position_dependent, separator):
+    """Writes the text format of L.fst to the standard output.  This version is for
+    when --sil-prob=0.0, meaning there is no optional silence allowed.
+    loop_state here is the start and final state of the fst. It goes to word_start_state
+    via epsilon transition.
+
+    In position-independent case, there is no difference between beginning word and 
+    middle word. So all subwords with separator would leave from and enter word_start_state.
+    All subword without separator would leave from word_start_state and enter sil_state.
+    This guarantees that optional silence can only follow a word-end subword and such subwords
+    must appear at the end of the whole subword sequence.
+
+    In position-dependent case, there are 4 types of position-dependent subword:
+    1) Beginning subword. The first phone suffix should be "_B" and other suffixes should be "_I"s:
+        nation@@ 1.0 n_B ey_I sh_I ih_I n_I
+        n@@      1.0 n_B
+    2) Middle subword. All phone suffixes should be "_I"s:
+        nation@@ 1.0 n_I ey_I sh_I ih_I n_I
+    3) End subword. The last phone suffix should be "_E" and other suffixes be should "_I"s:
+        nation   1.0 n_I ey_I sh_I ih_I n_E
+        n        1.0 n_E
+    4) Singleton subword (i.e., the subword is word it self).
+       The first phone suffix should be "_B" and the last suffix should be "_E".
+       All other suffix should be "_I"s. If there is only one phone, its suffix should be "_S":
+        nation   1.0 n_B ey_I sh_I ih_I n_E
+        n        1.0 n_S
+
+    So we need an extra word_internal_state. The beginning word 
+    would leave from word_start_state and enter word_internal_state and middle word
+    would leave from and enter word_internal_state. The rest part is same.
+
+      'lexicon' is a list of 3-tuples (subword, pron-prob, prons)
+         as returned by read_lexiconp().
+      'sil_prob', which is expected to be strictly between 0.0 and 1.0, is the
+         probability of silence
+      'sil_phone' is the silence phone, e.g. "SIL".
+      'sil_disambig' is either None, or the silence disambiguation symbol, e.g. "#5".
+      'position_dependent', which is True is the lexicion is position-dependent.
+      'separator' is the symbol we use to indicate the position of a subword in word.
+    """
+
+    sil_cost = -math.log(sil_prob)
+    no_sil_cost = -math.log(1 - sil_prob)
+
+    # regular setting
+    start_state = 0
+    loop_state = 1         # also the final state
+    sil_state = 2          # words terminate here when followed by silence; this state
+                           # has a licence transition to loop_state
+    word_start_state = 3   # subword leave from here
+    next_state = 4         # the next un-allocated state, will be incremented as we go
+
+    print_arc(start_state, loop_state, "<eps>", "<eps>", no_sil_cost)
+    print_arc(start_state, sil_state, "<eps>", "<eps>", sil_cost)
+    print_arc(loop_state, word_start_state, "<eps>", "<eps>", 0.0)
+
+    # optional setting for disambig_state
+    if sil_disambig is None:
+        print_arc(sil_state, loop_state, sil_phone, "<eps>", 0.0)
+    else:
+        disambig_state = next_state
+        next_state += 1
+        print_arc(sil_state, disambig_state, sil_phone, "<eps>", 0.0)
+        print_arc(disambig_state, loop_state, sil_disambig, "<eps>", 0.0)
+
+    # optional setting for word_internal_state
+    if position_dependent:
+        word_internal_state = next_state
+        next_state += 1
+
+    for (word, pron_prob, phones) in lexicon:
+        pron_cost = 0.0           # do not support pron_prob
+        phones_len = len(phones)
+        
+        # set start and end state for different cases
+        if position_dependent:
+            first_phone_suffix = get_suffix(phones[0])
+            last_phone = phones[-2] if contain_disambig_symbol(phones) else phones[-1]
+            last_phone_suffix = get_suffix(last_phone)
+
+            # singleton subword
+            if first_phone_suffix == "_S":
+                current_state = word_start_state
+                end_state_list = [loop_state, sil_state]
+                end_cost_list = [no_sil_cost, sil_cost]
+            # first set the current_state
+            elif first_phone_suffix == "_B":
+                current_state = word_start_state
+            elif first_phone_suffix == "_I" or first_phone_suffix == "_E":
+                current_state = word_internal_state
+            # then set the end_state (end_state_list)
+            if last_phone_suffix == "_B" or last_phone_suffix == "_I":
+                end_state_list = [word_internal_state]
+                end_cost_list = [0.0]
+            elif last_phone_suffix == "_E":
+                end_state_list = [loop_state, sil_state]
+                end_cost_list = [no_sil_cost, sil_cost]
+        else:
+            current_state = word_start_state
+            if is_end(word, separator):
+                end_state_list = [loop_state, sil_state]
+                end_cost_list = [no_sil_cost, sil_cost]
+            else:
+                end_state_list = [word_start_state]
+                end_cost_list = [0.0]
+
+        # print arcs (except the last one) for the subword
+        for i in range(phones_len - 1):
+            word = word if i == 0 else "<eps>"
+            cost = pron_cost if i == 0 else 0.0
+            print_arc(current_state, next_state, phones[i], word, cost)
+            current_state = next_state
+            next_state += 1
+
+        # print the last arc
+        i = phones_len - 1
+        phone = phones[i] if i >= 0 else "<eps>"
+        word = word if i <= 0 else "<eps>"
+        cost = pron_cost if i <= 0 else 0.0
+        for (end_state, end_cost) in zip(end_state_list, end_cost_list):
+            print_arc(current_state, end_state, phone, word, cost + end_cost)
+
+    # set the final state
+    print("{state}\t{final_cost}".format(state=loop_state, final_cost=0.0))
+
+def main():
+    args = get_args()
+    if args.sil_prob < 0.0 or args.sil_prob >= 1.0:
+        print("{}: invalid value specified --sil-prob={}".format(
+              sys.argv[0], args.sil_prob), file=sys.stderr)
+        sys.exit(1)
+    lexicon = read_lexiconp(args.lexiconp)
+    if args.sil_prob == 0.0:
+        write_fst_no_silence(lexicon, args.position_dependent, args.separator)
+    else:
+        write_fst_with_silence(lexicon, args.sil_phone, args.sil_prob, 
+            args.sil_disambig, args.position_dependent, args.separator)
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/wsj/s5/utils/nnet/gen_dct_mat.py b/egs/wsj/s5/utils/nnet/gen_dct_mat.py
index 24139f1c9f8..77461112d0b 100755
--- a/egs/wsj/s5/utils/nnet/gen_dct_mat.py
+++ b/egs/wsj/s5/utils/nnet/gen_dct_mat.py
@@ -16,8 +16,8 @@
 # limitations under the License.
 
 # ./gen_dct_mat.py
-# script generates matrix with DCT transform, which is sparse 
-# and takes into account that data-layout is along frequency axis, 
+# script generates matrix with DCT transform, which is sparse
+# and takes into account that data-layout is along frequency axis,
 # while DCT is done along temporal axis.
 
 from __future__ import division
@@ -29,10 +29,7 @@
 from optparse import OptionParser
 
 def print_on_same_line(text):
-    if (sys.version_info > (3,0)):
-        print(text, end=' ')
-    else:
-        print text,
+    print(text, end=' ')
 
 parser = OptionParser()
 parser.add_option('--fea-dim', dest='dim', help='feature dimension')
@@ -69,7 +66,7 @@ def print_on_same_line(text):
           if(n==timeContext-1):
               print_on_same_line((dim-m-1)*'0 ')
         print()
-    print() 
+    print()
 
 print(']')
 
diff --git a/egs/wsj/s5/utils/nnet/gen_hamm_mat.py b/egs/wsj/s5/utils/nnet/gen_hamm_mat.py
index d7e9d9b7493..110178c6702 100755
--- a/egs/wsj/s5/utils/nnet/gen_hamm_mat.py
+++ b/egs/wsj/s5/utils/nnet/gen_hamm_mat.py
@@ -27,10 +27,7 @@
 from optparse import OptionParser
 
 def print_on_same_line(text):
-    if (sys.version_info > (3,0)):
-        print(text, end=' ')
-    else:
-        print text,
+    print(text, end=' ')
 
 parser = OptionParser()
 parser.add_option('--fea-dim', dest='dim', help='feature dimension')
diff --git a/egs/wsj/s5/utils/nnet/gen_splice.py b/egs/wsj/s5/utils/nnet/gen_splice.py
index 3fe76513df6..f3a2c8b39ac 100755
--- a/egs/wsj/s5/utils/nnet/gen_splice.py
+++ b/egs/wsj/s5/utils/nnet/gen_splice.py
@@ -26,10 +26,7 @@
 from optparse import OptionParser
 
 def print_on_same_line(text):
-    if (sys.version_info > (3,0)):
-        print(text, end=' ')
-    else:
-        print text,
+    print(text, end=' ')
 
 parser = OptionParser()
 parser.add_option('--fea-dim', dest='dim_in', help='feature dimension')
diff --git a/egs/wsj/s5/utils/nnet/make_cnn2d_proto.py b/egs/wsj/s5/utils/nnet/make_cnn2d_proto.py
deleted file mode 100755
index 172660da825..00000000000
--- a/egs/wsj/s5/utils/nnet/make_cnn2d_proto.py
+++ /dev/null
@@ -1,259 +0,0 @@
-#!/usr/bin/python
-
-# Copyright 2014  Brno University of Technology (author: Karel Vesely)
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-# Generated Nnet prototype, to be initialized by 'nnet-initialize'.
-
-from __future__ import division
-from __future__ import print_function
-import math, random, sys, warnings
-from optparse import OptionParser
-
-###
-### Parse options
-###
-usage="%prog [options] <feat-dim> <num-leaves> <num-hidden-layers> <num-hidden-neurons>  >nnet-proto-file"
-parser = OptionParser(usage)
-
-parser.add_option('--activation-type', dest='activation_type', 
-                   help='Select type of activation function : (<Sigmoid>|<Tanh>) [default: %default]', 
-                   default='<Sigmoid>', type='string');
-
-parser.add_option('--cnn1-num-filters', dest='cnn1_num_filters',
-		   help='Number of filters in first convolutional layer [default: %default]',
-		   default=128, type='int')
-# this is given by splice
-# parser.add_option('--cnn1-fmap-x-len', dest='cnn1_fmap_x_len',
-# 	  	   help='Size of cnn1-fmap-x-len [default: %default]',
-# 		   default=11, type='int')
-
-# this should be equal to feat_raw_dim
-# parser.add_option('--cnn1-fmap-y-len', dest='cnn1_fmap_y_len',
-# 	  	   help='Size of cnn1-fmap-y-len [default: %default]',
-# 		   default=32, type='int')
-
-parser.add_option('--cnn1-filt-x-len', dest='cnn1_filt_x_len',
-	  	   help='Size of cnn1-filt-x-len [default: %default]',
-		   default=9, type='int')
-parser.add_option('--cnn1-filt-y-len', dest='cnn1_filt_y_len',
-	  	   help='Size of cnn1-filt-y-len [default: %default]',
-		   default=9, type='int')
-
-parser.add_option('--cnn1-filt-x-step', dest='cnn1_filt_x_step',
-	  	   help='Size of cnn1-filt-x-step [default: %default]',
-		   default=1, type='int')
-parser.add_option('--cnn1-filt-y-step', dest='cnn1_filt_y_step',
-	  	   help='Size of cnn1-filt-y-step [default: %default]',
-		   default=1, type='int')
-parser.add_option('--cnn1-connect-fmap', dest='cnn1_connect_fmap',
-	  	   help='Size of cnn1-connect-fmap [default: %default]',
-		   default=0, type='int')
-
-parser.add_option('--pool1-x-len', dest='pool1_x_len',
-	  	   help='Size of pool1-filt-x-len [default: %default]',
-		   default=1, type='int')
-parser.add_option('--pool1-x-step', dest='pool1_x_step',
-	  	   help='Size of pool1-x-step [default: %default]',
-		   default=1, type='int')
-
-
-# 
-parser.add_option('--pool1-y-len', dest='pool1_y_len',
-	  	   help='Size of pool1-y-len [default: %default]',
-		   default=3, type='int')
-parser.add_option('--pool1-y-step', dest='pool1_y_step',
-	  	   help='Size of pool1-y-step [default: %default]',
-		   default=3, type='int')
-
-parser.add_option('--pool1-type', dest='pool1_type',
-		  help='Type of pooling (Max || Average) [default: %default]',
-		  default='Max', type='string')
-
-parser.add_option('--cnn2-num-filters', dest='cnn2_num_filters',
-		   help='Number of filters in first convolutional layer [default: %default]',
-		   default=256, type='int')
-parser.add_option('--cnn2-filt-x-len', dest='cnn2_filt_x_len',
-	  	   help='Size of cnn2-filt-x-len [default: %default]',
-		   default=3, type='int')
-parser.add_option('--cnn2-filt-y-len', dest='cnn2_filt_y_len',
-	  	   help='Size of cnn2-filt-y-len [default: %default]',
-		   default=4, type='int')
-parser.add_option('--cnn2-filt-x-step', dest='cnn2_filt_x_step',
-	  	   help='Size of cnn2-filt-x-step [default: %default]',
-		   default=1, type='int')
-parser.add_option('--cnn2-filt-y-step', dest='cnn2_filt_y_step',
-	  	   help='Size of cnn2-filt-y-step [default: %default]',
-		   default=1, type='int')
-parser.add_option('--cnn2-connect-fmap', dest='cnn2_connect_fmap',
-	  	   help='Size of cnn2-connect-fmap [default: %default]',
-		   default=1, type='int')
-
-parser.add_option('--pitch-dim', dest='pitch_dim',
-		  help='Number of features representing pitch [default: %default]',
-		  default=0, type='int')
-parser.add_option('--delta-order', dest='delta_order',
-		  help='Order of delta features [default: %default]',
-		  default=2, type='int')
-parser.add_option('--splice', dest='splice',
-		  help='Length of splice [default: %default]',
-		  default=5,type='int')
-parser.add_option('--dir', dest='dirct',
-		  help='Directory, where network prototypes will be saved [default: %default]',
-		  default='.', type='string')
-parser.add_option('--num-pitch-neurons', dest='num_pitch_neurons',
-		  help='Number of neurons in layers processing pitch features [default: %default]',
-		  default='200', type='int')
-
-
-(o,args) = parser.parse_args()
-if len(args) != 1 : 
-  parser.print_help()
-  sys.exit(1)
-  
-feat_dim=int(args[0])
-### End parse options 
-
-feat_raw_dim = feat_dim / (o.delta_order+1) / (o.splice*2+1) - o.pitch_dim # we need number of feats without deltas and splice and pitch
-o.cnn1_fmap_y_len = feat_raw_dim
-o.cnn1_fmap_x_len = o.splice*2+1
-
-# Check
-assert(feat_dim > 0)
-assert(o.pool1_type == 'Max' or o.pool1_type == 'Average')
-
-## Extra checks if dimensions are matching, if not match them by 
-## producing a warning
-# cnn1
-assert( (o.cnn1_fmap_y_len - o.cnn1_filt_y_len) % o.cnn1_filt_y_step == 0 )
-assert( (o.cnn1_fmap_x_len - o.cnn1_filt_x_len) % o.cnn1_filt_x_step == 0 )
-
-# subsample1
-cnn1_out_fmap_y_len=(1 + (o.cnn1_fmap_y_len - o.cnn1_filt_y_len) / o.cnn1_filt_y_step)
-cnn1_out_fmap_x_len=(1 + (o.cnn1_fmap_x_len - o.cnn1_filt_x_len) / o.cnn1_filt_x_step)
-
-# fix filt_len and filt_step
-def fix_filt_step(inp_len, filt_len, filt_step):
-  
-  if ((inp_len - filt_len) % filt_step == 0):
-    return filt_step
-  else:
-    # filt_step <= filt_len
-    for filt_step in range(filt_len, 0, -1):
-      if ((inp_len - filt_len) % filt_step == 0):
-        return filt_step
-    
-o.pool1_y_step = fix_filt_step(cnn1_out_fmap_y_len, o.pool1_y_len, o.pool1_y_step)
-if o.pool1_y_step == 1 and o.pool1_y_len != 1:
-  warnings.warn('WARNING: Choose different pool1_y_len as subsampling is not happening');
-  
-o.pool1_x_step = fix_filt_step(cnn1_out_fmap_x_len, o.pool1_x_len, o.pool1_x_step)
-if o.pool1_x_step == 1 and o.pool1_x_len != 1:
-  warnings.warn('WARNING: Choose different pool1_x_len as subsampling is not happening');
-
-
-###
-### Print prototype of the network
-###
-
-# Begin the prototype
-print("<NnetProto>")
-
-# Convolutional part of network
-'''1st CNN layer'''
-cnn1_input_dim=feat_raw_dim * (o.delta_order+1) * (o.splice*2+1)
-cnn1_out_fmap_x_len=(1 + (o.cnn1_fmap_x_len - o.cnn1_filt_x_len) / o.cnn1_filt_x_step)
-cnn1_out_fmap_y_len=(1 + (o.cnn1_fmap_y_len - o.cnn1_filt_y_len) / o.cnn1_filt_y_step)
-cnn1_output_dim=o.cnn1_num_filters * cnn1_out_fmap_x_len * cnn1_out_fmap_y_len
-
-'''1st Pooling layer'''
-pool1_input_dim=cnn1_output_dim
-pool1_fmap_x_len=cnn1_out_fmap_x_len
-pool1_out_fmap_x_len=(1 + (pool1_fmap_x_len - o.pool1_x_len) / o.pool1_x_step)
-pool1_fmap_y_len=cnn1_out_fmap_y_len
-pool1_out_fmap_y_len=(1 + (pool1_fmap_y_len - o.pool1_y_len) / o.pool1_y_step)
-pool1_output_dim=o.cnn1_num_filters*pool1_out_fmap_x_len*pool1_out_fmap_y_len
-
-'''2nd CNN layer'''
-cnn2_input_dim=pool1_output_dim
-cnn2_fmap_x_len=pool1_out_fmap_x_len
-cnn2_out_fmap_x_len=(1 + (cnn2_fmap_x_len - o.cnn2_filt_x_len) / o.cnn2_filt_x_step)
-cnn2_fmap_y_len=pool1_out_fmap_y_len
-cnn2_out_fmap_y_len=(1 + (cnn2_fmap_y_len - o.cnn2_filt_y_len) / o.cnn2_filt_y_step)
-cnn2_output_dim=o.cnn2_num_filters * cnn2_out_fmap_x_len * cnn2_out_fmap_y_len
-
-
-convolution_proto = ''
-
-convolution_proto += "<Convolutional2DComponent> <InputDim> %d <OutputDim> %d <FmapXLen> %d <FmapYLen> %d <FiltXLen> %d <FiltYLen> %d <FiltXStep> %d <FiltYStep> %d <ConnectFmap> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f\n" % \
-    ( cnn1_input_dim, cnn1_output_dim, o.cnn1_fmap_x_len, o.cnn1_fmap_y_len, o.cnn1_filt_x_len, o.cnn1_filt_y_len, o.cnn1_filt_x_step, o.cnn1_filt_y_step, o.cnn1_connect_fmap, 0.0, 0.0, 0.01 )
-convolution_proto += "<%sPooling2DComponent> <InputDim> %d <OutputDim> %d <FmapXLen> %d <FmapYLen> %d <PoolXLen> %d <PoolYLen> %d <PoolXStep> %d <PoolYStep> %d\n" % \
-    ( o.pool1_type, pool1_input_dim, pool1_output_dim, pool1_fmap_x_len, pool1_fmap_y_len, o.pool1_x_len, o.pool1_y_len, o.pool1_x_step, o.pool1_y_step )
-convolution_proto += "<Rescale> <InputDim> %d <OutputDim> %d <InitParam> %f\n" % \
-    ( pool1_output_dim, pool1_output_dim, 1.0 )
-convolution_proto += "<AddShift> <InputDim> %d <OutputDim> %d <InitParam> %f\n" % \
-    ( pool1_output_dim, pool1_output_dim, 0.0 )
-convolution_proto += "%s <InputDim> %d <OutputDim> %d\n" % \
-    ( o.activation_type, pool1_output_dim, pool1_output_dim )
-convolution_proto += "<Convolutional2DComponent> <InputDim> %d <OutputDim> %d <FmapXLen> %d <FmapYLen> %d <FiltXLen> %d <FiltYLen> %d <FiltXStep> %d <FiltYStep> %d <ConnectFmap> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f\n" % \
-    ( cnn2_input_dim, cnn2_output_dim, cnn2_fmap_x_len, cnn2_fmap_y_len, o.cnn2_filt_x_len, o.cnn2_filt_y_len, o.cnn2_filt_x_step, o.cnn2_filt_y_step, o.cnn2_connect_fmap, -2.0, 4.0, 0.1 )
-convolution_proto += "<Rescale> <InputDim> %d <OutputDim> %d <InitParam> %f\n" % \
-    ( cnn2_output_dim, cnn2_output_dim, 1.0)
-convolution_proto += "<AddShift> <InputDim> %d <OutputDim> %d <InitParam> %f\n" % \
-    ( cnn2_output_dim, cnn2_output_dim, 0.0)
-convolution_proto += "%s <InputDim> %d <OutputDim> %d\n" % \
-    ( o.activation_type, cnn2_output_dim, cnn2_output_dim)
-
-if (o.pitch_dim > 0):
-  # convolutional part
-  f_conv = open('%s/nnet.proto.convolution' % o.dirct, 'w')
-  f_conv.write('<NnetProto>\n')
-  f_conv.write(convolution_proto)
-  f_conv.write('</NnetProto>\n')
-  f_conv.close()
-  
-  # pitch part
-  f_pitch = open('%s/nnet.proto.pitch' % o.dirct, 'w')
-  f_pitch.write('<NnetProto>\n')
-  f_pitch.write('<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f\n' % \
-		((o.pitch_dim * (o.delta_order+1) * (o.splice*2+1)), o.num_pitch_neurons, -2.0, 4.0, 0.109375))
-  f_pitch.write('%s <InputDim> %d <OutputDim> %d\n' % \
-		(o.activation_type, o.num_pitch_neurons, o.num_pitch_neurons))
-  f_pitch.write('<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f\n' % \
-		(o.num_pitch_neurons, o.num_pitch_neurons, -2.0, 4.0, 0.109375))
-  f_pitch.write('%s <InputDim> %d <OutputDim> %d\n' % \
-		(o.activation_type, o.num_pitch_neurons, o.num_pitch_neurons))
-  f_pitch.write('</NnetProto>\n')
-  f_pitch.close()
-
-  # paralell part
-  vector = ''
-  for i in range(1, (feat_raw_dim + o.pitch_dim) * (o.delta_order+1) * (o.splice*2+1), feat_raw_dim + o.pitch_dim):
-    vector += '%d:1:%d ' % (i, i + feat_raw_dim - 1)
-  for i in range(feat_raw_dim+1, (feat_raw_dim + o.pitch_dim) * (o.delta_order+1) * (o.splice*2+1), feat_raw_dim + o.pitch_dim):
-    vector += '%d:1:%d ' % (i, i + o.pitch_dim - 1)
-  print('<Copy> <InputDim> %d <OutputDim> %d <BuildVector>  %s </BuildVector> ' % \
-	((feat_raw_dim + o.pitch_dim) * (o.delta_order+1) * (o.splice*2+1), (feat_raw_dim + o.pitch_dim) * (o.delta_order+1) * (o.splice*2+1), vector))
-  print('<ParallelComponent> <InputDim> %d <OutputDim> %d <NestedNnetProto> %s %s </NestedNnetProto>' % \
-	((feat_raw_dim + o.pitch_dim) * (o.delta_order+1) * (o.splice*2+1), o.num_pitch_neurons + cnn2_output_dim, '%s/nnet.proto.convolution' % o.dirct, '%s/nnet.proto.pitch' % o.dirct))
-
-  num_convolution_output = o.num_pitch_neurons + cnn2_output_dim
-else: # no pitch
-  print(convolution_proto)
-
-# We are done!
-sys.exit(0)
-
-
diff --git a/egs/wsj/s5/utils/parallel/limit_num_gpus.sh b/egs/wsj/s5/utils/parallel/limit_num_gpus.sh
index d9707a816c4..9d7caddd1f6 100755
--- a/egs/wsj/s5/utils/parallel/limit_num_gpus.sh
+++ b/egs/wsj/s5/utils/parallel/limit_num_gpus.sh
@@ -18,8 +18,8 @@ if [ "$1" == "--num-gpus" ]; then
   shift
 fi
 
-if ! printf "%d" "$num_gpus" >/dev/null || [ $num_gpus -le 0 ]; then
-  echo $0: Must pass a positive interger after --num-gpus
+if ! printf "%d" "$num_gpus" >/dev/null || [ $num_gpus -le -1 ]; then
+  echo $0: Must pass a positive interger or 0 after --num-gpus
   echo e.g. $0 --num-gpus 2 local/tfrnnlm/run_lstm.sh
   exit 1
 fi
@@ -35,18 +35,24 @@ CUDA_VISIBLE_DEVICES=
 num_total_gpus=`nvidia-smi -L | wc -l`
 num_gpus_assigned=0
 
-for i in `seq 0 $[$num_total_gpus-1]`; do
-# going over all GPUs and check if it is idle, and add to the list if yes
-  if nvidia-smi -i $i | grep "No running processes found" >/dev/null; then
-    CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}$i, && num_gpus_assigned=$[$num_gpus_assigned+1]
-  fi
-# once we have enough GPUs, break out of the loop
-  [ $num_gpus_assigned -eq $num_gpus ] && break
-done
+if [ $num_gpus -eq 0 ] ; then
+    echo "$0: Running the job on CPU. Disabling submitting to gpu"
+    export CUDA_VISIBLE_DEVICES=""
+else
+    for i in `seq 0 $[$num_total_gpus-1]`; do
+    # going over all GPUs and check if it is idle, and add to the list if yes
+      if nvidia-smi -i $i | grep "No running processes found" >/dev/null; then
+        CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}$i, && num_gpus_assigned=$[$num_gpus_assigned+1]
+      fi
+    # once we have enough GPUs, break out of the loop
+      [ $num_gpus_assigned -eq $num_gpus ] && break
+    done
 
-[ $num_gpus_assigned -ne $num_gpus ] && echo Could not find enough idle GPUs && exit 1
+    [ $num_gpus_assigned -ne $num_gpus ] && echo Could not find enough idle GPUs && exit 1
 
-export CUDA_VISIBLE_DEVICES=$(echo $CUDA_VISIBLE_DEVICES | sed "s=,$==g")
+    export CUDA_VISIBLE_DEVICES=$(echo $CUDA_VISIBLE_DEVICES | sed "s=,$==g")
+
+    echo "$0: Running the job on GPU(s) $CUDA_VISIBLE_DEVICES"
+fi
 
-echo "$0: Running the job on GPU(s) $CUDA_VISIBLE_DEVICES"
 "$@"
diff --git a/egs/wsj/s5/utils/parallel/run.pl b/egs/wsj/s5/utils/parallel/run.pl
index f23bb8dc0b0..d648abd2382 100755
--- a/egs/wsj/s5/utils/parallel/run.pl
+++ b/egs/wsj/s5/utils/parallel/run.pl
@@ -72,13 +72,13 @@
     $jobname = $1;
     $jobstart = $2;
     $jobend = $3;
-    shift;
     if ($jobstart > $jobend) {
       die "run.pl: invalid job range $ARGV[0]";
     }
     if ($jobstart <= 0) {
       die "run.pl: invalid job range $ARGV[0], start must be strictly positive (this is required for GridEngine compatibility).";
     }
+    shift;
   } elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1.
     $jobname = $1;
     $jobstart = $2;
@@ -181,7 +181,7 @@
         delete $active_pids{$r};
         # print STDERR "Finished: $r/$jid " .  Dumper(\%active_pids) . "\n";
     } else {
-        die "run.pl: Cannot find the PID of the chold process that just finished.";
+        die "run.pl: Cannot find the PID of the child process that just finished.";
     }
 
     # In theory we could do a non-blocking waitpid over all jobs running just
@@ -243,7 +243,7 @@
 # Some sanity checks:
 # The $fail array should not contain undefined codes
 # The number of non-zeros in that array  should be equal to $numfail
-# We cannot do foreach() here, as the JOB ids do not necessarily start by zero
+# We cannot do foreach() here, as the JOB ids do not start at zero
 $failed_jids=0;
 for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
   $job_return = $fail[$jobid];
diff --git a/egs/wsj/s5/utils/parse_options.sh b/egs/wsj/s5/utils/parse_options.sh
index 34476fdb37a..335e69e9ac7 100755
--- a/egs/wsj/s5/utils/parse_options.sh
+++ b/egs/wsj/s5/utils/parse_options.sh
@@ -42,7 +42,7 @@ done
 
 
 ###
-### No we process the command line options
+### Now we process the command line options
 ###
 while true; do
   [ -z "${1:-}" ] && break;  # break if there are no arguments
diff --git a/egs/wsj/s5/utils/perturb_data_dir_speed.sh b/egs/wsj/s5/utils/perturb_data_dir_speed.sh
index a50cdb04be4..924ebdc3473 100755
--- a/egs/wsj/s5/utils/perturb_data_dir_speed.sh
+++ b/egs/wsj/s5/utils/perturb_data_dir_speed.sh
@@ -73,7 +73,7 @@ if [ -f $srcdir/segments ]; then
   utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments | \
     utils/apply_map.pl -f 2 $destdir/reco_map | \
       awk -v factor=$factor \
-        '{printf("%s %s %.2f %.2f\n", $1, $2, $3/factor, $4/factor);}' >$destdir/segments
+        '{s=$3/factor; e=$4/factor; if (e > s + 0.01) { printf("%s %s %.2f %.2f\n", $1, $2, $3/factor, $4/factor);} }' >$destdir/segments
 
   utils/apply_map.pl -f 1 $destdir/reco_map <$srcdir/wav.scp | sed 's/| *$/ |/' | \
     # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" 
@@ -102,6 +102,9 @@ fi
 if [ -f $srcdir/spk2gender ]; then
   utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender
 fi
+if [ -f $srcdir/utt2lang ]; then
+  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2lang >$destdir/utt2lang
+fi
 
 #prepare speed-perturbed utt2dur
 if [ ! -f $srcdir/utt2dur ]; then
diff --git a/egs/wsj/s5/utils/segmentation.pl b/egs/wsj/s5/utils/segmentation.pl
index 41d90f4bd9d..fa7c4429927 100755
--- a/egs/wsj/s5/utils/segmentation.pl
+++ b/egs/wsj/s5/utils/segmentation.pl
@@ -221,7 +221,8 @@ ()
         if ($A[$p] == 0) { $num_sil++; }
         else { last; }
       }
-      $num_silence_phones[$n] = $p;
+      
+      $num_silence_phones[$n] = $num_sil; # should be the num of silence
     }
   }
 
diff --git a/egs/wsj/s5/utils/split_scp.pl b/egs/wsj/s5/utils/split_scp.pl
index 994c62e7a2d..dc798282f79 100755
--- a/egs/wsj/s5/utils/split_scp.pl
+++ b/egs/wsj/s5/utils/split_scp.pl
@@ -1,7 +1,9 @@
 #!/usr/bin/env perl
-use warnings; #sed replacement for -w perl parameter
+
 # Copyright 2010-2011 Microsoft Corporation
 
+# See ../../COPYING for clarification regarding multiple authors
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -16,7 +18,6 @@
 # limitations under the License.
 
 
-
 # This program splits up any kind of .scp or archive-type file.
 # If there is no utt2spk option it will work on any text  file and
 # will split it up with an approximately equal number of lines in
@@ -41,29 +42,43 @@
 # [note: with this option, it assumes zero-based indexing of the split parts,
 # i.e. the second number must be 0 <= n < num-jobs.]
 
+use warnings;
+
 $num_jobs = 0;
 $job_id = 0;
 $utt2spk_file = "";
+$one_based = 0;
 
-for ($x = 1; $x <= 2 && @ARGV > 0; $x++) {
+for ($x = 1; $x <= 3 && @ARGV > 0; $x++) {
     if ($ARGV[0] eq "-j") {
         shift @ARGV;
         $num_jobs = shift @ARGV;
         $job_id = shift @ARGV;
-        if ($num_jobs <= 0 || $job_id < 0 || $job_id >= $num_jobs) {
-            die "Invalid num-jobs and job-id: $num_jobs and $job_id";
-        }
     }
-    if ($ARGV[0] =~ "--utt2spk=(.+)") {
+    if ($ARGV[0] =~ /--utt2spk=(.+)/) {
         $utt2spk_file=$1;
         shift;
     }
+    if ($ARGV[0] eq '--one-based') {
+        $one_based = 1;
+        shift @ARGV;
+    }
+}
+
+if ($num_jobs != 0 && ($num_jobs < 0 || $job_id - $one_based < 0 ||
+                       $job_id - $one_based >= $num_jobs)) {
+  die "$0: Invalid job number/index values for '-j $num_jobs $job_id" .
+      ($one_based ? " --one-based" : "") . "'\n"
 }
 
+$one_based
+    and $job_id--;
+
 if(($num_jobs == 0 && @ARGV < 2) || ($num_jobs > 0 && (@ARGV < 1 || @ARGV > 2))) {
-    die "Usage: split_scp.pl [--utt2spk=<utt2spk_file>] in.scp out1.scp out2.scp ... \n" .
-        " or: split_scp.pl -j num-jobs job-id [--utt2spk=<utt2spk_file>] in.scp [out.scp]\n" .
-        " ... where 0 <= job-id < num-jobs.";
+    die
+"Usage: split_scp.pl [--utt2spk=<utt2spk_file>] in.scp out1.scp out2.scp ...
+   or: split_scp.pl -j num-jobs job-id [--one-based] [--utt2spk=<utt2spk_file>] in.scp [out.scp]
+ ... where 0 <= job-id < num-jobs, or 1 <= job-id <- num-jobs if --one-based.\n";
 }
 
 $error = 0;
@@ -82,21 +97,22 @@
 }
 
 if ($utt2spk_file ne "") {  # We have the --utt2spk option...
-    open(U, "<$utt2spk_file") || die "Failed to open utt2spk file $utt2spk_file";
-    while(<U>) {
+    open($u_fh, '<', $utt2spk_file) || die "$0: Error opening utt2spk file $utt2spk_file: $!\n";
+    while(<$u_fh>) {
         @A = split;
-        @A == 2 || die "Bad line $_ in utt2spk file $utt2spk_file";
+        @A == 2 || die "$0: Bad line $_ in utt2spk file $utt2spk_file\n";
         ($u,$s) = @A;
         $utt2spk{$u} = $s;
     }
-    open(I, "<$inscp") || die "Opening input scp file $inscp";
+    close $u_fh;
+    open($i_fh, '<', $inscp) || die "$0: Error opening input scp file $inscp: $!\n";
     @spkrs = ();
-    while(<I>) {
+    while(<$i_fh>) {
         @A = split;
-        if(@A == 0) { die "Empty or space-only line in scp file $inscp"; }
+        if(@A == 0) { die "$0: Empty or space-only line in scp file $inscp\n"; }
         $u = $A[0];
         $s = $utt2spk{$u};
-        if(!defined $s) { die "No such utterance $u in utt2spk file $utt2spk_file"; }
+        defined $s || die "$0: No utterance $u in utt2spk file $utt2spk_file\n";
         if(!defined $spk_count{$s}) {
             push @spkrs, $s;
             $spk_count{$s} = 0;
@@ -111,8 +127,8 @@
     $numspks = @spkrs;  # number of speakers.
     $numscps = @OUTPUTS; # number of output files.
     if ($numspks < $numscps) {
-      die "Refusing to split data because number of speakers $numspks is less " .
-          "than the number of output .scp files $numscps";
+      die "$0: Refusing to split data because number of speakers $numspks " .
+          "is less than the number of output .scp files $numscps\n";
     }
     for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
         $scparray[$scpidx] = []; # [] is array reference.
@@ -174,52 +190,57 @@
     }
     # Now print out the files...
     for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
-        $scpfn = $OUTPUTS[$scpidx];
-        open(F, ">$scpfn") || die "Could not open scp file $scpfn for writing.";
+        $scpfile = $OUTPUTS[$scpidx];
+        ($scpfile ne '-' ? open($f_fh, '>', $scpfile)
+                         : open($f_fh, '>&', \*STDOUT)) ||
+            die "$0: Could not open scp file $scpfile for writing: $!\n";
         $count = 0;
         if(@{$scparray[$scpidx]} == 0) {
-            print STDERR "Error: split_scp.pl producing empty .scp file $scpfn (too many splits and too few speakers?)\n";
+            print STDERR "$0: eError: split_scp.pl producing empty .scp file " .
+                         "$scpfile (too many splits and too few speakers?)\n";
             $error = 1;
         } else {
             foreach $spk ( @{$scparray[$scpidx]} ) {
-                print F @{$spk_data{$spk}};
+                print $f_fh @{$spk_data{$spk}};
                 $count += $spk_count{$spk};
             }
-            if($count != $scpcount[$scpidx]) { die "Count mismatch [code error]"; }
+            $count == $scpcount[$scpidx] || die "Count mismatch [code error]";
         }
-        close(F);
+        close($f_fh);
     }
 } else {
    # This block is the "normal" case where there is no --utt2spk
    # option and we just break into equal size chunks.
 
-    open(I, "<$inscp") || die "Opening input scp file $inscp";
+    open($i_fh, '<', $inscp) || die "$0: Error opening input scp file $inscp: $!\n";
 
     $numscps = @OUTPUTS;  # size of array.
     @F = ();
-    while(<I>) {
+    while(<$i_fh>) {
         push @F, $_;
     }
     $numlines = @F;
     if($numlines == 0) {
-        print STDERR "split_scp.pl: error: empty input scp file $inscp , ";
+        print STDERR "$0: error: empty input scp file $inscp\n";
         $error = 1;
     }
     $linesperscp = int( $numlines / $numscps); # the "whole part"..
-    $linesperscp >= 1 || die "You are splitting into too many pieces! [reduce \$nj]";
+    $linesperscp >= 1 || die "$0: You are splitting into too many pieces! [reduce \$nj]\n";
     $remainder = $numlines - ($linesperscp * $numscps);
     ($remainder >= 0 && $remainder < $numlines) || die "bad remainder $remainder";
     # [just doing int() rounds down].
     $n = 0;
     for($scpidx = 0; $scpidx < @OUTPUTS; $scpidx++) {
         $scpfile = $OUTPUTS[$scpidx];
-        open(O, ">$scpfile") || die "Opening output scp file $scpfile";
+        ($scpfile ne '-' ? open($o_fh, '>', $scpfile)
+                         : open($o_fh, '>&', \*STDOUT)) ||
+            die "$0: Could not open scp file $scpfile for writing: $!\n";
         for($k = 0; $k < $linesperscp + ($scpidx < $remainder ? 1 : 0); $k++) {
-            print O $F[$n++];
+            print $o_fh $F[$n++];
         }
-        close(O) || die "Closing scp file $scpfile";
+        close($o_fh) || die "$0: Eror closing scp file $scpfile: $!\n";
     }
-    $n == $numlines || die "split_scp.pl: code error., $n != $numlines";
+    $n == $numlines || die "$n != $numlines [code error]";
 }
 
-exit ($error ? 1 : 0);
+exit ($error);
diff --git a/egs/wsj/s5/utils/subset_data_dir.sh b/egs/wsj/s5/utils/subset_data_dir.sh
index 93ee0971b88..c05ca458221 100755
--- a/egs/wsj/s5/utils/subset_data_dir.sh
+++ b/egs/wsj/s5/utils/subset_data_dir.sh
@@ -34,42 +34,27 @@
 
 shortest=false
 perspk=false
-first_opt=""
 speakers=false
-spk_list_specified=false
-utt_list_specified=false
-
-if [ "$1" == "--per-spk" ]; then
-  perspk=true;
-  shift;
-elif [ "$1" == "--shortest" ]; then
-  shortest=true;
-  shift;
-elif [ "$1" == "--first" ]; then
-  first_opt="--first";
-  shift;
-elif [ "$1" == "--speakers" ]; then
-  speakers=true
-  shift;
-elif [ "$1" == "--last" ]; then
-  first_opt="--last";
-  shift;
-elif [ "$1" == "--spk-list" ]; then
-  spk_list_specified=true
-  shift;
-elif [ "$1" == "--utt-list" ]; then
-  utt_list_specified=true
-  shift;
-fi
-
-
-
-
-if [ $# != 3 ]; then
-  echo "Usage: "
+first_opt=
+spk_list=
+utt_list=
+
+expect_args=3
+case $1 in
+  --first|--last) first_opt=$1; shift ;;
+  --per-spk)  perspk=true; shift ;;
+  --shortest) shortest=true; shift ;;
+  --speakers) speakers=true; shift ;;
+  --spk-list) shift; spk_list=$1; shift; expect_args=2 ;;
+  --utt-list) shift; utt_list=$1; shift; expect_args=2 ;;
+  --*) echo "$0: invalid option '$1'"; exit 1
+esac
+
+if [ $# != $expect_args ]; then
+  echo "Usage:"
   echo "  subset_data_dir.sh [--speakers|--shortest|--first|--last|--per-spk] <srcdir> <num-utt> <destdir>"
   echo "  subset_data_dir.sh [--spk-list <speaker-list-file>] <srcdir> <destdir>"
-  echo "  subset_data_dir.sh [--utt-list <utterance-list-file>] <srcdir> <destdir>"
+  echo "  subset_data_dir.sh [--utt-list <utt-list-file>] <srcdir> <destdir>"
   echo "By default, randomly selects <num-utt> utterances from the data directory."
   echo "With --speakers, randomly selects enough speakers that we have <num-utt> utterances"
   echo "With --per-spk, selects <num-utt> utterances per speaker, if available."
@@ -77,120 +62,131 @@ if [ $# != 3 ]; then
   echo "With --last, selects the last <num-utt> utterances"
   echo "With --shortest, selects the shortest <num-utt> utterances."
   echo "With --spk-list, reads the speakers to keep from <speaker-list-file>"
+  echo "With --utt-list, reads the utterances to keep from <utt-list-file>"
   exit 1;
 fi
 
-if $spk_list_specified; then
-  spk_list=$1
-  srcdir=$2
-  destdir=$3
-elif $utt_list_specified; then
-  utt_list=$1
-  srcdir=$2
-  destdir=$3
+srcdir=$1
+if [[ $spk_list || $utt_list ]]; then
+  numutt=
+  destdir=$2
 else
-  srcdir=$1
   numutt=$2
   destdir=$3
 fi
 
-
 export LC_ALL=C
 
 if [ ! -f $srcdir/utt2spk ]; then
-  echo "subset_data_dir.sh: no such file $srcdir/utt2spk"
-  exit 1;
+  echo "$0: no such file $srcdir/utt2spk"
+  exit 1
 fi
 
-function do_filtering {
-  # assumes the utt2spk and spk2utt files already exist.
-  [ -f $srcdir/feats.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp
-  [ -f $srcdir/vad.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp
-  [ -f $srcdir/utt2lang ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang
-  [ -f $srcdir/utt2dur ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2dur >$destdir/utt2dur
-  [ -f $srcdir/utt2num_frames ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2num_frames >$destdir/utt2num_frames
-  [ -f $srcdir/utt2uniq ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2uniq >$destdir/utt2uniq
-  [ -f $srcdir/wav.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp
-  [ -f $srcdir/spk2warp ] && utils/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp
-  [ -f $srcdir/utt2warp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp
-  [ -f $srcdir/text ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text
-  [ -f $srcdir/spk2gender ] && utils/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender
-  [ -f $srcdir/cmvn.scp ] && utils/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp
-  if [ -f $srcdir/segments ]; then
-     utils/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments
-     awk '{print $2;}' $destdir/segments | sort | uniq > $destdir/reco # recordings.
-     # The next line would override the command above for wav.scp, which would be incorrect.
-     [ -f $srcdir/wav.scp ] && utils/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp
-     [ -f $srcdir/reco2file_and_channel ] && \
-       utils/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel
-
-     # Filter the STM file for proper sclite scoring
-     # Copy over the comments from STM file
-     [ -f $srcdir/stm ] && grep "^;;" $srcdir/stm > $destdir/stm
-     [ -f $srcdir/stm ] && utils/filter_scp.pl $destdir/reco < $srcdir/stm >> $destdir/stm
-
-     rm $destdir/reco
-  else
-     awk '{print $1;}' $destdir/wav.scp | sort | uniq > $destdir/reco
-     [ -f $srcdir/reco2file_and_channel ] && \
-       utils/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel
-     
-     rm $destdir/reco
-  fi
-  srcutts=`cat $srcdir/utt2spk | wc -l`
-  destutts=`cat $destdir/utt2spk | wc -l`
-  echo "$0: reducing #utt from $srcutts to $destutts"
-}
+if [[ $numutt && $numutt -gt $(wc -l <$srcdir/utt2spk) ]]; then
+  echo "$0: cannot subset to more utterances than you originally had."
+  exit 1
+fi
 
+if $shortest && [ ! -f $srcdir/feats.scp ]; then
+  echo "$0: you selected --shortest but no feats.scp exist."
+  exit 1
+fi
+
+mkdir -p $destdir || exit 1
 
-if $spk_list_specified; then
-  mkdir -p $destdir
+if [[ $spk_list ]]; then
   utils/filter_scp.pl "$spk_list" $srcdir/spk2utt > $destdir/spk2utt || exit 1;
   utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk || exit 1;
-  do_filtering; # bash function.
-  exit 0;
-elif $utt_list_specified; then
-  mkdir -p $destdir
+elif [[ $utt_list ]]; then
   utils/filter_scp.pl "$utt_list" $srcdir/utt2spk > $destdir/utt2spk || exit 1;
   utils/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt || exit 1;
-  do_filtering; # bash function.
-  exit 0;
 elif $speakers; then
-  mkdir -p $destdir
-  utils/shuffle_list.pl < $srcdir/spk2utt | awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' | \
+  utils/shuffle_list.pl < $srcdir/spk2utt |
+    awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' |
     sort > $destdir/spk2utt
   utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk
-  do_filtering; # bash function.
-  exit 0;
 elif $perspk; then
-  mkdir -p $destdir
-  awk '{ n='$numutt'; printf("%s ",$1); skip=1; while(n*(skip+1) <= NF-1) { skip++; }
+  awk '{ n='$numutt'; printf("%s ",$1);
+         skip=1; while(n*(skip+1) <= NF-1) { skip++; }
          for(x=2; x<=NF && x <= n*skip; x += skip) { printf("%s ", $x); }
          printf("\n"); }' <$srcdir/spk2utt >$destdir/spk2utt
   utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk
-  do_filtering; # bash function.
-  exit 0;
 else
-  if [ $numutt -gt `cat $srcdir/utt2spk | wc -l` ]; then
-    echo "subset_data_dir.sh: cannot subset to more utterances than you originally had."
-    exit 1;
-  fi
-  mkdir -p $destdir || exit 1;
-
-  ## scripting note: $shortest evaluates to true or false
-  ## so this becomes the command true or false.
   if $shortest; then
-    # select the n shortest utterances.
+    # Select $numutt shortest utterances.
     . ./path.sh
-    [ ! -f $srcdir/feats.scp ] && echo "$0: you selected --shortest but no feats.scp exist." && exit 1;
     feat-to-len scp:$srcdir/feats.scp ark,t:$destdir/tmp.len || exit 1;
-    sort -n -k2 $destdir/tmp.len | awk '{print $1}' | head -$numutt >$destdir/tmp.uttlist
+    sort -n -k2 $destdir/tmp.len |
+      awk '{print $1}' |
+      head -$numutt >$destdir/tmp.uttlist
     utils/filter_scp.pl $destdir/tmp.uttlist $srcdir/utt2spk >$destdir/utt2spk
     rm $destdir/tmp.uttlist $destdir/tmp.len
   else
+    # Select $numutt random utterances.
     utils/subset_scp.pl $first_opt $numutt $srcdir/utt2spk > $destdir/utt2spk || exit 1;
   fi
   utils/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt
-  do_filtering;
-  exit 0;
 fi
+
+# Perform filtering. utt2spk and spk2utt files already exist by this point.
+# Filter by utterance.
+[ -f $srcdir/feats.scp ] &&
+  utils/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp
+[ -f $srcdir/vad.scp ] &&
+  utils/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp
+[ -f $srcdir/utt2lang ] &&
+  utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang
+[ -f $srcdir/utt2dur ] &&
+  utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2dur >$destdir/utt2dur
+[ -f $srcdir/utt2num_frames ] &&
+  utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2num_frames >$destdir/utt2num_frames
+[ -f $srcdir/utt2uniq ] &&
+  utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2uniq >$destdir/utt2uniq
+[ -f $srcdir/wav.scp ] &&
+  utils/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp
+[ -f $srcdir/utt2warp ] &&
+  utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp
+[ -f $srcdir/text ] &&
+  utils/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text
+
+# Filter by speaker.
+[ -f $srcdir/spk2warp ] &&
+  utils/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp
+[ -f $srcdir/spk2gender ] &&
+  utils/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender
+[ -f $srcdir/cmvn.scp ] &&
+  utils/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp
+
+# Filter by recording-id.
+if [ -f $srcdir/segments ]; then
+  utils/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments
+  # Recording-ids are in segments.
+  awk '{print $2}' $destdir/segments | sort | uniq >$destdir/reco
+  # The next line overrides the command above for wav.scp, which would be incorrect.
+  [ -f $srcdir/wav.scp ] &&
+    utils/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp
+else
+  # No segments; recording-ids are in wav.scp.
+  awk '{print $1}' $destdir/wav.scp | sort | uniq >$destdir/reco
+fi
+
+[ -f $srcdir/reco2file_and_channel ] &&
+  utils/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel
+[ -f $srcdir/reco2dur ] &&
+  utils/filter_scp.pl $destdir/reco <$srcdir/reco2dur >$destdir/reco2dur
+
+# Filter the STM file for proper sclite scoring.
+# Copy over the comments from STM file.
+[ -f $srcdir/stm ] &&
+  (grep "^;;" $srcdir/stm
+   utils/filter_scp.pl $destdir/reco $srcdir/stm) >$destdir/stm
+
+rm $destdir/reco
+
+# Copy frame_shift if present.
+[ -f $srcdir/frame_shift ] && cp $srcdir/frame_shift $destdir
+
+srcutts=$(wc -l <$srcdir/utt2spk)
+destutts=$(wc -l <$destdir/utt2spk)
+echo "$0: reducing #utt from $srcutts to $destutts"
+exit 0
diff --git a/egs/wsj/s5/utils/subword/prepare_lang_subword.sh b/egs/wsj/s5/utils/subword/prepare_lang_subword.sh
new file mode 100755
index 00000000000..f2432e91825
--- /dev/null
+++ b/egs/wsj/s5/utils/subword/prepare_lang_subword.sh
@@ -0,0 +1,423 @@
+#!/bin/bash
+# Copyright 2012-2013  Johns Hopkins University (Author: Daniel Povey);
+#                      Arnab Ghoshal
+#                2014  Guoguo Chen
+#                2015  Hainan Xu
+#                2016  FAU Erlangen (Author: Axel Horndasch)
+#                2019  Dongji Gao
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# This script prepares a directory (for subword) such as data/lang_subword/, in the standard format,
+# given a source directory containing a subword dictionary lexicon.txt in a form like:
+# subword phone1 phone2 ... phoneN
+# per line (alternate prons would be separate lines), or a dictionary with probabilities
+# called lexiconp.txt in a form:
+# subword pron-prob phone1 phone2 ... phoneN
+# (with 0.0 < pron-prob <= 1.0); note: if lexiconp.txt exists, we use it even if
+# lexicon.txt exists.
+# and also files silence_phones.txt, nonsilence_phones.txt, optional_silence.txt
+# and extra_questions.txt
+# Here, silence_phones.txt and nonsilence_phones.txt are lists of silence and
+# non-silence phones respectively (where silence includes various kinds of
+# noise, laugh, cough, filled pauses etc., and nonsilence phones includes the
+# "real" phones.)
+# In each line of those files is a list of phones, and the phones on each line
+# are assumed to correspond to the same "base phone", i.e. they will be
+# different stress or tone variations of the same basic phone.
+# The file "optional_silence.txt" contains just a single phone (typically SIL)
+# which is used for optional silence in the lexicon.
+# extra_questions.txt might be empty; typically will consist of lists of phones,
+# all members of each list with the same stress or tone; and also possibly a
+# list for the silence phones.  This will augment the automatically generated
+# questions (note: the automatically generated ones will treat all the
+# stress/tone versions of a phone the same, so will not "get to ask" about
+# stress or tone).
+#
+
+# This script adds word-position-dependent phones and constructs a host of other
+# derived files, that go in data/lang_subword/.
+
+# Currently it only support the most basic functions.
+# Begin configuration section.
+num_sil_states=5
+num_nonsil_states=3
+position_dependent_phones=true
+# position_dependent_phones is false also when position dependent phones and word_boundary.txt
+# have been generated by another source
+share_silence_phones=false  # if true, then share pdfs of different silence
+                            # phones together.
+sil_prob=0.5
+num_extra_phone_disambig_syms=1 # Standard one phone disambiguation symbol is used for optional silence.
+                                # Increasing this number does not harm, but is only useful if you later
+                                # want to introduce this labels to L_disambig.fst
+separator="@@"   # Separator is a suffix or prefix of subword indicating the position of this subword in word.
+                 # By default, subword which is not at the end of word would have separator as suffix.
+                 # For example: international -> inter@@ nation@@ al
+
+# end configuration sections
+
+echo "$0 $@"  # Print the command line for logging
+
+. utils/parse_options.sh
+
+if [ $# -ne 4 ]; then
+  echo "Usage: utils/prepare_lang.sh <dict-src-dir> <oov-dict-entry> <tmp-dir> <lang-dir>"
+  echo "e.g.: utils/prepare_lang.sh data/local/dict <SPOKEN_NOISE> data/local/lang data/lang"
+  echo "<dict-src-dir> should contain the following files:"
+  echo " extra_questions.txt  lexicon.txt nonsilence_phones.txt  optional_silence.txt  silence_phones.txt"
+  echo "See http://kaldi-asr.org/doc/data_prep.html#data_prep_lang_creating for more info."
+  echo "options: "
+  echo "<dict-src-dir> may also, for the grammar-decoding case (see http://kaldi-asr.org/doc/grammar.html)"
+  echo "contain a file nonterminals.txt containing symbols like #nonterm:contact_list, one per line."
+  echo "     --num-sil-states <number of states>             # default: 5, #states in silence models."
+  echo "     --num-nonsil-states <number of states>          # default: 3, #states in non-silence models."
+  echo "     --position-dependent-phones (true|false)        # default: true; if true, use _B, _E, _S & _I"
+  echo "                                                     # markers on phones to indicate word-internal positions. "
+  echo "     --share-silence-phones (true|false)             # default: false; if true, share pdfs of "
+  echo "                                                     # all silence phones. "
+  echo "     --sil-prob <probability of silence>             # default: 0.5 [must have 0 <= silprob < 1]"
+  echo "     --separator <separator>                         # default: @@"
+  exit 1;
+fi
+
+srcdir=$1
+oov_word=$2
+tmpdir=$3
+dir=$4
+mkdir -p $dir $tmpdir $dir/phones
+
+silprob=false
+[ -f $srcdir/lexiconp_silprob.txt ] && echo "$0: Currently we do not support word-dependent silence probability." && exit 1;
+
+if [ -f $srcdir/nonterminals.txt ]; then
+  echo "$0: Currently we do not support nonterminals" && exit 1;
+else
+  grammar_opts=
+fi
+
+[ -f path.sh ] && . ./path.sh
+
+# Validate dict directory
+! utils/validate_dict_dir.pl $srcdir && \
+  echo "*Error validating directory $srcdir*" && exit 1;
+
+if [[ ! -f $srcdir/lexicon.txt ]]; then
+  echo "**Creating $srcdir/lexicon.txt from $srcdir/lexiconp.txt"
+  perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' < $srcdir/lexiconp.txt > $srcdir/lexicon.txt || exit 1;
+fi
+if [[ ! -f $srcdir/lexiconp.txt ]]; then
+  echo "**Creating $srcdir/lexiconp.txt from $srcdir/lexicon.txt"
+  perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $srcdir/lexiconp.txt || exit 1;
+fi
+
+# Currently The lexicon in dict directory have to be a subword lexicon.
+# If the lexicon is for word and is not phonemic, we can not get a subword lexicon without knowing the alignment.
+! grep -q $separator $srcdir/lexiconp.txt && \
+echo "$0: Warning, this lexicon contains no separator \"$separator\" and may not be a subword lexicon." && exit 1;
+
+# Write the separator into file for future use.
+echo $separator > $dir/subword_separator.txt
+
+if ! utils/validate_dict_dir.pl $srcdir >&/dev/null; then
+  utils/validate_dict_dir.pl $srcdir  # show the output.
+  echo "Validation failed (second time)"
+  exit 1;
+fi
+
+if $position_dependent_phones; then
+  # Create $tmpdir/lexiconp.txt from $srcdir/lexiconp.txt (or
+  # $tmpdir/lexiconp_silprob.txt from $srcdir/lexiconp_silprob.txt) by
+  # adding the markers _B, _E, _S, _I depending on word position.
+  # In this recipe, these markers apply to silence also.
+  # Do this starting from lexiconp.txt only.
+  if "$silprob"; then
+    echo "$0: Currently we do not support word-dependent silence probability" && exit 1;
+  else
+    utils/lang/make_position_dependent_subword_lexicon.py $srcdir/lexiconp.txt > $tmpdir/lexiconp.txt || exit 1;
+  fi
+
+  # create $tmpdir/phone_map.txt
+  # this has the format (on each line)
+  # <original phone> <version 1 of original phone> <version 2> ...
+  # where the versions depend on the position of the phone within a word.
+  # For instance, we'd have:
+  # AA AA_B AA_E AA_I AA_S
+  # for (B)egin, (E)nd, (I)nternal and (S)ingleton
+  # and in the case of silence
+  # SIL SIL SIL_B SIL_E SIL_I SIL_S
+  # [because SIL on its own is one of the variants; this is for when it doesn't
+  #  occur inside a word but as an option in the lexicon.]
+
+  # This phone map expands the phone lists into all the word-position-dependent
+  # versions of the phone lists.
+  cat <(set -f; for x in `cat $srcdir/silence_phones.txt`; do for y in "" "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \
+    <(set -f; for x in `cat $srcdir/nonsilence_phones.txt`; do for y in "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \
+    > $tmpdir/phone_map.txt
+else
+  if "$silprob"; then
+    echo "$0: Currently we do not support word-dependent silence probability" && exit 1;
+  else
+    cp $srcdir/lexiconp.txt $tmpdir/lexiconp.txt
+  fi
+
+  cat $srcdir/silence_phones.txt $srcdir/nonsilence_phones.txt | \
+    awk '{for(n=1;n<=NF;n++) print $n; }' > $tmpdir/phones
+  paste -d' ' $tmpdir/phones $tmpdir/phones > $tmpdir/phone_map.txt
+fi
+
+mkdir -p $dir/phones  # various sets of phones...
+
+# Sets of phones for use in clustering, and making monophone systems.
+
+if $share_silence_phones; then
+  # build a roots file that will force all the silence phones to share the
+  # same pdf's. [three distinct states, only the transitions will differ.]
+  # 'shared'/'not-shared' means, do we share the 3 states of the HMM
+  # in the same tree-root?
+  # Sharing across models(phones) is achieved by writing several phones
+  # into one line of roots.txt (shared/not-shared doesn't affect this).
+  # 'not-shared not-split' means we have separate tree roots for the 3 states,
+  # but we never split the tree so they remain stumps,
+  # so all phones in the line correspond to the same model.
+
+  cat $srcdir/silence_phones.txt | awk '{printf("%s ", $0); } END{printf("\n");}' | cat - $srcdir/nonsilence_phones.txt | \
+    utils/apply_map.pl $tmpdir/phone_map.txt > $dir/phones/sets.txt
+  cat $dir/phones/sets.txt | \
+    awk '{if(NR==1) print "not-shared", "not-split", $0; else print "shared", "split", $0;}' > $dir/phones/roots.txt
+else
+  # different silence phones will have different GMMs.  [note: here, all "shared split" means
+  # is that we may have one GMM for all the states, or we can split on states.  because they're
+  # context-independent phones, they don't see the context.]
+  cat $srcdir/{,non}silence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt > $dir/phones/sets.txt
+  cat $dir/phones/sets.txt | awk '{print "shared", "split", $0;}' > $dir/phones/roots.txt
+fi
+
+cat $srcdir/silence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt | \
+  awk '{for(n=1;n<=NF;n++) print $n;}' > $dir/phones/silence.txt
+cat $srcdir/nonsilence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt | \
+  awk '{for(n=1;n<=NF;n++) print $n;}' > $dir/phones/nonsilence.txt
+cp $srcdir/optional_silence.txt $dir/phones/optional_silence.txt
+cp $dir/phones/silence.txt $dir/phones/context_indep.txt
+
+# if extra_questions.txt is empty, it's OK.
+cat $srcdir/extra_questions.txt 2>/dev/null | utils/apply_map.pl $tmpdir/phone_map.txt \
+  >$dir/phones/extra_questions.txt
+
+# Want extra questions about the word-start/word-end stuff. Make it separate for
+# silence and non-silence. Probably doesn't matter, as silence will rarely
+# be inside a word.
+if $position_dependent_phones; then
+  for suffix in _B _E _I _S; do
+    (set -f; for x in `cat $srcdir/nonsilence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt
+  done
+  for suffix in "" _B _E _I _S; do
+    (set -f; for x in `cat $srcdir/silence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt
+  done
+fi
+
+# add_lex_disambig.pl is responsible for adding disambiguation symbols to
+# the lexicon, for telling us how many disambiguation symbols it used,
+# and and also for modifying the unknown-word's pronunciation (if the
+# --unk-fst was provided) to the sequence "#1 #2 #3", and reserving those
+# disambig symbols for that purpose.
+# The #2 will later be replaced with the actual unk model.  The reason
+# for the #1 and the #3 is for disambiguation and also to keep the
+# FST compact.  If we didn't have the #1, we might have a different copy of
+# the unk-model FST, or at least some of its arcs, for each start-state from
+# which an <unk> transition comes (instead of per end-state, which is more compact);
+# and adding the #3 prevents us from potentially having 2 copies of the unk-model
+# FST due to the optional-silence [the last phone of any word gets 2 arcs].
+
+if "$silprob"; then
+  echo "$0: Currently we do not support word-dependent silence probability" && exit 1;
+else
+  ndisambig=$(utils/add_lex_disambig.pl $unk_opt --pron-probs $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt)
+fi
+ndisambig=$[$ndisambig+$num_extra_phone_disambig_syms]; # add (at least) one disambig symbol for silence in lexicon FST.
+echo $ndisambig > $tmpdir/lex_ndisambig
+
+# Format of lexiconp_disambig.txt:
+# !SIL	1.0   SIL_S
+# <SPOKEN_NOISE>	1.0   SPN_S #1
+# <UNK>	1.0  SPN_S #2
+# <NOISE>	1.0  NSN_S
+# !EXCLAMATION-POINT	1.0  EH2_B K_I S_I K_I L_I AH0_I M_I EY1_I SH_I AH0_I N_I P_I OY2_I N_I T_E
+
+( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) >$dir/phones/disambig.txt
+
+# Create phone symbol table.
+echo "<eps>" | cat - $dir/phones/{silence,nonsilence,disambig}.txt | \
+  awk '{n=NR-1; print $1, n;}' > $dir/phones.txt
+
+# Create a file that describes the word-boundary information for
+# each phone.  5 categories.
+if $position_dependent_phones; then
+  cat $dir/phones/{silence,nonsilence}.txt | \
+    awk '/_I$/{print $1, "internal"; next;} /_B$/{print $1, "begin"; next; }
+         /_S$/{print $1, "singleton"; next;} /_E$/{print $1, "end"; next; }
+         {print $1, "nonword";} ' > $dir/phones/word_boundary_moved.txt
+else
+  # word_boundary.txt might have been generated by another source
+  [ -f $srcdir/word_boundary.txt ] && cp $srcdir/word_boundary.txt $dir/phones/word_boundary_moved.txt
+fi
+
+# Create word symbol table.
+# <s> and </s> are only needed due to the need to rescore lattices with
+# ConstArpaLm format language model. They do not normally appear in G.fst or
+# L.fst.
+
+if "$silprob"; then
+  echo "$0: Currently we do not support word-dependent silence probability" && exit 1;
+fi
+
+cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq  | awk '
+  BEGIN {
+    print "<eps> 0";
+  }
+  {
+    if ($1 == "<s>") {
+      print "<s> is in the vocabulary!" | "cat 1>&2"
+      exit 1;
+    }
+    if ($1 == "</s>") {
+      print "</s> is in the vocabulary!" | "cat 1>&2"
+      exit 1;
+    }
+    printf("%s %d\n", $1, NR);
+  }
+  END {
+    printf("#0 %d\n", NR+1);
+    printf("<s> %d\n", NR+2);
+    printf("</s> %d\n", NR+3);
+  }' > $dir/words.txt || exit 1;
+
+# In case there are extra word-level disambiguation symbols they also
+# need to be added to words.txt
+
+# format of $dir/words.txt:
+# <eps> 0
+# a 1
+# aa 2
+# aarvark 3
+# ...
+
+silphone=`cat $srcdir/optional_silence.txt` || exit 1;
+[ -z "$silphone" ] && \
+  ( echo "You have no optional-silence phone; it is required in the current scripts"
+    echo "but you may use the option --sil-prob 0.0 to stop it being used." ) && \
+   exit 1;
+
+# create $dir/phones/align_lexicon.{txt,int}.
+# This is the method we use for lattice word alignment if we are not
+# using word-position-dependent phones.
+
+# First remove pron-probs from the lexicon.
+perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' <$tmpdir/lexiconp.txt >$tmpdir/align_lexicon.txt
+
+# Note: here, $silphone will have no suffix e.g. _S because it occurs as optional-silence,
+# and is not part of a word.
+[ ! -z "$silphone" ] && echo "<eps> $silphone" >> $tmpdir/align_lexicon.txt
+
+cat $tmpdir/align_lexicon.txt | \
+  perl -ane '@A = split; print $A[0], " ", join(" ", @A), "\n";' | sort | uniq > $dir/phones/align_lexicon.txt
+
+# create phones/align_lexicon.int from phones/align_lexicon.txt
+cat $dir/phones/align_lexicon.txt | utils/sym2int.pl -f 3- $dir/phones.txt | \
+  utils/sym2int.pl -f 1-2 $dir/words.txt > $dir/phones/align_lexicon.int
+
+# Create the basic L.fst without disambiguation symbols, for use
+# in training.
+
+if $silprob; then
+#  # Add silence probabilities (models the prob. of silence before and after each
+#  # word).  On some setups this helps a bit.  See utils/dict_dir_add_pronprobs.sh
+#  # and where it's called in the example scripts (run.sh).
+  echo "$0: Currently we do not support word-dependnet silence probability" && exit 1;
+else
+  utils/lang/make_subword_lexicon_fst.py $grammar_opts --sil-prob=$sil_prob --sil-phone=$silphone --position-dependent\
+            --separator=$separator $tmpdir/lexiconp.txt | \
+    fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
+      --keep_isymbols=false --keep_osymbols=false | \
+    fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
+fi
+
+# The file oov.txt contains a word that we will map any OOVs to during
+# training.
+echo "$oov_word" > $dir/oov.txt || exit 1;
+cat $dir/oov.txt | utils/sym2int.pl $dir/words.txt >$dir/oov.int || exit 1;
+# integer version of oov symbol, used in some scripts.
+
+# the file wdisambig.txt contains a (line-by-line) list of the text-form of the
+# disambiguation symbols that are used in the grammar and passed through by the
+# lexicon.  At this stage it's hardcoded as '#0', but we're laying the groundwork
+# for more generality (which probably would be added by another script).
+# wdisambig_words.int contains the corresponding list interpreted by the
+# symbol table words.txt, and wdisambig_phones.int contains the corresponding
+# list interpreted by the symbol table phones.txt.
+echo '#0' >$dir/phones/wdisambig.txt
+
+utils/sym2int.pl $dir/phones.txt <$dir/phones/wdisambig.txt >$dir/phones/wdisambig_phones.int
+utils/sym2int.pl $dir/words.txt <$dir/phones/wdisambig.txt >$dir/phones/wdisambig_words.int
+
+# Create these lists of phones in colon-separated integer list form too,
+# for purposes of being given to programs as command-line options.
+for f in silence nonsilence optional_silence disambig context_indep; do
+  utils/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt >$dir/phones/$f.int
+  utils/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt | \
+   awk '{printf(":%d", $1);} END{printf "\n"}' | sed s/:// > $dir/phones/$f.csl || exit 1;
+done
+
+for x in sets extra_questions; do
+  utils/sym2int.pl $dir/phones.txt <$dir/phones/$x.txt > $dir/phones/$x.int || exit 1;
+done
+
+utils/sym2int.pl -f 3- $dir/phones.txt <$dir/phones/roots.txt \
+   > $dir/phones/roots.int || exit 1;
+
+if [ -f $dir/phones/word_boundary_moved.txt ]; then
+  utils/sym2int.pl -f 1 $dir/phones.txt <$dir/phones/word_boundary_moved.txt \
+    > $dir/phones/word_boundary_moved.int || exit 1;
+fi
+
+silphonelist=`cat $dir/phones/silence.csl`
+nonsilphonelist=`cat $dir/phones/nonsilence.csl`
+
+# Note: it's OK, after generating the 'lang' directory, to overwrite the topo file
+# with another one of your choice if the 'topo' file you want can't be generated by
+# utils/gen_topo.pl.  We do this in the 'chain' recipes.  Of course, the 'topo' file
+# should cover all the phones.  Try running utils/validate_lang.pl to check that
+# everything is OK after modifying the topo file.
+utils/gen_topo.pl $num_nonsil_states $num_sil_states $nonsilphonelist $silphonelist >$dir/topo
+
+# Create the lexicon FST with disambiguation symbols, and put it in lang_test.
+# There is an extra step where we create a loop to "pass through" the
+# disambiguation symbols from G.fst.
+
+if $silprob; then
+  echo "$0: Currently we do not support word-dependnet silence probability" && exit 1;
+else
+  utils/lang/make_subword_lexicon_fst.py $grammar_opts \
+       --sil-prob=$sil_prob --sil-phone=$silphone --sil-disambig='#'$ndisambig --position-dependent \
+       --separator=$separator $tmpdir/lexiconp_disambig.txt | \
+     fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
+       --keep_isymbols=false --keep_osymbols=false |   \
+     fstaddselfloops  $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \
+     fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1;
+fi
+
+echo "$(basename $0): validating output directory"
+! utils/validate_lang.pl $dir && echo "$(basename $0): error validating output" &&  exit 1;
+
+exit 0;
diff --git a/egs/wsj/s5/utils/subword/prepare_subword_text.sh b/egs/wsj/s5/utils/subword/prepare_subword_text.sh
new file mode 100755
index 00000000000..0f0ce68c44f
--- /dev/null
+++ b/egs/wsj/s5/utils/subword/prepare_subword_text.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# 2019 Dongji Gao
+
+# This script generates subword text form word text.
+# For example, <noise> internatioal -> <noise> inter@@ nation@@ al
+# @@ here is the separator indicate the poisition of subword in word.
+# Subword directly followed by separator can only appear at he begining or middle of word.
+# "<noise>" here can be reserved if added to the option "--glossaries"
+
+# Begin configuration section
+separator="@@"
+glossaries=
+# End configuration section
+
+. utils/parse_options.sh
+
+echo "$0 $@"
+
+if [ $# -ne 3 ]; then
+  echo "Usage: utils/prepare_subword_text.sh <word-text> <pair_code> <subword-text>"
+  echo "e.g.: utils/prepare_subword_text.sh data/train/text data/local/pair_code.txt data/train/text_subword"
+  echo "    --seperator <separator>         # default: @@"
+  echo "    --glossaries <reserved-words>   # glossaries are words reserved"
+  exit 1;
+fi
+
+word_text=$1
+pair_code=$2
+subword_text=$3
+
+[ ! -f $word_text ] && echo "Word text $word_text does not exits." && exit 1;
+
+grep -q $separator $word_text && echo "$0: Error, word text file contains separator $separator. This might be a subword text file or you need to choose a different separator" && exit 1;
+
+glossaries_opt=
+[ -z $glossaires ] && glossaries_opt="--glossaries $glossaries"
+cut -d ' ' -f2- $word_text | \
+  utils/lang/bpe/apply_bpe.py -c $pair_code --separator $separator $glossaires_opt > ${word_text}.sub
+  if [ $word_text == $subword_text ]; then
+    mv $word_text ${word_text}.old
+    cut -d ' ' -f1 ${word_text}.old | paste -d ' ' - ${word_text}.sub > $subword_text
+  else
+    cut -d ' ' -f1 $word_text | paste -d ' ' - ${word_text}.sub > $subword_text
+  fi
+
+rm ${word_text}.sub
+echo "Subword text created."
diff --git a/egs/wsj/s5/utils/validate_data_dir.sh b/egs/wsj/s5/utils/validate_data_dir.sh
index dc06b6fa59e..c7e633ab57b 100755
--- a/egs/wsj/s5/utils/validate_data_dir.sh
+++ b/egs/wsj/s5/utils/validate_data_dir.sh
@@ -85,9 +85,7 @@ function check_sorted_and_uniq {
 }
 
 function partial_diff {
-  diff $1 $2 | head -n 6
-  echo "..."
-  diff $1 $2 | tail -n 6
+  diff -U1 $1 $2 | (head -n 6; echo "..."; tail -n 6)
   n1=`cat $1 | wc -l`
   n2=`cat $2 | wc -l`
   echo "[Lengths are $1=$n1 versus $2=$n2]"
@@ -341,9 +339,23 @@ if [ -f $data/utt2dur ]; then
     exit 1;
   fi
   cat $data/utt2dur | \
-    awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1
+    awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line utt2dur:" NR ":" $0; exit(1) }}' || exit 1
 fi
 
+if [ -f $data/utt2num_frames ]; then
+  check_sorted_and_uniq $data/utt2num_frames
+  cat $data/utt2num_frames | awk '{print $1}' > $tmpdir/utts.utt2num_frames
+  if ! cmp -s $tmpdir/utts{,.utt2num_frames}; then
+    echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2num_frames file"
+    echo "$0: differ, partial diff is:"
+    partial_diff $tmpdir/utts{,.utt2num_frames}
+    exit 1
+  fi
+  awk <$data/utt2num_frames '{
+    if (NF != 2 || !($2 > 0) || $2 != int($2)) {
+      print "Bad line utt2num_frames:" NR ":" $0
+      exit 1 } }' || exit 1
+fi
 
 if [ -f $data/reco2dur ]; then
   check_sorted_and_uniq $data/reco2dur
diff --git a/egs/wsj/s5/utils/validate_dict_dir.pl b/egs/wsj/s5/utils/validate_dict_dir.pl
index 8f8534c329b..209f9fd40c1 100755
--- a/egs/wsj/s5/utils/validate_dict_dir.pl
+++ b/egs/wsj/s5/utils/validate_dict_dir.pl
@@ -35,7 +35,7 @@ sub get_utf8_or_bytestream {
       $is_utf_compatible = $is_utf_compatible && defined($decoded_text);
       push @unicode_lines, $decoded_text;
     } else {
-      #print STDERR "WARNING: the line $raw_text cannot be interpreted as UTF-8: $decoded_text\n";
+      #print STDERR "WARNING: the line($.) $raw_text cannot be interpreted as UTF-8: $decoded_text\n";
       ;
     }
     push @raw_lines, $raw_text;
diff --git a/egs/wsj/s5/utils/validate_lang.pl b/egs/wsj/s5/utils/validate_lang.pl
index ea2272f3cda..8dba2a0ca69 100755
--- a/egs/wsj/s5/utils/validate_lang.pl
+++ b/egs/wsj/s5/utils/validate_lang.pl
@@ -4,6 +4,7 @@
 # Copyright  2012   Guoguo Chen
 #            2014   Neil Nelson
 #            2017   Johns Hopkins University (Jan "Yenda" Trmal <jtrmal@gmail.com>)
+#            2019   Dongji Gao
 #
 # Validation script for data/lang
 
@@ -101,6 +102,7 @@ sub check_allowed_whitespace {
 $skip_det_check = 0;
 $skip_disambig_check = 0;
 $skip_generate_words_check = 0;
+$subword_check = 0;
 
 for ($x=0; $x <= 3; $x++) {
   if (@ARGV > 0 && $ARGV[0] eq "--skip-determinization-check") {
@@ -121,6 +123,7 @@ sub check_allowed_whitespace {
   print "Usage: $0 [options] <lang_directory>\n";
   print "e.g.:  $0 data/lang\n";
   print "Options:\n";
+  print " --skip-det-check                         (this flag causes it to skip a deterministic fst check).\n";
   print " --skip-determinization-check             (this flag causes it to skip a time consuming check).\n";
   print " --skip-disambig-check                    (this flag causes it to skip a disambig check in phone bigram models).\n";
   exit(1);
@@ -131,6 +134,40 @@ sub check_allowed_whitespace {
 $lang = shift @ARGV;
 $exit = 0;
 $warning = 0;
+
+# Checking existence of separator file ------------------
+print "Checking existence of separator file\n";
+if (!-e "$lang/subword_separator.txt") {
+  print "separator file $lang/subword_separator.txt is empty or does not exist, deal in word case.\n";
+} else {
+  if (!open(S, "<$lang/subword_separator.txt")) {
+    print "--> ERROR: fail to open $lang/subword_separator.txt\n"; exit 1;
+  } else {
+    $line_num = `wc -l <$lang/subword_separator.txt`;
+    if ($line_num != 1) {
+      print "--> ERROR, $lang/subword_separator.txt should only contain one line.\n"; exit 1;
+    } else {
+      while (<S>) {
+        chomp;
+        my @col = split(" ", $_);
+        if (@col != 1) {
+          print "--> ERROR, invalid separator.\n"; exit 1;
+        } else {
+         $separator = shift @col;
+         $separator_length = length $separator;
+         $subword_check = 1;
+        }
+      }
+    }
+  }
+}
+
+if (!$subword_check) {
+  $word_boundary = "word_boundary";
+} else {
+  $word_boundary = "word_boundary_moved";
+}
+
 # Checking phones.txt -------------------------------
 print "Checking $lang/phones.txt ...\n";
 if (-z "$lang/phones.txt") {
@@ -492,7 +529,7 @@ sub check_summation {
   my $ok = 1;
   foreach $p (keys %psymtab) {
     if (! defined $sum{$p} && $p !~ m/^#nonterm/) {
-      $exit = 1;  $ok = 0;  print("--> ERROR: phone $p is not in silence.txt, nonsilence.txt or disambig.txt...");
+      $exit = 1;  $ok = 0;  print("--> ERROR: phone $p is not in silence.txt, nonsilence.txt or disambig.txt...\n");
     }
   }
 
@@ -530,8 +567,8 @@ sub check_summation {
     $exit = 1;
   }
 }
-if (-e "$lang/phones/word_boundary.txt") {
-  check_txt_int("$lang/phones/word_boundary", \%psymtab, 0); print "\n";
+if (-e "$lang/phones/$word_boundary.txt") {
+  check_txt_int("$lang/phones/$word_boundary", \%psymtab, 0); print "\n";
 }
 
 # Checking optional_silence.txt -------------------------------
@@ -634,10 +671,10 @@ sub check_summation {
 $end       = "";
 $internal  = "";
 $singleton = "";
-if (-s "$lang/phones/word_boundary.txt") {
-  print "Checking word_boundary.txt: silence.txt, nonsilence.txt, disambig.txt ...\n";
-  if (!open (W, "<$lang/phones/word_boundary.txt")) {
-    $exit = 1; print "--> ERROR: fail to open $lang/phones/word_boundary.txt\n";
+if (-s "$lang/phones/$word_boundary.txt") {
+  print "Checking $word_boundary.txt: silence.txt, nonsilence.txt, disambig.txt ...\n";
+  if (!open (W, "<$lang/phones/$word_boundary.txt")) {
+    $exit = 1; print "--> ERROR: fail to open $lang/phones/$word_boundary.txt\n";
   }
   $idx = 1;
   %wb = ();
@@ -660,7 +697,7 @@ sub check_summation {
       s/ singleton$//g; @col = split(" ", $_); if (@col == 1) {$singleton .= "$col[0] ";}
     }
     if (@col != 1) {
-      $exit = 1; print "--> ERROR: expect 1 column in $lang/phones/word_boundary.txt (line $idx)\n";
+      $exit = 1; print "--> ERROR: expect 1 column in $lang/phones/$word_boundary.txt (line $idx)\n";
     }
     $wb{shift @col} = 1;
     $idx ++;
@@ -671,13 +708,13 @@ sub check_summation {
   $success1 = 1;
   if (@itset != 0) {
     $success1 = 0;
-    $exit = 1; print "--> ERROR: $lang/phones/word_boundary.txt has disambiguation symbols -- ";
+    $exit = 1; print "--> ERROR: $lang/phones/$word_boundary.txt has disambiguation symbols -- ";
     foreach (@itset) {
       print "$_ ";
     }
     print "\n";
   }
-  $success1 == 0 || print "--> $lang/phones/word_boundary.txt doesn't include disambiguation symbols\n";
+  $success1 == 0 || print "--> $lang/phones/$word_boundary.txt doesn't include disambiguation symbols\n";
 
   %sum = (%silence, %nonsilence);
   @itset = intersect(\%sum, \%wb);
@@ -685,7 +722,7 @@ sub check_summation {
   $success2 = 1;
   if (@itset < scalar(keys %sum)) {
     $success2 = 0;
-    $exit = 1; print "--> ERROR: phones in nonsilence.txt and silence.txt but not in word_boundary.txt -- ";
+    $exit = 1; print "--> ERROR: phones in nonsilence.txt and silence.txt but not in $word_boundary.txt -- ";
     foreach (keys %sum) {
       if (!$itset{$_}) {
         print "$_ ";
@@ -695,7 +732,7 @@ sub check_summation {
   }
   if (@itset < scalar(keys %wb)) {
     $success2 = 0;
-    $exit = 1; print "--> ERROR: phones in word_boundary.txt but not in nonsilence.txt or silence.txt -- ";
+    $exit = 1; print "--> ERROR: phones in $word_boundary.txt but not in nonsilence.txt or silence.txt -- ";
     foreach (keys %wb) {
       if (!$itset{$_}) {
         print "$_ ";
@@ -703,8 +740,8 @@ sub check_summation {
     }
     print "\n";
   }
-  $success2 == 0 || print "--> $lang/phones/word_boundary.txt is the union of nonsilence.txt and silence.txt\n";
-  $success1 != 1 or $success2 != 1 || print "--> $lang/phones/word_boundary.txt is OK\n";
+  $success2 == 0 || print "--> $lang/phones/$word_boundary.txt is the union of nonsilence.txt and silence.txt\n";
+  $success1 != 1 or $success2 != 1 || print "--> $lang/phones/$word_boundary.txt is OK\n";
   print "\n";
 }
 
@@ -750,11 +787,11 @@ sub check_summation {
     close(P);
     my $len = @wdisambig, $len2;
     if (($len2 = @wdisambig_words) != $len) {
-      print "--> ERROR: files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_words.int have different lengths";
+      print "--> ERROR: files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_words.int have different lengths\n";
       $exit = 1; return;
     }
     if (($len2 = @wdisambig_phones) != $len) {
-      print "--> ERROR: files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_phones.int have different lengths";
+      print "--> ERROR: files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_phones.int have different lengths\n";
       $exit = 1; return;
     }
     for (my $i = 0; $i < $len; $i++) {
@@ -777,16 +814,23 @@ sub check_summation {
   }
 }
 
-
-if (-s "$lang/phones/word_boundary.int") {
-  print "Checking word_boundary.int and disambig.int\n";
-  if (!open (W, "<$lang/phones/word_boundary.int")) {
-    $exit = 1; print "--> ERROR: fail to open $lang/phones/word_boundary.int\n";
+# Check validity of L.fst, L_disambig.fst, and word_boundary.int.
+# First we generate a random word/subword sequence. We then compile it into fst and compose it with L.fst/L_disambig.fst.
+# For subword case the last subword of the sequence must be a end-subword 
+# (i.e. the subword can only be at the end of word or is a single word itself) 
+# to guarantee the composition would not fail.
+# We then get the corresponging phones sequence and apply a transition matrix on it to get the number of valid boundaries.
+# In word case, the number of valid boundaries should be equal to the number of words.
+# In subword case, the number of valid boundaries should be equal to the number of end-subwords.
+if (-s "$lang/phones/$word_boundary.int") {
+  print "Checking $word_boundary.int and disambig.int\n";
+  if (!open (W, "<$lang/phones/$word_boundary.int")) {
+    $exit = 1; print "--> ERROR: fail to open $lang/phones/$word_boundary.int\n";
   }
   while (<W>) {
     @A = split;
     if (@A != 2) {
-      $exit = 1; print "--> ERROR: bad line $_ in $lang/phones/word_boundary.int\n";
+      $exit = 1; print "--> ERROR: bad line $_ in $lang/phones/$word_boundary.int\n";
     }
     $wbtype{$A[0]} = $A[1];
   }
@@ -814,23 +858,58 @@ sub check_summation {
       next;
     }
     $wlen = int(rand(100)) + 1;
-    print "--> generating a $wlen word sequence\n";
+    $end_subword = 0;
+    print "--> generating a $wlen word/subword sequence\n";
     $wordseq = "";
     $sid = 0;
     $wordseq_syms = "";
-    foreach (1 .. $wlen) {
+    # exclude disambiguation symbols, BOS and EOS, epsilon, and
+    # grammar-related symbols from the word sequence.
+    while ($sid < ($wlen - 1)) {
       $id = int(rand(scalar(keys %wint2sym)));
-      # exclude disambiguation symbols, BOS and EOS, epsilon, and
-      # grammar-related symbols from the word sequence.
       while (defined $wdisambig_words_hash{$id} or
-             $wint2sym{$id} eq "<s>" or $wint2sym{$id} eq "</s>" or
-             $wint2sym{$id} =~ m/^#nonterm/ or $id == 0) {
+           $wint2sym{$id} eq "<s>" or $wint2sym{$id} eq "</s>" or
+           $wint2sym{$id} =~ m/^#nonterm/ or $id == 0) {
         $id = int(rand(scalar(keys %wint2sym)));
       }
       $wordseq_syms = $wordseq_syms . $wint2sym{$id} . " ";
       $wordseq = $wordseq . "$sid ". ($sid + 1) . " $id $id 0\n";
       $sid ++;
+
+      if ($subword_check) {
+        $subword = $wint2sym{$id};
+        $suffix = substr($subword, -$separator_length, $separator_length);
+        if ($suffix ne $separator) {
+          $end_subword ++;
+        }
+      }
+    } 
+
+    # generate the last word (subword)
+    $id = int(rand(scalar(keys %wint2sym)));
+    if ($subword_check) {
+      $subword = $wint2sym{$id};
+      $suffix = substr($subword, -$separator_length, $separator_length);
+      # the last subword can not followed by separator  
+      while (defined $wdisambig_words_hash{$id} or
+           $wint2sym{$id} eq "<s>" or $wint2sym{$id} eq "</s>" or
+           $wint2sym{$id} =~ m/^#nonterm/ or $id == 0 or $suffix eq $separator) {
+        $id = int(rand(scalar(keys %wint2sym)));
+        $subword = $wint2sym{$id};
+        $suffix = substr($subword, -$separator_length, $separator_length);
+      }
+      $end_subword ++;
+    } else {
+      while (defined $wdisambig_words_hash{$id} or
+           $wint2sym{$id} eq "<s>" or $wint2sym{$id} eq "</s>" or
+           $wint2sym{$id} =~ m/^#nonterm/ or $id == 0) {
+       $id = int(rand(scalar(keys %wint2sym)));
+      }
     }
+    $wordseq_syms = $wordseq_syms . $wint2sym{$id} . " ";
+    $wordseq = $wordseq . "$sid ". ($sid + 1) . " $id $id 0\n";
+    $sid ++;
+
     $wordseq = $wordseq . "$sid 0";
     $phoneseq = `. ./path.sh; echo \"$wordseq" | fstcompile | fstcompose $lang/$fst - | fstproject | fstrandgen | fstrmepsilon | fsttopsort | fstprint | awk '{if (NF > 2) {print \$3}}';`;
     $transition = { }; # empty assoc. array of allowed transitions between phone types.  1 means we count a word,
@@ -861,10 +940,10 @@ sub check_summation {
           $state = $wbtype{$phone};
         }
         if (!defined $state) {
-          $exit = 1; print "--> ERROR: phone $phone is not specified in $lang/phones/word_boundary.int\n";
+          $exit = 1; print "--> ERROR: phone $phone is not specified in $lang/phones/$word_boundary.int\n";
           last;
         } elsif (!defined $transition{$cur_state, $state}) {
-          $exit = 1; print "--> ERROR: transition from state $cur_state to $state indicates error in word_boundary.int or L.fst\n";
+          $exit = 1; print "--> ERROR: transition from state $cur_state to $state indicates error in $word_boundary.int or L.fst\n";
           last;
         } else {
           $num_words += $transition{$cur_state, $state};
@@ -873,10 +952,13 @@ sub check_summation {
       }
     }
     if (!$exit) {
+      if ($subword_check) { 
+        $wlen = $end_subword;
+      }
       if ($num_words != $wlen) {
         $phoneseq_syms = "";
         foreach my $id (split(" ", $phoneseq)) { $phoneseq_syms = $phoneseq_syms . " " . $pint2sym{$id}; }
-        $exit = 1; print "--> ERROR: number of reconstructed words $num_words does not match real number of words $wlen; indicates problem in $fst or word_boundary.int.  phoneseq = $phoneseq_syms, wordseq = $wordseq_syms\n";
+        $exit = 1; print "--> ERROR: number of reconstructed words $num_words does not match real number of words $wlen; indicates problem in $fst or $word_boundary.int.  phoneseq = $phoneseq_syms, wordseq = $wordseq_syms\n";
       } else {
         print "--> resulting phone sequence from $fst corresponds to the word sequence\n";
         print "--> $fst is OK\n";
diff --git a/egs/yomdle_fa/v1/local/chain/run_cnn_e2eali_1b.sh b/egs/yomdle_fa/v1/local/chain/run_cnn_e2eali_1b.sh
index e7c125d16de..700b57d9fce 100755
--- a/egs/yomdle_fa/v1/local/chain/run_cnn_e2eali_1b.sh
+++ b/egs/yomdle_fa/v1/local/chain/run_cnn_e2eali_1b.sh
@@ -131,7 +131,7 @@ if [ $stage -le 4 ]; then
   mkdir -p $dir
   echo "$0: creating neural net configs using the xconfig parser";
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   cnn_opts="l2-regularize=0.075"
   tdnn_opts="l2-regularize=0.075"
   output_opts="l2-regularize=0.1"
diff --git a/egs/yomdle_fa/v1/local/prepare_dict.sh b/egs/yomdle_fa/v1/local/prepare_dict.sh
index f1b1a8d70cc..8d14130d8c0 100755
--- a/egs/yomdle_fa/v1/local/prepare_dict.sh
+++ b/egs/yomdle_fa/v1/local/prepare_dict.sh
@@ -18,7 +18,7 @@ mkdir -p $dir
 
 local/prepare_lexicon.py --data-dir $data_dir $dir
 
-sed -i '/^\s*$/d' $dir/lexicon.txt
+perl -i -ne 'print if /\S/' $dir/lexicon.txt
 cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1;
 
 echo '<sil> SIL' >> $dir/lexicon.txt
diff --git a/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
index c43d7c669c1..03333f6d229 100755
--- a/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
+++ b/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
@@ -127,7 +127,7 @@ if [ $stage -le 4 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   cnn_opts="l2-regularize=0.075"
   tdnn_opts="l2-regularize=0.075"
   output_opts="l2-regularize=0.1"
diff --git a/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1b.sh b/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
index 8fca9235f46..fd9cdc8921d 100755
--- a/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
+++ b/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
@@ -124,7 +124,7 @@ if [ $stage -le 4 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   cnn_opts="l2-regularize=0.03 dropout-proportion=0.0"
   tdnn_opts="l2-regularize=0.03"
   output_opts="l2-regularize=0.04"
diff --git a/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh b/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh
index 654880fcf59..f6b2c1bac42 100755
--- a/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh
+++ b/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh
@@ -143,7 +143,7 @@ if [ $stage -le 11 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $sup_tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   cnn_opts="l2-regularize=0.075"
   tdnn_opts="l2-regularize=0.075"
   output_opts="l2-regularize=0.1"
diff --git a/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh b/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh
index eb688151665..8185fa2645d 100755
--- a/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh
+++ b/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh
@@ -142,7 +142,7 @@ if [ $stage -le 11 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $sup_tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   cnn_opts="l2-regularize=0.03 dropout-proportion=0.0"
   tdnn_opts="l2-regularize=0.03"
   output_opts="l2-regularize=0.04"
diff --git a/egs/yomdle_russian/README.txt b/egs/yomdle_russian/README.txt
new file mode 100644
index 00000000000..3bf4cc8cd2d
--- /dev/null
+++ b/egs/yomdle_russian/README.txt
@@ -0,0 +1,3 @@
+This directory contains example scripts for OCR on the Yomdle and Slam datasets.
+Training is done on the Yomdle dataset and testing is done on Slam.
+LM rescoring is also done with extra corpus data obtained from various sources
diff --git a/egs/yomdle_russian/v1/cmd.sh b/egs/yomdle_russian/v1/cmd.sh
new file mode 100755
index 00000000000..3d69546dfe8
--- /dev/null
+++ b/egs/yomdle_russian/v1/cmd.sh
@@ -0,0 +1,12 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+export cmd="queue.pl"
diff --git a/egs/yomdle_russian/v1/image b/egs/yomdle_russian/v1/image
new file mode 120000
index 00000000000..1668ee99922
--- /dev/null
+++ b/egs/yomdle_russian/v1/image
@@ -0,0 +1 @@
+../../cifar/v1/image/
\ No newline at end of file
diff --git a/egs/yomdle_russian/v1/local/chain/compare_wer.sh b/egs/yomdle_russian/v1/local/chain/compare_wer.sh
new file mode 100755
index 00000000000..80f31e0f311
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/chain/compare_wer.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b}
+
+# Copyright      2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/cnn{1a,1b}"
+  exit 1
+fi
+
+echo "# $0 $*"
+used_epochs=false
+
+echo -n "# System                     "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+echo -n "# WER                        "
+for x in $*; do
+  wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# WER (rescored)             "
+for x in $*; do
+  wer=$(cat $x/decode_test_rescored/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# CER                        "
+for x in $*; do
+  cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+echo -n "# CER (rescored)             "
+for x in $*; do
+  cer=$(cat $x/decode_test_rescored/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+echo -n "# Final train prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
diff --git a/egs/yomdle_russian/v1/local/chain/run_cnn_e2eali.sh b/egs/yomdle_russian/v1/local/chain/run_cnn_e2eali.sh
new file mode 120000
index 00000000000..e2545b0186e
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/chain/run_cnn_e2eali.sh
@@ -0,0 +1 @@
+tuning/run_cnn_e2eali_1a.sh
\ No newline at end of file
diff --git a/egs/yomdle_russian/v1/local/chain/run_e2e_cnn.sh b/egs/yomdle_russian/v1/local/chain/run_e2e_cnn.sh
new file mode 100755
index 00000000000..6f5742cd34b
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/chain/run_e2e_cnn.sh
@@ -0,0 +1,129 @@
+#!/bin/bash
+
+# Copyright    2017  Hossein Hadian
+# This script does end2end chain training (i.e. from scratch)
+# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/
+# System                      e2e_cnn_1a
+#                             score_basic      rescoring + nomalized
+# WER                             16.24        11.0
+# WER (rescored)                  15.63        10.5
+# CER                              5.98         5.6
+# CER (rescored)                   5.66         5.3
+# Final train prob               0.1376
+# Final valid prob               0.1913
+# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a
+# exp/chain/e2e_cnn_1a: num-iters=27 nj=5..8 num-params=3.0M dim=40->470 combine=0.091->0.091 (over 1) logprob:train/valid[17,26,final]=(0.135,0.137,0.138/0.191,0.191,0.191)
+
+set -e
+# configs for 'chain'
+stage=0
+nj=30
+train_stage=-10
+get_egs_stage=-10
+affix=1a
+
+# training options
+tdnn_dim=450
+minibatch_size=150=64,32/300=32,16/600=16,8/1200=8,4
+cmvn_opts="--norm-means=false --norm-vars=false"
+train_set=train
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+lang=data/lang_e2e
+treedir=exp/chain/e2e_monotree  # it's actually just a trivial tree (no tree building)
+dir=exp/chain/e2e_cnn_${affix}
+
+if [ $stage -le 0 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 1 ]; then
+  steps/nnet3/chain/e2e/prepare_e2e.sh --nj $nj --cmd "$cmd" \
+                                       --shared-phones true \
+                                       --type mono \
+                                       data/$train_set $lang $treedir
+  $cmd $treedir/log/make_phone_lm.log \
+  cat data/$train_set/text \| \
+    steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \
+    utils/sym2int.pl -f 2- data/lang/phones.txt \| \
+    chain-est-phone-lm --num-extra-lm-states=500 \
+                       ark:- $treedir/phone_lm.fst
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+  num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
+  cnn_opts="l2-regularize=0.075"
+  tdnn_opts="l2-regularize=0.075"
+  output_opts="l2-regularize=0.1"
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+  conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs
+fi
+
+if [ $stage -le 3 ]; then
+  steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \
+    --cmd "$cmd" \
+    --feat.cmvn-opts "$cmvn_opts" \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.apply-deriv-weights true \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \
+    --chain.frame-subsampling-factor 4 \
+    --chain.alignment-subsampling-factor 4 \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 3 \
+    --trainer.optimization.momentum 0 \
+    --trainer.optimization.num-jobs-initial 5 \
+    --trainer.optimization.num-jobs-final 8 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.shrink-value 1.0 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir data/${train_set} \
+    --tree-dir $treedir \
+    --dir $dir  || exit 1;
+fi
diff --git a/egs/yomdle_russian/v1/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/yomdle_russian/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
new file mode 100755
index 00000000000..cd582472993
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
@@ -0,0 +1,203 @@
+#!/bin/bash
+
+# local/chain/compare_wer.sh exp/chain/cnn_e2eali_1a
+# System                      cnn_e2eali_1a      rescoring + nomalized
+# WER                             12.08          7.7
+# WER (rescored)                  11.90          7.5
+# CER                              3.60          3.4
+# CER (rescored)                   3.42          3.2
+# Final train prob              -0.0373
+# Final valid prob              -0.0362
+# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1a
+# exp/chain/cnn_e2eali_1a: num-iters=74 nj=3..16 num-params=6.3M dim=40->848 combine=-0.039->-0.039 (over 1) xent:train/valid[48,73,final]=(-0.206,-0.153,-0.146/-0.191,-0.156,-0.151) logprob:train/valid[48,73,final]=(-0.044,-0.038,-0.037/-0.040,-0.037,-0.036)
+
+set -e -o pipefail
+stage=0
+nj=30
+train_set=train
+nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
+affix=_1a  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# chain options
+train_stage=-10
+xent_regularize=0.1
+frame_subsampling_factor=4
+# training chunk-options
+chunk_width=340,300,200,100
+num_leaves=1000
+# we don't need extra left/right context for TDNN systems.
+tdnn_dim=550
+# training options
+srand=0
+remove_egs=false
+dropout_schedule='0,0@0.20,0.2@0.50,0'
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+ali_dir=exp/chain/e2e_ali_train
+lat_dir=exp/chain${nnet3_affix}/e2e_${train_set}_lats
+dir=exp/chain${nnet3_affix}/cnn_e2eali${affix}
+train_data_dir=data/${train_set}
+tree_dir=exp/chain${nnet3_affix}/tree_e2e
+e2echain_model_dir=exp/chain/e2e_cnn_1a
+
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+for f in $train_data_dir/feats.scp $ali_dir/ali.1.gz $ali_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 1 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
+                            --acoustic-scale 1.0 \
+                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
+  echo "" >$lat_dir/splice_opts
+fi
+
+if [ $stage -le 3 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    --alignment-subsampling-factor 1 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 4 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  cnn_opts="l2-regularize=0.03 dropout-proportion=0.0"
+  tdnn_opts="l2-regularize=0.03"
+  output_opts="l2-regularize=0.04"
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=90"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+  conv-relu-batchnorm-dropout-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-dropout-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-dropout-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-dropout-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-dropout-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-dropout-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  conv-relu-batchnorm-dropout-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  relu-batchnorm-dropout-layer name=tdnn1 input=Append(-8,-4,0,4,8) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+  relu-batchnorm-dropout-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+  relu-batchnorm-dropout-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' mod?els... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 5 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$cmd" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.frame-subsampling-factor=$frame_subsampling_factor \
+    --chain.alignment-subsampling-factor=1 \
+    --chain.left-tolerance 3 \
+    --chain.right-tolerance 3 \
+    --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=900" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=16 \
+    --trainer.frames-per-iter=2000000 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=16 \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=32,16 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0 --constrained false" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
diff --git a/egs/yomdle_russian/v1/local/check_tools.sh b/egs/yomdle_russian/v1/local/check_tools.sh
new file mode 100755
index 00000000000..5b4d3107d3b
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/check_tools.sh
@@ -0,0 +1,43 @@
+#!/bin/bash -u
+
+# Copyright 2015 (c) Johns Hopkins University (Jan Trmal <jtrmal@gmail.com>)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+[ -f ./path.sh ] && . ./path.sh
+set +e
+
+command -v python3 >&/dev/null \
+  || { echo  >&2 "python3 not found on PATH. You will have to install Python3, preferably >= 3.6"; exit 1; }
+
+python3 -c "import numpy"
+if [ $? -ne 0 ] ; then
+  echo >&2 "This recipe needs numpy installed."
+  exit 1
+fi
+
+python3 -c "import scipy"
+if [ $? -ne 0 ] ; then
+  echo >&2 "This recipe needs scipy installed."
+  exit 1
+fi
+
+python3 -c "import scipy.misc; scipy.misc.__dict__['imread']"
+if [ $? -ne 0 ] ; then
+  echo >&2 "This recipe needs scipy-image and  Pillow installed."
+  exit 1
+fi
+
+
+exit  0
diff --git a/egs/yomdle_russian/v1/local/extract_features.sh b/egs/yomdle_russian/v1/local/extract_features.sh
new file mode 100755
index 00000000000..3880ebad3e8
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/extract_features.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Copyright   2017 Yiwen Shao
+#             2018 Ashish Arora
+
+# Apache 2.0
+# This script runs the make features script in parallel. 
+
+nj=4
+cmd=run.pl
+feat_dim=40
+augment='no_aug'
+fliplr=false
+echo "$0 $@"
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+data=$1
+featdir=$data/data
+scp=$data/images.scp
+logdir=$data/log
+
+mkdir -p $logdir
+mkdir -p $featdir
+
+# make $featdir an absolute pathname
+featdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $featdir ${PWD}`
+
+for n in $(seq $nj); do
+    split_scps="$split_scps $logdir/images.$n.scp"
+done
+
+# split images.scp
+utils/split_scp.pl $scp $split_scps || exit 1;
+
+$cmd JOB=1:$nj $logdir/extract_features.JOB.log \
+  image/ocr/make_features.py $logdir/images.JOB.scp \
+    --allowed_len_file_path $data/allowed_lengths.txt \
+    --feat-dim $feat_dim --fliplr $fliplr --augment_type $augment \| \
+    copy-feats --compress=true --compression-method=7 \
+    ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp
+
+## aggregates the output scp's to get feats.scp
+for n in $(seq $nj); do
+  cat $featdir/images.$n.scp || exit 1;
+done > $data/feats.scp || exit 1
diff --git a/egs/yomdle_russian/v1/local/prepare_dict.sh b/egs/yomdle_russian/v1/local/prepare_dict.sh
new file mode 100755
index 00000000000..22db5ae834d
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/prepare_dict.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+
+# Copyright      2017  Hossein Hadian
+#                2017  Babak Rekabdar
+#                2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+# This script prepares the dictionary.
+
+set -e
+dir=data/local/dict
+. ./utils/parse_options.sh || exit 1;
+
+mkdir -p $dir
+
+local/prepare_lexicon.py $dir
+
+cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1;
+
+echo '<sil> SIL' >> $dir/lexicon.txt
+
+echo SIL > $dir/silence_phones.txt
+
+echo SIL >$dir/optional_silence.txt
+
+echo -n "" >$dir/extra_questions.txt
diff --git a/egs/yomdle_russian/v1/local/prepare_lexicon.py b/egs/yomdle_russian/v1/local/prepare_lexicon.py
new file mode 100755
index 00000000000..a68b1cb49dd
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/prepare_lexicon.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+
+# Copyright      2017  Babak Rekabdar
+#                2017  Hossein Hadian
+#                2017  Chun Chieh Chang
+#                2017  Ashish Arora
+# Apache 2.0
+
+# This script prepares lexicon for BPE. It gets the set of all words that occur in data/train/text.
+# Since this lexicon is based on BPE, it replaces '|' with silence.
+
+import argparse
+import os
+import unicodedata
+parser = argparse.ArgumentParser(description="""Creates the list of characters and words in lexicon""")
+parser.add_argument('dir', type=str, help='output path')
+args = parser.parse_args()
+
+### main ###
+lex = {}
+text_path = os.path.join('data', 'train', 'text')
+with open(text_path, 'r', encoding='utf-8') as f:
+    for line in f:
+        line_vect = line.strip().split(' ')
+        for i in range(1, len(line_vect)):
+            characters = list(line_vect[i])
+            characters = " ".join([ 'SIL' if char == '|' else char for char in characters])
+            characters = list(characters)
+            characters = "".join([ '<HASH>' if char == '#' else char for char in characters])
+            lex[line_vect[i]] = characters
+
+with open(os.path.join(args.dir, 'lexicon.txt'), 'w', encoding='utf-8') as fp:
+    for key in sorted(lex):
+        fp.write(key + " " + lex[key] + "\n")
diff --git a/egs/yomdle_russian/v1/local/process_corpus.py b/egs/yomdle_russian/v1/local/process_corpus.py
new file mode 100755
index 00000000000..b39030270b7
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/process_corpus.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python3
+# Copyright      2018  Ashish Arora
+# Apache 2.0
+# This script reads valid phones and removes the lines in the corpus
+# which have any other phone.
+
+import os
+import sys, io
+
+phone_file = os.path.join('data/local/text/cleaned/phones.txt')
+infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
+output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+phone_dict = dict()
+with open(phone_file, 'r', encoding='utf-8') as phone_fh:
+    for line in phone_fh:
+        line = line.strip().split()[0]
+        phone_dict[line] = line
+
+phone_dict[' '] = ' '
+corpus_text = list()
+for line in infile:
+    text = line.strip()
+    skip_text = False
+    for phone in text:
+        if phone not in phone_dict.keys():
+            skip_text = True
+            break
+    if not skip_text:
+        output.write(text+ '\n')
+
diff --git a/egs/yomdle_russian/v1/local/process_data.py b/egs/yomdle_russian/v1/local/process_data.py
new file mode 100755
index 00000000000..d7546b0a803
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/process_data.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+
+# Copyright      2018  Ashish Arora
+#                2018  Chun Chieh Chang
+
+""" This script reads the extracted Tamil OCR (yomdle and slam) database files 
+    and creates the following files (for the data subset selected via --dataset):
+    text, utt2spk, images.scp.
+  Eg. local/process_data.py data/download/ data/local/splits/train.txt data/train
+
+  Eg. text file: english_phone_books_0001_1 To sum up, then, it would appear that
+      utt2spk file: english_phone_books_0001_0 english_phone_books_0001
+      images.scp file: english_phone_books_0001_0 \
+      data/download/truth_line_image/english_phone_books_0001_0.png
+"""
+
+import argparse
+import os
+import sys
+import csv
+import itertools
+import unicodedata
+import re
+import string
+import unicodedata
+parser = argparse.ArgumentParser(description="Creates text, utt2spk, and images.scp files")
+parser.add_argument('database_path', type=str, help='Path to data')
+parser.add_argument('data_split', type=str, help='Path to file that contain datasplits')
+parser.add_argument('out_dir', type=str, help='directory to output files')
+args = parser.parse_args()
+
+### main ###
+print("Processing '{}' data...".format(args.out_dir))
+
+text_file = os.path.join(args.out_dir, 'text')
+text_fh = open(text_file, 'w', encoding='utf-8')
+utt2spk_file = os.path.join(args.out_dir, 'utt2spk')
+utt2spk_fh = open(utt2spk_file, 'w', encoding='utf-8')
+image_file = os.path.join(args.out_dir, 'images.scp')
+image_fh = open(image_file, 'w', encoding='utf-8')
+
+with open(args.data_split) as f:
+    for line in f:
+        line = line.strip()
+        image_id = line
+        image_filename = image_id + '.png'
+        image_filepath = os.path.join(args.database_path, 'truth_line_image', image_filename)
+        if not os.path.isfile (image_filepath):
+            print("File does not exist {}".format(image_filepath))
+            continue
+        line_id = int(line.split('_')[-1])
+        csv_filename = '_'.join(line.split('_')[:-1]) + '.csv'
+        csv_filepath = os.path.join(args.database_path, 'truth_csv', csv_filename)
+        csv_file = open(csv_filepath, 'r', encoding='utf-8')
+        for row in csv.reader(csv_file):
+            if row[1] == image_filename:
+                text = row[11]
+                text_vect = text.split() # this is to avoid non-utf-8 spaces
+                text = " ".join(text_vect)
+                #text_normalized = unicodedata.normalize('NFD', text).replace('\n', '')
+                if not text:
+                    continue
+                text_fh.write(image_id + ' ' + text + '\n')
+                utt2spk_fh.write(image_id + ' ' + '_'.join(line.split('_')[:-1]) + '\n')
+                image_fh.write(image_id + ' ' + image_filepath +  '\n')
diff --git a/egs/yomdle_russian/v1/local/score.sh b/egs/yomdle_russian/v1/local/score.sh
new file mode 100755
index 00000000000..31564d25326
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/score.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+
+steps/scoring/score_kaldi_wer.sh "$@"
+steps/scoring/score_kaldi_cer.sh --stage 2 "$@"
diff --git a/egs/yomdle_russian/v1/local/train_lm.sh b/egs/yomdle_russian/v1/local/train_lm.sh
new file mode 100755
index 00000000000..c73c42fb7dc
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/train_lm.sh
@@ -0,0 +1,127 @@
+#!/bin/bash
+
+# Copyright 2016  Vincent Nguyen
+#           2016  Johns Hopkins University (author: Daniel Povey)
+#           2017  Ashish Arora
+#           2017  Hossein Hadian
+# Apache 2.0
+#
+# This script trains a LM on the training transcriptions and corpus text.
+# It is based on the example scripts distributed with PocoLM
+
+# It will check if pocolm is installed and if not will proceed with installation
+
+set -e
+stage=0
+dir=data/local/local_lm
+order=6
+echo "$0 $@"  # Print the command line for logging
+. ./utils/parse_options.sh || exit 1;
+
+lm_dir=${dir}/data
+
+
+mkdir -p $dir
+. ./path.sh || exit 1; # for KALDI_ROOT
+export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
+( # First make sure the pocolm toolkit is installed.
+ cd $KALDI_ROOT/tools || exit 1;
+ if [ -d pocolm ]; then
+   echo Not installing the pocolm toolkit since it is already there.
+ else
+   echo "$0: Please install the PocoLM toolkit with: "
+   echo " cd ../../../tools; extras/install_pocolm.sh; cd -"
+   exit 1;
+ fi
+) || exit 1;
+
+bypass_metaparam_optim_opt=
+# If you want to bypass the metaparameter optimization steps with specific metaparameters
+# un-comment the following line, and change the numbers to some appropriate values.
+# You can find the values from output log of train_lm.py.
+# These example numbers of metaparameters is for 4-gram model (with min-counts)
+# running with train_lm.py.
+# The dev perplexity should be close to the non-bypassed model.
+#bypass_metaparam_optim_opt="--bypass-metaparameter-optimization=0.031,0.860,0.678,0.194,0.037,0.006,0.928,0.712,0.454,0.220,0.926,0.844,0.749,0.358,0.966,0.879,0.783,0.544,0.966,0.826,0.674,0.450"
+# Note: to use these example parameters, you may need to remove the .done files
+# to make sure the make_lm_dir.py be called and tain only 3-gram model
+#for order in 3; do
+#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done
+
+if [ $stage -le 0 ]; then
+  mkdir -p ${dir}/data
+  mkdir -p ${dir}/data/text
+
+  echo "$0: Getting the Data sources"
+
+  rm ${dir}/data/text/* 2>/dev/null || true
+
+  # use the validation data as the dev set.
+  # Note: the name 'dev' is treated specially by pocolm, it automatically
+  # becomes the dev set.
+
+  cat data/local/text/cleaned/bpe_val.txt  > ${dir}/data/text/dev.txt
+  # use the training data as an additional data source.
+  # we can later fold the dev data into this.
+  cat data/train/text | cut -d " " -f 2- >  ${dir}/data/text/train.txt
+  cat data/local/text/cleaned/bpe_corpus.txt > ${dir}/data/text/corpus_text.txt
+  # for reporting perplexities, we'll use the "real" dev set.
+  # (the validation data is used as ${dir}/data/text/dev.txt to work
+  # out interpolation weights.)
+  # note, we can't put it in ${dir}/data/text/, because then pocolm would use
+  # it as one of the data sources.
+  cut -d " " -f 2-  < data/test/text  > ${dir}/data/real_dev_set.txt
+
+  # get the wordlist from train and corpus text
+  cat ${dir}/data/text/{train,corpus_text}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
+  cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist
+fi
+
+if [ $stage -le 1 ]; then
+  # decide on the vocabulary.
+  # Note: you'd use --wordlist if you had a previously determined word-list
+  # that you wanted to use.
+  # Note: if you have more than one order, use a certain amount of words as the
+  # vocab and want to restrict max memory for 'sort',
+  echo "$0: training the unpruned LM"
+  min_counts='train=1'
+  wordlist=${dir}/data/wordlist
+
+  lm_name="`basename ${wordlist}`_${order}"
+  if [ -n "${min_counts}" ]; then
+    lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`"
+  fi
+  unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
+  train_lm.py  --wordlist=${wordlist} --num-splits=20 --warm-start-ratio=20 \
+               --limit-unk-history=true \
+               ${bypass_metaparam_optim_opt} \
+               ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity'
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: pruning the LM (to larger size)"
+  # Using 10 million n-grams for a big LM for rescoring purposes.
+  size=10000000
+  prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity'
+  #[perplexity = 22.0613098868] over 151116.0 words
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: pruning the LM (to smaller size)"
+  # Using 2 million n-grams for a smaller LM for graph building.  Prune from the
+  # bigger-pruned LM, it'll be faster.
+  size=2000000
+  prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity'
+  #[perplexity = 23.4801171202] over 151116.0 words
+  format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz
+fi
diff --git a/egs/yomdle_russian/v1/local/wer_output_filter b/egs/yomdle_russian/v1/local/wer_output_filter
new file mode 100755
index 00000000000..59e364e0231
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/wer_output_filter
@@ -0,0 +1,17 @@
+#!/usr/bin/env python3
+
+# Copyright      2017  Hossein Hadian
+
+# Apache 2.0
+# This script converts a BPE-encoded text to normal text. It is used in scoring
+
+import sys, io
+import string
+infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
+output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+for line in infile:
+  words = line.strip().split()
+  uttid = words[0]
+  transcript = ''.join(words[1:])
+  transcript = transcript.replace('|', ' ')
+  output.write(uttid + ' ' + transcript + '\n')
diff --git a/egs/yomdle_russian/v1/local/yomdle b/egs/yomdle_russian/v1/local/yomdle
new file mode 120000
index 00000000000..2c4544c1399
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/yomdle
@@ -0,0 +1 @@
+../../../yomdle_tamil/v1/local/yomdle/
\ No newline at end of file
diff --git a/egs/yomdle_russian/v1/path.sh b/egs/yomdle_russian/v1/path.sh
new file mode 100755
index 00000000000..2d17b17a84a
--- /dev/null
+++ b/egs/yomdle_russian/v1/path.sh
@@ -0,0 +1,6 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/yomdle_russian/v1/run_end2end.sh b/egs/yomdle_russian/v1/run_end2end.sh
new file mode 100755
index 00000000000..12beebeaa05
--- /dev/null
+++ b/egs/yomdle_russian/v1/run_end2end.sh
@@ -0,0 +1,186 @@
+#!/bin/bash
+
+# Copyright 2018    Hossein Hadian
+#                   Ashish Arora
+#                   Jonathan Chang
+# Apache 2.0
+
+set -e
+stage=0
+nj=30
+
+language_main=Russian
+slam_dir=/export/corpora5/slam/SLAM/
+yomdle_dir=/export/corpora5/slam/YOMDLE/
+corpus_dir=/export/corpora5/handwriting_ocr/corpus_data/ru/
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+./local/check_tools.sh
+# Start from stage=-2 for data preparation. This stage stores line images,
+# csv files and splits{train,test,train_unsup} data/download/truth_line_image,
+# data/download/truth_csv and data/local/splits respectively.
+if [ $stage -le -2 ]; then
+  echo "$0: $(date): preparing data, obtaining line images and csv files..."
+  local/yomdle/create_download_dir.sh --language_main $language_main \
+    --slam_dir $slam_dir --yomdle_dir $yomdle_dir
+fi
+
+if [ $stage -le -1 ]; then
+  echo "$0: $(date): getting corpus text for language modelling..."
+  mkdir -p data/local/text/cleaned
+  cat $corpus_dir/* > data/local/text/ru.txt
+  head -20000 data/local/text/ru.txt > data/local/text/cleaned/val.txt
+  tail -n +20000 data/local/text/ru.txt > data/local/text/cleaned/corpus.txt
+fi
+
+mkdir -p data/{train,test}/data
+if [ $stage -le 0 ]; then
+  echo "$0: stage 0: Processing train and test data.$(date)"
+  echo "$0: creating text, images.scp, utt2spk and spk2utt"
+  #local/prepare_data.sh data/download/
+  for set in train test; do
+    local/process_data.py data/download/ \
+      data/local/splits/${set}.txt data/${set}
+    image/fix_data_dir.sh data/${set}
+  done
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$0: $(date) stage 1: getting allowed image widths for e2e training..."
+  image/get_image2num_frames.py --feat-dim 40 data/train
+  image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train
+  for set in train test; do
+    echo "$0: $(date) Extracting features, creating feats.scp file"
+    local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set}
+    steps/compute_cmvn_stats.sh data/${set} || exit 1;
+  done
+  image/fix_data_dir.sh data/train
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: $(date) stage 3: BPE preparation"
+  # getting non-silence phones.
+  cut -d' ' -f2- data/train/text | \
+python3 <(
+cat << "END"
+import os, sys, io;
+infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8');
+output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8');
+phone_dict = dict();
+for line in infile:
+    line_vect = line.strip().split();
+    for word in line_vect:
+        for phone in word:
+            phone_dict[phone] = phone;
+
+for phone in phone_dict.keys():
+      output.write(phone+ '\n');
+END
+   ) > data/local/text/cleaned/phones.txt
+
+  cut -d' ' -f2- data/train/text > data/local/text/cleaned/train.txt
+
+  echo "$0: learning BPE..."
+  # it is currently learned with only training text but we can also use all corpus text
+  # to learn BPE. phones are added so that one isolated occurance of every phone exists.
+  cat data/local/text/cleaned/phones.txt data/local/text/cleaned/train.txt | \
+    utils/lang/bpe/prepend_words.py | utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: $(date) stage 4: applying BPE..."
+  echo "$0: applying BPE on train, test text..."
+  for set in test train; do
+    cut -d' ' -f1 data/$set/text > data/$set/ids
+    cut -d' ' -f2- data/$set/text | utils/lang/bpe/prepend_words.py | \
+      utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt | \
+      sed 's/@@//g' > data/$set/bpe_text
+    mv data/$set/text data/$set/text.old
+    paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text
+    rm -f data/$set/bpe_text data/$set/ids
+  done
+
+  echo "$0: applying BPE to corpus text..."
+  cat data/local/text/cleaned/corpus.txt | utils/lang/bpe/prepend_words.py | \
+    utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt | \
+    sed 's/@@//g' > data/local/text/cleaned/bpe_corpus.txt
+  cat data/local/text/cleaned/val.txt | utils/lang/bpe/prepend_words.py | \
+    utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt | \
+    sed 's/@@//g' > data/local/text/cleaned/bpe_val.txt
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$0: $(date) stage 5: Preparing dictionary and lang..."
+  local/prepare_dict.sh --dir data/local/dict
+  utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 4 --sil-prob 0.0 --position-dependent-phones false \
+    data/local/dict "<sil>" data/lang/temp data/lang
+  utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang
+fi
+
+if [ $stage -le 6 ]; then
+  echo "$0: $(date) stage 6: Calling the flat-start chain recipe..."
+  local/chain/run_e2e_cnn.sh
+fi
+
+if [ $stage -le 7 ]; then
+  echo "$0: $(date) stage 7: Aligning the training data using the e2e chain model..."
+  steps/nnet3/align.sh --nj $nj --cmd "$cmd" \
+    --scale-opts '--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0' \
+    data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train
+fi
+
+chunk_width='340,300,200,100'
+lang_decode=data/lang
+lang_rescore=data/lang_rescore_6g
+if [ $stage -le 8 ]; then
+  echo "$0: $(date) stage 8: Building a tree and training a regular chain model using the e2e alignments..."
+  local/chain/run_cnn_e2eali.sh --chunk_width $chunk_width
+fi
+
+if [ $stage -le 9 ]; then
+  echo "$0: $(date) stage 9: Estimating a language model for decoding..."
+  local/train_lm.sh
+  utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_small.arpa.gz \
+                     data/local/dict/lexicon.txt data/lang
+  utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \
+                               data/lang data/lang_rescore_6g
+fi
+
+if [ $stage -le 10 ] && $decode_e2e; then
+  echo "$0: $(date) stage 10: decoding end2end setup..."
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $lang_decode \
+    exp/chain/e2e_cnn_1a/ exp/chain/e2e_cnn_1a/graph || exit 1;
+
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --nj 30 --cmd "$cmd" --beam 12 \
+    exp/chain/e2e_cnn_1a/graph data/test exp/chain/e2e_cnn_1a/decode_test || exit 1;
+
+  steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+                                data/test exp/chain/e2e_cnn_1a/decode_test{,_rescored} || exit 1
+
+  echo "$0: Done. Date: $(date). Results:"
+  local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/
+fi
+
+if [ $stage -le 11 ] && $decode_chain; then
+  echo "$0: $(date) stage 11: decoding chain alignment setup..."
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $lang_decode \
+    exp/chain/cnn_e2eali_1a/ exp/chain/cnn_e2eali_1a/graph || exit 1;
+
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --nj 30 --cmd "$cmd" --beam 12 \
+    exp/chain/cnn_e2eali_1a/graph data/test exp/chain/cnn_e2eali_1a/decode_test || exit 1;
+
+  steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+                                data/test exp/chain/cnn_e2eali_1a/decode_test{,_rescored} || exit 1
+
+  echo "$0: Done. Date: $(date). Results:"
+  local/chain/compare_wer.sh exp/chain/cnn_e2eali_1a
+fi
diff --git a/egs/yomdle_russian/v1/steps b/egs/yomdle_russian/v1/steps
new file mode 120000
index 00000000000..1b186770dd1
--- /dev/null
+++ b/egs/yomdle_russian/v1/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps/
\ No newline at end of file
diff --git a/egs/yomdle_russian/v1/utils b/egs/yomdle_russian/v1/utils
new file mode 120000
index 00000000000..a3279dc8679
--- /dev/null
+++ b/egs/yomdle_russian/v1/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils/
\ No newline at end of file
diff --git a/egs/yomdle_tamil/v1/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/yomdle_tamil/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
index c43d7c669c1..03333f6d229 100755
--- a/egs/yomdle_tamil/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
+++ b/egs/yomdle_tamil/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
@@ -127,7 +127,7 @@ if [ $stage -le 4 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   cnn_opts="l2-regularize=0.075"
   tdnn_opts="l2-regularize=0.075"
   output_opts="l2-regularize=0.1"
diff --git a/egs/yomdle_tamil/v1/local/chain/tuning/run_cnn_e2eali_1b.sh b/egs/yomdle_tamil/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
index 9a12a5a9e1e..fb15ce10dde 100755
--- a/egs/yomdle_tamil/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
+++ b/egs/yomdle_tamil/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
@@ -125,7 +125,7 @@ if [ $stage -le 4 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   cnn_opts="l2-regularize=0.03 dropout-proportion=0.0"
   tdnn_opts="l2-regularize=0.03"
   output_opts="l2-regularize=0.04"
diff --git a/egs/yomdle_tamil/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh b/egs/yomdle_tamil/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh
index 654880fcf59..f6b2c1bac42 100755
--- a/egs/yomdle_tamil/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh
+++ b/egs/yomdle_tamil/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh
@@ -143,7 +143,7 @@ if [ $stage -le 11 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $sup_tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   cnn_opts="l2-regularize=0.075"
   tdnn_opts="l2-regularize=0.075"
   output_opts="l2-regularize=0.1"
diff --git a/egs/yomdle_tamil/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh b/egs/yomdle_tamil/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh
index 08641f6a38a..17d59642b05 100755
--- a/egs/yomdle_tamil/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh
+++ b/egs/yomdle_tamil/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh
@@ -142,7 +142,7 @@ if [ $stage -le 11 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $sup_tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   cnn_opts="l2-regularize=0.03 dropout-proportion=0.0"
   tdnn_opts="l2-regularize=0.03"
   output_opts="l2-regularize=0.04"
diff --git a/egs/yomdle_zh/v1/local/chain/run_cnn_e2eali_1b.sh b/egs/yomdle_zh/v1/local/chain/run_cnn_e2eali_1b.sh
index 4183aa74587..0a4e00d7aed 100755
--- a/egs/yomdle_zh/v1/local/chain/run_cnn_e2eali_1b.sh
+++ b/egs/yomdle_zh/v1/local/chain/run_cnn_e2eali_1b.sh
@@ -130,7 +130,7 @@ if [ $stage -le 4 ]; then
   mkdir -p $dir
   echo "$0: creating neural net configs using the xconfig parser";
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   cnn_opts="l2-regularize=0.075"
   tdnn_opts="l2-regularize=0.075"
   output_opts="l2-regularize=0.1"
diff --git a/egs/yomdle_zh/v1/local/create_download.sh b/egs/yomdle_zh/v1/local/create_download.sh
index a440a331747..1daad354473 100755
--- a/egs/yomdle_zh/v1/local/create_download.sh
+++ b/egs/yomdle_zh/v1/local/create_download.sh
@@ -43,4 +43,4 @@ local/create_line_image_from_page_image.py \
 
 echo "Downloading table for CangJie."
 wget -P $download_dir/ $cangjie_url || exit 1;
-sed -ie '1,8d' $download_dir/cj5-cc.txt
+perl -n -i -e 'print if $. > 8' $download_dir/cj5-cc.txt
diff --git a/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_1a.sh
index 55e046dd55a..14b9a8d6c8e 100755
--- a/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -156,7 +156,7 @@ if [ $stage -le 11 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true"
   tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
   linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
diff --git a/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_opgru_1a.sh b/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_opgru_1a.sh
index 44110888519..28b36243ba3 100755
--- a/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_opgru_1a.sh
+++ b/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_opgru_1a.sh
@@ -158,7 +158,7 @@ if [ $stage -le 11 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   gru_opts="dropout-per-frame=true dropout-proportion=0.0"
 
   mkdir -p $dir/configs
diff --git a/scripts/rnnlm/choose_features.py b/scripts/rnnlm/choose_features.py
index c6621e04494..595c1d85bc1 100755
--- a/scripts/rnnlm/choose_features.py
+++ b/scripts/rnnlm/choose_features.py
@@ -10,12 +10,8 @@
 from collections import defaultdict
 sys.stdout = open(1, 'w', encoding='utf-8', closefd=False)
 
-# because this script splits inside words, we cannot use latin-1; we actually need to know what 
-# what the encoding is.  By default we make this utf-8; to handle encodings that are not compatible
-# with utf-8 (e.g. gbk), we'll eventually have to make the encoding an option to this script.
-
 import re
-tab_or_space = re.compile('[ \t]+')
+
 
 parser = argparse.ArgumentParser(description="This script chooses the sparse feature representation of words. "
                                              "To be more specific, it chooses the set of features-- you compute "
@@ -92,7 +88,7 @@ def read_vocab(vocab_file):
     vocab = {}
     with open(vocab_file, 'r', encoding="utf-8") as f:
         for line in f:
-            fields = re.split(tab_or_space, line)
+            fields = line.split()
             assert len(fields) == 2
             if fields[0] in vocab:
                 sys.exit(sys.argv[0] + ": duplicated word({0}) in vocab: {1}"
@@ -121,7 +117,7 @@ def read_unigram_probs(unigram_probs_file):
     unigram_probs = []
     with open(unigram_probs_file, 'r', encoding="utf-8") as f:
         for line in f:
-            fields = re.split(tab_or_space, line)
+            fields = line.split()
             assert len(fields) == 2
             idx = int(fields[0])
             if idx >= len(unigram_probs):
diff --git a/scripts/rnnlm/compute_perplexity.sh b/scripts/rnnlm/compute_perplexity.sh
new file mode 100755
index 00000000000..17c441e6aea
--- /dev/null
+++ b/scripts/rnnlm/compute_perplexity.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# This script computes perplexity of text on the specified RNNLM model. 
+
+[ -f ./path.sh ] && . ./path.sh
+. utils/parse_options.sh
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 <rnn-dir> <input-text>"
+  exit 1
+fi
+
+dir=$1
+text_in=$2
+
+# the format of the $text_in file is one sentence per line, without explicit
+# <s> or </s> symbols, and without utterance-id's, for example:
+
+# ====== begin file ======
+# well western new york is supposed to be used to this kind of weather but
+# yeah you are right
+# in um anaheim california you know just
+# ====== end file ======
+
+if [ -f $dir/word_embedding.final.mat ]; then
+  word_embedding=$dir/word_embedding.final.mat
+else
+  [ ! -f $dir/feat_embedding.final.mat ] &&
+             echo "$0: expect file $dir/feat_embedding.final.mat to exit"
+  word_embedding="rnnlm-get-word-embedding $dir/word_feats.txt $dir/feat_embedding.final.mat -|"
+fi
+
+for x in final.raw config/words.txt; do
+  if [ ! -f $dir/$x ]; then 
+    echo "$0: expected file $dir/$x to exist."
+    exit 1;
+  fi
+done
+
+special_symbol_opts=$(cat $dir/special_symbol_opts.txt)
+
+ppl=$(rnnlm-sentence-probs --normalize-probs=true \
+       $special_symbol_opts $dir/final.raw "$word_embedding" \
+       <(cat $text_in | sym2int.pl $dir/config/words.txt | awk '{print "utt_id ", $0}') | \
+       awk '{for(i=2;i<=NF;i++) a+=$i; b+=NF-1}END{print exp(-a / b)}')
+
+echo "$0: perplexity is $ppl"
+
diff --git a/scripts/rnnlm/get_best_model.py b/scripts/rnnlm/get_best_model.py
index 333ed8dbfc7..ed266346e06 100755
--- a/scripts/rnnlm/get_best_model.py
+++ b/scripts/rnnlm/get_best_model.py
@@ -21,7 +21,7 @@
 
 num_iters = None
 try:
-    with open(args.rnnlm_dir + "/info.txt", encoding="latin-1") as f:
+    with open(args.rnnlm_dir + "/info.txt", encoding="utf-8") as f:
         for line in f:
             a = line.split("=")
             if a[0] == "num_iters":
@@ -40,7 +40,7 @@
 for i in range(1, num_iters):
     this_logfile = "{0}/log/compute_prob.{1}.log".format(args.rnnlm_dir, i)
     try:
-        f = open(this_logfile, 'r', encoding='latin-1')
+        f = open(this_logfile, 'r', encoding='utf-8')
     except:
         sys.exit(sys.argv[0] + ": could not open log-file {0}".format(this_logfile))
     this_objf = -1000
diff --git a/scripts/rnnlm/get_embedding_dim.py b/scripts/rnnlm/get_embedding_dim.py
index 63eaf307498..1d516e0edf5 100755
--- a/scripts/rnnlm/get_embedding_dim.py
+++ b/scripts/rnnlm/get_embedding_dim.py
@@ -45,7 +45,7 @@
 left_context=0
 right_context=0
 for line in out_lines:
-    line = line.decode('latin-1')
+    line = line.decode('utf-8')
     m = re.search(r'input-node name=input dim=(\d+)', line)
     if m is not None:
         try:
diff --git a/scripts/rnnlm/get_special_symbol_opts.py b/scripts/rnnlm/get_special_symbol_opts.py
index 4310b116ad7..7ee0ca54c9a 100755
--- a/scripts/rnnlm/get_special_symbol_opts.py
+++ b/scripts/rnnlm/get_special_symbol_opts.py
@@ -9,7 +9,7 @@
 import sys
 
 import re
-tab_or_space = re.compile('[ \t]+')
+
 
 parser = argparse.ArgumentParser(description="This script checks whether the special symbols "
                                  "appear in words.txt with expected values, if not, it will "
@@ -28,9 +28,9 @@
 
 lower_ids = {}
 upper_ids = {}
-input_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='latin-1')
+input_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
 for line in input_stream:
-    fields = re.split(tab_or_space, line)
+    fields = line.split()
     assert(len(fields) == 2)
     sym = fields[0]
     if sym in special_symbols:
diff --git a/scripts/rnnlm/get_unigram_probs.py b/scripts/rnnlm/get_unigram_probs.py
index ab3f9bb382f..52e637a0e2d 100755
--- a/scripts/rnnlm/get_unigram_probs.py
+++ b/scripts/rnnlm/get_unigram_probs.py
@@ -8,7 +8,7 @@
 import sys
 
 import re
-tab_or_space = re.compile('[ \t]+')
+
 
 parser = argparse.ArgumentParser(description="This script gets the unigram probabilities of words.",
                                  epilog="E.g. " + sys.argv[0] + " --vocab-file=data/rnnlm/vocab/words.txt "
@@ -27,7 +27,7 @@
                     help="File that specifies multiplicities and weights for each data source: "
                     "e.g. if <text_dir> contains foo.txt and bar.txt, then should have lines "
                     "like 'foo 1 0.5' and 'bar 5 1.5'.  These "
-                    "don't have to sum to on.")
+                    "don't have to sum to one.")
 parser.add_argument("--smooth-unigram-counts", type=float, default=1.0,
                     help="Specify the constant for smoothing. We will add "
                          "(smooth_unigram_counts * num_words_with_non_zero_counts / vocab_size) "
@@ -77,10 +77,10 @@ def get_all_data_sources_except_dev(text_dir):
 #                    value is a tuple (repeated_times_per_epoch, weight)
 def read_data_weights(weights_file, data_sources):
     data_weights = {}
-    with open(weights_file, 'r', encoding="latin-1") as f:
+    with open(weights_file, 'r', encoding="utf-8") as f:
         for line in f:
             try:
-                fields = re.split(tab_or_space, line)
+                fields = line.split()
                 assert len(fields) == 3
                 if fields[0] in data_weights:
                     raise Exception("duplicated data source({0}) specified in "
@@ -102,9 +102,9 @@ def read_data_weights(weights_file, data_sources):
 # return the vocab, which is a dict mapping the word to a integer id.
 def read_vocab(vocab_file):
     vocab = {}
-    with open(vocab_file, 'r', encoding="latin-1") as f:
+    with open(vocab_file, 'r', encoding="utf-8") as f:
         for line in f:
-            fields = re.split(tab_or_space, line)
+            fields = line.split()
             assert len(fields) == 2
             if fields[0] in vocab:
                 sys.exit(sys.argv[0] + ": duplicated word({0}) in vocab: {1}"
@@ -131,9 +131,9 @@ def get_counts(data_sources, data_weights, vocab):
         if weight == 0.0:
             continue
 
-        with open(counts_file, 'r', encoding="latin-1") as f:
+        with open(counts_file, 'r', encoding="utf-8") as f:
             for line in f:
-                fields = re.split(tab_or_space, line)
+                fields = line.split()
                 if len(fields) != 2: print("Warning, should be 2 cols:", fields, line, file=sys.stderr);
                 assert(len(fields) == 2)
                 word = fields[0]
diff --git a/scripts/rnnlm/get_vocab.py b/scripts/rnnlm/get_vocab.py
index 1502e915f9c..baafcb3a131 100755
--- a/scripts/rnnlm/get_vocab.py
+++ b/scripts/rnnlm/get_vocab.py
@@ -6,10 +6,10 @@
 import os
 import argparse
 import sys
-sys.stdout = open(1, 'w', encoding='latin-1', closefd=False)
+sys.stdout = open(1, 'w', encoding='utf-8', closefd=False)
 
 import re
-tab_or_space = re.compile('[ \t]+')
+
 
 parser = argparse.ArgumentParser(description="This script get a vocab from unigram counts "
                                  "of words produced by get_unigram_counts.sh",
@@ -28,10 +28,10 @@
 # Add the count for every word in counts_file
 # the result is written into word_counts
 def add_counts(word_counts, counts_file):
-    with open(counts_file, 'r', encoding="latin-1") as f:
+    with open(counts_file, 'r', encoding="utf-8") as f:
         for line in f:
             line = line.strip(" \t\r\n")
-            word_and_count = re.split(tab_or_space, line)
+            word_and_count = line.split()
             assert len(word_and_count) == 2
             if word_and_count[0] in word_counts:
                 word_counts[word_and_count[0]] += int(word_and_count[1])
diff --git a/scripts/rnnlm/get_word_features.py b/scripts/rnnlm/get_word_features.py
index aeb7a3ec6ae..cdcc0a77734 100755
--- a/scripts/rnnlm/get_word_features.py
+++ b/scripts/rnnlm/get_word_features.py
@@ -10,7 +10,7 @@
 from collections import defaultdict
 
 import re
-tab_or_space = re.compile('[ \t]+')
+
 
 parser = argparse.ArgumentParser(description="This script turns the words into the sparse feature representation, "
                                              "using features from rnnlm/choose_features.py.",
@@ -41,9 +41,9 @@
 # return the vocab, which is a dict mapping the word to a integer id.
 def read_vocab(vocab_file):
     vocab = {}
-    with open(vocab_file, 'r', encoding="latin-1") as f:
+    with open(vocab_file, 'r', encoding="utf-8") as f:
         for line in f:
-            fields = re.split(tab_or_space, line)
+            fields = line.split()
             assert len(fields) == 2
             if fields[0] in vocab:
                 sys.exit(sys.argv[0] + ": duplicated word({0}) in vocab: {1}"
@@ -62,9 +62,9 @@ def read_vocab(vocab_file):
 # return a list of unigram_probs, indexed by word id
 def read_unigram_probs(unigram_probs_file):
     unigram_probs = []
-    with open(unigram_probs_file, 'r', encoding="latin-1") as f:
+    with open(unigram_probs_file, 'r', encoding="utf-8") as f:
         for line in f:
-            fields = re.split(tab_or_space, line)
+            fields = line.split()
             assert len(fields) == 2
             idx = int(fields[0])
             if idx >= len(unigram_probs):
@@ -103,9 +103,9 @@ def read_features(features_file):
     feats['min_ngram_order'] = 10000
     feats['max_ngram_order'] = -1
 
-    with open(features_file, 'r', encoding="latin-1") as f:
+    with open(features_file, 'r', encoding="utf-8") as f:
         for line in f:
-            fields = re.split(tab_or_space, line)
+            fields = line.split()
             assert(len(fields) in [3, 4, 5])
 
             feat_id = int(fields[0])
diff --git a/scripts/rnnlm/lmrescore_nbest.sh b/scripts/rnnlm/lmrescore_nbest.sh
index 58b19b9fa79..a22d43961ab 100755
--- a/scripts/rnnlm/lmrescore_nbest.sh
+++ b/scripts/rnnlm/lmrescore_nbest.sh
@@ -58,7 +58,7 @@ elif [ ! -f $oldlm ]; then
     exit 1;
 fi
 
-for f in $rnndir/final.raw $data/feats.scp $indir/lat.1.gz; do
+for f in $rnndir/final.raw $indir/lat.1.gz; do
   [ ! -f $f ] && echo "$0: expected file $f to exist." && exit 1;
 done
 
@@ -174,6 +174,7 @@ if [ $stage -le 5 ]; then
       $adir.$n/lmwt.lmonly || exit 1;
   done
 fi
+
 if [ $stage -le 6 ]; then
   echo "$0: invoking rnnlm/compute_sentence_scores.sh which calls rnnlm to get RNN LM scores."
   $cmd JOB=1:$nj $dir/log/rnnlm_compute_scores.JOB.log \
diff --git a/scripts/rnnlm/lmrescore_pruned.sh b/scripts/rnnlm/lmrescore_pruned.sh
index 9ba78415708..b6ec694ffd4 100755
--- a/scripts/rnnlm/lmrescore_pruned.sh
+++ b/scripts/rnnlm/lmrescore_pruned.sh
@@ -26,7 +26,7 @@ normalize=false # If true, we add a normalization step to the output of the RNNL
                 # as in our RNNLM setup, a properly trained network would automatically
                 # have its normalization term close to 1. The details of this
                 # could be found at http://www.danielpovey.com/files/2018_icassp_rnnlm.pdf
-lattice_prune_beam=4 # Beam used in pruned lattice composition
+lattice_prune_beam=8 # Beam used in pruned lattice composition
                      # This option affects speed and how large the composed lattice may be
 
 # End configuration section.
diff --git a/scripts/rnnlm/prepare_split_data.py b/scripts/rnnlm/prepare_split_data.py
index cceac48313e..427f043df98 100755
--- a/scripts/rnnlm/prepare_split_data.py
+++ b/scripts/rnnlm/prepare_split_data.py
@@ -9,7 +9,7 @@
 import sys
 
 import re
-tab_or_space = re.compile('[ \t]+')
+
 
 parser = argparse.ArgumentParser(description="This script prepares files containing integerized text, "
                                  "for consumption by nnet3-get-egs.",
@@ -66,10 +66,10 @@ def get_all_data_sources_except_dev(text_dir):
 #                    value is a tuple (repeated_times_per_epoch, weight)
 def read_data_weights(weights_file, data_sources):
     data_weights = {}
-    with open(weights_file, 'r', encoding="latin-1") as f:
+    with open(weights_file, 'r', encoding="utf-8") as f:
         for line in f:
             try:
-                fields = re.split(tab_or_space, line)
+                fields = line.split()
                 assert len(fields) == 3
                 if fields[0] in data_weights:
                     raise Exception("duplicated data source({0}) specified in "
@@ -97,7 +97,7 @@ def distribute_to_outputs(source_filename, weight, output_filehandles):
     num_outputs = len(output_filehandles)
     n = 0
     try:
-        f = open(source_filename, 'r', encoding="latin-1")
+        f = open(source_filename, 'r', encoding="utf-8")
     except Exception as e:
         sys.exit(sys.argv[0] + ": failed to open file {0} for reading: {1} ".format(
             source_filename, str(e)))
@@ -124,7 +124,7 @@ def distribute_to_outputs(source_filename, weight, output_filehandles):
     os.makedirs(args.split_dir +  "/info")
 
 # set up the 'num_splits' file, which contains an integer.
-with open("{0}/info/num_splits".format(args.split_dir), 'w', encoding="latin-1") as f:
+with open("{0}/info/num_splits".format(args.split_dir), 'w', encoding="utf-8") as f:
     print(args.num_splits, file=f)
 
 # e.g. set temp_files = [ 'foo/1.tmp', 'foo/2.tmp', ..., 'foo/5.tmp' ]
@@ -136,7 +136,7 @@ def distribute_to_outputs(source_filename, weight, output_filehandles):
 temp_filehandles = []
 for fname in temp_files:
     try:
-        temp_filehandles.append(open(fname, 'w', encoding="latin-1"))
+        temp_filehandles.append(open(fname, 'w', encoding="utf-8"))
     except Exception as e:
         sys.exit(sys.argv[0] + ": failed to open file: " + str(e) +
                  ".. if this is a max-open-filehandles limitation, you may "
diff --git a/scripts/rnnlm/rnnlm_cleanup.py b/scripts/rnnlm/rnnlm_cleanup.py
index 40cbee7a496..6a304f7f4cb 100644
--- a/scripts/rnnlm/rnnlm_cleanup.py
+++ b/scripts/rnnlm/rnnlm_cleanup.py
@@ -69,7 +69,7 @@ def get_compute_prob_info(log_file):
     compute_prob_done = False
     # roughly based on code in get_best_model.py
     try:
-        f = open(log_file, "r", encoding="latin-1")
+        f = open(log_file, "r", encoding="utf-8")
     except:
         print(script_name + ": warning: compute_prob log not found for iteration " +
               str(iter) + ". Skipping",
diff --git a/scripts/rnnlm/show_word_features.py b/scripts/rnnlm/show_word_features.py
index 89b134adaf9..4335caed5d8 100755
--- a/scripts/rnnlm/show_word_features.py
+++ b/scripts/rnnlm/show_word_features.py
@@ -7,15 +7,10 @@
 import argparse
 import sys
 
-# The use of latin-1 encoding does not preclude reading utf-8.  latin-1 encoding
-# means "treat words as sequences of bytes", and it is compatible with utf-8
-# encoding as well as other encodings such as gbk, as long as the spaces are
-# also spaces in ascii (which we check).  It is basically how we emulate the
-# behavior of python before python3.
-sys.stdout = open(1, 'w', encoding='latin-1', closefd=False)
+sys.stdout = open(1, 'w', encoding='utf-8', closefd=False)
 
 import re
-tab_or_space = re.compile('[ \t]+')
+
 
 parser = argparse.ArgumentParser(description="This script turns the word features to a human readable format.",
                                  epilog="E.g. " + sys.argv[0] + "exp/rnnlm/word_feats.txt exp/rnnlm/features.txt "
@@ -36,9 +31,9 @@
 def read_feature_type_and_key(features_file):
     feat_types = {}
 
-    with open(features_file, 'r', encoding="latin-1") as f:
+    with open(features_file, 'r', encoding="utf-8") as f:
         for line in f:
-            fields = re.split(tab_or_space, line)
+            fields = line.split()
             assert(len(fields) in [2, 3, 4])
 
             feat_id = int(fields[0])
@@ -53,9 +48,9 @@ def read_feature_type_and_key(features_file):
 feat_type_and_key = read_feature_type_and_key(args.features_file)
 
 num_word_feats = 0
-with open(args.word_features_file, 'r', encoding="latin-1") as f:
+with open(args.word_features_file, 'r', encoding="utf-8") as f:
     for line in f:
-        fields = re.split(tab_or_space, line)
+        fields = line.split()
         assert len(fields) % 2 == 1
 
         print(int(fields[0]), end='\t')
diff --git a/scripts/rnnlm/validate_features.py b/scripts/rnnlm/validate_features.py
index 2a077da4758..e67f03207bb 100755
--- a/scripts/rnnlm/validate_features.py
+++ b/scripts/rnnlm/validate_features.py
@@ -8,7 +8,7 @@
 import sys
 
 import re
-tab_or_space = re.compile('[ \t]+')
+
 
 parser = argparse.ArgumentParser(description="Validates features file, produced by rnnlm/choose_features.py.",
                                  epilog="E.g. " + sys.argv[0] + " exp/rnnlm/features.txt",
@@ -24,7 +24,7 @@
 if not os.path.isfile(args.features_file):
     sys.exit(sys.argv[0] + ": Expected file {0} to exist".format(args.features_file))
 
-with open(args.features_file, 'r', encoding="latin-1") as f:
+with open(args.features_file, 'r', encoding="utf-8") as f:
     has_unigram = False
     has_length = False
     idx = 0
@@ -33,7 +33,7 @@
     final_feats = {}
     word_feats = {}
     for line in f:
-        fields = re.split(tab_or_space, line)
+        fields = line.split()
         assert(len(fields) in [3, 4, 5])
 
         assert idx == int(fields[0])
diff --git a/scripts/rnnlm/validate_text_dir.py b/scripts/rnnlm/validate_text_dir.py
index 903e720bdf4..1f250d4c2f8 100755
--- a/scripts/rnnlm/validate_text_dir.py
+++ b/scripts/rnnlm/validate_text_dir.py
@@ -8,7 +8,7 @@
 import sys
 
 import re
-tab_or_space = re.compile('[ \t]+')
+
 
 parser = argparse.ArgumentParser(description="Validates data directory containing text "
                                  "files from one or more data sources, including dev.txt.",
@@ -40,7 +40,7 @@
 
 
 def check_text_file(text_file):
-    with open(text_file, 'r', encoding="latin-1") as f:
+    with open(text_file, 'r', encoding="utf-8") as f:
         found_nonempty_line = False
         lineno = 0
         if args.allow_internal_eos == 'true':
@@ -54,7 +54,7 @@ def check_text_file(text_file):
             lineno += 1
             if args.spot_check == 'true' and lineno > 10:
                 break
-            words = re.split(tab_or_space, line)
+            words = line.split()
             if len(words) != 0:
                 found_nonempty_line = True
                 for word in words:
@@ -76,9 +76,9 @@ def check_text_file(text_file):
     # with some kind of utterance-id
     first_field_set = set()
     other_fields_set = set()
-    with open(text_file, 'r', encoding="latin-1") as f:
+    with open(text_file, 'r', encoding="utf-8") as f:
         for line in f:
-            array = re.split(tab_or_space, line)
+            array = line.split()
             if len(array) > 0:
                 first_word = array[0]
                 if first_word in first_field_set or first_word in other_fields_set:
diff --git a/scripts/rnnlm/validate_word_features.py b/scripts/rnnlm/validate_word_features.py
index 205b934ae1b..372286d8d12 100755
--- a/scripts/rnnlm/validate_word_features.py
+++ b/scripts/rnnlm/validate_word_features.py
@@ -8,7 +8,7 @@
 import sys
 
 import re
-tab_or_space = re.compile('[ \t]+')
+
 
 parser = argparse.ArgumentParser(description="Validates word features file, produced by rnnlm/get_word_features.py.",
                                  epilog="E.g. " + sys.argv[0] + " --features-file=exp/rnnlm/features.txt "
@@ -28,9 +28,9 @@
 unigram_feat_id = -1
 length_feat_id = -1
 max_feat_id = -1
-with open(args.features_file, 'r', encoding="latin-1") as f:
+with open(args.features_file, 'r', encoding="utf-8") as f:
     for line in f:
-        fields = re.split(tab_or_space, line)
+        fields = line.split()
         assert(len(fields) in [3, 4, 5])
 
         feat_id = int(fields[0])
@@ -52,9 +52,9 @@
         if feat_id > max_feat_id:
             max_feat_id = feat_id
 
-with open(args.word_features_file, 'r', encoding="latin-1") as f:
+with open(args.word_features_file, 'r', encoding="utf-8") as f:
     for line in f:
-        fields = re.split(tab_or_space, line)
+        fields = line.split()
         assert len(fields) > 0 and len(fields) % 2 == 1
         word_id = int(fields[0])
 
diff --git a/src/Doxyfile b/src/Doxyfile
index a6c0b434ff2..0032214f803 100644
--- a/src/Doxyfile
+++ b/src/Doxyfile
@@ -457,7 +457,7 @@ INPUT    = doc itf \
           fstext hmm lm decoder lat cudamatrix nnet \
           bin fstbin gmmbin fgmmbin featbin \
           nnetbin latbin sgmm2 sgmm2bin nnet2 nnet2bin nnet3 nnet3bin \
-          kwsbin ivector ivectorbin
+          kws kwsbin ivector ivectorbin online onlinebin online2 online2bin
 
 # If the value of the INPUT tag contains directories, you can use the
 # FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
diff --git a/src/Makefile b/src/Makefile
index a49c912c6ed..07b7947f3b1 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -4,16 +4,16 @@
 
 SHELL := /bin/bash
 
-
-SUBDIRS = base matrix util feat tree gmm transform \
+SUBDIRS = base matrix util feat cudafeat tree gmm transform \
           fstext hmm lm decoder lat kws cudamatrix nnet \
-          bin fstbin gmmbin fgmmbin featbin \
+          bin fstbin gmmbin fgmmbin featbin cudafeatbin \
           nnetbin latbin sgmm2 sgmm2bin nnet2 nnet3 rnnlm chain nnet3bin nnet2bin kwsbin \
-          ivector ivectorbin online2 online2bin lmbin chainbin rnnlmbin
+          ivector ivectorbin online2 online2bin lmbin chainbin rnnlmbin \
+          cudadecoder cudadecoderbin
 
-MEMTESTDIRS = base matrix util feat tree gmm transform \
+MEMTESTDIRS = base matrix util feat cudafeat tree gmm transform \
           fstext hmm lm decoder lat nnet kws chain \
-          bin fstbin gmmbin fgmmbin featbin \
+          bin fstbin gmmbin fgmmbin featbin cudafeatbin \
           nnetbin latbin sgmm2 nnet2 nnet3 rnnlm nnet2bin nnet3bin sgmm2bin kwsbin \
           ivector ivectorbin online2 online2bin lmbin
 
@@ -143,9 +143,10 @@ $(EXT_SUBDIRS) : checkversion kaldi.mk mklibdir ext_depend
 ### Dependency list ###
 # this is necessary for correct parallel compilation
 #1)The tools depend on all the libraries
-bin fstbin gmmbin fgmmbin sgmm2bin featbin nnetbin nnet2bin nnet3bin chainbin latbin ivectorbin lmbin kwsbin online2bin rnnlmbin: \
- base matrix util feat tree gmm transform sgmm2 fstext hmm \
- lm decoder lat cudamatrix nnet nnet2 nnet3 ivector chain kws online2 rnnlm
+bin fstbin gmmbin fgmmbin sgmm2bin featbin cudafeatbin nnetbin nnet2bin nnet3bin chainbin latbin ivectorbin lmbin kwsbin online2bin rnnlmbin cudadecoderbin: \
+ base matrix util feat cudafeat tree gmm transform sgmm2 fstext hmm \
+ lm decoder lat cudamatrix nnet nnet2 nnet3 ivector chain kws online2 rnnlm \
+ cudadecoder
 
 #2)The libraries have inter-dependencies
 base: base/.depend.mk
@@ -159,18 +160,22 @@ sgmm2: base util matrix gmm tree transform hmm
 fstext: base util matrix tree
 hmm: base tree matrix util
 lm: base util matrix fstext
-decoder: base util matrix gmm hmm tree transform lat
+decoder: base util matrix gmm hmm tree transform lat fstext
 lat: base util hmm tree matrix
 cudamatrix: base util matrix
 nnet: base util hmm tree matrix cudamatrix
 nnet2: base util matrix lat gmm hmm tree transform cudamatrix
-nnet3: base util matrix lat gmm hmm tree transform cudamatrix chain fstext
+nnet3: base util matrix decoder lat gmm hmm tree transform cudamatrix chain fstext
 rnnlm: base util matrix cudamatrix nnet3 lm hmm
 chain: lat hmm tree fstext matrix cudamatrix util base
 ivector: base util matrix transform tree gmm
 #3)Dependencies for optional parts of Kaldi
 onlinebin: base matrix util feat tree gmm transform sgmm2 fstext hmm lm decoder lat cudamatrix nnet nnet2 online
 # python-kaldi-decoding: base matrix util feat tree gmm transform sgmm2 fstext hmm decoder lat online
+cudafeat: base matrix util gmm transform tree feat cudamatrix online2
+cudafeatbin: base matrix util gmm transform tree feat cudamatrix cudafeat online2
 online: decoder gmm transform feat matrix util base lat hmm tree
 online2: decoder gmm transform feat matrix util base lat hmm tree ivector cudamatrix nnet2 nnet3 chain
 kws: base util hmm tree matrix lat
+cudadecoder:  cudamatrix cudafeat online2 nnet3 ivector feat fstext lat chain transform
+cudadecoderbin: cudadecoder cudafeat cudamatrix online2 nnet3 ivector feat fstext lat chain transform
diff --git a/src/base/io-funcs-inl.h b/src/base/io-funcs-inl.h
index 6b87f4c1a24..b703ef5addc 100644
--- a/src/base/io-funcs-inl.h
+++ b/src/base/io-funcs-inl.h
@@ -47,7 +47,7 @@ template<class T>  void WriteBasicType(std::ostream &os,
       os << t << " ";
   }
   if (os.fail()) {
-    throw std::runtime_error("Write failure in WriteBasicType.");
+    KALDI_ERR << "Write failure in WriteBasicType.";
   }
 }
 
@@ -122,7 +122,7 @@ inline void WriteIntegerPairVector(std::ostream &os, bool binary,
     os << "]\n";
   }
   if (os.fail()) {
-    throw std::runtime_error("Write failure in WriteIntegerPairVector.");
+    KALDI_ERR << "Write failure in WriteIntegerPairVector.";
   }
 }
 
@@ -224,7 +224,7 @@ template<class T> inline void WriteIntegerVector(std::ostream &os, bool binary,
     os << "]\n";
   }
   if (os.fail()) {
-    throw std::runtime_error("Write failure in WriteIntegerVector.");
+    KALDI_ERR << "Write failure in WriteIntegerVector.";
   }
 }
 
diff --git a/src/base/io-funcs.cc b/src/base/io-funcs.cc
index 90988faf3ea..150f74099be 100644
--- a/src/base/io-funcs.cc
+++ b/src/base/io-funcs.cc
@@ -138,7 +138,7 @@ void WriteToken(std::ostream &os, bool binary, const char *token) {
   CheckToken(token);  // make sure it's valid (can be read back)
   os << token << " ";
   if (os.fail()) {
-    throw std::runtime_error("Write failure in WriteToken.");
+    KALDI_ERR << "Write failure in WriteToken.";
   }
 }
 
@@ -179,11 +179,8 @@ int PeekToken(std::istream &is, bool binary) {
   int ans = is.peek();
   if (read_bracket) {
     if (!is.unget()) {
-      KALDI_WARN << "Error ungetting '<' in PeekToken";
-      // Clear the bad bit.  It seems to be possible for this code to be
-      // reached, and the C++ standard is very vague on whether even a single
-      // call to unget() should succeed; see
-      // http://www.cplusplus.com/reference/istream/istream/unget/
+      // Clear the bad bit. This code can be (and is in fact) reached, since the
+      // C++ standard does not guarantee that a call to unget() must succeed.
       is.clear();
     }
   }
diff --git a/src/base/io-funcs.h b/src/base/io-funcs.h
index 6c2b690f54c..895f661ecee 100644
--- a/src/base/io-funcs.h
+++ b/src/base/io-funcs.h
@@ -46,7 +46,7 @@ namespace kaldi {
   We also want to have control over whitespace in text mode without affecting
   the meaning of the file, for pretty-printing purposes.
 
-  Errors are handled by throwing an exception (std::runtime_error).
+  Errors are handled by throwing a KaldiFatalError exception.
 
   For integer and floating-point types (and boolean values):
 
@@ -108,7 +108,7 @@ namespace kaldi {
   it doesn't throw.  It's useful if a class can have various forms based on
   typedefs and virtual classes, and wants to know which version to read.
 
-  ReadToken allow the caller to obtain the next token.  PeekToken works just
+  ReadToken allows the caller to obtain the next token.  PeekToken works just
   like ReadToken, but seeks back to the beginning of the token.  A subsequent
   call to ReadToken will read the same token again.  This is useful when
   different object types are written to the same file; using PeekToken one can
@@ -203,13 +203,18 @@ void WriteToken(std::ostream &os, bool binary, const std::string & token);
 /// value of the stream.
 int Peek(std::istream &is, bool binary);
 
-/// ReadToken gets the next token and puts it in str (exception on failure).
+/// ReadToken gets the next token and puts it in str (exception on failure). If
+/// PeekToken() had been previously called, it is possible that the stream had
+/// failed to unget the starting '<' character. In this case ReadToken() returns
+/// the token string without the leading '<'. You must be prepared to handle
+/// this case. ExpectToken() handles this internally, and is not affected.
 void ReadToken(std::istream &is, bool binary, std::string *token);
 
 /// PeekToken will return the first character of the next token, or -1 if end of
 /// file.  It's the same as Peek(), except if the first character is '<' it will
-/// skip over it and will return the next character.  It will unget the '<' so
-/// the stream is where it was before you did PeekToken().
+/// skip over it and will return the next character. It will attempt to unget
+/// the '<' so the stream is where it was before you did PeekToken(), however,
+/// this is not guaranteed (see ReadToken()).
 int PeekToken(std::istream &is, bool binary);
 
 /// ExpectToken tries to read in the given token, and throws an exception
diff --git a/src/base/kaldi-error-test.cc b/src/base/kaldi-error-test.cc
index 527de852cac..31440edf3f9 100644
--- a/src/base/kaldi-error-test.cc
+++ b/src/base/kaldi-error-test.cc
@@ -17,19 +17,14 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-
 #include "base/kaldi-common.h"
 
 // testing that we get the stack trace.
 namespace kaldi {
 
-void MyFunction2() {
-  KALDI_ERR << "Ignore this error";
-}
+void MyFunction2() { KALDI_ERR << "Ignore this error"; }
 
-void MyFunction1() {
-  MyFunction2();
-}
+void MyFunction1() { MyFunction2(); }
 
 void UnitTestError() {
   {
@@ -38,17 +33,50 @@ void UnitTestError() {
   }
 }
 
+void VerifySymbolRange(const std::string &trace, const bool want_found,
+                       const std::string &want_symbol) {
+  size_t begin, end;
+  const bool found = internal::LocateSymbolRange(trace, &begin, &end);
+  if (found != want_found) {
+    KALDI_ERR << "Found mismatch, got " << found << " want " << want_found;
+  }
+  if (!found) {
+    return;
+  }
+  const std::string symbol = trace.substr(begin, end - begin);
+  if (symbol != want_symbol) {
+    KALDI_ERR << "Symbol mismatch, got " << symbol << " want " << want_symbol;
+  }
+}
+
+void TestLocateSymbolRange() {
+  VerifySymbolRange("", false, "");
+  VerifySymbolRange(
+      R"TRACE(./kaldi-error-test(_ZN5kaldi13UnitTestErrorEv+0xb) [0x804965d])TRACE",
+      true, "_ZN5kaldi13UnitTestErrorEv");
+  // It is ok thread_start is not found because it is a C symbol.
+  VerifySymbolRange(
+      R"TRACE(31  libsystem_pthread.dylib             0x00007fff6fe4e40d thread_start + 13)TRACE",
+      false, "");
+  VerifySymbolRange(
+      R"TRACE(0 server 0x000000010f67614d _ZNK5kaldi13MessageLogger10LogMessageEv + 813)TRACE",
+      true, "_ZNK5kaldi13MessageLogger10LogMessageEv");
+  VerifySymbolRange(
+      R"TRACE(29  libsystem_pthread.dylib             0x00007fff6fe4f2eb _pthread_body + 126)TRACE",
+      true, "_pthread_body");
+}
 
-}  // end namespace kaldi.
+} // namespace kaldi
 
 int main() {
-  kaldi::g_program_name = "/foo/bar/kaldi-error-test";
+  kaldi::TestLocateSymbolRange();
+
+  kaldi::SetProgramName("/foo/bar/kaldi-error-test");
   try {
     kaldi::UnitTestError();
-    KALDI_ASSERT(0);  // should not happen.
+    KALDI_ASSERT(0); // should not happen.
     exit(1);
-  } catch(std::runtime_error &r) {
-    std::cout << "UnitTestError: the error we generated was: " << r.what();
+  } catch (kaldi::KaldiFatalError &e) {
+    std::cout << "The error we generated was: '" << e.KaldiMessage() << "'\n";
   }
 }
-
diff --git a/src/base/kaldi-error.cc b/src/base/kaldi-error.cc
index 3eeebe01910..2dbc7318209 100644
--- a/src/base/kaldi-error.cc
+++ b/src/base/kaldi-error.cc
@@ -1,5 +1,7 @@
 // base/kaldi-error.cc
 
+// Copyright 2019 LAIX (Yi Sun)
+// Copyright 2019 SmartAction LLC (kkm)
 // Copyright 2016 Brno University of Technology (author: Karel Vesely)
 // Copyright 2009-2011  Microsoft Corporation;  Lukas Burget;  Ondrej Glembek
 
@@ -19,15 +21,15 @@
 // limitations under the License.
 
 #ifdef HAVE_EXECINFO_H
-#include <execinfo.h>  // To get stack trace in error messages.
+#include <execinfo.h> // To get stack trace in error messages.
 // If this #include fails there is an error in the Makefile, it does not
 // support your platform well. Make sure HAVE_EXECINFO_H is undefined,
 // and the code will compile.
 #ifdef HAVE_CXXABI_H
-#include <cxxabi.h>  // For name demangling.
+#include <cxxabi.h> // For name demangling.
 // Useful to decode the stack trace, but only used if we have execinfo.h
-#endif  // HAVE_CXXABI_H
-#endif  // HAVE_EXECINFO_H
+#endif // HAVE_CXXABI_H
+#endif // HAVE_EXECINFO_H
 
 #include "base/kaldi-common.h"
 #include "base/kaldi-error.h"
@@ -38,103 +40,131 @@ namespace kaldi {
 /***** GLOBAL VARIABLES FOR LOGGING *****/
 
 int32 g_kaldi_verbose_level = 0;
-const char *g_program_name = NULL;
-static LogHandler g_log_handler = NULL;
-
-// If the program name was set (g_program_name != ""), GetProgramName
-// returns the program name (without the path), e.g. "gmm-align".
-// Otherwise it returns the empty string "".
-const char *GetProgramName() {
-  return g_program_name == NULL ? "" : g_program_name;
+static std::string program_name;
+static LogHandler log_handler = NULL;
+
+void SetProgramName(const char *basename) {
+  // Using the 'static std::string' for the program name is mostly harmless,
+  // because (a) Kaldi logging is undefined before main(), and (b) no stdc++
+  // string implementation has been found in the wild that would not be just
+  // an empty string when zero-initialized but not yet constructed.
+  program_name = basename;
 }
 
 /***** HELPER FUNCTIONS *****/
 
-// Given a filename like "/a/b/c/d/e/f.cc",  GetShortFileName
-// returns "e/f.cc".  Does not currently work if backslash is
-// the filename separator.
-static const char *GetShortFileName(const char *filename) {
-  const char *last_slash = strrchr(filename, '/');
-  if (!last_slash) {
-    return filename;
-  } else {
-    while (last_slash > filename && last_slash[-1] != '/')
-      last_slash--;
-    return last_slash;
+// Trim filename to at most 1 trailing directory long. Given a filename like
+// "/a/b/c/d/e/f.cc", return "e/f.cc". Support both '/' and '\' as the path
+// separator.
+static const char *GetShortFileName(const char *path) {
+  if (path == nullptr)
+    return "";
+
+  const char *prev = path, *last = path;
+  while ((path = std::strpbrk(path, "\\/")) != nullptr) {
+    ++path;
+    prev = last;
+    last = path;
   }
+  return prev;
 }
 
+/***** STACK TRACE *****/
 
-/***** STACKTRACE *****/
+namespace internal {
+bool LocateSymbolRange(const std::string &trace_name, size_t *begin,
+                       size_t *end) {
+  // Find the first '_' with leading ' ' or '('.
+  *begin = std::string::npos;
+  for (size_t i = 1; i < trace_name.size(); i++) {
+    if (trace_name[i] != '_') {
+      continue;
+    }
+    if (trace_name[i - 1] == ' ' || trace_name[i - 1] == '(') {
+      *begin = i;
+      break;
+    }
+  }
+  if (*begin == std::string::npos) {
+    return false;
+  }
+  *end = trace_name.find_first_of(" +", *begin);
+  return *end != std::string::npos;
+}
+} // namespace internal
 
+#ifdef HAVE_EXECINFO_H
 static std::string Demangle(std::string trace_name) {
-#if defined(HAVE_CXXABI_H) && defined(HAVE_EXECINFO_H)
-  // at input the string looks like:
+#ifndef HAVE_CXXABI_H
+  return trace_name;
+#else  // HAVE_CXXABI_H
+  // Try demangle the symbol. We are trying to support the following formats
+  // produced by different platforms:
+  //
+  // Linux:
   //   ./kaldi-error-test(_ZN5kaldi13UnitTestErrorEv+0xb) [0x804965d]
-  // We want to extract the name e.g. '_ZN5kaldi13UnitTestErrorEv",
-  // demangle it and return it.
-
-  // try to locate '(' and '+', take the string in between,
-  size_t begin(trace_name.find("(")),
-         end(trace_name.rfind("+"));
-  if (begin != std::string::npos && end != std::string::npos && begin < end) {
-    trace_name = trace_name.substr(begin+1,end-(begin+1));
+  //
+  // Mac:
+  //   0 server 0x000000010f67614d _ZNK5kaldi13MessageLogger10LogMessageEv + 813
+  //
+  // We want to extract the name e.g., '_ZN5kaldi13UnitTestErrorEv' and
+  // demangle it info a readable name like kaldi::UnitTextError.
+  size_t begin, end;
+  if (!internal::LocateSymbolRange(trace_name, &begin, &end)) {
+    return trace_name;
   }
-  // demangle,
+  std::string symbol = trace_name.substr(begin, end - begin);
   int status;
-  char *demangled_name = abi::__cxa_demangle(trace_name.c_str(), 0, 0, &status);
-  std::string ans;
-  if (status == 0) {
-    ans = demangled_name;
+  char *demangled_name = abi::__cxa_demangle(symbol.c_str(), 0, 0, &status);
+  if (status == 0 && demangled_name != nullptr) {
+    symbol = demangled_name;
     free(demangled_name);
-  } else {
-    ans = trace_name;
   }
-  // return,
-  return ans;
-#else
-  return trace_name;
-#endif
+  return trace_name.substr(0, begin) + symbol +
+         trace_name.substr(end, std::string::npos);
+#endif // HAVE_CXXABI_H
 }
-
+#endif // HAVE_EXECINFO_H
 
 static std::string KaldiGetStackTrace() {
   std::string ans;
 #ifdef HAVE_EXECINFO_H
-#define KALDI_MAX_TRACE_SIZE 50
-#define KALDI_MAX_TRACE_PRINT 20  // must be even.
-  // buffer for the trace,
+  const size_t KALDI_MAX_TRACE_SIZE = 50;
+  const size_t KALDI_MAX_TRACE_PRINT = 50; // Must be even.
+  // Buffer for the trace.
   void *trace[KALDI_MAX_TRACE_SIZE];
-  // get the trace,
+  // Get the trace.
   size_t size = backtrace(trace, KALDI_MAX_TRACE_SIZE);
-  // get the trace symbols,
+  // Get the trace symbols.
   char **trace_symbol = backtrace_symbols(trace, size);
+  if (trace_symbol == NULL)
+    return ans;
 
-  // Compose the 'string',
+  // Compose a human-readable backtrace string.
   ans += "[ Stack-Trace: ]\n";
   if (size <= KALDI_MAX_TRACE_PRINT) {
     for (size_t i = 0; i < size; i++) {
       ans += Demangle(trace_symbol[i]) + "\n";
     }
-  } else {  // print out first+last (e.g.) 5.
-    for (size_t i = 0; i < KALDI_MAX_TRACE_PRINT/2; i++) {
+  } else { // Print out first+last (e.g.) 5.
+    for (size_t i = 0; i < KALDI_MAX_TRACE_PRINT / 2; i++) {
       ans += Demangle(trace_symbol[i]) + "\n";
     }
     ans += ".\n.\n.\n";
-    for (size_t i = size - KALDI_MAX_TRACE_PRINT/2; i < size; i++) {
+    for (size_t i = size - KALDI_MAX_TRACE_PRINT / 2; i < size; i++) {
       ans += Demangle(trace_symbol[i]) + "\n";
     }
     if (size == KALDI_MAX_TRACE_SIZE)
-      ans += ".\n.\n.\n";  // stack was too long, probably a bug.
+      ans += ".\n.\n.\n"; // Stack was too long, probably a bug.
   }
 
-  // cleanup,
-  free(trace_symbol);  // it's okay, just the pointers, not the strings.
-#endif  // HAVE_EXECINFO_H
+  // We must free the array of pointers allocated by backtrace_symbols(),
+  // but not the strings themselves.
+  free(trace_symbol);
+#endif // HAVE_EXECINFO_H
   return ans;
 }
 
-
 /***** KALDI LOGGING *****/
 
 MessageLogger::MessageLogger(LogMessageEnvelope::Severity severity,
@@ -142,132 +172,74 @@ MessageLogger::MessageLogger(LogMessageEnvelope::Severity severity,
   // Obviously, we assume the strings survive the destruction of this object.
   envelope_.severity = severity;
   envelope_.func = func;
-  envelope_.file = GetShortFileName(file);  // Pointer inside 'file'.
+  envelope_.file = GetShortFileName(file); // Points inside 'file'.
   envelope_.line = line;
 }
 
-
-MessageLogger::~MessageLogger() noexcept(false) {
-  std::string str = GetMessage();
-  // print the mesage (or send to logging handler),
-  MessageLogger::HandleMessage(envelope_, str.c_str());
-}
-
-std::string MessageLogger::GetMessage() const {
-  // remove trailing '\n',
-  std::string str = ss_.str();
-  while (!str.empty() && str[str.length() - 1] == '\n')
-    str.resize(str.length() - 1);
-  return str;
-}
-
-
-void MessageLogger::HandleMessage(const LogMessageEnvelope &envelope,
-                                  const char *message) {
-  // Send to a logging handler if provided.
-  if (g_log_handler != NULL) {
-    g_log_handler(envelope, message);
-  } else {
-    // Otherwise, we use the default Kaldi logging.
-    // Build the log-message 'header',
-    std::stringstream header;
-    if (envelope.severity > LogMessageEnvelope::kInfo) {
-      header << "VLOG[" << envelope.severity << "] (";
-    } else {
-      switch (envelope.severity) {
-        case LogMessageEnvelope::kInfo :
-          header << "LOG (";
-          break;
-        case LogMessageEnvelope::kWarning :
-          header << "WARNING (";
-          break;
-        case LogMessageEnvelope::kError :
-          header << "ERROR (";
-          break;
-        case LogMessageEnvelope::kAssertFailed :
-          header << "ASSERTION_FAILED (";
-          break;
-        default:
-          abort();  // coding error (unknown 'severity'),
-      }
-    }
-    // fill the other info from the envelope,
-    header << GetProgramName() << "[" KALDI_VERSION "]" << ':'
-           << envelope.func << "():" << envelope.file << ':' << envelope.line
-           << ")";
-
-    // Printing the message,
-    if (envelope.severity >= LogMessageEnvelope::kWarning) {
-      // VLOG, LOG, WARNING:
-      fprintf(stderr, "%s %s\n", header.str().c_str(), message);
-    } else {
-      // ERROR, ASSERT_FAILED (print with stack-trace):
-      fprintf(stderr, "%s %s\n\n%s\n", header.str().c_str(), message,
-              KaldiGetStackTrace().c_str());
-    }
+void MessageLogger::LogMessage() const {
+  // Send to the logging handler if provided.
+  if (log_handler != NULL) {
+    log_handler(envelope_, GetMessage().c_str());
+    return;
   }
-}
-
-FatalMessageLogger::FatalMessageLogger(LogMessageEnvelope::Severity severity,
-                                       const char *func, const char *file,
-                                       int32 line):
-  MessageLogger(severity, func, file, line) {
-  if (severity != LogMessageEnvelope::kAssertFailed &&
-      severity != LogMessageEnvelope::kError) {
-    // Don't use KALDI_ERR, since that will recursively instantiate
-    // MessageLogger.
-    throw std::runtime_error("FatalMessageLogger should be called only with "
-                             "severities kAssertFailed and kError");
-  }
-}
-
-[[ noreturn ]] FatalMessageLogger::~FatalMessageLogger() noexcept(false) {
-  std::string str = GetMessage();
 
-  // print the mesage (or send to logging handler),
-  MessageLogger::HandleMessage(envelope_, str.c_str());
-
-  // Should we throw exception, or abort?
-  switch (envelope_.severity) {
+  // Otherwise, use the default Kaldi logging.
+  // Build the log-message header.
+  std::stringstream full_message;
+  if (envelope_.severity > LogMessageEnvelope::kInfo) {
+    full_message << "VLOG[" << envelope_.severity << "] (";
+  } else {
+    switch (envelope_.severity) {
+    case LogMessageEnvelope::kInfo:
+      full_message << "LOG (";
+      break;
+    case LogMessageEnvelope::kWarning:
+      full_message << "WARNING (";
+      break;
     case LogMessageEnvelope::kAssertFailed:
-      abort(); // ASSERT_FAILED,
+      full_message << "ASSERTION_FAILED (";
       break;
     case LogMessageEnvelope::kError:
-      if (!std::uncaught_exception()) {
-        // throw exception with empty message,
-        throw std::runtime_error(""); // KALDI_ERR,
-      } else {
-        // If we got here, this thread has already thrown exception,
-        // and this exception has not yet arrived to its 'catch' clause...
-        // Throwing a new exception would be unsafe!
-        // (can happen during 'stack unwinding', if we have 'KALDI_ERR << msg'
-        // in a destructor of some local object).
-        abort();
-      }
+    default: // If not the ERROR, it still an error!
+      full_message << "ERROR (";
       break;
-  default: // This should never happen, based on constructor's
-           // preconditions. But we place abort() here so that all
-           // possible pathways through this function do not return.
-    abort();
+    }
+  }
+  // Add other info from the envelope and the message text.
+  full_message << program_name.c_str() << "[" KALDI_VERSION "]" << ':'
+               << envelope_.func << "():" << envelope_.file << ':'
+               << envelope_.line << ") " << GetMessage().c_str();
+
+  // Add stack trace for errors and assertion failures, if available.
+  if (envelope_.severity < LogMessageEnvelope::kWarning) {
+    const std::string &stack_trace = KaldiGetStackTrace();
+    if (!stack_trace.empty()) {
+      full_message << "\n\n" << stack_trace;
+    }
   }
-}
 
+  // Print the complete message to stderr.
+  full_message << "\n";
+  std::cerr << full_message.str();
+}
 
 /***** KALDI ASSERTS *****/
 
-void KaldiAssertFailure_(const char *func, const char *file,
-                         int32 line, const char *cond_str) {
-  FatalMessageLogger ml(LogMessageEnvelope::kAssertFailed, func, file, line);
-  ml.stream() << ": '" << cond_str << "' ";
+void KaldiAssertFailure_(const char *func, const char *file, int32 line,
+                         const char *cond_str) {
+  MessageLogger::Log() =
+      MessageLogger(LogMessageEnvelope::kAssertFailed, func, file, line)
+      << "Assertion failed: (" << cond_str << ")";
+  fflush(NULL); // Flush all pending buffers, abort() may not flush stderr.
+  std::abort();
 }
 
-
 /***** THIRD-PARTY LOG-HANDLER *****/
 
-LogHandler SetLogHandler(LogHandler new_handler) {
-  LogHandler old_handler = g_log_handler;
-  g_log_handler = new_handler;
+LogHandler SetLogHandler(LogHandler handler) {
+  LogHandler old_handler = log_handler;
+  log_handler = handler;
   return old_handler;
 }
 
-}  // end namespace kaldi
+} // namespace kaldi
diff --git a/src/base/kaldi-error.h b/src/base/kaldi-error.h
index c643902f01b..a9904a752cd 100644
--- a/src/base/kaldi-error.h
+++ b/src/base/kaldi-error.h
@@ -1,5 +1,7 @@
 // base/kaldi-error.h
 
+// Copyright 2019 LAIX (Yi Sun)
+// Copyright 2019 SmartAction LLC (kkm)
 // Copyright 2016 Brno University of Technology (author: Karel Vesely)
 // Copyright 2009-2011  Microsoft Corporation;  Ondrej Glembek;  Lukas Burget;
 //                      Saarland University
@@ -42,117 +44,128 @@ namespace kaldi {
 /// \addtogroup error_group
 /// @{
 
-/***** VERBOSITY LEVEL *****/
+/***** PROGRAM NAME AND VERBOSITY LEVEL *****/
 
-/// This is set by util/parse-options.{h, cc} if you set --verbose=? option.
-extern int32 g_kaldi_verbose_level;
+/// Called by ParseOptions to set base name (no directory) of the executing
+/// program. The name is printed in logging code along with every message,
+/// because in our scripts, we often mix together the stderr of many programs.
+/// This function is very thread-unsafe.
+void SetProgramName(const char *basename);
 
-/// This is set by util/parse-options.{h, cc} (from argv[0]) and used (if set)
-/// in error reporting code to display the name of the program (this is because
-/// in our scripts, we often mix together the stderr of many programs).  it is
-/// the base-name of the program (no directory), followed by ':' We don't use
-/// std::string, due to the static initialization order fiasco.
-extern const char *g_program_name;
+/// This is set by util/parse-options.{h,cc} if you set --verbose=? option.
+/// Do not use directly, prefer {Get,Set}VerboseLevel().
+extern int32 g_kaldi_verbose_level;
 
+/// Get verbosity level, usually set via command line '--verbose=' switch.
 inline int32 GetVerboseLevel() { return g_kaldi_verbose_level; }
 
-/// This should be rarely used; command-line programs set the verbose level
-/// automatically from ParseOptions.
+/// This should be rarely used, except by programs using Kaldi as library;
+/// command-line programs set the verbose level automatically from ParseOptions.
 inline void SetVerboseLevel(int32 i) { g_kaldi_verbose_level = i; }
 
-
 /***** KALDI LOGGING *****/
 
 /// Log message severity and source location info.
 struct LogMessageEnvelope {
+  /// Message severity. In addition to these levels, positive values (1 to 6)
+  /// specify verbose logging level. Verbose messages are produced only when
+  /// SetVerboseLevel() has been called to set logging level to at least the
+  /// corresponding value.
   enum Severity {
-    kAssertFailed = -3,
-    kError = -2,
-    kWarning = -1,
-    kInfo = 0,
+    kAssertFailed = -3, //!< Assertion failure. abort() will be called.
+    kError = -2,        //!< Fatal error. KaldiFatalError will be thrown.
+    kWarning = -1,      //!< Indicates a recoverable but abnormal condition.
+    kInfo = 0,          //!< Informational message.
   };
-  // An 'enum Severity' value, or a positive number indicating verbosity level.
-  int severity;
-  const char *func;
-  const char *file;
-  int32 line;
+  int severity;     //!< A Severity value, or positive verbosity level.
+  const char *func; //!< Name of the function invoking the logging.
+  const char *file; //!< Source file name with up to 1 leading directory.
+  int32 line;       //<! Line number in the source file.
 };
 
-// Class MessageLogger is invoked from the KALDI_ASSERT, KALDI_ERR, KALDI_WARN and
-// KALDI_LOG macros. It formats the message, then either prints it to stderr or
-// passes to the log custom handler if provided, then, in case of the error,
-// throws an std::runtime_exception, in case of failed KALDI_ASSERT calls abort().
-//
-// Note: we avoid using std::cerr for thread safety issues.
-// fprintf(stderr,...) is guaranteed thread-safe, and outputs
-// its formatted string atomically.
-class MessageLogger {
+/// Kaldi fatal runtime error exception. This exception is thrown from any use
+/// of the KALDI_ERR logging macro after the logging function, either set by
+/// SetLogHandler(), or the Kaldi's internal one, has returned.
+class KaldiFatalError : public std::runtime_error {
 public:
-  /// Constructor stores the info,
-  MessageLogger(LogMessageEnvelope::Severity severity,
-                const char *func,
-                const char *file,
-                int32 line);
-
-  /// Destructor, calls 'HandleMessage' which prints the message,
-  /// (since C++11 a 'throwing' destructor must be declared 'noexcept(false)')
-  ~MessageLogger() noexcept(false);
-
-  /// The hook for the 'insertion operator', e.g.
-  /// 'KALDI_LOG << "Message,"',
-  inline std::ostream &stream() { return ss_; }
-
-protected:
-  std::string GetMessage() const;
-  /// The logging function,
-  static void HandleMessage(const LogMessageEnvelope &env, const char *msg);
-
-protected:
-  LogMessageEnvelope envelope_;
+  explicit KaldiFatalError(const std::string &message)
+      : std::runtime_error(message) {}
+  explicit KaldiFatalError(const char *message) : std::runtime_error(message) {}
 
-private:
-  std::ostringstream ss_;
+  /// Returns the exception name, "kaldi::KaldiFatalError".
+  virtual const char *what() const noexcept override {
+    return "kaldi::KaldiFatalError";
+  }
+
+  /// Returns the Kaldi error message logged by KALDI_ERR.
+  const char *KaldiMessage() const { return std::runtime_error::what(); }
 };
 
-class FatalMessageLogger: public MessageLogger {
+// Class MessageLogger is the workhorse behind the KALDI_ASSERT, KALDI_ERR,
+// KALDI_WARN, KALDI_LOG and KALDI_VLOG macros. It formats the message, then
+// either prints it to stderr or passes to the custom logging handler if
+// provided. Then, in case of the error, throws a KaldiFatalError exception, or
+// in case of failed KALDI_ASSERT, calls std::abort().
+class MessageLogger {
 public:
-  FatalMessageLogger(LogMessageEnvelope::Severity severity,
-                     const char *func, const char *file, int32 line);
+  /// The constructor stores the message's "envelope", a set of data which
+  // identifies the location in source which is sending the message to log.
+  // The pointers to strings are stored internally, and not owned or copied,
+  // so that their storage must outlive this object.
+  MessageLogger(LogMessageEnvelope::Severity severity, const char *func,
+                const char *file, int32 line);
+
+  // The stream insertion operator, used in e.g. 'KALDI_LOG << "Message"'.
+  template <typename T> MessageLogger &operator<<(const T &val) {
+    ss_ << val;
+    return *this;
+  }
+
+  // When assigned a MessageLogger, log its contents.
+  struct Log final {
+    void operator=(const MessageLogger &logger) { logger.LogMessage(); }
+  };
 
-  [[ noreturn ]] ~FatalMessageLogger() noexcept(false);
-};
+  // When assigned a MessageLogger, log its contents and then throw
+  // a KaldiFatalError.
+  struct LogAndThrow final {
+    [[noreturn]] void operator=(const MessageLogger &logger) {
+      logger.LogMessage();
+      throw KaldiFatalError(logger.GetMessage());
+    }
+  };
 
-// The definition of the logging macros,
-#define KALDI_ERR \
-  ::kaldi::FatalMessageLogger(::kaldi::LogMessageEnvelope::kError, \
-                              __func__, __FILE__, __LINE__).stream()
-#define KALDI_WARN \
-  ::kaldi::MessageLogger(::kaldi::LogMessageEnvelope::kWarning, \
-                         __func__, __FILE__, __LINE__).stream()
-#define KALDI_LOG \
-  ::kaldi::MessageLogger(::kaldi::LogMessageEnvelope::kInfo, \
-                         __func__, __FILE__, __LINE__).stream()
-#define KALDI_VLOG(v) if ((v) <= ::kaldi::g_kaldi_verbose_level)     \
-  ::kaldi::MessageLogger((::kaldi::LogMessageEnvelope::Severity)(v), \
-                         __func__, __FILE__, __LINE__).stream()
+private:
+  std::string GetMessage() const { return ss_.str(); }
+  void LogMessage() const;
 
+  LogMessageEnvelope envelope_;
+  std::ostringstream ss_;
+};
+
+// Logging macros.
+#define KALDI_ERR                                                              \
+  ::kaldi::MessageLogger::LogAndThrow() = ::kaldi::MessageLogger(              \
+      ::kaldi::LogMessageEnvelope::kError, __func__, __FILE__, __LINE__)
+#define KALDI_WARN                                                             \
+  ::kaldi::MessageLogger::Log() = ::kaldi::MessageLogger(                      \
+      ::kaldi::LogMessageEnvelope::kWarning, __func__, __FILE__, __LINE__)
+#define KALDI_LOG                                                              \
+  ::kaldi::MessageLogger::Log() = ::kaldi::MessageLogger(                      \
+      ::kaldi::LogMessageEnvelope::kInfo, __func__, __FILE__, __LINE__)
+#define KALDI_VLOG(v)                                                          \
+  if ((v) <= ::kaldi::GetVerboseLevel())                                       \
+  ::kaldi::MessageLogger::Log() =                                              \
+      ::kaldi::MessageLogger((::kaldi::LogMessageEnvelope::Severity)(v),       \
+                             __func__, __FILE__, __LINE__)
 
 /***** KALDI ASSERTS *****/
 
-[[ noreturn ]] void KaldiAssertFailure_(const char *func, const char *file,
-                                        int32 line, const char *cond_str);
+[[noreturn]] void KaldiAssertFailure_(const char *func, const char *file,
+                                      int32 line, const char *cond_str);
 
-// Note on KALDI_ASSERT and KALDI_PARANOID_ASSERT
-// The original (simple) version of the code was this
-//
-// #define KALDI_ASSERT(cond) if (!(cond))
-//              kaldi::KaldiAssertFailure_(__func__, __FILE__, __LINE__, #cond);
+// Note on KALDI_ASSERT and KALDI_PARANOID_ASSERT:
 //
-// That worked well, but we were concerned that it
-// could potentially cause a performance issue due to failed branch
-// prediction (best practice is to have the if branch be the commonly
-// taken one).
-// Therefore, we decided to move the call into the else{} branch.
 // A single block {} around if /else  does not work, because it causes
 // syntax error (unmatched else block) in the following code:
 //
@@ -161,41 +174,58 @@ class FatalMessageLogger: public MessageLogger {
 // else
 //   SomethingElse();
 //
-// do {} while(0)  -- note there is no semicolon at the end! --- works nicely
+// do {} while(0) -- note there is no semicolon at the end! -- works nicely,
 // and compilers will be able to optimize the loop away (as the condition
 // is always false).
+//
+// Also see KALDI_COMPILE_TIME_ASSERT, defined in base/kaldi-utils.h, and
+// KALDI_ASSERT_IS_INTEGER_TYPE and KALDI_ASSERT_IS_FLOATING_TYPE, also defined
+// there.
 #ifndef NDEBUG
-#define KALDI_ASSERT(cond) do { if (cond) (void)0; else \
-  ::kaldi::KaldiAssertFailure_(__func__, __FILE__, __LINE__, #cond); } while(0)
+#define KALDI_ASSERT(cond)                                                     \
+  do {                                                                         \
+    if (cond)                                                                  \
+      (void)0;                                                                 \
+    else                                                                       \
+      ::kaldi::KaldiAssertFailure_(__func__, __FILE__, __LINE__, #cond);       \
+  } while (0)
 #else
 #define KALDI_ASSERT(cond) (void)0
 #endif
-// Also see KALDI_COMPILE_TIME_ASSERT, defined in base/kaldi-utils.h,
-// and KALDI_ASSERT_IS_INTEGER_TYPE and KALDI_ASSERT_IS_FLOATING_TYPE,
-// also defined there.
-// some more expensive asserts only checked if this defined
+
+// Some more expensive asserts only checked if this defined.
 #ifdef KALDI_PARANOID
-#define KALDI_PARANOID_ASSERT(cond) do { if (cond) (void)0; else \
-  ::kaldi::KaldiAssertFailure_(__func__, __FILE__, __LINE__, #cond); } while(0)
+#define KALDI_PARANOID_ASSERT(cond)                                            \
+  do {                                                                         \
+    if (cond)                                                                  \
+      (void)0;                                                                 \
+    else                                                                       \
+      ::kaldi::KaldiAssertFailure_(__func__, __FILE__, __LINE__, #cond);       \
+  } while (0)
 #else
 #define KALDI_PARANOID_ASSERT(cond) (void)0
 #endif
 
-
 /***** THIRD-PARTY LOG-HANDLER *****/
 
-/// Type of third-party logging function,
+/// Type of third-party logging function.
 typedef void (*LogHandler)(const LogMessageEnvelope &envelope,
                            const char *message);
 
 /// Set logging handler. If called with a non-NULL function pointer, the
-/// function pointed by it is called to send messages to a caller-provided
-/// log. If called with NULL pointer, restores default Kaldi error logging to
-/// stderr.  SetLogHandler is obviously not thread safe.
+/// function pointed by it is called to send messages to a caller-provided log.
+/// If called with a NULL pointer, restores default Kaldi error logging to
+/// stderr. This function is obviously not thread safe; the log handler must be.
+/// Returns a previously set logging handler pointer, or NULL.
 LogHandler SetLogHandler(LogHandler);
 
 /// @} end "addtogroup error_group"
 
-}  // namespace kaldi
+// Functions within internal is exported for testing only, do not use.
+namespace internal {
+bool LocateSymbolRange(const std::string &trace_name, size_t *begin,
+                       size_t *end);
+} // namespace internal
+} // namespace kaldi
 
-#endif  // KALDI_BASE_KALDI_ERROR_H_
+#endif // KALDI_BASE_KALDI_ERROR_H_
diff --git a/src/base/kaldi-math.cc b/src/base/kaldi-math.cc
index 991e46a590c..17271f3c46f 100644
--- a/src/base/kaldi-math.cc
+++ b/src/base/kaldi-math.cc
@@ -21,6 +21,7 @@
 #include "base/kaldi-math.h"
 #ifndef _MSC_VER
 #include <stdlib.h>
+#include <unistd.h>
 #endif
 #include <string>
 #include <mutex>
@@ -42,7 +43,7 @@ int32 RoundUpToNearestPowerOfTwo(int32 n) {
 static std::mutex _RandMutex;
 
 int Rand(struct RandomState* state) {
-#if defined(_MSC_VER) || defined(__CYGWIN__)
+#if !defined(_POSIX_THREAD_SAFE_FUNCTIONS)
   // On Windows and Cygwin, just call Rand()
   return rand();
 #else
@@ -109,10 +110,8 @@ int32 RandInt(int32 min_val, int32 max_val, struct RandomState* state) {
       return min_val + ( (unsigned int)( (Rand(state)+RAND_MAX*Rand(state)))
                     % (unsigned int)(max_val+1-min_val));
     } else {
-      throw std::runtime_error(std::string()
-                               +"rand_int failed because we do not support "
-                               +"such large random numbers. "
-                               +"(Extend this function).");
+      KALDI_ERR << "rand_int failed because we do not support such large "
+          "random numbers. (Extend this function).";
     }
   }
 #else
diff --git a/src/base/kaldi-math.h b/src/base/kaldi-math.h
index 21665ddfc63..9f91a73cf08 100644
--- a/src/base/kaldi-math.h
+++ b/src/base/kaldi-math.h
@@ -158,7 +158,7 @@ inline float RandGauss(struct RandomState* state = NULL) {
 }
 
 // Returns poisson-distributed random number.  Uses Knuth's algorithm.
-// Take care: this takes time proportinal
+// Take care: this takes time proportional
 // to lambda.  Faster algorithms exist but are more complex.
 int32 RandPoisson(float lambda, struct RandomState* state = NULL);
 
diff --git a/src/bin/align-text.cc b/src/bin/align-text.cc
index 616dac858d7..1c695675274 100644
--- a/src/bin/align-text.cc
+++ b/src/bin/align-text.cc
@@ -86,28 +86,34 @@ int main(int argc, char *argv[]) {
 
       if (!text2_reader.HasKey(key)) {
         KALDI_WARN << "Key " << key << " is in " << text1_rspecifier
-            << ", but not in " << text2_rspecifier;
+                   << ", but not in " << text2_rspecifier;
         n_fail++;
         continue;
       }
       const std::vector<std::string> &text1 = text1_reader.Value();
       const std::vector<std::string> &text2 = text2_reader.Value(key);
 
-      // Checks if the special symbol is in the string.
-      KALDI_ASSERT(std::find(text1.begin(),
-                             text1.end(), special_symbol) == text1.end());
-      KALDI_ASSERT(std::find(text2.begin(),
-                             text2.end(), special_symbol) == text2.end());
-
       if (std::find_if(text1.begin(), text1.end(), IsNotToken) != text1.end()) {
-        KALDI_ERR << "In text1, the utterance " << key << " contains unprintable characters." \
-          << "That means there is a problem with the text (such as incorrect encoding)." << std::endl;
-        return  -1;
+        KALDI_ERR << "In text1, the utterance " << key
+                  << " contains unprintable characters. That means there is"
+                  << " a problem with the text (such as incorrect encoding).";
       }
       if (std::find_if(text2.begin(), text2.end(), IsNotToken) != text2.end()) {
-        KALDI_ERR << "In text2, the utterance " << key << " contains unprintable characters." \
-          << "That means there is a problem with the text (such as incorrect encoding)." << std::endl;
-        return  -1;
+        KALDI_ERR << "In text2, the utterance " << key
+                  << " contains unprintable characters. That means there is"
+                  << " a problem with the text (such as incorrect encoding).";
+      }
+
+      // Verify that the special symbol is not in the string.
+      if (std::find(text1.begin(), text1.end(), special_symbol) != text1.end()){
+        KALDI_ERR << "In text1, the utterance " << key
+                  << " contains the special symbol '" << special_symbol
+                  << "'. This is not allowed.";
+      }
+      if (std::find(text2.begin(), text2.end(), special_symbol) != text2.end()){
+        KALDI_ERR << "In text2, the utterance " << key
+                  << " contains the special symbol '" << special_symbol
+                  << "'. This is not allowed.";
       }
 
       std::vector<std::pair<std::string, std::string> > aligned;
diff --git a/src/bin/compute-wer-bootci.cc b/src/bin/compute-wer-bootci.cc
index b8b0697af75..ba2a4ce739c 100644
--- a/src/bin/compute-wer-bootci.cc
+++ b/src/bin/compute-wer-bootci.cc
@@ -162,7 +162,7 @@ int main(int argc, char *argv[]) {
 
   try {
     const char *usage =
-      "Compute a bootstrapping of WER to extract the 95\% confidence interval.\n"
+      "Compute a bootstrapping of WER to extract the 95% confidence interval.\n"
       "Take a reference and a transcription file, in integer or text format,\n"
       "and outputs overall WER statistics to standard output along with its\n"
       "confidence interval using the bootstrap method of Bisani and Ney.\n"
@@ -234,12 +234,12 @@ int main(int argc, char *argv[]) {
     std::cout.precision(2);
     std::cerr.precision(2);
     std::cout << "Set1: %WER " << std::fixed << 100*mean_wer <<
-              " 95\% Conf Interval [ " << 100*mean_wer-100*interval <<
+              " 95% Conf Interval [ " << 100*mean_wer-100*interval <<
               ", " << 100*mean_wer+100*interval << " ]" << '\n';
 
     if(!hyp2_rspecifier.empty()) {
         std::cout << "Set2: %WER " << std::fixed << 100*mean_wer2 <<
-            " 95\% Conf Interval [ " << 100*mean_wer2-100*interval2 <<
+            " 95% Conf Interval [ " << 100*mean_wer2-100*interval2 <<
             ", " << 100*mean_wer2+100*interval2 << " ]" << '\n';
 
         std::cout << "Probability of Set2 improving Set1: " << std::fixed <<
diff --git a/src/bin/draw-tree.cc b/src/bin/draw-tree.cc
index c9be5586933..d107ab1cfac 100644
--- a/src/bin/draw-tree.cc
+++ b/src/bin/draw-tree.cc
@@ -34,25 +34,23 @@ void MakeEvent(std::string &qry, fst::SymbolTable *phone_syms,
     if (key == kPdfClass) {
       value = static_cast<EventValueType>(atoi(valstr.c_str()));
       if (value < 0) { // not valid pdf-class
-        KALDI_ERR << "Bad query: invalid pdf-class ("
-                  << valstr << ')' << std::endl << std::endl;
+        KALDI_ERR << "Bad query: invalid pdf-class (" << valstr << ')';
       }
     }
     else {
       value = static_cast<EventValueType>(phone_syms->Find(valstr.c_str()));
       if (value == -1) { // fst::kNoSymbol
-        KALDI_ERR << "Bad query: invalid symbol ("
-                  << valstr << ')' << std::endl << std::endl;
+        KALDI_ERR << "Bad query: invalid symbol (" << valstr << ')';
       }
     }
     query_event->push_back(std::make_pair(key++, value));
     old_found = found + 1;
   }
   std::string valstr = qry.substr(old_found);
-  EventValueType value = static_cast<EventValueType>(phone_syms->Find(valstr.c_str()));
+  EventValueType value =
+      static_cast<EventValueType>(phone_syms->Find(valstr.c_str()));
   if (value == -1) { // fst::kNoSymbol
-    KALDI_ERR << "Bad query: invalid symbol ("
-              << valstr << ')' << std::endl << std::endl;
+    KALDI_ERR << "Bad query: invalid symbol (" << valstr << ')';
   }
   query_event->push_back(std::make_pair(key, value));
 
diff --git a/src/chain/chain-denominator.cc b/src/chain/chain-denominator.cc
index b644e429b67..b9023f02f5e 100644
--- a/src/chain/chain-denominator.cc
+++ b/src/chain/chain-denominator.cc
@@ -24,6 +24,7 @@
 namespace kaldi {
 namespace chain {
 
+
 DenominatorComputation::DenominatorComputation(
     const ChainTrainingOptions &opts,
     const DenominatorGraph &den_graph,
@@ -54,6 +55,18 @@ DenominatorComputation::DenominatorComputation(
   // log-space.
   KALDI_ASSERT(opts_.leaky_hmm_coefficient > 0.0 &&
                opts_.leaky_hmm_coefficient < 1.0);
+
+  if (RandInt(0, 99) == 0) {
+    // A check, that all values in nnet_output are in the range [-30, 30]..
+    // otherwise derivatives will be wrong (search below for 30).
+    BaseFloat max_val = nnet_output.Max(), min_val = nnet_output.Min();
+    if (max_val > 30.0 || min_val < -30.0) {
+      KALDI_WARN << "Nnet outputs " << min_val << ", "
+                 << max_val <<
+          " outside the range [-30,30], derivs may be inaccurate.";
+    }
+  }
+
   // make sure the alpha sums and beta sums are zeroed.
   alpha_.ColRange(den_graph_.NumStates() * num_sequences_,
                   num_sequences_).SetZero();
@@ -294,6 +307,7 @@ bool DenominatorComputation::Backward(
         transposed_deriv_part.SetZero();
     }
   }
+
   return ok_;
 }
 
diff --git a/src/chain/chain-denominator.h b/src/chain/chain-denominator.h
index d76e4244ae2..217b7447621 100644
--- a/src/chain/chain-denominator.h
+++ b/src/chain/chain-denominator.h
@@ -218,7 +218,8 @@ class DenominatorComputation {
   BaseFloat Forward();
 
   // this adds deriv_weight times (the derivative of the log-prob w.r.t. the
-  // nnet output), to 'nnet_output_deriv'.
+  // nnet output), to 'nnet_output_deriv'.  Note: normally, deriv_weight
+  // will be -1, or some other negative number if we are doing data weighting.
   // returns true if everything seemed OK, false if a failure was detected.
   bool Backward(BaseFloat deriv_weight,
                 CuMatrixBase<BaseFloat> *nnet_output_deriv);
diff --git a/src/chain/chain-kernels-ansi.h b/src/chain/chain-kernels-ansi.h
index 388c78ab2ee..f5814d7c11c 100644
--- a/src/chain/chain-kernels-ansi.h
+++ b/src/chain/chain-kernels-ansi.h
@@ -48,6 +48,12 @@ extern "C" {
                               const BaseFloat *prev_alpha,
                               BaseFloat *this_alpha);
 
+  void cuda_penalize_out_of_range(dim3 Gr, dim3 Bl, BaseFloat limit,
+                                  BaseFloat scale, const BaseFloat *in_data,
+                                  MatrixDim dim, int out_stride,
+                                  BaseFloat *out_deriv);
+
+
 } // extern "C"
 
 #endif  // HAVE_CUDA
diff --git a/src/chain/chain-kernels.cu b/src/chain/chain-kernels.cu
index f093f21a5a5..a63944f0012 100644
--- a/src/chain/chain-kernels.cu
+++ b/src/chain/chain-kernels.cu
@@ -1,6 +1,6 @@
 // chain/chain-kernels.cu
 
-// Copyright  2015  Johns Hopkins University (author: Daniel Povey)
+// Copyright  2015-2019  Johns Hopkins University (author: Daniel Povey)
 
 
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -287,3 +287,32 @@ void cuda_chain_hmm_backward(dim3 Gr, dim3 Bl,
                                       this_beta, log_prob_deriv,
                                       log_prob_deriv_stride);
 }
+
+
+// See documentation for PenalizeOutOfRange() in chain-training.cc to see what
+// this is about.
+__global__
+static void _penalize_out_of_range(
+    BaseFloat limit, BaseFloat scale, const BaseFloat *in_data, MatrixDim dim,
+    int out_stride, BaseFloat *out_deriv) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int in_index = i + j * dim.stride,
+      out_index = i + j * out_stride;
+  if (i < dim.cols && j < dim.rows) {
+    BaseFloat val = in_data[in_index];
+    if (val < -limit) {
+      out_deriv[out_index] -= scale * (val + limit);
+    } else if (val > limit) {
+      out_deriv[out_index] -= scale * (val - limit);
+    }
+  }
+}
+
+void cuda_penalize_out_of_range(dim3 Gr, dim3 Bl, BaseFloat limit,
+                                BaseFloat scale, const BaseFloat *in_data,
+                                MatrixDim dim, int out_stride,
+                                BaseFloat *out_deriv) {
+  _penalize_out_of_range<<<Gr,Bl>>>(limit, scale, in_data,
+                                    dim, out_stride, out_deriv);
+}
diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc
index 6b4a7b593c2..d20ecfa4c1e 100644
--- a/src/chain/chain-training.cc
+++ b/src/chain/chain-training.cc
@@ -28,6 +28,62 @@ namespace kaldi {
 namespace chain {
 
 
+/**
+   This is a rather special-purpose function which adds something to
+   the derivative in order to encourage the value to stay within
+   a specified range.  This is something we use in chain training
+   in order to encourage the nnet outputs to stay within the
+   range [-30, 30] (needed because we don't do the forward-backward
+   denominator computation in log space).
+
+   It's very similar to l2 regularization but only applied once you depart
+   the range [-limit, limit].
+
+   Basically, this function does as follows:
+
+     (*out_deriv)(i,j) +=   0                                if   -limit <= in_value(i,j) <= limit
+                            (-limit - in_value(i,j)) * scale if  in_value(i,j) < -limit
+                            (limit - in_value(i,j)) * scale  if  in_value(i,j) > limit
+   If limit were zero, this would be the same as l2 regularization with scale 'scale'.
+ */
+static void PenalizeOutOfRange(const CuMatrixBase<BaseFloat> &in_value,
+                               BaseFloat limit,
+                               BaseFloat scale,
+                               CuMatrixBase<BaseFloat> *out_deriv) {
+  KALDI_ASSERT(SameDim(in_value, *out_deriv) && limit > 0 && scale >= 0);
+  if (scale == 0)
+    return;
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    CuTimer tim;
+    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
+    dim3 dimGrid(n_blocks(in_value.NumCols(), CU2DBLOCK),
+                 n_blocks(in_value.NumRows(), CU2DBLOCK));
+    cuda_penalize_out_of_range(dimGrid, dimBlock, limit, scale,
+                               in_value.Data(), in_value.Dim(),
+                               out_deriv->Stride(), out_deriv->Data());
+    CU_SAFE_CALL(cudaGetLastError());
+    CuDevice::Instantiate().AccuProfile(__func__, tim);
+  } else
+#endif
+  {
+    int32 num_rows = in_value.NumRows(),
+        num_cols = in_value.NumCols();
+    for (int32 r = 0; r < num_rows; r++) {
+      const BaseFloat *in_row_data =  in_value.RowData(r);
+      BaseFloat *out_row_data = out_deriv->RowData(r);
+      for (int32 c = 0; c < num_cols; c++) {
+        BaseFloat val = in_row_data[c];
+        if (val < -limit) {
+          out_row_data[c] -= scale * (val + limit);
+        } else if (val > limit) {
+          out_row_data[c] -= scale * (val - limit);
+        }
+      }
+    }
+  }
+}
+
 
 void ComputeChainObjfAndDerivE2e(const ChainTrainingOptions &opts,
                                  const DenominatorGraph &den_graph,
@@ -47,6 +103,14 @@ void ComputeChainObjfAndDerivE2e(const ChainTrainingOptions &opts,
   if (nnet_output_deriv != NULL)
     nnet_output_deriv->SetZero();
 
+  if (nnet_output_deriv != NULL && RandInt(0, 1) == 0) {
+    // Only do this about every other frame, for efficiency; we'll multiply the
+    // scale by 2 to compensate.  See docs for the function, for its purpose.
+    PenalizeOutOfRange(nnet_output, 30.0,
+                       2.0 * opts.out_of_range_regularize,
+                       nnet_output_deriv);
+  }
+
   { // Doing the denominator first helps to reduce the maximum
     // memory use, as we can set 'xent_deriv' to nonempty after
     // we've freed the memory in this object.
@@ -172,6 +236,14 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts,
                                 nnet_output_deriv);
   }
 
+  if (nnet_output_deriv != NULL && RandInt(0, 1) == 0) {
+    // Only do this about every other frame, for efficiency; we'll multiply the
+    // scale by 2 to compensate.  See docs for the function, for its purpose.
+    PenalizeOutOfRange(nnet_output, 30.0,
+                       2.0 * opts.out_of_range_regularize,
+                       nnet_output_deriv);
+  }
+
   if (xent_output_deriv != NULL) {
     // the reason for kStrideEqualNumCols is so that we can share the memory
     // block with the memory that was used for exp_nnet_output_transposed_ from
diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h
index 6ea70b5ca41..3e7efbb59a1 100644
--- a/src/chain/chain-training.h
+++ b/src/chain/chain-training.h
@@ -45,6 +45,14 @@ struct ChainTrainingOptions {
   // (squared so it's additive across the dimensions).  e.g. try 0.0005.
   BaseFloat l2_regularize;
 
+
+  // This is similar to an l2 regularization constant (like l2-regularize) but
+  // applied on the part of the nnet output matrix that exceeds the range
+  // [-30,30]... this is necessary to avoid things regularly going out of the
+  // range that we can do exp() on, since the denominator computation is not in
+  // log space and to avoid NaNs we limit the outputs to the range [-30,30].
+  BaseFloat out_of_range_regularize;
+
   // Coefficient for 'leaky hmm'.  This means we have an epsilon-transition from
   // each state to a special state with probability one, and then another
   // epsilon-transition from that special state to each state, with probability
@@ -62,13 +70,19 @@ struct ChainTrainingOptions {
   // should have a softmax as its final nonlinearity.
   BaseFloat xent_regularize;
 
-  ChainTrainingOptions(): l2_regularize(0.0), leaky_hmm_coefficient(1.0e-05),
+  ChainTrainingOptions(): l2_regularize(0.0), out_of_range_regularize(0.01),
+                          leaky_hmm_coefficient(1.0e-05),
                           xent_regularize(0.0) { }
 
   void Register(OptionsItf *opts) {
     opts->Register("l2-regularize", &l2_regularize, "l2 regularization "
                    "constant for 'chain' training, applied to the output "
                    "of the neural net.");
+    opts->Register("out-of-range-regularize", &out_of_range_regularize,
+                   "Constant that controls how much we penalize the nnet output "
+                   "being outside the range [-30,30].  This is needed because we "
+                   "limit it to that range in the denominator computation (which "
+                   "is to avoid NaNs because it is not done in log space.");
     opts->Register("leaky-hmm-coefficient", &leaky_hmm_coefficient, "Coefficient "
                    "that allows transitions from each HMM state to each other "
                    "HMM state, to ensure gradual forgetting of context (can "
diff --git a/src/configure b/src/configure
index c156f253376..e6ffdf337af 100755
--- a/src/configure
+++ b/src/configure
@@ -1,50 +1,54 @@
 #!/bin/bash
 
-# This configure script is hand-generated, not auto-generated.
-# It creates the file kaldi.mk, which is %included by the Makefiles
-# in the subdirectories.
+# This configure script is hand-generated, not auto-generated.  It creates the
+# file kaldi.mk, which is %included by the Makefiles in the subdirectories.
 # The file kaldi.mk is editable by hand -- for example, you may want to
-# remove the options -g -O0 -DKALDI_PARANOID, or edit the
-# DOUBLE_PRECISION variable (to be 1 not 0).
-
+# uncomment the options -O0 -DKALDI_PARANOID, or edit the DOUBLE_PRECISION
+# variable (to be 1 not 0).
 
 #  Example command lines:
-# ./configure --shared  ## shared libraries.
 # ./configure
-# ./configure --mkl-root=/opt/intel/mkl
-# ./configure --mkl-root=/opt/intel/mkl --threaded-math=yes
-# ./configure --mkl-root=/opt/intel/mkl --threaded-math=yes --mkl-threading=tbb
-#        # This is for MKL 11.3, which does not seem  to provide Intel OMP libs
-# ./configure --openblas-root=../tools/OpenBLAS/install
-#        # Before doing this, cd to ../tools and type "make openblas".
+# ./configure --shared                # Build shared Kaldi libraries.
+# ./configure --mathlib=OPENBLAS      # Build and use OpenBLAS.
+#        # Before doing this, cd to ../tools and type "make -j openblas".
+# ./configure --openblas-root=/usr    # Use system OpenBLAS.
 #        # Note: this is not working correctly on all platforms, do "make test"
 #        # and look out for segmentation faults.
 # ./configure --atlas-root=../tools/ATLAS/build
 # ./configure --use-cuda=no   # disable CUDA detection (will build cpu-only
-#                             # version of kaldi even on CUDA-enabled machine
+#                             # version of kaldi even on CUDA-enabled machine.
 # ./configure --use-cuda --cudatk-dir=/usr/local/cuda/ --cuda-arch=-arch=sm_70
 #        # Use cuda in /usr/local/cuda and set the arch to sm_70
 # ./configure --static --fst-root=/opt/cross/armv8hf \
-# --atlas-root=/opt/cross/armv8hf --host=armv8-rpi3-linux-gnueabihf
-#        # Cross compile for armv8hf, this assumes that you have openfst built
+#   --atlas-root=/opt/cross/armv8hf --host=armv8-rpi3-linux-gnueabihf
+#        # Cross-compile for armv8hf. This assumes that you have OpenFST built
 #        # with the armv8-rpi3-linux-gnueabihf toolchain and installed to
 #        # /opt/cross/armv8hf. It also assumes that you have an ATLAS library
 #        # built for the target install to /opt/cross/armv8hf and that the
-#        # armv8-rpi3-linux-gnueabihf toolchain is available in your path
+#        # armv8-rpi3-linux-gnueabihf toolchain is available in your path.
 # ./configure --static --openblas-root=/opt/cross/arm-linux-androideabi \
-# --fst-root=/opt/cross/arm-linux-androideabi --fst-version=1.4.1 \
-# --android-incdir=/opt/cross/arm-linux-androideabi/sysroot/usr/include \
-# --host=arm-linux-androideabi
-#        # Cross compile for Android on arm. The only difference here is the
+#   --fst-root=/opt/cross/arm-linux-androideabi --fst-version=1.6.9 \
+#   --android-incdir=/opt/cross/arm-linux-androideabi/sysroot/usr/include \
+#   --host=arm-linux-androideabi
+#        # Cross-compile for Android on arm. The only difference here is the
 #        # addition of the the --android-includes flag because the toolchains
 #        # produced by the Android NDK don't always include the C++ stdlib
-#        # headers in the normal cross compile include path.
-# --host=aarch64-linux-android
-#        # support for 64bit ARMv8(AArch64) architecture in Android.
+#        # headers in the normal cross-compile include path.
+#   --host=aarch64-linux-android
+#        # support for 64bit ARMv8 (AArch64) architecture in Android.
 
 # This should be incremented after any significant change to the configure
 # script, i.e. any change affecting kaldi.mk or the build system as a whole.
-CONFIGURE_VERSION=10
+CONFIGURE_VERSION=11
+
+# We support bash version 3.2 (Macs still ship with this version as of 2019)
+# and above.
+[[ $BASH_VERSION < '3.2' ]] && {
+  echo >&2 "bash version ${BASH_VERSION} is too old, cannot continue." \
+           "You won't be able to run Kaldi recipes with it anyway." \
+           "Please upgrade. bash version 3.2 or higher is required."
+  exit 1;
+}
 
 if ! [ -x "$PWD/configure" ]; then
   echo 'You must run "configure" from the src/ directory.'
@@ -67,13 +71,14 @@ Configuration options:
   --shared              Build and link against shared libraries [default=no]
   --use-cuda            Build with CUDA [default=yes]
   --cudatk-dir=DIR      CUDA toolkit directory
-  --cuda-arch=FLAGS     Override the default CUDA_ARCH flags.  See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#nvcc-examples.
+  --cuda-arch=FLAGS     Override the default CUDA_ARCH flags. See:
+         https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#nvcc-examples.
   --double-precision    Build with BaseFloat set to double if yes [default=no],
                         mostly useful for testing purposes.
   --static-fst          Build with static OpenFst libraries [default=no]
   --fst-root=DIR        OpenFst root directory [default=../tools/openfst/]
   --fst-version=STR     OpenFst version string
-  --mathlib=LIB         Math library [default=ATLAS]
+  --mathlib=LIB         Math library [default=MKL|OPENBLAS, based on platform]
                         Supported libraries: ATLAS, MKL, CLAPACK, OPENBLAS.
   --static-math         Build with static math libraries [default=no]
   --threaded-math       Build with multi-threaded math libraries [default=no]
@@ -110,25 +115,42 @@ compiler/linker.
 EOF
 }
 
+# E.g. Die "Invalid switch --foobar"
+Die() { echo >&2 "$0: FATAL:" "$@"; exit 1; }
+
+# E.g. abspath=$(rel2abs "../tools") || exit 1
+#  - Set 'abspath' to existing absolute path of $1, return 0.
+#  - print empty string if path does not exist, return non-0.
 function rel2abs {
-  if [ ! -z "$1" ]; then
-    local retval=`cd $1 2>/dev/null && pwd || exit 1`
-    echo $retval
-  fi
+  [[ $1 ]] && cd -P "$1" 2>/dev/null && pwd
 }
 
-function read_value {
-  local val=`expr "X$1" : '[^=]*=\(.*\)'`;
-  echo $val
+# E.g.: GetSwitchValue var --some-switch=foo
+# Assign variable named 'var' to 'foo'. Return 0 iff value is not empty.
+GetSwitchValue() {
+  IFS='=' read -r -- _ $1 <<< "$2" && [[ ${!1} ]]
 }
 
-function read_dirname {
-  local dir_name=`read_value $1`
-  local retval=`rel2abs $dir_name`
-  [ -z $retval ] && echo "Bad option '$1': no such directory" && exit 1;
-  echo $retval
+# E.g.: GetSwitchValueOrDie var --some-switch=foo
+# Assign variable named 'var' to 'foo'. Die with a fatal error if value is empty.
+GetSwitchValueOrDie() {
+  GetSwitchValue "$@" ||
+    Die "'$2': switch requires a value. See '$0 --help'."
 }
 
+# E.g.: GetSwitchExistingPathOrDie var --some-switch=../tools
+#  - Set 'var' to absolute path of '../tools' if exists, return 1.
+#  - Die with a fatal error if path does not exist or not given in switch.
+GetSwitchExistingPathOrDie() {
+  GetSwitchValueOrDie "$@"  # Already sets variable named $1 to path.
+  local path varname=$1
+  path=$(rel2abs "${!varname}") && [[ -d $path ]] ||
+    Die "'$2': switch must specify an existing directory. See '$0 --help'."
+  builtin printf -v $varname %s "$path"  # Assign $path to variable '$varname'.
+}
+
+# TODO(kkm): Kill this. `[[ ${var-} ]]' is the idiomatic equivalent in bash.
+#   Even better, do not rely on uninitialized variables.
 function is_set {
   local myvar=${1:-notset}
   if [ "$myvar" == "notset" ]; then
@@ -138,6 +160,11 @@ function is_set {
   fi
 }
 
+# Lowercase/uppercase argument. Only bash 4.2+ has internal faclilties for this,
+# and we support versions down to 3.2.
+lcase () { awk '{print tolower($0)}' <<<"$1" ; }
+ucase () { awk '{print toupper($0)}' <<<"$1" ; }
+
 function failure {
   echo "***configure failed: $* ***" >&2
   if [ -f kaldi.mk ]; then rm kaldi.mk; fi
@@ -145,7 +172,7 @@ function failure {
 }
 
 function check_exists {
-  if [ ! -f $1 ]; then failure "$1 not found."; fi
+  if [[ ! -f $1 ]]; then failure "$1 not found."; fi
 }
 
 function check_library {
@@ -292,7 +319,7 @@ function linux_configure_mkl_extra {
   echo "$linkline ${extra_libs[$threaded]}"
 }
 
-function linux_configure_threadinglibdir {
+function linux_configure_mkl_threadinglibdir {
   local library=$1
   local mklroot=$2
   local mkllibdir=$3
@@ -342,9 +369,9 @@ function linux_configure_mkl_threading {
 
   if ! is_set $OMPLIBDIR ; then
     if  $static ; then
-      OMPLIBDIR=`linux_configure_threadinglibdir $library "$MKLROOT" "$MKLLIBDIR" "a"`
+      OMPLIBDIR=`linux_configure_mkl_threadinglibdir $library "$MKLROOT" "$MKLLIBDIR" "a"`
     else
-      OMPLIBDIR=`linux_configure_threadinglibdir $library "$MKLROOT" "$MKLLIBDIR" "so"`
+      OMPLIBDIR=`linux_configure_mkl_threadinglibdir $library "$MKLROOT" "$MKLLIBDIR" "so"`
     fi
   fi
 
@@ -406,18 +433,32 @@ function configure_cuda {
         7_*)
           MIN_UNSUPPORTED_GCC_VER="5.0"
           MIN_UNSUPPORTED_GCC_VER_NUM=50000;
+          CUSOLVER=false
         ;;
         8_*)
           MIN_UNSUPPORTED_GCC_VER="6.0"
           MIN_UNSUPPORTED_GCC_VER_NUM=60000;
+          CUSOLVER=false
+        ;;
+        9_0)
+          MIN_UNSUPPORTED_GCC_VER="7.0"
+          MIN_UNSUPPORTED_GCC_VER_NUM=70000;
+          CUSOLVER=false
         ;;
-        9_0 | 9_1)
+        9_1)
           MIN_UNSUPPORTED_GCC_VER="7.0"
           MIN_UNSUPPORTED_GCC_VER_NUM=70000;
+          CUSOLVER=true
         ;;
-        9_2 | 9_* | 10_*)
+        9_2 | 9_* | 10_0)
           MIN_UNSUPPORTED_GCC_VER="8.0"
           MIN_UNSUPPORTED_GCC_VER_NUM=80000;
+          CUSOLVER=true
+        ;;
+        10_1 | 10_*)
+          MIN_UNSUPPORTED_GCC_VER="9.0"
+          MIN_UNSUPPORTED_GCC_VER_NUM=90000;
+          CUSOLVER=true
         ;;
         *)
           echo "Unsupported CUDA_VERSION (CUDA_VERSION=$CUDA_VERSION), please report it to Kaldi mailing list, together with 'nvcc -h' or 'ptxas -h' which lists allowed -gencode values..."; exit 1;
@@ -430,14 +471,27 @@ function configure_cuda {
     fi
 
     if [ -z "$CUDA_ARCH" ]; then
-      case $CUDA_VERSION in
-        5_5) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35" ;;
-        6_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50" ;;
-        7_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_53,code=sm_53" ;;
-        8_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_53,code=sm_53 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_62,code=sm_62" ;;
-        9_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_53,code=sm_53 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_62,code=sm_62 -gencode arch=compute_70,code=sm_70" ;;
-        10_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_53,code=sm_53 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_62,code=sm_62 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_72,code=sm_72 -gencode arch=compute_75,code=sm_75" ;;
-        *) echo "Unsupported CUDA_VERSION (CUDA_VERSION=$CUDA_VERSION), please report it to Kaldi mailing list, together with 'nvcc -h' or 'ptxas -h' which lists allowed -gencode values..."; exit 1 ;;
+      case `uname -m` in
+        x86_64|ppc64le)
+          case $CUDA_VERSION in
+            5_5) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35" ;;
+            6_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50" ;;
+            7_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52" ;;
+            8_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61" ;;
+            9_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70" ;;
+            10_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75" ;;
+            *) echo "Unsupported CUDA_VERSION (CUDA_VERSION=$CUDA_VERSION), please report it to Kaldi mailing list, together with 'nvcc -h' or 'ptxas -h' which lists allowed -gencode values..."; exit 1 ;;
+          esac
+        ;;
+        aarch64)
+          case $CUDA_VERSION in
+            7_*)     CUDA_ARCH="-gencode arch=compute_53,code=sm_53" ;;
+            8_*|9_*) CUDA_ARCH="-gencode arch=compute_53,code=sm_53 -gencode arch=compute_62,code=sm_62" ;;
+            10_*)    CUDA_ARCH="-gencode arch=compute_53,code=sm_53 -gencode arch=compute_62,code=sm_62 -gencode arch=compute_72,code=sm_72" ;;
+            *) echo "Unsupported CUDA_VERSION (CUDA_VERSION=$CUDA_VERSION), please report it to Kaldi mailing list, together with 'nvcc -h' or 'ptxas -h' which lists allowed -gencode values..."; exit 1 ;;
+          esac
+        ;;
+        *) echo "Unsupported architecture for use of Kaldi with CUDA.  Please report it to Kaldi mailing list."; exit 1 ;;
       esac
     fi
 
@@ -448,9 +502,12 @@ function configure_cuda {
     echo CUDA = true >> kaldi.mk
     echo CUDATKDIR = $CUDATKDIR >> kaldi.mk
     echo "CUDA_ARCH = $CUDA_ARCH" >> kaldi.mk
+    
+    
     echo >> kaldi.mk
 
-    # 64bit/32bit? We do not support cross compilation with CUDA so, use direct calls to uname -m here
+    # 64bit/32bit? We do not support cross compilation with CUDA so, use direct
+    # calls to uname -m here
     if [ "`uname -m`" == "x86_64" ]; then
       if [ "`uname`" == "Darwin" ]; then
         sed 's/lib64/lib/g' < makefiles/cuda_64bit.mk >> kaldi.mk
@@ -462,14 +519,22 @@ function configure_cuda {
     elif [ "`uname -m`" == "ppc64le" ]; then
       cat makefiles/cuda_64bit.mk >> kaldi.mk
     else
-      echo "CUDA will not be used! CUDA is not supported with 32-bit builds."
+      echo "\
+WARNING: CUDA will not be used!
+         CUDA is not supported with 32-bit builds."
       exit 1;
     fi
+    
+    #add cusolver flags for newer toolkits
+    if [ "$CUSOLVER" == "true" ]; then
+      echo "CUDA_LDLIBS += -lcusolver" >> kaldi.mk
+    fi
 
   else
-    echo "CUDA will not be used! If you have already installed cuda drivers "
-    echo "and cuda toolkit, try using --cudatk-dir=... option.  Note: this is"
-    echo "only relevant for neural net experiments"
+    echo "\
+WARNING: CUDA will not be used! If you have already installed cuda drivers
+         and CUDA toolkit, try using the --cudatk-dir= option. A GPU and CUDA
+         are required to run neural net experiments in a realistic time."
   fi
 }
 
@@ -489,8 +554,9 @@ function linux_configure_speex {
     spx_type=so
   fi
   if [ ! -f "$SPEEXLIBDIR/libspeex.${spx_type}" ];then
-    echo "Info: configuring Kaldi not to link with Speex (don't worry, it's only needed if you"
-    echo "intend to use 'compress-uncompress-speex', which is very unlikely)"
+    echo "\
+INFO: Configuring Kaldi not to link with Speex. Don't worry, it's only needed if
+      you intend to use 'compress-uncompress-speex', which is very unlikely."
     return
   fi
 
@@ -511,17 +577,11 @@ function linux_configure_speex {
   fi
 }
 
-function linux_atlas_failure {
+function linux_configure_atlas_failure {
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = [somewhere]/liblapack.a [somewhere]/libcblas.a [somewhere]/libatlas.a [somewhere]/libf77blas.a $ATLASLIBDIR >> kaldi.mk
   echo >> kaldi.mk
-  if [[ "$TARGET_ARCH" == arm* ]]; then
-    cat makefiles/linux_atlas_arm.mk >> kaldi.mk
-  elif [[ "$TARGET_ARCH" == ppc64le ]]; then
-    cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
-  else
-    cat makefiles/linux_atlas.mk >> kaldi.mk
-  fi
+
   echo "** $* ***"
   echo "**  ERROR   **"
   echo "** Configure cannot proceed automatically."
@@ -535,11 +595,11 @@ function linux_atlas_failure {
   echo "**"
   echo "**  Otherwise (or if you prefer OpenBLAS for speed), you could go the OpenBLAS"
   echo "** route: cd to ../tools, type 'extras/install_openblas.sh', cd back to here,"
-  echo "** and type './configure  --openblas-root=../tools/OpenBLAS/install'"
+  echo "** and type './configure  --mathlib=OPENBLAS'"
   exit 1;
 }
 
-function linux_check_static {
+function linux_atlas_check_static {
   # will exit with success if $dir seems to contain ATLAS libraries with
   # right architecture (compatible with default "nm")
   echo "int main(void) { return 0; }" > test_linking.cc;
@@ -579,19 +639,10 @@ function linux_configure_atlas_generic {
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS -Wl,-rpath=$libdir >> kaldi.mk
   echo >> kaldi.mk
-  if [[ "$TARGET_ARCH" == arm* ]]; then
-    cat makefiles/linux_atlas_arm.mk >> kaldi.mk
-  elif [[ "$TARGET_ARCH" == ppc64le ]]; then
-    cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
-  else
-    cat makefiles/linux_atlas.mk >> kaldi.mk
-  fi
   echo "Successfully configured ATLAS with ATLASLIBS=$ATLASLIBS"
-  $use_cuda && configure_cuda
-  linux_configure_speex
 }
 
-function linux_configure_redhat_fat {
+function linux_configure_atlas_redhat_fat {
   # This is for when only two so-called 'fat' ATLAS libs are provided:
   # libsatlas.so.3 and libtatlas.so.3.
   # See http://stackoverflow.com/questions/13439296/build-shared-libraries-in-atlas.
@@ -601,19 +652,11 @@ function linux_configure_redhat_fat {
     [ ! -f $f ] && return 1;
   done
   libdir=$(dirname $(echo $ATLASLIBS | awk '{print $1}'))
-  [ -z "$libdir" ] && echo "Error getting libdir in linux_configure_redhat_fat" && exit 1;
+  [ -z "$libdir" ] && echo "Error getting libdir in linux_configure_atlas_redhat_fat" && exit 1;
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS -Wl,-rpath=$libdir >> kaldi.mk
   echo >> kaldi.mk
-  if [[ "$TARGET_ARCH" == arm* ]]; then
-    cat makefiles/linux_atlas_arm.mk >> kaldi.mk
-  elif [[ "$TARGET_ARCH" == ppc64le ]]; then
-    cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
-  else
-    cat makefiles/linux_atlas.mk >> kaldi.mk
-  fi
   echo "Successfully configured for red hat [dynamic libraries, fat] with ATLASLIBS =$ATLASLIBS"
-  $use_cuda && configure_cuda
 }
 
 function linux_configure_atlas_static {
@@ -622,7 +665,7 @@ function linux_configure_atlas_static {
   if [ -z $ATLASLIBDIR ]; then # Note: it'll pick up the last one below.
     for dir in /usr{,/local}/lib{64,}{,/atlas,/atlas-sse2,/atlas-sse3} \
        /usr/local/atlas/lib{,64} `pwd`/../tools/ATLAS/build/install/lib/ $ATLASROOT/lib; do
-     linux_check_static &&  ATLASLIBDIR=$dir
+     linux_atlas_check_static && ATLASLIBDIR=$dir
     done
     if [ -z $ATLASLIBDIR ]; then # Note: it'll pick up the last one below.
       echo "Could not find libatlas.a in any of the generic-Linux places, but we'll try other stuff..."
@@ -662,115 +705,24 @@ function linux_configure_atlas_static {
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS >> kaldi.mk
   echo >> kaldi.mk
-  if [[ "$TARGET_ARCH" == arm* ]]; then
-    cat makefiles/linux_atlas_arm.mk >> kaldi.mk
-  elif [[ "$TARGET_ARCH" == ppc64le ]]; then
-    cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
-  else
-    cat makefiles/linux_atlas.mk >> kaldi.mk
-  fi
-  $use_cuda && configure_cuda
-  linux_configure_speex
   echo "Successfully configured for Linux [static libraries] with ATLASLIBS =$ATLASLIBS"
 }
 
-function linux_check_dynamic {
-  # will exit with success if $dir seems to contain ATLAS libraries with
-  # right architecture (compatible with default "nm")
-  if $threaded_atlas; then pt=t; else pt=s; fi
-  for atlas_libname in libatlas.so lib${pt}atlas.so; do
-    if [ -f $dir/$atlas_libname ]; then # candidate...
-      if nm --dynamic $dir/$atlas_libname 2>&1 | grep "File format not recognized" >/dev/null; then
-        echo "Directory $dir may contain dynamic ATLAS libraries but seems to be wrong architecture";
-        return 1;
-      fi
-        echo "Atlas found in $dir";
-        return 0;
-      fi
-  done
-  # echo "... no {libatlas,lib${pt}atlas}.so in $dir";
-  return 1;
-}
-
-function linux_configure_dynamic {
-  if $threaded_atlas; then pt=t; else pt=s; fi # relevant to "fat" libraries, will change later for separate ones
-  if [ -z $ATLASLIBDIR ]; then # Note: it'll pick up the last one below.
-    for dir in /usr{,/local}/lib{,64}{,/atlas,/atlas-sse2,/atlas-sse3,/x86_64-linux-gnu} \
-      `pwd`/../tools/ATLAS/build/install/lib/ $ATLASROOT/lib; do
-      linux_check_dynamic && ATLASLIBDIR=$dir && ATLASLIBNAME=$atlas_libname
-    done
-    if [ -z $ATLASLIBDIR -o -z $ATLASLIBNAME ]; then
-      echo "Could not find {libatlas,lib${pt}atlas}.so in any of the obvious places, will most likely try static:"
-      return 1;
-    fi
-  fi
-
-  # If using "fat" libraries we only need one file to link against
-  if [ $ATLASLIBNAME != libatlas.so ]; then
-    if [ -f $ATLASLIBDIR/$ATLASLIBNAME ]; then
-      ATLASLIBS="$ATLASLIBDIR/$ATLASLIBNAME"
-    else
-      echo "Configuring dynamic ATLAS library failed: library $ATLASLIBNAME not found in $ATLASLIBDIR"
-      return 1;
-    fi
-  else  # with "thin" libraries, we have several object to link against, and different single/multi-thread names
-    if $threaded_atlas; then pt=pt; else pt=""; fi
-    echo "Validating presence of ATLAS libs in $ATLASLIBDIR"
-    ATLASLIBS=
-    # The Lapack part of ATLAS seems to appear under various different names.. but it
-    # should always have symbols like clapack_cgetrf and ATL_cgetrf defined, so we test for this.
-    for libname in lapack lapack_atlas  clapack; do
-      if [ -f $ATLASLIBDIR/lib${libname}.so -a "$ATLASLIBS" == "" ]; then
-        if nm  --dynamic $ATLASLIBDIR/lib${libname}.so  | grep clapack_cgetrf >/dev/null && \
-           nm  --dynamic $ATLASLIBDIR/lib${libname}.so  | grep ATL_cgetrf >/dev/null; then
-           ATLASLIBS="$ATLASLIBDIR/lib${libname}.so"
-           echo "Using library $ATLASLIBS as ATLAS's CLAPACK library."
-        fi
-      fi
-    done
-    if [ "$ATLASLIBS" == "" ]; then
-      echo Could not find any libraries $ATLASLIBDIR/{liblapack,liblapack_atlas,libclapack} that seem to be an ATLAS CLAPACK library.
-      return 1;
-    fi
-
-    for x in ${pt}cblas atlas ${pt}f77blas; do
-      if [ ! -f $ATLASLIBDIR/lib$x.so ]; then
-        echo "Configuring dynamic ATLAS libraries failed: Could not find library $x in directory $ATLASLIBDIR"
-        return 1;
-      fi
-      ATLASLIBS="$ATLASLIBS $ATLASLIBDIR/lib${x}.so"
-    done
-    if $threaded_atlas; then ATLASLIBS="$ATLASLIBS"; fi
-  fi
-
-  echo ATLASINC = $ATLASROOT/include >> kaldi.mk
-  echo ATLASLIBS = $ATLASLIBS >> kaldi.mk
-  echo ATLASLDFLAGS = -Wl,-rpath,$ATLASLIBDIR >> kaldi.mk
-  echo >> kaldi.mk
-  if [[ "$TARGET_ARCH" == arm* ]]; then
-    cat makefiles/linux_atlas_arm.mk >> kaldi.mk
-  elif [[ "$TARGET_ARCH" == ppc64le ]]; then
-    cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
-  else
-    cat makefiles/linux_atlas.mk >> kaldi.mk
-  fi
-  $use_cuda && configure_cuda
-  linux_configure_speex
-  echo "Successfully configured for Linux [dynamic libraries] with ATLASLIBS =$ATLASLIBS"
-}
-
 #############################    CONFIGURATION    #############################
 
 # If configuration sets any of these variables, we will switch the external
 # math library. Here we unset them so that we can check later.
-unset MKLROOT
-unset CLAPACKROOT
-unset OPENBLASROOT
-unset MKLLIBDIR
+#TODO(kkm): Maybe allow env vars to provide defaults?
+ATLASROOT=
+CLAPACKROOT=
+MATHLIB=
+MKLLIBDIR=
+MKLROOT=
+OPENBLASROOT=
 
 # This variable identifies the type of system where built programs and
 # libraries will run. It is set by the configure script when cross compiling.
-unset HOST
+HOST=
 
 # These environment variables can be used to override the default toolchain.
 CXX=${CXX:-g++}
@@ -796,8 +748,6 @@ threaded_atlas=false
 mkl_threading=sequential
 android=false
 
-MATHLIB='ATLAS'
-ATLASROOT=`rel2abs ../tools/ATLAS_headers/`
 FSTROOT=`rel2abs ../tools/openfst`
 CUBROOT=`rel2abs ../tools/cub`
 
@@ -831,7 +781,7 @@ do
     double_precision=false;
     shift ;;
   --atlas-root=*)
-    ATLASROOT=`read_dirname $1`;
+    GetSwitchExistingPathOrDie ATLASROOT "$1"
     shift ;;
   --threaded-atlas)
     threaded_atlas=true;
@@ -886,56 +836,57 @@ do
     mkl_threading=sequential;
     shift ;;
   --mkl-threading=*)
-    mkl_threading=`read_value $1`;
+    GetSwitchValueOrDie mkl_threading "$1"
     threaded_atlas=true;
     shift ;;
   --fst-root=*)
-    FSTROOT=`read_dirname $1`;
+    GetSwitchExistingPathOrDie FSTROOT "$1"
     shift ;;
   --cub-root=*)
-    CUBROOT=`read_dirname $1`;
+    GetSwitchExistingPathOrDie CUBROOT "$1"
     shift ;;
   --clapack-root=*)
-    CLAPACKROOT=`read_dirname $1`;
+    GetSwitchExistingPathOrDie CLAPACKROOT "$1"
     shift ;;
   --openblas-root=*)
-    OPENBLASROOT=`read_dirname $1`;
+    GetSwitchExistingPathOrDie OPENBLASROOT "$1"
     shift ;;
   --mkl-root=*)
-    MKLROOT=`read_dirname $1`;
+    GetSwitchExistingPathOrDie MKLROOT "$1"
     shift ;;
   --mkl-libdir=*)
-    MKLLIBDIR=`read_dirname $1`;
+    GetSwitchExistingPathOrDie MKLLIBDIR "$1"
     shift ;;
   --speex-root=*)
-    SPEEXROOT=`read_dirname $1`;
+    GetSwitchExistingPathOrDie SPEEXROOT "$1"
     shift ;;
   --speex-libdir=*)
-    SPEEXLIBDIR=`read_dirname $1`;
+    GetSwitchExistingPathOrDie SPEEXLIBDIR "$1"
     shift ;;
   --speex-incdir=*)
-    SPEEXINCDIR=`read_dirname $1`;
+    GetSwitchExistingPathOrDie SPEEXINCDIR "$1"
     shift ;;
   --omp-libdir=*)
-    OMPLIBDIR=`read_dirname $1`;
+    GetSwitchExistingPathOrDie OMPLIBDIR "$1"
     shift ;;
   --mathlib=*)
-    MATHLIB=`read_value $1`;
+    GetSwitchValueOrDie MATHLIB "$1"
     shift ;;
   --cudatk-dir=*)
-    CUDATKDIR=`read_dirname $1`;
-    shift ;; #CUDA is used in src/cudamatrix and src/nnet{,bin} only
+    # CUDA is used in src/cudamatrix and src/nnet{,bin} only.
+    GetSwitchExistingPathOrDie CUDATKDIR "$1"
+    shift ;;
   --cuda-arch=*)
-    CUDA_ARCH=`read_value $1`;
+    GetSwitchValueOrDie CUDA_ARCH "$1"
     shift;;
   --fst-version=*)
-    OPENFST_VER=`read_value $1`;
+    GetSwitchValueOrDie OPENFST_VER "$1"
     shift;;
   --host=*)
     # The type of system where built programs and libraries will run.
     # It should be in the format cpu-vendor-os. If specified, this script
     # will infer the target architecture from the specified host triple.
-    HOST=`read_value $1`;
+    GetSwitchValueOrDie HOST "$1"
     shift ;;
   --android-incdir=*)
     android=true;
@@ -944,7 +895,7 @@ do
     static_fst=true;
     dynamic_kaldi=false;
     MATHLIB='OPENBLAS';
-    ANDROIDINC=`read_dirname $1`;
+    GetSwitchExistingPathOrDie ANDROIDINC "$1"
     shift;;
   *)  echo "Unknown argument: $1, exiting"; usage; exit 1 ;;
   esac
@@ -978,7 +929,8 @@ if is_set $HOST; then
   IFS='-' read -ra PARTS <<< "$HOST"
   # The first field in the PARTS list is the target architecture.
   TARGET_ARCH="$PARTS"
-  if [[ "$TARGET_ARCH" != aarch64* && "$TARGET_ARCH" != arm* && "$TARGET_ARCH" != ppc64le && "$TARGET_ARCH" != x86* ]] ; then
+  if [[ "$TARGET_ARCH" != aarch64* && "$TARGET_ARCH" != arm* && "$TARGET_ARCH" != ppc64le && \
+        "$TARGET_ARCH" != x86* && "$TARGET_ARCH" != i686* ]] ; then
     # We currently only support building for x86[_64], arm*, aarch64* and ppc64le.
     # If TARGET_ARCH was read from the HOST variable, it must be one of these.
     failure "$TARGET_ARCH is not a supported architecture.
@@ -988,13 +940,67 @@ else
   TARGET_ARCH="`uname -m`"
 fi
 
-# If one of these variables is set, we switch the external math library.
-is_set $MKLLIBDIR && echo "Configuring KALDI to use MKL" && export MATHLIB="MKL"
-is_set $MKLROOT && echo "Configuring KALDI to use MKL"&& export MATHLIB="MKL"
-is_set $CLAPACKROOT && echo "Configuring KALDI to use CLAPACK"&& export MATHLIB="CLAPACK"
-is_set $OPENBLASROOT && echo "Configuring KALDI to use OPENBLAS"&& export MATHLIB="OPENBLAS"
+#------------------------------------------------------------------------------
+# Matrix algebra library selection and validation.
+#--------------
+
+declare -a mathlibs   # Contains e. g. 'atlas', 'mkl'
+declare -a incompat   # Contains mutually-inconsistent switches, if any.
+auto_lib=             # Deduced lib name, used when $MATHLIB is not set.
+
+# Validate the (optionally) provided MATHLIB value.
+case $MATHLIB in
+  ''|ATLAS|CLAPACK|MKL|OPENBLAS) : ;;
+  *) failure "Unknown --mathlib='${MATHLIB}'. Supported libs: ATLAS CLAPACK MKL OPENBLAS" ;;
+esac
+
+# See which library-root switches are set, what mathlib they imply, and whether
+# there are any conflicts betweeh the switches.
+[[ $MKLLIBDIR || $MKLROOT ]] && { mathlibs+=(mkl); auto_lib=MKL; }
+[[ $CLAPACKROOT  ]] && { mathlibs+=(clapack); auto_lib=CLAPACK; }
+[[ $OPENBLASROOT ]] && { mathlibs+=(openblas); auto_lib=OPENBLAS; }
+[[ $ATLASROOT    ]] && { mathlibs+=(atlas); auto_lib=ATLAS; }
+
+# When --mathlib= is explicitly provided, and some mathlib(s) deduced, but
+# MATHLIB is not among them, record a conflict for the --mathlib= value.
+shopt -s nocasematch
+[[ $MATHLIB && $mathlibs && ! " ${mathlibs[@]} " =~ " $MATHLIB " ]] &&
+  incompat+=(--mathlib=$MATHLIB)
+shopt -u nocasematch
+
+# If more than one library specified, or a conflict has been recorded above
+# already, then add all deduced libraries as conflicting options (not all may
+# be conflicting sensu stricto, but let the user deal with it).
+if [[ ${#mathlibs[@]} -gt 1 || $incompat ]]; then
+  for libpfx in "${mathlibs[@]}"; do
+    # Handle --mkl-libdir out of common pattern.
+    [[ $libpfx == mkl && $MKLLIBDIR ]] && incompat+=(--mkl-libdir=)
+    # All other switches follow the pattern --$libpfx-root.
+    incompat+=(--$(lcase $libpfx)-root=)
+  done
+  failure "Incompatible configuration switches: ${incompat[@]}"
+fi
+
+# When no library roots were provided, so that auto_lib is not deduced, and
+# MATHLIB is also not explicitly provided by the user, then default to MKL.
+[[ ! $auto_lib && ! $MATHLIB ]] &&
+  case $TARGET_ARCH in
+    x86_64) auto_lib=MKL ;;
+    *) auto_lib=OPENBLAS ;;
+  esac
+: ${MATHLIB:=$auto_lib}
+export MATHLIB  #TODO(kkm): Likely not needed. Briefly tested without,
+                #    but left in the hotfix. Remove when doing the #3192.
+
+# Define default library roots where known (others may be found by probing).
+case $MATHLIB in
+  MKL) [[ ! $MKLLIBDIR && ! $MKLROOT ]] && MKLROOT=/opt/intel/mkl ;;
+  ATLAS) : ${ATLASROOT:=$(rel2abs ../tools/ATLAS_headers/)} ;;
+esac
+
+unset auto_lib incompat libpfx mathlibs
 
-echo "Configuring ..."
+echo "Configuring KALDI to use ${MATHLIB}."
 
 # Back up the old kaldi.mk in case we modified it
 if [ -f kaldi.mk ]; then
@@ -1173,10 +1179,16 @@ elif [ "`uname`" == "Linux" ]; then
       linux_configure_atlas_generic /usr/lib64/atlas "so.3" || \
       linux_configure_atlas_generic /usr/lib/x86_64-linux-gnu/ "so.3" || \
       linux_configure_atlas_generic /usr/lib/x86_64-linux-gnu/ "so" || \
-      linux_configure_redhat_fat 64 || \
-      linux_configure_redhat_fat || \
+      linux_configure_atlas_redhat_fat 64 || \
+      linux_configure_atlas_redhat_fat || \
       linux_configure_atlas_static || \
-      linux_atlas_failure "Failed to configure ATLAS libraries";
+      linux_configure_atlas_failure "Failed to configure ATLAS libraries";
+
+    case $TARGET_ARCH in
+      arm*)    cat makefiles/linux_atlas_arm.mk ;;
+      ppc64le) cat makefiles/linux_atlas_ppc64le.mk ;;
+      *)       cat makefiles/linux_atlas.mk ;;
+    esac >> kaldi.mk
 
   elif [ "$MATHLIB" == "MKL" ]; then
     if [ "$TARGET_ARCH" != "x86_64" ]; then
@@ -1188,7 +1200,7 @@ elif [ "`uname`" == "Linux" ]; then
       echo -n "Configuring MKL library directory: "
       MKLLIBDIR=`linux_configure_mkllibdir $MKLROOT`
       if [ $? -ne 0 ]; then
-        failure "MKL libraries could not be found. Please use the switch --mkl-libdir "
+        failure "MKL libraries could not be found. Please use the switch --mkl-libdir or try another math library, e.g. --mathlib=ATLAS (would be slower)"
       else
         echo "Found: $MKLLIBDIR"
       fi
@@ -1225,8 +1237,6 @@ elif [ "`uname`" == "Linux" ]; then
     cat makefiles/linux_x86_64_mkl.mk >> kaldi.mk
     echo "MKLFLAGS = ${MKL_LINK_LINE} ${THREADING_LINE} $EXTRA_LIBS " >> kaldi.mk
     echo "Successfully configured for Linux with MKL libs from $MKLROOT"
-    $use_cuda && configure_cuda
-    linux_configure_speex
 
   elif [ "$MATHLIB" == "CLAPACK" ]; then
     if [ -z "$CLAPACKROOT" ]; then
@@ -1250,13 +1260,28 @@ elif [ "`uname`" == "Linux" ]; then
     fi
     echo "Warning (CLAPACK): this part of the configure process is not properly tested and may not work."
     echo "Successfully configured for Linux with CLAPACK libs from $CLAPACKROOT"
-    $use_cuda && configure_cuda
-    linux_configure_speex
 
   elif [ "$MATHLIB" == "OPENBLAS" ]; then
-    OPENBLASROOT=`rel2abs "$OPENBLASROOT"`
-    if [ -z "$OPENBLASROOT" ]; then
-      failure "Must specify the location of OPENBLAS with --openblas-root option (and it must exist)"
+    if [[ ! $OPENBLASROOT ]]; then
+      # Either the user specified --mathlib=OPENBLAS or we've autodetected the
+      # system where OpenBLAS is the preferred option (the parser for
+      # --openblas-root fails fatally if the path does not exist, so we trust
+      # that if set, the variable contains the existing path, converted to
+      # absolute form).
+      OPENBLASROOT="$(rel2abs ../tools/OpenBLAS/install)" ||
+        Die "OpenBLAS not found in '../tools/OpenBLAS/install'.
+** This is the only place we look for it. The best option is to build OpenBLAS
+** tuned for your system and CPU. To do that, run the following commands:
+**
+**   cd ../tools; extras/install_openblas.sh
+**
+** Another option is to specify the location of existing OpenBLAS directory
+** with the switch '--openblas-root='. However, even if a package is provided
+** for your system, the packaged version is almost always significantly slower
+** and often older than the above commands can fetch and build.
+**
+** You can also use other matrix algebra libraries. For information, see:
+**   http://kaldi-asr.org/doc/matrixwrap.html"
     fi
     if [ -f $OPENBLASROOT/lib/libopenblas.so ]; then
       OPENBLASLIBDIR=$OPENBLASROOT/lib
@@ -1272,17 +1297,21 @@ elif [ "`uname`" == "Linux" ]; then
       # in REDHAT/CentOS/Ubuntu package installs, the includes are located here
       OPENBLASINCDIR=$OPENBLASROOT/include/openblas
     else
-      echo "$0: ***** Using OpenBlas from $OPENBLASROOT but cblas.h is not found. "
-      echo " ****** Assuming openblas is aleady in a default include path, but"
-      echo " ***** if you get compilation messages about not finding files like cblas.h,"
-      echo " ***** you should look into this (e.g. make sure to install the 'openblas-dev' package,"
-      echo " ***** if it is a package-based install)."
+      echo "$0: ***** Using OpenBLAS from $OPENBLASROOT but cblas.h is not found. "
+      echo "** Assuming openblas is aleady in a default include path, but"
+      echo "** if you get compilation messages about not finding files like cblas.h,"
+      echo "** you should look into this (e.g. make sure to install the 'openblas-dev' package,"
+      echo "** if it is a package-based install)."
       OPENBLASINCDIR="/usr/include"
     fi
     echo "Your math library seems to be OpenBLAS from $OPENBLASROOT.  Configuring appropriately."
+    # TODO(kkm): Probably, OpenBLAS required libgfortran.so.3 at some point, but
+    # no longer does. *My* linker does not complain about a missing library, but
+    # is it safe to keep the reference if no longer required? Try to figure out
+    # how long ago the dependency was dropped.
     if $static_math; then
       echo "Configuring static OpenBlas since --static-math=yes"
-      OPENBLASLIBS="$OPENBLASLIBDIR/libopenblas.a -lgfortran"
+      OPENBLASLIBS="-L$OPENBLASLIBDIR -l:libopenblas.a -lgfortran"
     else
       echo "Configuring dynamically loaded OpenBlas since --static-math=no (the default)"
       OPENBLASLIBS="-L$OPENBLASLIBDIR -lopenblas -lgfortran -Wl,-rpath=$OPENBLASLIBDIR"
@@ -1290,22 +1319,19 @@ elif [ "`uname`" == "Linux" ]; then
     echo "OPENBLASINC = $OPENBLASINCDIR" >> kaldi.mk
     echo "OPENBLASLIBS = $OPENBLASLIBS" >> kaldi.mk
     echo >> kaldi.mk
-    if [[ "$TARGET_ARCH" == arm* ]]; then
-      cat makefiles/linux_openblas_arm.mk >> kaldi.mk
-    elif [[ "$TARGET_ARCH" == aarch64* ]]; then
-      cat makefiles/linux_openblas_aarch64.mk >> kaldi.mk
-    elif [[ "$TARGET_ARCH" == ppc64le ]]; then
-      cat makefiles/linux_openblas_ppc64le.mk >> kaldi.mk
-    else
-      cat makefiles/linux_openblas.mk >> kaldi.mk
-    fi
-    echo "Successfully configured for Linux with OpenBLAS from $OPENBLASROOT"
-    $use_cuda && configure_cuda
-    linux_configure_speex
+    case $TARGET_ARCH in
+      aarch64*) cat makefiles/linux_openblas_aarch64.mk ;;
+      arm*)     cat makefiles/linux_openblas_arm.mk ;;
+      ppc64le)  cat makefiles/linux_openblas_ppc64le.mk ;;
+      *)        cat makefiles/linux_openblas.mk ;;
+    esac >> kaldi.mk
 
+    echo "Successfully configured for Linux with OpenBLAS from $OPENBLASROOT"
   else
     failure "Unsupported linear algebra library '$MATHLIB'"
   fi
+  $use_cuda && configure_cuda
+  linux_configure_speex
 else
   failure "Could not detect the platform or we have not yet worked out the
   appropriate configuration for this platform. Please contact the developers."
@@ -1323,7 +1349,13 @@ if [ -n "$ENV_LDLIBS" ]; then echo "LDLIBS += $ENV_LDLIBS" >> kaldi.mk; fi
 # We check for slow exp implementation just before we exit. This check uses
 # and possibly modifies the kaldi.mk file that we just generated.
 check_for_slow_expf;
-echo "SUCCESS"
-echo "To compile: make clean -j; make depend -j; make -j"
-echo " ... or e.g. -j 10, instead of -j, to use a specified number of CPUs"
-exit 0;
+echo "Kaldi has been successfully configured. To compile:
+
+  make -j clean depend; make -j <NCPU>
+
+where <NCPU> is the number of parallel builds you can afford to do. If unsure,
+use the smaller of the number of CPUs or the amount of RAM in GB divided by 2,
+to stay within safe limits. 'make -j' without the numeric value may not limit
+the number of parallel jobs at all, and overwhelm even a powerful workstation,
+since Kaldi build is highly parallelized."
+exit 0
diff --git a/src/cudadecoder/Makefile b/src/cudadecoder/Makefile
new file mode 100644
index 00000000000..166f72e060f
--- /dev/null
+++ b/src/cudadecoder/Makefile
@@ -0,0 +1,34 @@
+all:
+
+EXTRA_CXXFLAGS = -Wno-sign-compare
+include ../kaldi.mk
+
+ifeq ($(CUDA), true)
+
+# Make sure we have CUDA_ARCH from kaldi.mk,
+ifndef CUDA_ARCH
+  $(error CUDA_ARCH is undefined, run 'src/configure')
+endif
+
+TESTFILES =
+
+OBJFILES = batched-threaded-nnet3-cuda-pipeline.o decodable-cumatrix.o \
+           cuda-decoder.o cuda-decoder-kernels.o cuda-fst.o
+
+LDFLAGS += $(CUDA_LDFLAGS)
+LDLIBS += $(CUDA_LDLIBS)
+
+LIBNAME = kaldi-cudadecoder
+
+ADDLIBS = ../cudamatrix/kaldi-cudamatrix.a ../base/kaldi-base.a ../matrix/kaldi-matrix.a \
+          ../lat/kaldi-lat.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../gmm/kaldi-gmm.a \
+          ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a ../gmm/kaldi-gmm.a ../transform/kaldi-transform.a \
+          ../tree/kaldi-tree.a ../online2/kaldi-online2.a ../nnet3/kaldi-nnet3.a \
+					../cudafeat/kaldi-cudafeat.a
+
+# Implicit rule for kernel compilation
+%.o : %.cu
+	$(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ -I$(OPENFSTINC)
+endif
+
+include ../makefiles/default_rules.mk
diff --git a/src/cudadecoder/README b/src/cudadecoder/README
new file mode 100644
index 00000000000..64aeee3fa35
--- /dev/null
+++ b/src/cudadecoder/README
@@ -0,0 +1,141 @@
+CUDADECODER USAGE AND TUNING GUIDE
+
+INTRODUCTION:
+
+The CudaDecoder was developed by NVIDIA with coordination from Johns Hopkins.
+This work was intended to demonstrate efficient GPU utilization across a range 
+of NVIDIA hardware from SM_35 and on.  The following guide describes how to 
+use and tune the decoder for your models.
+
+A single speech-to-text is not enough work to fully saturate any NVIDIA GPUs.
+To fully saturate GPUs we need to decode many audio files concurrently.  The
+solution provide does this through a combination of batching many audio files
+into a single speech pipeline, running multiple pipelines in parallel on the
+device, and using multiple CPU threads to perform feature extraction and 
+determinization.  Users of the decoder will need to have a high level 
+understanding of the underlying implementation to know how to tune the 
+decoder.  
+
+The interface to the decoder is defined in "batched-threaded-cuda-decoder.h".
+A binary example can be found in cudadecoderbin/batched-wav-nnet3-cuda.cc".
+Below is a simple usage example. 
+/*
+ *  BatchedThreadedCudaDecoderConfig batchedDecoderConfig;
+ *  batchedDecoderConfig.Register(&po);
+ *  po.Read(argc, argv);
+ *  ...
+ *  BatchedThreadedCudaDecoder CudaDecoder(batchedDecoderConfig);
+ *  CudaDecoder.Initialize(*decode_fst, am_nnet, trans_model);
+ *  ...
+ *
+ *  for (; !wav_reader.Done(); wav_reader.Next()) {
+ *    std::string key = wav_reader.Key();
+ *    CudaDecoder.OpenDecodeHandle(key, wave_reader.Value());
+ *    ...
+ *  }
+ *
+ *  while (!processed.empty()) {
+ *    CompactLattice clat;
+ *    CudaDecoder.GetLattice(key, &clat);
+ *    CudaDecoder.CloseDecodeHandle(key);
+ *    ...
+ *  }
+ *
+ *  CudaDecoder.Finalize();
+ */
+
+In the code above we first declare a BatchedThreadedCudaDecoderConfig
+and register its options.  This enables us to tune the configuration 
+options.   Next we declare the CudaDecoder with that configuration.
+Before we can use the CudaDecoder we need to initalize it with an
+FST, AmNnetSimple, and TransitionModel.  
+
+Next we iterate through waves and enqueue them into the decoder by
+calling OpenDecodeHandle.  Note the key must be unique for each 
+decode. Once we have enqueued work we can query the results by calling
+GetLattice on the same key we opened the handle on.  This will automatticaly
+wait for processing to complete before returning. 
+
+The key to get performance is to have many decodes active at the same time
+by opening many decode handles before querying for the lattices.
+
+
+PERFORMANCE TUNING:
+
+The CudaDecoder has a lot of tuning parameters which should be used to
+increase performance on various models and hardware.  Note that it is 
+expected that the optimal parameters will vary according to both the hardware,
+model, and data being decoded.
+
+The following will briefly describe each parameter:
+
+BatchedThreadedCudaDecoderOptions:
+  cuda-control-threads:  Number of CPU threads simultaniously submitting work
+    to the device.  For best performance this should be between 2-4.
+  cuda-worker-threads:  CPU threads for worker tasks like determinization and
+    feature extraction.  For best performance this should take up all spare
+    CPU threads available on the system.
+  max-batch-size:  Maximum batch size in a single pipeline.  This should be as
+    large as possible but is expected to be between 50-200.  
+  batch-drain-size:  How far to drain the batch before getting new work.
+    Draining the batch allows nnet3 to be better batched.  Testing has 
+    indicated that 10-30% of max-batch-size is ideal.
+  determinize-lattice:  Use cuda-worker-threads to determinize the lattice. if
+    this is true then GetRawLattice can no longer be called.
+  max-outstanding-queue-length:  The maximum number of decodes that can be
+    queued and not assigned before OpenDecodeHandle will automatically stall 
+    the submitting thread.  Raising this increases CPU resources.  This should 
+    be set to a few thousand at least.
+
+Decoder Options:
+  beam:  The width of the beam during decoding
+  lattice-beam:  The width of the lattice beam
+  ntokens-preallocated:  number of tokens allocated in host buffers.  If
+    this size is exceeded the buffer will reallocate larger consuming more
+    resources
+  max-tokens-per-frame:  maximum tokens in GPU memory per frame.  If this
+    value is exceeded the beam will tighten and accuracy may decrease.
+  max-active: at the end of each frame computation, we keep only its best max-active tokens (arc instantiations)
+
+Device Options:
+  use-tensor-cores:  Enables tensor core (fp16 math) for gemms.  This is
+    faster but less accurate.  For inference the loss of accuracy is marginal
+
+GPU MEMORY USAGE:
+
+GPU memory is limited.  Large GPUs have between 16-32GB of memory.  Consumer
+GPUs have much less.  For best performance users should have as many
+concurrent decodes as possible.  Thus users should purchase GPUs with as
+much memory as possible.  GPUs with less memory may have to sacrifice either
+performance or accuracy.  On 16GB GPUs for example we are able to support
+around 200 concurrent decodes at a time. This translates into 4
+cuda-control-threads and a max-batch-size of 50 (4x50).  If your model is
+larger or smaller than the models our models when testing you may have to
+raise or lower this.  
+
+There are a number of parameters which can be used to control GPU memory
+usage. How they impact memory usage and accuracy is discussed below:
+
+  max-tokens-per-frame: Controls how many buffers can be stored on the GPU for
+    each frame.  This buffer size cannot be exceed or reallocated.  As this
+    buffer gets closer to being exhausted the beam is reduced possibly reducing
+    quality.  This should be tuned according to the model and data.  For
+    example, a highly accurate model could set this values smaller to enable
+    more concurrent decodes.
+
+  cuda-control-threads:  Each control thread is a concurrent pipeline.  Thus
+    the GPU memory scales linearly with this parameter.  This should always be
+    at least 2 but should probably not be higher than 4 as more concurrent
+    pipelines leads to more driver contention reducing performance.
+
+  max-batch-size:  The number of concurrent decodes in each pipeline.  The
+    memory usage also scales linear with this parameter.  Setting this smaller
+    will reduce kernel runtime while increase launch latency overhead.
+    Ideally this should be as large as possible while still fitting into
+    memory.  Note that currently the maximum allowed is 200.
+
+== Acknowledgement ==
+
+We would like to thank Daniel Povey, Zhehuai Chen and Daniel Galvez for their help and expertise during the review process.
+
+
diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
new file mode 100644
index 00000000000..d3ad909d80a
--- /dev/null
+++ b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
@@ -0,0 +1,931 @@
+// cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Hugo Braun, Justin Luitjens, Ryan Leary
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#define SLEEP_BACKOFF_NS 500
+#define SLEEP_BACKOFF_S ((double)SLEEP_BACKOFF_NS / 1e9)
+#if HAVE_CUDA == 1
+
+#include "cudadecoder/batched-threaded-nnet3-cuda-pipeline.h"
+#include <nvToolsExt.h>
+#include "base/kaldi-utils.h"
+
+namespace kaldi {
+namespace cuda_decoder {
+
+void BatchedThreadedNnet3CudaPipeline::Initialize(
+    const fst::Fst<fst::StdArc> &decode_fst, const nnet3::AmNnetSimple &am_nnet,
+    const TransitionModel &trans_model) {
+  KALDI_LOG << "BatchedThreadedNnet3CudaPipeline Initialize with "
+            << config_.num_control_threads << " control threads, "
+            << config_.num_worker_threads << " worker threads"
+            << " and batch size " << config_.max_batch_size;
+
+  am_nnet_ = &am_nnet;
+  trans_model_ = &trans_model;
+  cuda_fst_.Initialize(decode_fst, trans_model_);
+
+  feature_info_ = new OnlineNnet2FeaturePipelineInfo(config_.feature_opts);
+  feature_info_->ivector_extractor_info.use_most_recent_ivector = true;
+  feature_info_->ivector_extractor_info.greedy_ivector_extractor = true;
+
+  // initialize threads and save their contexts so we can join them later
+  thread_contexts_.resize(config_.num_control_threads);
+
+  // create work queue
+  pending_task_queue_ = new TaskState *[config_.max_pending_tasks + 1];
+  tasks_front_ = 0;
+  tasks_back_ = 0;
+
+  // ensure all allocations/kernels above are complete before launching threads
+  // in different streams.
+  cudaStreamSynchronize(cudaStreamPerThread);
+
+  // Create threadpool for CPU work
+  work_pool_ = new ThreadPool(config_.num_worker_threads);
+
+  exit_ = false;
+  numStarted_ = 0;
+
+  // start workers
+  for (int i = 0; i < config_.num_control_threads; i++) {
+    thread_contexts_[i] =
+        std::thread(&BatchedThreadedNnet3CudaPipeline::ExecuteWorker, this, i);
+  }
+
+  // wait for threads to start to ensure allocation time isn't in the timings
+  while (numStarted_ < config_.num_control_threads)
+    kaldi::Sleep(SLEEP_BACKOFF_S);
+}
+void BatchedThreadedNnet3CudaPipeline::Finalize() {
+  // Tell threads to exit and join them
+  exit_ = true;
+
+  for (int i = 0; i < config_.num_control_threads; i++) {
+    thread_contexts_[i].join();
+  }
+
+  cuda_fst_.Finalize();
+
+  delete feature_info_;
+  delete work_pool_;
+  delete[] pending_task_queue_;
+}
+
+// query a specific key to see if compute on it is complete
+bool BatchedThreadedNnet3CudaPipeline::isFinished(const std::string &key) {
+  bool finished;
+  {
+    std::lock_guard<std::mutex> lock(tasks_lookup_mutex_);
+    auto it = tasks_lookup_.find(key);
+    KALDI_ASSERT(it != tasks_lookup_.end());
+    finished = it->second.finished;
+  }
+  return finished;
+}
+
+// remove an audio file from the decoding and clean up resources
+void BatchedThreadedNnet3CudaPipeline::CloseDecodeHandle(
+    const std::string &key) {
+  TaskState *task;
+  decltype(tasks_lookup_.end()) it;
+  {
+    std::lock_guard<std::mutex> lock(tasks_lookup_mutex_);
+    it = tasks_lookup_.find(key);
+    KALDI_ASSERT(it != tasks_lookup_.end());
+    task = &it->second;
+  }
+
+  // wait for task to finish processing
+  while (task->finished != true) kaldi::Sleep(SLEEP_BACKOFF_S);
+
+  // Delete the group counter if necessary
+  std::lock_guard<std::mutex> lk1(group_tasks_mutex_);
+  if (group_tasks_not_done_[task->group] == 0)
+    group_tasks_not_done_.erase(task->group);
+
+  // remove it
+  {
+    std::lock_guard<std::mutex> lock(tasks_lookup_mutex_);
+    std::string &group = task->group;
+    auto p = tasks_group_lookup_.equal_range(group);
+    bool found = false;
+    for (auto it = p.first; it != p.second; ++it) {
+      if (it->second == task) {
+        tasks_group_lookup_.erase(it);
+        found = true;
+        break;
+      }
+    }
+    KALDI_ASSERT(found);
+    tasks_lookup_.erase(it);
+
+    if (tasks_lookup_.empty()) tasks_lookup_cv_.notify_all();
+  }
+}
+
+void BatchedThreadedNnet3CudaPipeline::WaitForAllTasks() {
+  std::unique_lock<std::mutex> lk(group_tasks_mutex_);
+  group_done_cv_.wait(lk, [this] { return all_group_tasks_not_done_ == 0; });
+}
+
+void BatchedThreadedNnet3CudaPipeline::WaitForGroup(const std::string &group) {
+  std::unique_lock<std::mutex> lk(group_tasks_mutex_);
+  group_done_cv_.wait(
+      lk, [this, &group] { return group_tasks_not_done_[group] == 0; });
+  // Safe to delete entry from the map now. If the user creates new task in that
+  // group,
+  // the entry will be created once more
+  group_tasks_not_done_.erase(group);
+}
+
+bool BatchedThreadedNnet3CudaPipeline::IsGroupCompleted(
+    const std::string &group) {
+  std::unique_lock<std::mutex> lk(group_tasks_mutex_);
+  return (group_tasks_not_done_[group] == 0);  // will unlock in destructor
+}
+
+std::string BatchedThreadedNnet3CudaPipeline::WaitForAnyGroup() {
+  std::unique_lock<std::mutex> lk(group_tasks_mutex_);
+  // Waiting for any group to be done.
+  const string *group_done;
+  auto predicate = [this, &group_done] {
+    for (auto it : group_tasks_not_done_) {
+      if (it.second == 0) {
+        group_done = &it.first;
+        return true;
+      }
+    }
+    return false;
+  };
+  group_done_cv_.wait(lk, predicate);
+  return *group_done;
+}
+
+bool BatchedThreadedNnet3CudaPipeline::IsAnyGroupCompleted(std::string *group) {
+  std::lock_guard<std::mutex> lk(group_tasks_mutex_);
+  for (auto it : group_tasks_not_done_) {
+    if (it.second == 0) {
+      *group = it.first;
+      return true;
+    }
+  }
+  return false;  // will unlock in destructor
+}
+
+void BatchedThreadedNnet3CudaPipeline::CloseAllDecodeHandlesForGroup(
+    const std::string &group) {
+  WaitForGroup(group);
+  std::lock_guard<std::mutex> lk1(tasks_lookup_mutex_);
+  auto p = tasks_group_lookup_.equal_range(group);
+  for (auto it = p.first; it != p.second; ++it)
+    tasks_lookup_.erase(it->second->key);
+  tasks_group_lookup_.erase(p.first, p.second);
+  std::lock_guard<std::mutex> lk2(group_tasks_mutex_);
+  group_tasks_not_done_.erase(group);
+}
+
+void BatchedThreadedNnet3CudaPipeline::CloseAllDecodeHandles() {
+  WaitForAllTasks();
+  std::lock_guard<std::mutex> lk1(tasks_lookup_mutex_);
+  tasks_lookup_.clear();
+  tasks_group_lookup_.clear();
+  std::lock_guard<std::mutex> lk2(group_tasks_mutex_);
+  group_tasks_not_done_.clear();
+}
+
+int32 BatchedThreadedNnet3CudaPipeline::GetNumberOfTasksPending() {
+  int size;
+  {
+    std::lock_guard<std::mutex> lk(group_tasks_mutex_);
+    size = all_group_tasks_not_done_;
+  }
+  return size;
+}
+
+BatchedThreadedNnet3CudaPipeline::TaskState *
+BatchedThreadedNnet3CudaPipeline::AddTask(const std::string &key,
+                                          const std::string &group) {
+  TaskState *task;
+  {
+    std::lock_guard<std::mutex> lock(tasks_lookup_mutex_);
+    // ensure key is unique
+    KALDI_ASSERT(tasks_lookup_.end() == tasks_lookup_.find(key));
+
+    // Create a new task in lookup map
+    task = &tasks_lookup_[key];
+    tasks_group_lookup_.insert({group, task});
+  }
+  task->group = group;
+
+  // Add the task to its group
+  {
+    std::lock_guard<std::mutex> lk(group_tasks_mutex_);
+    ++all_group_tasks_not_done_;
+    ++group_tasks_not_done_[task->group];
+  }
+  return task;
+}
+
+// Adds a decoding task to the decoder
+void BatchedThreadedNnet3CudaPipeline::OpenDecodeHandle(
+    const std::string &key, const WaveData &wave_data, const std::string &group,
+    const std::function<void(CompactLattice &clat)> &callback) {
+  TaskState *task = AddTask(key, group);
+  task->callback = std::move(callback);
+  task->Init(key, wave_data);
+
+  if (config_.gpu_feature_extract) {
+    // Feature extraction done on device
+    AddTaskToPendingTaskQueue(task);
+  } else {
+    // Feature extraction done on host thread
+    work_pool_->enqueue(THREAD_POOL_LOW_PRIORITY,
+                        &BatchedThreadedNnet3CudaPipeline::ComputeOneFeatureCPU,
+                        this, task);
+  }
+}
+
+void BatchedThreadedNnet3CudaPipeline::OpenDecodeHandle(
+    const std::string &key, const VectorBase<BaseFloat> &wave_data,
+    float sample_rate, const std::string &group,
+    const std::function<void(CompactLattice &clat)> &callback) {
+  TaskState *task = AddTask(key, group);
+  task->Init(key, wave_data, sample_rate);
+  task->callback = std::move(callback);
+
+  if (config_.gpu_feature_extract) {
+    // Feature extraction done on device
+    AddTaskToPendingTaskQueue(task);
+  } else {
+    // Feature extraction done on host thread
+    work_pool_->enqueue(THREAD_POOL_LOW_PRIORITY,
+                        &BatchedThreadedNnet3CudaPipeline::ComputeOneFeatureCPU,
+                        this, task);
+  }
+}
+
+bool BatchedThreadedNnet3CudaPipeline::GetRawLattice(const std::string &key,
+                                                     Lattice *lat) {
+  nvtxRangePushA("GetRawLattice");
+  TaskState *task;
+  {
+    std::lock_guard<std::mutex> lock(tasks_lookup_mutex_);
+    auto it = tasks_lookup_.find(key);
+    KALDI_ASSERT(it != tasks_lookup_.end());
+    task = &it->second;
+  }
+
+  // wait for task to finish.  This should happens automatically without
+  // intervention from the master thread.
+  while (task->finished == false) kaldi::Sleep(SLEEP_BACKOFF_S);
+
+  // GetRawLattice on a determinized lattice is not supported (Per email from
+  // DanP)
+  KALDI_ASSERT(task->determinized == false);
+
+  if (task->error) {
+    nvtxRangePop();
+    return false;
+  }
+  // Store off the lattice
+  *lat = task->lat;
+  nvtxRangePop();
+  return true;
+}
+
+bool BatchedThreadedNnet3CudaPipeline::GetLattice(const std::string &key,
+                                                  CompactLattice *clat) {
+  nvtxRangePushA("GetLattice");
+  TaskState *task;
+  {
+    std::lock_guard<std::mutex> lock(tasks_lookup_mutex_);
+
+    auto it = tasks_lookup_.find(key);
+    KALDI_ASSERT(it != tasks_lookup_.end());
+    task = &it->second;
+  }
+  // wait for task to finish.  This should happens automatically without
+  // intervention from the master thread.
+  while (!task->finished) kaldi::Sleep(SLEEP_BACKOFF_S);
+
+  if (task->error) {
+    nvtxRangePop();
+    return false;
+  }
+
+  // if user has not requested a determinized lattice from the decoder then we
+  // must
+  // determinize it here since it was done done already.
+  if (!config_.determinize_lattice && !task->determinized) {
+    // Determinzation was not done by worker threads so do it here
+    DeterminizeOneLattice(task);
+  }
+
+  *clat = task->dlat;  // grab compact lattice
+  nvtxRangePop();
+  return true;
+}
+
+// Adds task to the PendingTaskQueue
+void BatchedThreadedNnet3CudaPipeline::AddTaskToPendingTaskQueue(
+    TaskState *task) {
+  std::lock_guard<std::mutex> lk(tasks_add_mutex_);
+  if (NumPendingTasks() == config_.max_pending_tasks) {
+    // task queue is full launch a new thread to add this task and exit to make
+    // room for other work
+    work_pool_->enqueue(
+        THREAD_POOL_LOW_PRIORITY,
+        &BatchedThreadedNnet3CudaPipeline::AddTaskToPendingTaskQueue, this,
+        task);
+  } else {
+    // there is room so let's add it
+    // insert into pending task queue
+    pending_task_queue_[tasks_back_] = task;
+    // (int)tasks_back_);
+    tasks_back_ = (tasks_back_ + 1) % (config_.max_pending_tasks + 1);
+  }
+}
+
+// Attempts to fill the batch from the task queue.  May not fully fill the
+// batch.
+void BatchedThreadedNnet3CudaPipeline::AquireAdditionalTasks(
+    CudaDecoder &cuda_decoder, ChannelState &channel_state,
+    std::vector<TaskState *> &tasks) {
+  std::vector<ChannelId> &channels = channel_state.channels;
+  std::vector<ChannelId> &free_channels = channel_state.free_channels;
+
+  int tasksRequested =
+      std::min(free_channels.size(), config_.max_batch_size - channels.size());
+  int tasksAssigned = 0;
+
+  {
+    // lock required because front might change from other
+    // workers
+    std::lock_guard<std::mutex> lock(tasks_mutex_);
+    {
+      // compute number of tasks to grab
+      int tasksAvailable = NumPendingTasks();
+      tasksAssigned = std::min(tasksAvailable, tasksRequested);
+
+      // grab tasks
+      for (int i = 0; i < tasksAssigned; i++) {
+        // pending_task_queue_[tasks_front_]);
+        tasks.push_back(pending_task_queue_[tasks_front_]);
+        tasks_front_ = (tasks_front_ + 1) % (config_.max_pending_tasks + 1);
+      }
+    }
+  }
+
+  if (tasksAssigned > 0) {
+    // for each assigned tasks we have to do a little bookkeeping
+
+    // list of channels that need initialization
+    std::vector<ChannelId> init_channels(tasksAssigned);
+
+    for (int i = 0; i < tasksAssigned; i++) {
+      // assign a free channel
+      ChannelId channel;
+      {
+        std::lock_guard<std::mutex> lk(channel_state.free_channels_mutex);
+        KALDI_ASSERT(free_channels.size() >
+                     0);  // it should always be true (cf std::min above)
+        channel = free_channels.back();
+        free_channels.pop_back();
+      }
+      // add channel to processing list
+      channels.push_back(channel);
+      // add new channel to initialization list
+      init_channels[i] = channel;
+    }
+
+    // Setup cuda_decoder channels
+    cuda_decoder.InitDecoding(init_channels);
+  }
+}
+
+// Computes NNET3 across the tasks[first,tasks.size())
+void BatchedThreadedNnet3CudaPipeline::ComputeBatchNnet(
+    nnet3::NnetBatchComputer &computer, int32 first,
+    std::vector<TaskState *> &tasks) {
+  nvtxRangePushA("ComputeBatchNnet");
+
+  bool output_to_cpu = false;
+  int32 online_ivector_period = 0;
+  int max_pending_minibatches =
+      0;  // zero means unlimited.  This API call should not block then.
+
+  // list of nnet tasks for each batch
+  std::vector<std::vector<nnet3::NnetInferenceTask>> nnet_tasks(tasks.size());
+
+  // for all new batches enqueue up nnet work.
+  for (int i = first; i < tasks.size(); i++) {
+    TaskState &task = *tasks[i];
+    std::shared_ptr<TaskData> &task_data = task.task_data;
+    std::vector<nnet3::NnetInferenceTask> &ntasks = nnet_tasks[i];
+
+    if (config_.gpu_feature_extract) {
+      CuVector<BaseFloat> &ivector_features = task_data->ivector_features;
+      CuMatrix<BaseFloat> &input_features = task_data->input_features;
+
+      CuVector<BaseFloat> *ifeat = NULL;
+      if (ivector_features.Dim() > 0) {
+        ifeat = &ivector_features;
+      }
+      // create task list
+      computer.SplitUtteranceIntoTasks(output_to_cpu, input_features, ifeat,
+                                       NULL, online_ivector_period, &ntasks);
+    } else {
+      Vector<BaseFloat> &ivector_features = task_data->ivector_features_cpu;
+      Matrix<BaseFloat> &input_features = task_data->input_features_cpu;
+
+      Vector<BaseFloat> *ifeat = NULL;
+      if (ivector_features.Dim() > 0) {
+        ifeat = &ivector_features;
+      }
+      // create task list
+      computer.SplitUtteranceIntoTasks(output_to_cpu, input_features, ifeat,
+                                       NULL, online_ivector_period, &ntasks);
+    }
+
+    // Add tasks to computer
+    for (size_t j = 0; j < ntasks.size(); j++) {
+      computer.AcceptTask(&ntasks[j], max_pending_minibatches);
+    }
+  }
+
+  // process all minibatches, we allow partial minibatches but this should only
+  // occur on the last iteration
+  bool allow_partial_minibatch = true;
+  while (computer.Compute(allow_partial_minibatch))
+    ;
+
+  // Extract Posteriors
+  for (int i = first; i < tasks.size(); i++) {
+    TaskState &task = *tasks[i];
+    std::shared_ptr<TaskData> &task_data = task.task_data;
+    CuMatrix<BaseFloat> &posteriors = task_data->posteriors;
+    MergeTaskOutput(nnet_tasks[i], &posteriors);
+
+    // nnet output is no longer necessary as we have copied the output out
+    nnet_tasks[i].resize(0);
+
+    // featurs are no longer needed so free memory
+    task_data->ivector_features.Resize(0);
+    task_data->input_features.Resize(0, 0);
+  }
+
+  nvtxRangePop();
+}
+
+// Computes Features for a single decode instance.
+void BatchedThreadedNnet3CudaPipeline::ComputeOneFeatureCPU(TaskState *task_) {
+  nvtxRangePushA("ComputeOneFeatureCPU");
+  TaskState &task = *task_;
+  std::shared_ptr<TaskData> &task_data = task.task_data;
+  Vector<BaseFloat> &ivector_features = task_data->ivector_features_cpu;
+  Matrix<BaseFloat> &input_features = task_data->input_features_cpu;
+
+  // create decoding state
+  OnlineNnet2FeaturePipeline feature(*feature_info_);
+
+  // Accept waveforms
+  feature.AcceptWaveform(task_data->sample_frequency,
+                         SubVector<BaseFloat>(*task_data->wave_samples, 0,
+                                              task_data->wave_samples->Dim()));
+  feature.InputFinished();
+  // All frames should be ready here
+  int32 numFrames = feature.NumFramesReady();
+  // If we don't have anything to do, we must return now
+  if (numFrames == 0) {
+    task_->finished = true;
+    return;
+  }
+  int32 input_dim = feature.InputFeature()->Dim();
+
+  std::vector<int> frames(numFrames);
+  // create list of frames
+  for (int j = 0; j < numFrames; j++) frames[j] = j;
+
+  // Copy Features
+  input_features.Resize(numFrames, input_dim);
+  feature.InputFeature()->GetFrames(frames, &input_features);
+
+  // Ivectors are optional, if they were not provided skip this step
+  if (feature.IvectorFeature() != NULL) {
+    int32 ivector_dim = feature.IvectorFeature()->Dim();
+    ivector_features.Resize(ivector_dim);
+
+    // Copy Features
+    feature.IvectorFeature()->GetFrame(numFrames - 1, &ivector_features);
+  }
+
+  AddTaskToPendingTaskQueue(task_);
+
+  nvtxRangePop();
+}
+
+// Computes features across the tasks[first,tasks.size()
+void BatchedThreadedNnet3CudaPipeline::ComputeBatchFeatures(
+    int32 first, std::vector<TaskState *> &tasks,
+    OnlineCudaFeaturePipeline &feature_pipeline) {
+  KALDI_ASSERT(config_.gpu_feature_extract == true);
+  nvtxRangePushA("CopyBatchWaves");
+  // below we will pack waves into a single buffer for efficient transfer across
+  // device
+
+  // first count the total number of elements and create a single large vector
+  int count = 0;
+  for (int i = first; i < tasks.size(); i++) {
+    count += tasks[i]->task_data->wave_samples->Dim();
+  }
+
+  // creating a thread local vector of pinned memory.
+  // wave data will be stagged through this memory to get
+  // more efficient non-blocking transfers to the device.
+  thread_local Vector<BaseFloat> pinned_vector;
+
+  if (pinned_vector.Dim() < count) {
+    if (pinned_vector.Dim() != 0) {
+      cudaHostUnregister(pinned_vector.Data());
+    }
+    // allocated array 2x size
+    pinned_vector.Resize(count * 2, kUndefined);
+    cudaHostRegister(pinned_vector.Data(),
+                     pinned_vector.Dim() * sizeof(BaseFloat), 0);
+  }
+
+  // We will launch a thread for each task in order to get better host memory
+  // bandwidth
+  std::vector<std::future<void>> futures;  // for syncing
+
+  // vector copy function for threading below.
+  auto copy_vec = [](SubVector<BaseFloat> &dst,
+                     const SubVector<BaseFloat> &src) {
+    nvtxRangePushA("CopyVec");
+    dst.CopyFromVec(src);
+    nvtxRangePop();
+  };
+
+  // next launch threads to copy all waves for each task in parallel
+  count = 0;
+  for (int i = first; i < tasks.size(); i++) {
+    std::shared_ptr<TaskData> &task_data = tasks[i]->task_data;
+    SubVector<BaseFloat> wave(pinned_vector, count,
+                              task_data->wave_samples->Dim());
+    count += task_data->wave_samples->Dim();
+    futures.push_back(
+        work_pool_->enqueue(copy_vec, wave, *(task_data->wave_samples)));
+  }
+
+  // wait for waves to be copied into place
+  for (int i = 0; i < futures.size(); i++) {
+    futures[i].get();
+  }
+
+  CuVector<BaseFloat> cu_waves(count, kUndefined);
+  // copy memory down asynchronously.  Vector copy functions are synchronous so
+  // we do it manually.
+  // It is important for this to happen asynchrously to help hide launch latency
+  // of smaller kernels
+  // that come in the future.
+  cudaMemcpyAsync(cu_waves.Data(), pinned_vector.Data(),
+                  cu_waves.Dim() * sizeof(BaseFloat), cudaMemcpyHostToDevice,
+                  cudaStreamPerThread);
+  nvtxRangePop();
+
+  nvtxRangePushA("ComputeBatchFeatures");
+  // extract features for each wave
+  count = 0;
+  for (int i = first; i < tasks.size(); i++) {
+    TaskState &task = *tasks[i];
+    std::shared_ptr<TaskData> &task_data = task.task_data;
+
+    CuSubVector<BaseFloat> cu_wave(cu_waves, count,
+                                   task_data->wave_samples->Dim());
+    count += task_data->wave_samples->Dim();
+    feature_pipeline.ComputeFeatures(cu_wave, task_data->sample_frequency,
+                                     &task_data->input_features,
+                                     &task_data->ivector_features);
+
+    int32 numFrames = task_data->input_features.NumRows();
+
+    if (numFrames == 0) {
+      // Make this a warning for now.  Need to check how this is handled
+      KALDI_WARN << "Warning empty audio file";
+    }
+  }
+  nvtxRangePop();
+}
+
+// Allocates decodables for tasks in the range of tasks[first,tasks.size())
+void BatchedThreadedNnet3CudaPipeline::AllocateDecodables(
+    int32 first, std::vector<TaskState *> &tasks,
+    std::vector<CudaDecodableInterface *> &decodables) {
+  // Create mapped decodable here
+  for (int i = first; i < tasks.size(); i++) {
+    std::shared_ptr<TaskData> &task_data = tasks[i]->task_data;
+    CuMatrix<BaseFloat> &posteriors = task_data->posteriors;
+    decodables.push_back(
+        new DecodableCuMatrixMapped(*trans_model_, posteriors, 0));
+  }
+}
+
+// Removes all completed channels from the channel list.
+// Also enqueues up work for post processing
+void BatchedThreadedNnet3CudaPipeline::RemoveCompletedChannels(
+    CudaDecoder &cuda_decoder, ChannelState &channel_state,
+    std::vector<CudaDecodableInterface *> &decodables,
+    std::vector<TaskState *> &tasks) {
+  std::vector<ChannelId> &channels = channel_state.channels;
+  std::vector<ChannelId> &completed_channels = channel_state.completed_channels;
+
+  // Here we will reorder arrays to put finished decodes at the end
+  int cur = 0;  // points to the current unchecked decode
+  int back = tasks.size() - completed_channels.size() -
+             1;  // points to the last unchecked decode
+
+  // for each active channel
+  // scan channels to find finished decodes
+  // move finished decodes to the end
+  for (int i = 0; i < channels.size(); i++) {
+    ChannelId channel = channels[cur];
+    int numDecoded = cuda_decoder.NumFramesDecoded(channel);
+    int toDecode = decodables[cur]->NumFramesReady();
+
+    if (toDecode == numDecoded) {  // if current task is completed
+      // add channel to free and completed queues
+      completed_channels.push_back(channel);
+
+      // Rearrange queues,
+      // move this element to end and end to this spot
+      std::swap(tasks[cur], tasks[back]);
+      std::swap(channels[cur], channels[back]);
+      std::swap(decodables[cur], decodables[back]);
+
+      // back is a completed decode so decrement it
+      back--;
+    } else {
+      // not completed move to next task
+      cur++;
+    }  // end if completed[cur]
+  }    // end for loop
+
+  // removing finished channels from list
+  channels.resize(cur);
+}
+
+// Post decode some channels will be complete
+// For those channels we need to
+//  free up the channel
+//  get and determinize the lattice
+//
+void BatchedThreadedNnet3CudaPipeline::PostDecodeProcessing(
+    CudaDecoder &cuda_decoder, ChannelState &channel_state,
+    std::vector<CudaDecodableInterface *> &decodables,
+    std::vector<TaskState *> &tasks) {
+  std::vector<ChannelId> &channels = channel_state.channels;
+  std::vector<ChannelId> &completed_channels = channel_state.completed_channels;
+
+  /*
+  // Generate lattices for GetRawLattice
+  std::vector<Lattice *> lattices(completed_channels.size());
+  for (int i = 0; i < completed_channels.size(); i++) {
+    // reverse order of lattices to match channel order
+    // tasks order was reversed when reordering to the back
+    lattices[i] = &(tasks[tasks.size() - i - 1]->lat);
+  }
+  */
+
+  // Prepare data for GetRawLattice
+  cuda_decoder.PrepareForGetRawLattice(completed_channels, true);
+  // clean up datastructures for completed tasks
+  for (int i = channels.size(); i < tasks.size(); i++) {
+    delete decodables[i];
+  }
+
+  // Calling GetRawLattice + Determinize (optional) on a CPU worker thread
+  for (int i = channels.size(); i < tasks.size(); i++) {
+    tasks[i]->ichannel = channels[i];
+    work_pool_->enqueue(THREAD_POOL_NORMAL_PRIORITY,
+                        &BatchedThreadedNnet3CudaPipeline::CompleteTask, this,
+                        &cuda_decoder, &channel_state, tasks[i]);
+  }
+
+  tasks.resize(channels.size());
+  decodables.resize(channels.size());
+  completed_channels.resize(0);
+}
+
+void BatchedThreadedNnet3CudaPipeline::CompleteTask(CudaDecoder *cuda_decoder,
+                                                    ChannelState *channel_state,
+                                                    TaskState *task) {
+  // Calling GetRawLattice for that channel. PrepareForGetRawLattice was already
+  // called
+  cuda_decoder->ConcurrentGetRawLatticeSingleChannel(task->ichannel,
+                                                     &task->lat);
+  // We are done using that channel. Putting it back into the free channels
+  {
+    std::lock_guard<std::mutex> lk(channel_state->free_channels_mutex);
+    channel_state->free_channels.push_back(task->ichannel);
+  }
+
+  // If necessary, determinize the lattice
+  if (config_.determinize_lattice) DeterminizeOneLattice(task);
+
+  if (!config_.determinize_lattice) {
+    ConvertLattice(task->lat, &task->dlat);
+  }
+
+  if (task->callback)  // if callable
+    task->callback(task->dlat);
+
+  task->finished = true;
+  // Clear working data (raw input, posteriors, etc.)
+  task->task_data.reset();
+
+  {
+    std::lock_guard<std::mutex> lk(group_tasks_mutex_);
+    --all_group_tasks_not_done_;
+    int32 left_in_group = --group_tasks_not_done_[task->group];
+    //    std::cout << "left in group " << task->group << " " << left_in_group
+    //    << std::endl;
+    if (left_in_group == 0) group_done_cv_.notify_all();
+  }
+}
+
+void BatchedThreadedNnet3CudaPipeline::DeterminizeOneLattice(TaskState *task) {
+  nvtxRangePushA("DeterminizeOneLattice");
+  // Note this destroys the original raw lattice
+  DeterminizeLatticePhonePrunedWrapper(*trans_model_, &task->lat,
+                                       config_.decoder_opts.lattice_beam,
+                                       &(task->dlat), config_.det_opts);
+  task->determinized = true;
+  nvtxRangePop();
+}
+
+void BatchedThreadedNnet3CudaPipeline::ExecuteWorker(int threadId) {
+  // Initialize this threads device
+  CuDevice::Instantiate();
+
+  KALDI_LOG << "CudaDecoder batch_size=" << config_.max_batch_size
+            << " num_channels=" << config_.num_channels;
+  // Data structures that are reusable across decodes but unique to each thread
+  CudaDecoder cuda_decoder(cuda_fst_, config_.decoder_opts,
+                           config_.max_batch_size, config_.num_channels);
+  if (config_.num_decoder_copy_threads > 0)
+    cuda_decoder.SetThreadPoolAndStartCPUWorkers(
+        work_pool_, config_.num_decoder_copy_threads);
+  nnet3::NnetBatchComputer computer(config_.compute_opts, am_nnet_->GetNnet(),
+                                    am_nnet_->Priors());
+
+  OnlineCudaFeaturePipeline feature_pipeline(config_.feature_opts);
+
+  ChannelState channel_state;
+
+  std::vector<TaskState *> tasks;  // The state for each decode
+  std::vector<CudaDecodableInterface *> decodables;
+
+  // Initialize reuseable data structures
+  {
+    channel_state.channels.reserve(config_.max_batch_size);
+    channel_state.completed_channels.reserve(config_.max_batch_size);
+    tasks.reserve(config_.max_batch_size);
+    decodables.reserve(config_.max_batch_size);
+    {
+      std::lock_guard<std::mutex> lk(channel_state.free_channels_mutex);
+      channel_state.free_channels.reserve(config_.num_channels);
+      // add all channels to free channel list
+      for (int i = 0; i < config_.num_channels; i++) {
+        channel_state.free_channels.push_back(i);
+      }
+    }
+  }
+
+  numStarted_++;  // Tell master I have started
+
+  // main control loop.  At each iteration a thread will see if it has been
+  // asked to shut
+  // down.  If it has it will exit.  This loop condition will only be processed
+  // if all
+  // other work assigned to this thread has been processed.
+  while (!exit_) {
+    // main processing loop.  At each iteration the thread will do the
+    // following:
+    // 1) Attempt to grab more work.
+    // 2) Initialize any new work
+    // do
+    // 3) Process work in a batch
+    // while(free lanes < drain_count)
+    // 4) Postprocess any completed work
+    do {
+      // 1) attempt to fill the batch
+      if (tasks_front_ != tasks_back_) {  // if work is available grab more work
+
+        int start = tasks.size();  // Save the current assigned tasks size
+
+        AquireAdditionalTasks(cuda_decoder, channel_state, tasks);
+
+        // New tasks are now in the in tasks[start,tasks.size())
+        if (start != tasks.size()) {  // if there are new tasks
+          if (config_.gpu_feature_extract)
+            ComputeBatchFeatures(start, tasks, feature_pipeline);
+          ComputeBatchNnet(computer, start, tasks);
+          AllocateDecodables(start, tasks, decodables);
+        }
+      }  // end if (tasks_front_!=tasks_back_)
+
+      // check if there is no active work on this thread.
+      // This can happen if another thread was assigned the work.
+      if (tasks.size() == 0) {
+        // Thread is spinning waiting for work.  Backoff.
+        kaldi::Sleep(SLEEP_BACKOFF_S);
+        break;
+      }
+
+      // try/catch to catch and report errors inside decoder.
+      // errors can be recoverable or non-recoverable
+      // unrecoverable errors will assert
+      // recoverable errors will cancel the batch (output empty lattice)
+      // and print a warning.
+      // There should be no errors and this is just a sanity check
+      try {
+        // This is in a loop in case we want to drain the batch a little.
+        // Draining the batch will cause initialization tasks to be batched.
+        do {
+          // 3) Process outstanding work in a batch
+          // Advance decoding on all open channels
+          cuda_decoder.AdvanceDecoding(channel_state.channels, decodables);
+
+          // Adjust channel state for all completed decodes
+          RemoveCompletedChannels(cuda_decoder, channel_state, decodables,
+                                  tasks);
+          // do loop repeates until we meet drain size or run out of work
+        } while (config_.max_batch_size - channel_state.channels.size() <
+                     config_.batch_drain_size &&
+                 channel_state.channels.size() > 0);
+        // 4) Post process work.  This reorders completed work to the end,
+        // copies results outs, and cleans up data structures
+        PostDecodeProcessing(cuda_decoder, channel_state, decodables, tasks);
+
+      } catch (CudaDecoderException e) {
+        // Code to catch errors.  Most errors are unrecoverable but a user can
+        // mark them
+        // recoverable which will cancel the entire batch but keep processing.
+        if (!e.recoverable) {
+          bool UNRECOVERABLE_EXCEPTION = false;
+          KALDI_LOG << "Error unrecoverable cuda decoder error '" << e.what()
+                    << "'\n";
+          KALDI_ASSERT(UNRECOVERABLE_EXCEPTION);
+        } else {
+          KALDI_LOG << "Error recoverable cuda decoder error '" << e.what()
+                    << "'\n";
+          KALDI_LOG << "    Aborting batch for recovery.  Canceling the "
+                       "following decodes:\n";
+          // Cancel all outstanding tasks
+          for (int i = 0; i < tasks.size(); i++) {
+            // move all channels to free channel queue
+            ChannelId channel = channel_state.channels[i];
+            {
+              std::lock_guard<std::mutex> lk(channel_state.free_channels_mutex);
+              channel_state.free_channels.push_back(channel);
+            }
+            TaskState &task = *(tasks[i]);
+            KALDI_LOG << "      Canceled: " << task.key << "\n";
+
+            // set error flag
+            task.error = true;
+            task.error_string = e.what();
+
+            // cleanup memory
+            delete decodables[i];
+
+            // notifiy master decode is finished
+            task.finished = true;
+          }
+          tasks.resize(0);
+          channel_state.channels.resize(0);
+          decodables.resize(0);
+        }
+      }
+    } while (tasks.size() > 0);  // more work don't check exit condition
+  }                              // end while(!exit_)
+}  // end ExecuteWorker
+
+}  // end namespace cuda_decoder
+}  // end namespace kaldi
+
+#endif  // HAVE_CUDA == 1
diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.h b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.h
new file mode 100644
index 00000000000..6401b24b7db
--- /dev/null
+++ b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.h
@@ -0,0 +1,379 @@
+// cudadecoder/batched-threaded-nnet3-cuda-pipeline.h
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Hugo Braun, Justin Luitjens, Ryan Leary
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_CUDA_DECODER_BATCHED_THREADED_CUDA_DECODER_H_
+#define KALDI_CUDA_DECODER_BATCHED_THREADED_CUDA_DECODER_H_
+
+#include <atomic>
+#include <thread>
+
+#include "cudadecoder/cuda-decoder.h"
+#include "decodable-cumatrix.h"
+#include "feat/wave-reader.h"
+#include "lat/determinize-lattice-pruned.h"
+#include "nnet3/nnet-batch-compute.h"
+#include "online2/online-nnet2-feature-pipeline.h"
+#include "cudafeat/online-cuda-feature-pipeline.h"
+#include "thread-pool.h"
+
+// If num_channels sets to automatic,
+// num_channels = [this define] * max_batch_size
+#define KALDI_CUDA_DECODER_CHANNELS_BATCH_SIZE_RATIO 1.3
+
+namespace kaldi {
+namespace cuda_decoder {
+
+/* BatchedThreadedNnet3CudaPipelineConfig
+ * This class is a common configuration class for the various components
+ * of a batched cuda multi-threaded pipeline.  It defines a single place
+ * to control all operations and ensures that the various componets
+ * match configurations
+ */
+// configuration options common to the BatchedThreadedNnet3CudaPipeline and
+// BatchedThreadedNnet3CudaPipeline
+struct BatchedThreadedNnet3CudaPipelineConfig {
+  BatchedThreadedNnet3CudaPipelineConfig()
+      : max_batch_size(200),
+        num_channels(-1),
+        batch_drain_size(10),
+        num_control_threads(2),
+        num_worker_threads(20),
+        determinize_lattice(true),
+        max_pending_tasks(4000),
+        num_decoder_copy_threads(2),
+        gpu_feature_extract(true) {};
+  void Register(OptionsItf *po) {
+    po->Register("max-batch-size", &max_batch_size,
+                 "The maximum batch size to be used by the decoder. "
+                 "This is also the number of lanes in the CudaDecoder. "
+                 "Larger = Faster and more GPU memory used.");
+    std::ostringstream num_channels_desc;
+    num_channels_desc
+        << "The number of channels "
+           "allocated to the cuda decoder.  This should be larger "
+           "than max_batch_size.  Each channel consumes a small "
+           "amount of memory but also allows us to better overlap "
+           "computation"
+           " (-1 = set to "
+        << KALDI_CUDA_DECODER_CHANNELS_BATCH_SIZE_RATIO << "*max-batch-size).";
+    po->Register("num-channels", &num_channels, num_channels_desc.str());
+    po->Register("batch-drain-size", &batch_drain_size,
+                 "How far to drain the batch before refilling work. This "
+                 "batches pre/post decode work.");
+    po->Register("cuda-control-threads", &num_control_threads,
+                 "The number of pipeline control threads for the CUDA work. "
+                 "e.g. 2 control threads -> 2 independent CUDA pipeline (nnet3 "
+                 "and decoder).");
+    po->Register(
+        "cuda-worker-threads", &num_worker_threads,
+        "The total number of CPU threads launched to process CPU tasks.");
+    po->Register("determinize-lattice", &determinize_lattice,
+                 "Determinize the lattice before output.");
+    po->Register("max-outstanding-queue-length", &max_pending_tasks,
+                 "Number of files to allow to be outstanding at a time. When "
+                 "the number of files is larger than this handles will be "
+                 "closed before opening new ones in FIFO order.");
+    po->Register("cuda-decoder-copy-threads", &num_decoder_copy_threads,
+                 "Advanced - Number of worker threads used in the decoder for "
+                 "the host to host copies.");
+    po->Register("gpu-feature-extract", &gpu_feature_extract,
+                 "Extract features on the GPU.  This reduces CPU overhead "
+                 "leading to better scalability but may reduce overall "
+                 "performance for a single GPU.");
+
+    feature_opts.Register(po);
+    decoder_opts.Register(po);
+    det_opts.Register(po);
+    compute_opts.Register(po);
+  }
+  int max_batch_size;
+  int num_channels;
+  int batch_drain_size;
+  int num_control_threads;
+  int num_worker_threads;
+  bool determinize_lattice;
+  int max_pending_tasks;
+  int num_decoder_copy_threads;
+  bool gpu_feature_extract;
+
+  void ComputeConfig() {
+    if (num_channels == -1)
+      num_channels =
+          max_batch_size * KALDI_CUDA_DECODER_CHANNELS_BATCH_SIZE_RATIO;
+  }
+
+  OnlineNnet2FeaturePipelineConfig feature_opts;      // constant readonly
+  CudaDecoderConfig decoder_opts;                     // constant readonly
+  fst::DeterminizeLatticePhonePrunedOptions det_opts; // constant readonly
+  nnet3::NnetBatchComputerOptions compute_opts;       // constant readonly
+};
+
+/*
+ * BatchedThreadedNnet3CudaPipeline uses multiple levels of parallelism in order to
+ * decode quickly on CUDA GPUs. This is the primary interface for cuda decoding.
+ * For examples of how to use this decoder see cudadecoder/README and
+ * cudadecoderbin/batched-wav-nnet3-cuda.cc
+ */
+class BatchedThreadedNnet3CudaPipeline {
+public:
+ BatchedThreadedNnet3CudaPipeline(
+     const BatchedThreadedNnet3CudaPipelineConfig &config)
+     : config_(config), all_group_tasks_not_done_(0) {
+   config_.ComputeConfig();
+ };
+
+ // allocates reusable objects that are common across all decodings
+ void Initialize(const fst::Fst<fst::StdArc> &decode_fst,
+                 const nnet3::AmNnetSimple &nnet,
+                 const TransitionModel &trans_model);
+
+ // deallocates reusable objects
+ void Finalize();
+
+ // query a specific key to see if compute on it is complete
+ bool isFinished(const std::string &key);
+
+ // remove an audio file from the decoding and clean up resources
+ void CloseDecodeHandle(const std::string &key);
+ void CloseAllDecodeHandlesForGroup(const std::string &group);
+ void CloseAllDecodeHandles();
+
+ // Adds a decoding task to the decoder
+ // When passing in a vector of data, the caller must ensure the data exists
+ // until the CloseDecodeHandle/WaitForAllTasks is called
+ // callback is called once task is done and we pass it the final lattice
+ // callback can be used to compute lattice rescoring, find best path in
+ // lattice, writing lattice to disk, etc.
+ // Important: callback is launched in the threadpool. It must be threadsafe.
+ // For instance, if writing to disk, or to stdout,
+ // use a lock:
+ // e.g. :
+ // {
+ // 	std::lock_guard<std::mutex> lock(global_mutex);
+ // 	// write lattice to disk
+ //    // lock is released in the destructor of lock_guard<>
+ // }
+ void OpenDecodeHandle(
+     const std::string &key, const WaveData &wave_data,
+     const std::string &group = std::string(),
+     const std::function<void(CompactLattice &clat)> &callback =
+         std::function<void(CompactLattice &clat)>());
+ // When passing in a vector of data, the caller must ensure the data exists
+ // until the CloseDecodeHandle is called
+ void OpenDecodeHandle(
+     const std::string &key, const VectorBase<BaseFloat> &wave_data,
+     float sample_rate, const std::string &group = std::string(),
+     const std::function<void(CompactLattice &clat)> &callback =
+         std::function<void(CompactLattice &clat)>());
+
+ // Copies the raw lattice for decoded handle "key" into lat
+ bool GetRawLattice(const std::string &key, Lattice *lat);
+ // Determinizes raw lattice and returns a compact lattice
+ bool GetLattice(const std::string &key, CompactLattice *lat);
+
+ int32 GetNumberOfTasksPending();
+
+ // Wait for all tasks to complete
+ void WaitForAllTasks();
+ // Wait for all tasks in the group to complete
+ void WaitForGroup(const std::string &group);
+ // Check if a group is available. Returns if not.
+ bool IsGroupCompleted(const std::string &group);
+ // Wait for any group to complete, then returns which group completed
+ std::string WaitForAnyGroup();
+ // Check if any group is available. If one is available, set its name in *group
+ bool IsAnyGroupCompleted(std::string *group);
+ inline int NumPendingTasks() {
+   return (tasks_back_ - tasks_front_ + config_.max_pending_tasks + 1) %
+          (config_.max_pending_tasks + 1);
+  };
+
+private:
+ // Task data used during computation
+ // Is cleared when task is completed
+ struct TaskData {
+   Vector<BaseFloat> raw_data;  // Wave input data when wave_reader passed
+   std::shared_ptr<SubVector<BaseFloat>>
+       wave_samples;  // Used as a pointer to either the raw
+                      // data or the samples passed
+   float sample_frequency;
+   Vector<BaseFloat> ivector_features_cpu;
+   Matrix<BaseFloat> input_features_cpu;
+   CuVector<BaseFloat> ivector_features;
+   CuMatrix<BaseFloat> input_features;
+   CuMatrix<BaseFloat> posteriors;
+
+   TaskData(const WaveData &wave_data_in)
+       : wave_samples(NULL), sample_frequency(0) {
+     raw_data.Resize(
+         wave_data_in.Data().NumRows() * wave_data_in.Data().NumCols(),
+         kUndefined);
+     memcpy(raw_data.Data(), wave_data_in.Data().Data(),
+            raw_data.Dim() * sizeof(BaseFloat));
+     wave_samples =
+         std::make_shared<SubVector<BaseFloat>>(raw_data, 0, raw_data.Dim());
+     sample_frequency = wave_data_in.SampFreq();
+   };
+
+   // Init when raw data is passed in.  This data is shallow copied.
+   TaskData(const VectorBase<BaseFloat> &wave_data_in, float sample_rate) {
+     wave_samples = std::make_shared<SubVector<BaseFloat>>(wave_data_in, 0,
+                                                           wave_data_in.Dim());
+     sample_frequency = sample_rate;
+   }
+ };
+
+ // State needed for each decode task.
+ // This state can be passed around by reference or pointer safely
+ // and provides a convieniet way to store all decoding state.
+ struct TaskState {
+   std::string key;
+   std::string group;  // group for that task. "" is default
+   bool error;
+   std::string error_string;
+
+   std::shared_ptr<TaskData> task_data;
+
+   int32 ichannel;              // associated CudaDecoder channel
+   Lattice lat;                 // Raw Lattice output
+   CompactLattice dlat;         // Determinized lattice output.  Only set if
+                                // determinize-lattice=true
+   std::atomic<bool> finished;  // Tells master thread if task has finished
+                                // execution
+
+   bool determinized;
+
+   // (optional) callback is called task is finished and we have a lattice
+   // ready
+   // that way we can compute all CPU tasks in the threadpool (lattice
+   // rescoring, find best path in lattice, etc.)
+   std::function<void(CompactLattice &clat)> callback;
+
+   TaskState() : error(false), finished(false), determinized(false) {}
+
+   // Init when wave data is passed directly in.  This data is deep copied.
+   void Init(const std::string &key_in, const WaveData &wave_data_in) {
+     task_data = std::make_shared<TaskData>(wave_data_in);
+     key = key_in;
+   };
+   // Init when raw data is passed in.  This data is shallow copied.
+   void Init(const std::string &key_in,
+             const VectorBase<BaseFloat> &wave_data_in, float sample_rate) {
+     task_data = std::make_shared<TaskData>(wave_data_in, sample_rate);
+     key = key_in;
+   }
+  };
+
+  // Creating a new task in the hashmaps
+  TaskState *AddTask(const std::string &key, const std::string &group);
+
+  // Holds the current channel state for a worker
+  struct ChannelState {
+    std::vector<ChannelId> channels;
+    std::vector<ChannelId> free_channels;
+    std::vector<ChannelId> completed_channels;
+    std::mutex free_channels_mutex;
+  };
+
+  // Adds task to the PendingTaskQueue
+  void AddTaskToPendingTaskQueue(TaskState *task);
+
+  // Attempts to fill the batch from the task queue.  May not fully fill the
+  // batch.
+  void AquireAdditionalTasks(CudaDecoder &cuda_decoder,
+                             ChannelState &channel_state,
+                             std::vector<TaskState *> &tasks);
+
+  // Computes Features for a single decode instance.
+  void ComputeOneFeatureCPU(TaskState *task);
+
+  // Computes features across the tasks[first,tasks.size()
+  void ComputeBatchFeatures(int32 first,
+                            std::vector<TaskState *> &tasks,
+                            OnlineCudaFeaturePipeline &feature_pipeline);
+
+  // Computes Nnet across the current decode batch
+  void ComputeBatchNnet(nnet3::NnetBatchComputer &computer, int32 first,
+                        std::vector<TaskState *> &tasks);
+
+  // Allocates decodables for tasks in the range of
+  // dstates[first,dstates.size())
+  void AllocateDecodables(int32 first, std::vector<TaskState *> &tasks,
+                          std::vector<CudaDecodableInterface *> &decodables);
+
+  // Removes all completed channels from the channel list.
+  // Also enqueues up work for post processing
+  void
+  RemoveCompletedChannels(CudaDecoder &cuda_decoder,
+                          ChannelState &channel_state,
+                          std::vector<CudaDecodableInterface *> &decodables,
+                          std::vector<TaskState *> &tasks);
+
+  // For each completed decode perform post processing work and clean up
+  void PostDecodeProcessing(CudaDecoder &cuda_decoder,
+                            ChannelState &channel_state,
+                            std::vector<CudaDecodableInterface *> &decodables,
+                            std::vector<TaskState *> &tasks);
+
+  // Calls ConcurrentGetRawLatticeSingleChannel and Determinize
+  // on a dedicated CPU worker thread at the end of the decode
+  void CompleteTask(CudaDecoder *cuda_decoder, ChannelState *channel_state,
+                    TaskState *state);
+
+  // Determinize one lattice
+  void DeterminizeOneLattice(TaskState *task);
+  // Thread execution function.  This is a single worker thread which processes
+  // input.
+  void ExecuteWorker(int threadId);
+
+  BatchedThreadedNnet3CudaPipelineConfig config_;
+
+  CudaFst cuda_fst_;
+  const TransitionModel *trans_model_;
+  const nnet3::AmNnetSimple *am_nnet_;
+  nnet3::DecodableNnetSimpleLoopedInfo *decodable_info_;
+  OnlineNnet2FeaturePipelineInfo *feature_info_;
+
+  std::mutex tasks_mutex_; // protects tasks_front_ and pending_task_queue_ for
+                           // workers
+  std::mutex tasks_add_mutex_; // protect OpenDecodeHandle if multiple threads
+                               // access
+  std::mutex tasks_lookup_mutex_; // protext tasks_lookup map
+  std::condition_variable tasks_lookup_cv_;
+  std::atomic<int> tasks_front_, tasks_back_;
+  TaskState **pending_task_queue_;
+
+  std::atomic<bool> exit_;      // signals threads to exit
+  std::atomic<int> numStarted_; // signals master how many threads have started
+
+  ThreadPool *work_pool_; // thread pool for CPU work
+  std::map<std::string, int32> group_tasks_not_done_;
+  int32 all_group_tasks_not_done_;
+  std::mutex group_tasks_mutex_;
+  std::condition_variable group_done_cv_;
+  std::unordered_multimap<std::string, TaskState *>
+      tasks_group_lookup_;  // group -> list of tasks
+  std::unordered_map<std::string, TaskState>
+      tasks_lookup_;                              // Contains a map of
+                                                  // utterance to TaskState
+  std::vector<std::thread> thread_contexts_;      // A list of thread contexts
+};
+
+}  // end namespace cuda_decoder
+} // end namespace kaldi.
+
+#endif  // KALDI_CUDA_DECODER_BATCHED_THREADED_CUDA_DECODER_H_
diff --git a/src/cudadecoder/cuda-decodable-itf.h b/src/cudadecoder/cuda-decodable-itf.h
new file mode 100644
index 00000000000..98d0619b6eb
--- /dev/null
+++ b/src/cudadecoder/cuda-decodable-itf.h
@@ -0,0 +1,33 @@
+// cudadecoder/cuda-decodable-itf.h
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Hugo Braun, Justin Luitjens, Ryan Leary
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_CUDA_DECODER_DECODABLE_ITF_H
+#define KALDI_CUDA_DECODER_DECODABLE_ITF_H
+
+#include "itf/decodable-itf.h"
+
+namespace kaldi {
+namespace cuda_decoder {
+
+class CudaDecodableInterface : public DecodableInterface {
+public:
+  virtual BaseFloat *GetLogLikelihoodsCudaPointer(int32 subsampled_frame) = 0;
+};
+
+}  // end namespace cuda_decoder
+}  // end namespace kaldi.
+#endif  // KALDI_CUDA_DECODER_DECODABLE_ITF_H
diff --git a/src/cudadecoder/cuda-decoder-common.h b/src/cudadecoder/cuda-decoder-common.h
new file mode 100644
index 00000000000..fc11ff894bb
--- /dev/null
+++ b/src/cudadecoder/cuda-decoder-common.h
@@ -0,0 +1,564 @@
+// cudadecoder/cuda-decoder-common.h
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Hugo Braun, Justin Luitjens, Ryan Leary
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_CUDA_DECODER_CUDA_DECODER_UTILS_H_
+#define KALDI_CUDA_DECODER_CUDA_DECODER_UTILS_H_
+#include "cudamatrix/cu-device.h"
+#include "util/stl-utils.h"
+
+// A decoder channel is linked to one utterance. Frames
+// from the same must be sent to the same channel.
+//
+// A decoder lane is where the computation actually happens
+// a decoder lane is given a frame and its associated channel
+// and does the actual computation
+//
+// An analogy would be lane -> a core, channel -> a software thread
+
+// Some config parameters can be computed using other parameters
+// (e.g. we can set main_q_capacity using max-active)
+// Those values are the different factors between parameters that we know
+// and parameters we want to set
+#define KALDI_CUDA_DECODER_MAX_ACTIVE_MAIN_Q_CAPACITY_FACTOR 4
+#define KALDI_CUDA_DECODER_AUX_Q_MAIN_Q_CAPACITIES_FACTOR 3
+
+// If we're at risk of filling the tokens queue,
+// the beam is reduced to keep only the best candidates in the
+// remaining space
+// We then slowly put the beam back to its default value
+// beam_next_frame = min(default_beam, RECOVER_RATE * beam_previous_frame)
+#define KALDI_CUDA_DECODER_ADAPTIVE_BEAM_RECOVER_RATE 1.2f
+
+// Defines for the cuda decoder kernels
+// It shouldn't be necessary to change the DIMX of the kernels
+
+// Below that value, we launch the persistent kernel for NonEmitting
+#define KALDI_CUDA_DECODER_NONEM_LT_MAX_NARCS 4096
+
+// We know we will have at least X elements in the hashmap
+// We allocate space for X*KALDI_CUDA_DECODER_HASHMAP_CAPACITY_FACTOR elements
+// to avoid having too much collisions
+#define KALDI_CUDA_DECODER_HASHMAP_CAPACITY_FACTOR 1
+
+// Max size of the total kernel arguments
+// 4kb for compute capability >= 2.0
+#define KALDI_CUDA_DECODER_MAX_KERNEL_ARGUMENTS_BYTE_SIZE (4096)
+
+// When applying the max-active, we need to compute a topk
+// to perform that (soft) topk, we compute a histogram
+// here we define the number of bins in that histogram
+// it has to be less than the number of 1D threads
+#define KALDI_CUDA_DECODER_HISTO_NBINS 255
+
+// Number of "heavy duty" process non emitting kernels
+// If more non emitting iterations are required, those will be done
+// in the one-CTA persistent kernel
+#define KALDI_CUDA_DECODER_N_NON_EMITTING_MAIN_ITERATIONS 2
+
+// Adaptive beam parameters
+// We will decrease the beam when we detect that we are generating too many
+// tokens
+// for the first segment of the aux_q, we don't do anything (keep the original
+// beam)
+// the first segment is made of (aux_q
+// capacity)/KALDI_CUDA_DECODER_ADAPTIVE_BEAM_STATIC_SEGMENT
+// then we will decrease the beam step by step, until 0.
+// we will decrease the beam every m elements, with:
+// x = (aux_q capacity)/KALDI_CUDA_DECODER_ADAPTIVE_BEAM_STATIC_SEGMENT (static
+// segment
+// y = (aux_q capacity) - x
+// m = y / KALDI_CUDA_DECODER_ADAPTIVE_BEAM_NSTEPS
+// For more information, please refer to the definition of GetAdaptiveBeam in
+// cuda-decoder-kernels.cu
+#define KALDI_CUDA_DECODER_ADAPTIVE_BEAM_STATIC_SEGMENT 4
+#define KALDI_CUDA_DECODER_ADAPTIVE_BEAM_NSTEPS 8
+// When applying max_active we don't keep exactly max_active_ tokens,
+// but a bit more. And we can call ApplyMaxActiveAndReduceBeam multiple times
+// in the first frame (the first times as a pre-filter, the last time at the
+// very end of the frame)
+// Because keeping a bit more than max_active_ is expected, we add the tolerance
+// so that we can avoid triggering ApplyMaxActiveAndReduceBeam for just a few
+// tokens above the limit
+// at the end of the frame
+
+#define KALDI_CUDA_DECODER_DIV_ROUND_UP(a, b) ((a + b - 1) / b)
+
+#define KALDI_CUDA_DECODER_ASSERT(val, recoverable)                     \
+  {                                                                     \
+    if ((val) != true) {                                                \
+      throw CudaDecoderException("KALDI_CUDA_DECODER_ASSERT", __FILE__, \
+                                 __LINE__, recoverable)                 \
+    }                                                                   \
+  }
+// Macro for checking cuda errors following a cuda launch or api call
+#ifdef NDEBUG
+#define KALDI_DECODER_CUDA_CHECK_ERROR()
+#else
+#define KALDI_DECODER_CUDA_CHECK_ERROR()                                  \
+  {                                                                       \
+    cudaError_t e = cudaGetLastError();                                   \
+    if (e != cudaSuccess) {                                               \
+      throw CudaDecoderException(cudaGetErrorName(e), __FILE__, __LINE__, \
+                                 false);                                  \
+    }                                                                     \
+  }
+#endif
+
+#define KALDI_DECODER_CUDA_API_CHECK_ERROR(e)                             \
+  {                                                                       \
+    if (e != cudaSuccess) {                                               \
+      throw CudaDecoderException(cudaGetErrorName(e), __FILE__, __LINE__, \
+                                 false);                                  \
+    }                                                                     \
+  }
+
+#define KALDI_CUDA_DECODER_1D_KERNEL_LOOP(i, n)                \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+#define KALDI_CUDA_DECODER_1D_BLOCK_OFFSET_KERNEL_LOOP(offset, th_idx, n) \
+  for (int offset = blockIdx.x * blockDim.x, th_idx = threadIdx.x;        \
+       offset < (n); offset += blockDim.x * gridDim.x)
+
+#define KALDI_CUDA_DECODER_IS_LAST_1D_THREAD() (threadIdx.x == (blockDim.x - 1))
+
+#define KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(i, n) \
+  for (int i = blockIdx.y; i < (n); i += gridDim.y)
+
+#define KALDI_CUDA_DECODER_DIV_ROUND_UP(a, b) ((a + b - 1) / b)
+
+#define KALDI_CUDA_DECODER_1D_BLOCK 256
+#define KALDI_CUDA_DECODER_LARGEST_1D_BLOCK 1024
+#define KALDI_CUDA_DECODER_ONE_THREAD_BLOCK 1
+#define KALDI_CUDA_DECODER_MAX_CTA_COUNT 4096u
+#define KALDI_CUDA_DECODER_MAX_CTA_PER_LANE 512u
+namespace kaldi {
+namespace cuda_decoder {
+
+// Returning the number of CTAs to launch for (N,M) elements to compute
+// M is usually the batch size
+inline dim3 KaldiCudaDecoderNumBlocks(int N, int M) {
+  dim3 grid;
+  grid.x = KALDI_CUDA_DECODER_DIV_ROUND_UP(N, KALDI_CUDA_DECODER_1D_BLOCK);
+  unsigned int max_CTA_per_lane =
+      std::max(KALDI_CUDA_DECODER_MAX_CTA_COUNT / M, 1u);
+  grid.x = std::min(grid.x, max_CTA_per_lane);
+  grid.y = M;
+  return grid;
+}
+
+// Use a fixed number of blocks for nlanes
+// Using the max number of CTAs possible for each lane,
+// according to KALDI_CUDA_DECODER_MAX_CTA_COUNT
+// and KALDI_CUDA_DECODER_MAX_CTA_PER_LANE
+inline dim3 KaldiCudaDecoderNumBlocks(int nlanes) {
+  dim3 grid;
+  unsigned int n_CTA_per_lane =
+      std::max(KALDI_CUDA_DECODER_MAX_CTA_COUNT / nlanes, 1u);
+  if (n_CTA_per_lane == 0) n_CTA_per_lane = 1;
+  grid.x = std::min(KALDI_CUDA_DECODER_MAX_CTA_PER_LANE, n_CTA_per_lane);
+  grid.y = nlanes;
+  return grid;
+}
+
+typedef int32 StateId;
+typedef float CostType;
+// IntegerCostType is the type used in the lookup table d_state_best_cost
+// and the d_cutoff
+// We use a 1:1 conversion between CostType <--> IntegerCostType
+// IntegerCostType is used because it triggers native atomic operations
+// (CostType does not)
+typedef int32 IntegerCostType;
+typedef int32 LaneId;
+typedef int32 ChannelId;
+
+// On the device we compute everything by batch
+// Data is stored as 2D matrices (BatchSize, 1D_Size)
+// For example, for the token queue, (BatchSize, max_tokens_per_frame_)
+// DeviceMatrix owns the data but is not used to access it.
+// DeviceMatrix is inherited in DeviceLaneMatrix and DeviceChannelMatrix
+// those two classes do the same thing, except that they belong either to a
+// channel or lane
+// that inheritance is done to clarify the code and help debugging
+//
+// To actually access the data, we should request an view through
+// GetView
+// That view contains both host cuda code to access the data. It does not own
+// the data.
+template <typename T>
+// if necessary, make a version that always use ncols_ as the next power of 2
+class DeviceMatrix {
+  T *data_;
+  void Allocate() {
+    KALDI_ASSERT(nrows_ > 0);
+    KALDI_ASSERT(ncols_ > 0);
+    KALDI_ASSERT(!data_);
+    data_ = static_cast<T *>(CuDevice::Instantiate().Malloc(
+        (size_t)nrows_ * ncols_ * sizeof(*data_)));
+    KALDI_ASSERT(data_);
+  }
+  void Free() {
+    KALDI_ASSERT(data_);
+    CuDevice::Instantiate().Free(data_);
+  }
+
+ protected:
+  int32 ncols_;
+  int32 nrows_;
+
+ public:
+  DeviceMatrix() : data_(NULL), ncols_(0), nrows_(0) {}
+
+  virtual ~DeviceMatrix() {
+    if (data_) Free();
+  }
+
+  void Resize(int32 nrows, int32 ncols) {
+    if (data_) Free();
+    KALDI_ASSERT(nrows > 0);
+    KALDI_ASSERT(ncols > 0);
+    nrows_ = nrows;
+    ncols_ = ncols;
+    Allocate();
+  }
+
+  T *MutableData() {
+    KALDI_ASSERT(data_);
+    return data_;
+  }
+  // abstract getInterface...
+};
+
+template <typename T>
+// if necessary, make a version that always use ncols_ as the next power of 2
+class HostMatrix {
+  T *data_;
+  void Allocate() {
+    KALDI_ASSERT(nrows_ > 0);
+    KALDI_ASSERT(ncols_ > 0);
+    KALDI_ASSERT(!data_);
+    cudaMallocHost((void **)&data_, (size_t)nrows_ * ncols_ * sizeof(*data_));
+    KALDI_ASSERT(data_);
+  }
+  void Free() {
+    KALDI_ASSERT(data_);
+    cudaFreeHost(data_);
+  }
+
+ protected:
+  int32 ncols_;
+  int32 nrows_;
+
+ public:
+  HostMatrix() : data_(NULL), ncols_(0), nrows_(0) {}
+
+  virtual ~HostMatrix() {
+    if (data_) Free();
+  }
+
+  void Resize(int32 nrows, int32 ncols) {
+    if (data_) Free();
+    KALDI_ASSERT(nrows > 0);
+    KALDI_ASSERT(ncols > 0);
+    nrows_ = nrows;
+    ncols_ = ncols;
+    Allocate();
+  }
+
+  T *MutableData() {
+    KALDI_ASSERT(data_);
+    return data_;
+  }
+  // abstract getInterface...
+};
+
+// Views of DeviceMatrix
+// Those views are created by either DeviceChannelMatrix or
+// DeviceLaneMatrix
+// We can access the data (the matrix) associated with that
+// Device[Channel|Lane]Matrix without owning that data.
+// Which means that we can pass those views by copy
+// without triggering a cudaFree, for instance.
+// Device[Channel|Lane]Matrix owns the data, [Channel|Lane]MatrixInterface just
+// gives access to it
+// Generating both host and device interfaces
+template <typename T>
+struct LaneMatrixView {
+  T *data_;
+  int32 ncols_;
+  __host__ __device__ __inline__ T *lane(const int32 ilane) {
+    return &data_[ilane * ncols_];
+  }
+};
+
+template <typename T>
+struct ChannelMatrixView {
+  T *data_;
+  int32 ncols_;
+  __host__ __device__ __inline__ T *channel(const int32 ichannel) {
+    return &data_[ichannel * ncols_];
+  }
+};
+
+// Specializing DeviceMatrix into lane and channel variants.
+// Helps with code clarity/debugging
+template <typename T>
+class DeviceLaneMatrix : public DeviceMatrix<T> {
+ public:
+  LaneMatrixView<T> GetView() { return {this->MutableData(), this->ncols_}; }
+
+  T *lane(const int32 ilane) {
+    return &this->MutableData()[ilane * this->ncols_];
+  }
+};
+
+template <typename T>
+class HostLaneMatrix : public HostMatrix<T> {
+ public:
+  LaneMatrixView<T> GetView() { return {this->MutableData(), this->ncols_}; }
+
+  T *lane(const int32 ilane) {
+    return &this->MutableData()[ilane * this->ncols_];
+  }
+};
+
+template <typename T>
+class DeviceChannelMatrix : public DeviceMatrix<T> {
+ public:
+  ChannelMatrixView<T> GetView() { return {this->MutableData(), this->ncols_}; }
+  T *channel(const int32 ichannel) {
+    return &this->MutableData()[ichannel * this->ncols_];
+  }
+};
+
+// LaneCounters/ChannelCounters
+// The counters are all the singular values associated to a lane/channel
+// For instance  the main queue size. Or the min_cost of all tokens in that
+// queue
+// LaneCounters are used during computation
+struct LaneCounters {
+  // hannel that this lane will compute for the current frame
+  ChannelId channel_to_compute;
+  // Pointer to the loglikelihoods array for this channel and current frame
+  BaseFloat *loglikelihoods;
+  // Contains both main_q_end and narcs
+  // End index of the main queue
+  // only tokens at index i with i < main_q_end
+  // are valid tokens
+  // Each valid token the subqueue main_q[main_q_local_offset, main_q_end[ has
+  // a number of outgoing arcs (out-degree)
+  // main_q_narcs is the sum of those numbers
+  // We sometime need to update both end and narcs at the same time using a
+  // single atomic,
+  // which is why they're packed together
+  int2 main_q_narcs_and_end;
+  // contains the requested queue length which can
+  // be larger then the actual queue length in the case of overflow
+  int32 main_q_requested;
+  int32 aux_q_requested;
+  int32 aux_q_end;
+  int32 post_expand_aux_q_end;  // used for double buffering
+  // Some tokens in the same frame share the same token.next_state
+  // main_q_n_extra_prev_tokens is the count of those tokens
+  int32 main_q_n_extra_prev_tokens;
+  // Number of tokens created during the emitting stage
+  int32 main_q_n_emitting_tokens;
+  // Depending on the value of the parameter "max_tokens_per_frame"
+  // we can end up with an overflow when generating the tokens for a frame
+  // We try to prevent this from happening using an adaptive beam
+  // If an overflow happens, then the kernels no longer insert any data into
+  // the queues and set overflow flag to true.
+  // queue length.
+  // Even if that flag is set, we can continue the execution (quality
+  // of the output can be lowered)
+  // We use that flag to display a warning to the user
+  int32 q_overflow;
+  // ExpandArcs reads the tokens in the index range [main_q_local_offset, end[
+  int32 main_q_local_offset;
+  // We transfer the tokens back to the host at the end of each frame.
+  // Which means that tokens at a frame  n > 0 have an offset compared to to
+  // those
+  // in frame n-1. main_q_global_offset is the overall offset of the current
+  // main_q,
+  // since frame 0
+  // It is used to set the prev_token index.
+  int32 main_q_global_offset;
+  // Same thing, but for main_q_n_extra_prev_tokens (those are also transfered
+  // back to host)
+  int32 main_q_extra_prev_tokens_global_offset;
+  // Minimum token for that frame
+  IntegerCostType min_int_cost;
+  // Current beam. Can be different from default_beam,
+  // because of the AdaptiveBeam process, or because of
+  // ApplyMaxActiveAndReduceBeam
+  IntegerCostType int_beam;
+  // Adaptive beam. The validity says until which index this adaptive beam is
+  // valid.
+  // After that index, we need to lower the adaptive beam
+  int2 adaptive_int_beam_with_validity_index;
+  // min_cost + beam
+  IntegerCostType int_cutoff;
+  // The histogram for max_active will be computed between min_histo_cost
+  // and max_histo_cost. Set for each frame after emitting stage
+  CostType min_histo_cost;
+  CostType max_histo_cost;
+  CostType histo_bin_width;
+  bool compute_max_active;
+  // offsets used by concatenate_lanes_data_kernel
+  int32 main_q_end_lane_offset;
+  int32 main_q_n_emitting_tokens_lane_offset;
+  int32 main_q_n_extra_prev_tokens_lane_offset;
+
+  // --- Only valid after calling GetBestCost
+  // min_cost and its arg. Can be different than min_cost, because we may
+  // include final costs
+  int2 min_int_cost_and_arg;
+  // Number of final tokens with cost < best + lattice_beam
+  int32 n_within_lattice_beam;
+  int32 has_reached_final;  // if there's at least one final token in the queue
+  int32 prev_arg_min_int_cost;
+};
+
+// Channel counters
+// Their job is to save the state of a channel, when this channel is idle
+// The channel counters are loaded into the lane counters during the context
+// switches
+struct ChannelCounters {
+  // All the following values are just saved values from LaneCounters
+  // from the latest context-switch
+  int2 prev_main_q_narcs_and_end;
+  int32 prev_main_q_n_extra_prev_tokens;
+  int32 prev_main_q_global_offset;
+  int32 prev_main_q_extra_prev_tokens_global_offset;
+  CostType prev_beam;
+
+  // Only valid after calling GetBestCost
+  // different than min_int_cost : we include the "final" cost
+  int2 min_int_cost_and_arg_with_final;
+  int2 min_int_cost_and_arg_without_final;
+};
+
+class CudaDecoderException : public std::exception {
+ public:
+  CudaDecoderException(const char *str_, const char *file_, int line_,
+                       const bool recoverable_)
+      : str(str_),
+        file(file_),
+        line(line_),
+        buffer(std::string(file) + ":" + std::to_string(line) + " :" +
+               std::string(str)),
+        recoverable(recoverable_) {}
+  const char *what() const throw() { return buffer.c_str(); }
+
+  const char *str;
+  const char *file;
+  const int line;
+  const std::string buffer;
+  const bool recoverable;
+};
+
+// InfoToken contains data that needs to be saved for the backtrack
+// in GetBestPath/GetRawLattice
+// We don't need the token.cost or token.next_state.
+struct __align__(8) InfoToken {
+  int32 prev_token;
+  int32 arc_idx;
+  bool IsUniqueTokenForStateAndFrame() {
+    // This is a trick used to save space and PCI-E bandwidth (cf
+    // preprocess_in_place kernel)
+    // This token is associated with a next_state s, created during the
+    // processing of frame f.
+    // If we have multiple tokens associated with the state s in the frame f,
+    // arc_idx < 0 and -arc_idx is the
+    // count of such tokens. We will then have to look at another list to read
+    // the actually arc_idx and prev_token values
+    // If the current token is the only one, prev_token and arc_idx are valid
+    // and can be used directly
+    return (arc_idx >= 0);
+  }
+
+  // Called if this token is linked to others tokens in the same frame (cf
+  // comments for IsUniqueTokenForStateAndFrame)
+  // return the {offset,size} pair necessary to list those tokens in the
+  // extra_prev_tokens list
+  // They are stored at offset "offset", and we have "size" of those
+  std::pair<int32, int32> GetSameFSTStateTokensList() {
+    KALDI_ASSERT(!IsUniqueTokenForStateAndFrame());
+
+    return {prev_token, -arc_idx};
+  }
+};
+
+// Device function, used to set a in an InfoToken the [offset,size] related to
+// InfoToken.GetSameFSTStateTokensList
+__device__ __inline__ void SetSameFSTStateTokensList(int32 offset, int32 size,
+                                                     InfoToken *info_token) {
+  // We always have size > 0
+  *info_token = {offset, -size};
+}
+
+// Used to store the index in the GPU hashmap of that FST state
+// The hashmap is only generated with the final main queue (post max_active_) of
+// each frame
+// Also stores the information or whether or not the owner of that object is the
+// representative of this FSTState
+typedef int32 FSTStateHashIndex;
+
+// 1:1 Conversion float <---> sortable int
+// We convert floats to sortable ints in order
+// to use native atomics operation
+// Those are the host version, used when we transfer an int from the device
+// and we want to convert it to a float
+// (it was created on device by floatToOrderedInt, we'll use
+// orderedIntToFloatHost on host to convert it back to a float)
+__inline__ int32 floatToOrderedIntHost(float floatVal) {
+  int32 intVal;
+  // Should be optimized away by compiler
+  memcpy(&intVal, &floatVal, sizeof(float));
+  return (intVal >= 0) ? intVal : intVal ^ 0x7FFFFFFF;
+}
+
+__inline__ float orderedIntToFloatHost(int32 intVal) {
+  intVal = (intVal >= 0) ? intVal : intVal ^ 0x7FFFFFFF;
+  float floatVal;
+  // Should be optimized away by compiler
+  memcpy(&floatVal, &intVal, sizeof(float));
+  return floatVal;
+}
+
+// Hashmap value. Used when computing the hashmap in PostProcessingMainQueue
+struct __align__(16) HashmapValueT {
+  // Map key : fst state
+  int32 key;
+  // Number of tokens associated to that state
+  int32 count;
+  // minimum cost for that state + argmin
+  unsigned long long min_and_argmin_int_cost_u64;
+};
+
+enum OVERFLOW_TYPE {
+  OVERFLOW_NONE = 0,
+  OVERFLOW_MAIN_Q = 1,
+  OVERFLOW_AUX_Q = 2
+};
+
+enum QUEUE_ID { MAIN_Q = 0, AUX_Q = 1 };
+
+}  // end namespace cuda_decoder
+}  // end namespace kaldi
+
+#endif  // KALDI_CUDA_DECODER_CUDA_DECODER_UTILS_H_
diff --git a/src/cudadecoder/cuda-decoder-kernels-utils.h b/src/cudadecoder/cuda-decoder-kernels-utils.h
new file mode 100644
index 00000000000..c94b84f6360
--- /dev/null
+++ b/src/cudadecoder/cuda-decoder-kernels-utils.h
@@ -0,0 +1,264 @@
+// cudadecoder/cuda-decoder-kernels-utils.h
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Hugo Braun, Justin Luitjens, Ryan Leary
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_CUDA_DECODER_CUDA_DECODER_KERNELS_UTILS_H_
+#define KALDI_CUDA_DECODER_CUDA_DECODER_KERNELS_UTILS_H_
+
+// NO_KEY == -1 is ok, because all keys will be >= 0 (FST states)
+#define KALDI_CUDA_DECODER_HASHMAP_NO_KEY -1
+#define KALDI_CUDA_DECODER_HASHMAP_NO_VAL                 \
+  {                                                       \
+    KALDI_CUDA_DECODER_HASHMAP_NO_KEY, 0, ULONG_MAX \
+  }
+
+#include "util/stl-utils.h"
+
+namespace kaldi {
+namespace cuda_decoder {
+
+// MinPlus and PlusPlus
+// int2 operators used in Scan or Reduce operations
+struct MinPlus {
+  __device__ int2 operator()(const int2 &a, const int2 &b) const {
+    int2 c;
+    c.x = min(a.x, b.x);
+    c.y = a.y + b.y;
+    return c;
+  }
+};
+struct PlusPlus {
+  __device__ int2 operator()(const int2 &a, const int2 &b) const {
+    int2 c;
+    c.x = a.x + b.x;
+    c.y = a.y + b.y;
+    return c;
+  }
+};
+
+struct PlusPlusPlusPlus {
+  __device__ int4 operator()(const int4 &a, const int4 &b) const {
+    int4 c;
+    c.x = a.x + b.x;
+    c.y = a.y + b.y;
+    c.z = a.z + b.z;
+    c.w = a.w + b.w;
+    return c;
+  }
+};
+
+// 1:1 Conversion float <---> sortable int
+// We convert floats to sortable ints in order
+// to use native atomics operation, which are
+// way faster than looping over atomicCAS
+__device__ __forceinline__ int32 floatToOrderedInt(float floatVal) {
+  int32 intVal = __float_as_int(floatVal);
+  return (intVal >= 0) ? intVal : intVal ^ 0x7FFFFFFF;
+}
+
+__device__ __forceinline__ float orderedIntToFloat(int32 intVal) {
+  return __int_as_float((intVal >= 0) ? intVal : intVal ^ 0x7FFFFFFF);
+}
+
+// binsearch_maxle (device)
+// With L=[all indexes low<=i<=high such as vec[i]<= val]
+// binsearch_maxle returns max(L)
+// the array vec must be sorted
+// Finds that value using a binary search
+__device__ __forceinline__ int32 binsearch_maxle(const int32 *vec,
+                                                 const int32 val, int32 low,
+                                                 int32 high) {
+  while (true) {
+    if (low == high) return low;  // we know it exists
+    if ((low + 1) == high) return (vec[high] <= val) ? high : low;
+
+    int32 mid = low + (high - low) / 2;
+
+    if (vec[mid] > val)
+      high = mid - 1;
+    else
+      low = mid;
+  }
+}
+
+// Atomic operations on int2 (device)
+// atomicAddI2, atomicMinI2, atomicSubI2
+//
+// union used
+union UInt64UnionInt2 {
+  int2 i2;
+  unsigned long long int ull;
+};
+
+#if __CUDA_ARCH__ < 350
+__device__ __inline__ void atomicMinULL(unsigned long long *ptr,
+                                        unsigned long long val) {
+  unsigned long long old = *ptr, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(ptr, assumed, val);
+  } while (old > val && assumed != old);
+}
+#else
+__device__ __forceinline__ void atomicMinULL(unsigned long long *ptr,
+                                             unsigned long long val) {
+  atomicMin(ptr, val);
+}
+#endif
+
+__device__ __inline__ int2 atomicAddI2(int2 *ptr, int2 val) {
+  unsigned long long int *ptr64 =
+      reinterpret_cast<unsigned long long int *>(ptr);
+  UInt64UnionInt2 uval, uold;
+  uval.i2 = val;
+  uold.ull = atomicAdd(ptr64, uval.ull);
+  return uold.i2;
+}
+
+// We should switch to native atom64 on atomicMinI2 and atomicSubI2
+__device__ __inline__ void atomicMinI2(int2 *ptr, int2 val) {
+  unsigned long long int *ptr64 =
+      reinterpret_cast<unsigned long long int *>(ptr);
+  UInt64UnionInt2 old, assumed, value;
+  old.ull = *ptr64;
+  value.i2 = val;
+  if (old.i2.x <= val.x) return;
+  do {
+    assumed = old;
+    old.ull = atomicCAS(ptr64, assumed.ull, value.ull);
+  } while (old.ull != assumed.ull && old.i2.x > value.i2.x);
+}
+
+__device__ void atomicSubI2(int2 *ptr, int2 sub) {
+  unsigned long long int *ptr64 =
+      reinterpret_cast<unsigned long long int *>(ptr);
+  UInt64UnionInt2 old, assumed, value;
+  old.ull = *ptr64;
+  do {
+    assumed = old;
+    value.i2.x = assumed.i2.x - sub.x;
+    value.i2.y = assumed.i2.y - sub.y;
+    old.ull = atomicCAS(ptr64, assumed.ull, value.ull);
+  } while (old.ull != assumed.ull);
+}
+
+// Hash function used in the hashmap.
+// Using identity for now. They keys are the FST states, some randomness already
+// exists
+__device__ __forceinline__ int hash_func(int key) {
+  return key;  // using identity for now
+}
+
+// Packing and unpacking a minimum + its argument into a single uint64
+// (min is first, used for sorting)
+// Not using an union because documentation is not clear regarding reordering in structs
+// (for instance, in int2, y is stored before x)
+
+__device__ __inline__ void PackArgminInUInt64(const uint32_t min, const uint32_t arg, unsigned long long *argmin) {
+	unsigned long long p = min;
+	p <<= 32;
+	p |= arg;
+	*argmin = p;
+}
+
+__device__ __inline__ void GetMinFromPackedArgminUInt64(const unsigned long long argmin, uint32_t *min) {
+	*min = (uint32_t)((argmin & 0xFFFFFFFF00000000LL) >> 32);
+}
+
+__device__ __inline__ void GetArgFromPackedArgminUInt64(const unsigned long long argmin, uint32_t *arg) {
+	*arg = (uint32_t)(argmin & 0xFFFFFFFFLL);
+}
+
+// hashmap_insert_or_aggregate
+// Inserting a new value into the hashmap. If the key already exists in the
+// hashmap,
+// we'll aggregate the existing value with the new one, and set the result as
+// value for that key.
+// The new value inserted at key is (1, (int_cost, arg_int_cost)
+// With values being [count (int32), [min_cost, argmin_cost] (int2)]
+// If a value already exists for a key, we will aggregate the two values:
+// hashmap[key] = old_value +_ new_value
+// with +_ being (integer +, argmin)
+// It returns the hash_idx, i.e. where the key was inserted in the hashmap
+// The owner will then use that to access the data, and clear it for future use
+// It also returns local_idx, which informs how many values of that same key
+// were inserted before that call.
+// e.g. if thread 23 inserts the key 3, then thread 9 inserts the key 3,
+// thread 23 will have local_idx=0, thread 9 will have local_idx=1
+//
+// We use hashmap_insert in the context of a ReduceByKey. The same thread will
+// always
+// access the same key. Which is why we do not need a hashmap_find, and can
+// simply remember the hash_idx
+// from our last insert.
+//
+// Restriction: that function can only be used if we know that we will have
+// enough space in the hashmap
+// ie hashmap_capacity > total number of keys
+//
+// keys must be >= 0 (to avoid collisions with
+// KALDI_CUDA_DECODER_HASHMAP_NO_KEY)
+__device__ __inline__ void hashmap_insert_or_aggregate(
+    HashmapValueT *d_map_values, int key, int int_cost, int arg_int_cost,
+    int capacity, int *local_idx, int *out_hash_idx) {
+  int hash_idx = hash_func(key) % capacity;
+  int c = 0;
+  HashmapValueT *d_val = NULL;
+  do {
+    d_val = &d_map_values[hash_idx];
+    // Looking for a spot in the hashmap
+    int old = atomicCAS(&d_val->key, KALDI_CUDA_DECODER_HASHMAP_NO_KEY, key);
+    if (old == KALDI_CUDA_DECODER_HASHMAP_NO_KEY || old == key)
+      break;  // found a spot
+    hash_idx = (hash_idx + 1) % capacity;
+    ++c;
+  } while (c < capacity);
+  // The condition in which we use the hashmap always ensure that we have space
+  // asserting that we found a spot
+  assert(d_val);
+
+  // Updating values
+  *local_idx = atomicAdd(&d_val->count, 1);
+  *out_hash_idx = hash_idx;
+  unsigned long long argmin_u64;
+  PackArgminInUInt64(int_cost, arg_int_cost, &argmin_u64);
+  atomicMinULL(&d_val->min_and_argmin_int_cost_u64, argmin_u64);
+}
+
+// In FSTStateHashIndex, we store both the hash_idx and a boolean
+// is_representative
+// which tells if the current thread is responsible for the state stored at
+// index hash_idx
+// We use the bit sign for that
+// Setter and getter
+__device__ __inline__ void SetFSTStateHashIndex(int32 raw_hash_idx,
+                                                bool is_representative,
+                                                FSTStateHashIndex *hash_idx) {
+  *hash_idx = is_representative ? (-raw_hash_idx - 1)  // -1 to force it < 0
+                                : raw_hash_idx;
+}
+
+__device__ __inline__ void GetFSTStateHashIndex(FSTStateHashIndex &hash_idx,
+                                                int32 *raw_hash_idx,
+                                                bool *is_representative) {
+  *is_representative = (hash_idx < 0);
+  *raw_hash_idx = *is_representative ? (-(hash_idx + 1)) : hash_idx;
+}
+
+}  // end namespace cuda_decoder
+}  // end namespace kaldi
+
+#endif  // KALDI_CUDA_DECODER_CUDA_DECODER_KERNELS_UTILS_H_
diff --git a/src/cudadecoder/cuda-decoder-kernels.cu b/src/cudadecoder/cuda-decoder-kernels.cu
new file mode 100644
index 00000000000..f2a0d16d317
--- /dev/null
+++ b/src/cudadecoder/cuda-decoder-kernels.cu
@@ -0,0 +1,2091 @@
+// cudadecoder/cuda-decoder-kernels.cu
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Hugo Braun, Justin Luitjens, Ryan Leary
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cub/cub.cuh>
+#include "cuda-decoder-kernels.h"
+#include "cuda-decoder-kernels-utils.h"
+
+namespace kaldi {
+namespace cuda_decoder {
+
+// Initialize the hashmap with NO_VAL
+// Called in InitDeviceData, when building the CudaDecoder object
+__global__ void init_hashmap_kernel(DeviceParams cst_dev_params) {
+  const int max_nlanes = cst_dev_params.max_nlanes;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, max_nlanes) {
+    const int capacity = cst_dev_params.hashmap_capacity;
+    KALDI_CUDA_DECODER_1D_KERNEL_LOOP(idx, capacity) {
+      cst_dev_params.d_hashmap_values.lane(ilane)[idx] =
+          KALDI_CUDA_DECODER_HASHMAP_NO_VAL;
+    }
+  }
+}
+
+// Initialize initial channel on  device
+// Called by ComputeInitialChannel
+// It is NOT called in InitDecoding
+// In InitDecoding we will clone the initial channel into the channel we called
+// InitDecoding on
+// Here we are actually creating this initial channel
+// we do that once in the CudaDecoder constructor.
+//
+// The initial channel is the state of a channel when
+// it will start decoding a new utterance
+// thread (1, 1, 1)
+// blocks(1, 1, 1);
+__global__ void initialize_initial_lane_kernel(DeviceParams cst_dev_params) {
+  const int init_ichannel = cst_dev_params.init_channel_id;
+  const int init_ilane = 0;
+  ChannelCounters *init_channel_counters =
+      cst_dev_params.d_channels_counters.channel(init_ichannel);
+  LaneCounters *lane_counters =
+      cst_dev_params.d_lanes_counters.lane(init_ilane);
+
+  // Making the data look like an ExpandArcsEmitting just executed,
+  // and put the StartState in the aux_q. We will then pick up a normal
+  // execution from there
+  // (calling PruneAndPreprocess, then ExpandArcsNonEmitting..)
+  lane_counters->aux_q_end = 0;
+  lane_counters->aux_q_requested = 0;
+  lane_counters->post_expand_aux_q_end = 1;
+  lane_counters->main_q_global_offset = 0;
+  lane_counters->main_q_local_offset = 0;
+  lane_counters->main_q_n_extra_prev_tokens = 0;
+  lane_counters->int_cutoff = INT_MAX;
+  lane_counters->main_q_n_emitting_tokens = 0;  // all non emitting
+  lane_counters->int_beam = floatToOrderedInt(cst_dev_params.default_beam);
+  lane_counters->main_q_narcs_and_end = {0, 0};
+  lane_counters->main_q_requested = 0;
+  lane_counters->prev_arg_min_int_cost = 0;
+  const StateId init_state = cst_dev_params.init_state;
+  const CostType init_cost = cst_dev_params.init_cost;
+  IntegerCostType int_init_cost = floatToOrderedInt(init_cost);
+  cst_dev_params.d_aux_q_state_and_cost.lane(init_ilane)[0] = {init_state,
+                                                               int_init_cost};
+  lane_counters->min_int_cost = int_init_cost;
+  CostType cutoff = orderedIntToFloat(int_init_cost);
+  lane_counters->int_cutoff =
+      floatToOrderedInt(cutoff + cst_dev_params.default_beam);
+  cst_dev_params.d_aux_q_info.lane(init_ilane)[0] = {INT_MIN, -1};
+}
+
+// Called by InitDecoding
+// Called when some channels will start decoding a new utterance
+// do everything that's needed to do on the device to start decoding a new
+// utterance with those channels
+// It clones the initial channel (created in initialize_initial_lane_kernel)
+// into the channels we want to InitDecoding on
+__global__ void init_decoding_on_device_kernel(DeviceParams cst_dev_params,
+                                               KernelParams params) {
+  const int init_ichannel = cst_dev_params.init_channel_id;
+
+  const ChannelCounters *init_channel_counters =
+      cst_dev_params.d_channels_counters.channel(init_ichannel);
+  const int32 init_main_q_end =
+      init_channel_counters->prev_main_q_narcs_and_end.y;
+  const int32 nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    KALDI_CUDA_DECODER_1D_KERNEL_LOOP(idx, init_main_q_end) {
+      const LaneCounters *lane_counters =
+          cst_dev_params.d_lanes_counters.lane(ilane);
+      const int32 ichannel = lane_counters->channel_to_compute;
+      cst_dev_params.d_main_q_state_and_cost.channel(ichannel)[idx] =
+          cst_dev_params.d_main_q_state_and_cost.channel(init_ichannel)[idx];
+      cst_dev_params.d_main_q_degrees_prefix_sum.channel(ichannel)[idx] =
+          cst_dev_params.d_main_q_degrees_prefix_sum.channel(
+              init_ichannel)[idx];
+      cst_dev_params.d_main_q_arc_offsets.channel(ichannel)[idx] =
+          cst_dev_params.d_main_q_arc_offsets.channel(init_ichannel)[idx];
+      if (idx == 0) {
+        ChannelCounters *channel_counters =
+            cst_dev_params.d_channels_counters.channel(ichannel);
+        channel_counters->prev_main_q_narcs_and_end =
+            init_channel_counters->prev_main_q_narcs_and_end;
+        channel_counters->prev_main_q_n_extra_prev_tokens =
+            init_channel_counters->prev_main_q_n_extra_prev_tokens;
+        channel_counters->prev_main_q_global_offset = 0;
+        channel_counters->prev_main_q_extra_prev_tokens_global_offset = 0;
+        channel_counters->prev_beam = cst_dev_params.default_beam;
+      }
+    }
+  }
+}
+
+// Context switch : load
+// Called by LoadChannelsStateToLanes
+// THREADS : (1, 1, 1)
+// BLOCKS : (1, nlanes_used, 1)
+__global__ void load_channels_state_in_lanes_kernel(DeviceParams cst_dev_params,
+                                                    KernelParams params) {
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
+    const int32 ichannel = lane_counters->channel_to_compute;
+    const ChannelCounters *channel_counters =
+        cst_dev_params.d_channels_counters.channel(ichannel);
+    int2 main_q_narcs_and_end = channel_counters->prev_main_q_narcs_and_end;
+    lane_counters->main_q_narcs_and_end = main_q_narcs_and_end;
+    lane_counters->main_q_n_extra_prev_tokens =
+        channel_counters->prev_main_q_n_extra_prev_tokens;
+    CostType beam = channel_counters->prev_beam;
+    IntegerCostType int_beam = floatToOrderedInt(beam);
+    lane_counters->int_beam = int_beam;
+    lane_counters->adaptive_int_beam_with_validity_index.x = int_beam;
+    lane_counters->adaptive_int_beam_with_validity_index.y =
+        cst_dev_params.adaptive_beam_static_segment;
+    lane_counters->main_q_global_offset =
+        channel_counters
+            ->prev_main_q_global_offset;  // we'll update it after emitting
+    lane_counters->main_q_extra_prev_tokens_global_offset =
+        channel_counters->prev_main_q_extra_prev_tokens_global_offset;
+  }
+}
+
+// Context switch : store
+// Called by SaveChannelsStateFromLanes
+// THREADS : (1, 1, 1)
+// BLOCKS : (1, nchannel_to_compute, 1)
+__global__ void save_channels_state_from_lanes_kernel(
+    DeviceParams cst_dev_params, KernelParams params) {
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    const LaneCounters *lane_counters =
+        cst_dev_params.d_lanes_counters.lane(ilane);
+    const int32 ichannel = lane_counters->channel_to_compute;
+    ChannelCounters *channel_counters =
+        cst_dev_params.d_channels_counters.channel(ichannel);
+    channel_counters->prev_main_q_global_offset =
+        lane_counters->main_q_global_offset;
+    channel_counters->prev_main_q_extra_prev_tokens_global_offset =
+        lane_counters->main_q_extra_prev_tokens_global_offset;
+    channel_counters->prev_main_q_narcs_and_end =
+        lane_counters->main_q_narcs_and_end;
+    channel_counters->prev_main_q_n_extra_prev_tokens =
+        lane_counters->main_q_n_extra_prev_tokens;
+    channel_counters->prev_beam = orderedIntToFloat(lane_counters->int_beam);
+  }
+}
+
+// compute_lane_offsets_kernel
+// the kernel concatenate_lanes_data concatenates multiple array into a single
+// continuous array
+// compute_lane_offsets_kernel computes the offset of each array into this
+// continous array
+// This kernel is 1D : the lanes are on the X dimension, because we want to
+// compute the offset of those lanes
+__global__ void compute_lane_offsets_kernel(DeviceParams cst_dev_params,
+                                            KernelParams params) {
+  typedef cub::BlockScan<int4, KALDI_CUDA_DECODER_1D_BLOCK> BlockScan;
+  __shared__ typename BlockScan::TempStorage temp_storage;
+
+  const int nlanes = params.nlanes_used;
+  int4 sum_so_far = {0, 0, 0, 0};
+  KALDI_CUDA_DECODER_1D_BLOCK_OFFSET_KERNEL_LOOP(
+      block_offset, thread_idx,
+      nlanes + 1) {  // +1 because we are doing an exclusive sum, and we want
+                     // all the values
+    int32 ilane = block_offset + thread_idx;
+    int4 zero4 = {0, 0, 0, 0};
+    int4 lane_offsets = zero4;
+    if (ilane < nlanes) {  // nlanes, not nlanes+1, because we cannot read +1
+                           // values (undefined)
+      LaneCounters *d_lane_counters =
+          cst_dev_params.d_lanes_counters.lane(ilane);
+      int32 main_q_end = d_lane_counters->main_q_narcs_and_end.y;
+      int32 n_emitting_tokens = d_lane_counters->main_q_n_emitting_tokens;
+      int32 main_q_n_extra_prev_tokens =
+          d_lane_counters->main_q_n_extra_prev_tokens;
+      lane_offsets = {main_q_end, n_emitting_tokens, main_q_n_extra_prev_tokens,
+                      0};
+    }
+    int4 block_aggregate;
+    BlockScan(temp_storage)
+        .ExclusiveScan(lane_offsets, lane_offsets, zero4, PlusPlusPlusPlus(),
+                       block_aggregate);
+    PlusPlusPlusPlus pppp;
+    lane_offsets = pppp(lane_offsets, sum_so_far);
+    sum_so_far = pppp(sum_so_far, block_aggregate);
+    if (ilane < (nlanes + 1)) {  // nlanes+1, to write the output
+      LaneCounters *d_lane_counters =
+          cst_dev_params.d_lanes_counters.lane(ilane);
+      LaneCounters *h_lane_counters =
+          cst_dev_params.h_lanes_counters.lane(ilane);
+      h_lane_counters->main_q_end_lane_offset =
+          d_lane_counters->main_q_end_lane_offset = lane_offsets.x;
+      h_lane_counters->main_q_n_emitting_tokens_lane_offset =
+          d_lane_counters->main_q_n_emitting_tokens_lane_offset =
+              lane_offsets.y;
+      h_lane_counters->main_q_n_extra_prev_tokens_lane_offset =
+          d_lane_counters->main_q_n_extra_prev_tokens_lane_offset =
+              lane_offsets.z;
+    }
+    __syncthreads();  // reusing temp_storage
+  }
+}
+
+// concatenate_lanes_data
+// Called by PerformConcatenatedCopy
+// Creates a concatenate array into concat,
+// by concatenating all the arrays src.lane(ilane)
+// for ilane=0..params.nlanes_used
+// Used to prepare data for copy to Host. We want to avoid small Device2Host
+// copies.
+template <typename T>
+__global__ void concatenate_lanes_data_kernel(DeviceParams cst_dev_params,
+                                              KernelParams params,
+                                              LaneMatrixView<T> src, T *concat,
+                                              int32 *lane_offsets) {
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    const int32 stride =
+        sizeof(LaneCounters) / sizeof(int32);  // offsets are in LaneCounters
+    int32 beg = *(lane_offsets + ilane * stride);
+    int32 end = *(lane_offsets + (ilane + 1) * stride);
+    int32 vec_size = end - beg;
+    KALDI_CUDA_DECODER_1D_KERNEL_LOOP(idx, vec_size) {
+      T d = src.lane(ilane)[idx];
+      concat[beg + idx] = d;
+    }
+  }
+}
+
+// nonemitting_preprocess_and_contract_kernel
+// Called from PruneAndPreprocess
+// This kernels prune the aux_q, move the survival tokens to the main_q,
+// and add the preprocessing information necessary for the next ExpandArcs
+// (the expand that follows PruneAndPreprocess is always non-emitting)
+// It prunes the tokens using the cutoff, and prepare the data necessary for
+// ExpandArcs:
+// d_main_q_degrees_prefix_sum, d_main_q_arc_offsets_
+// The prefix sum is done in one-pass here, using a trick (we compute the prefix
+// sum
+// as we fill the main_q)
+__global__ void nonemitting_preprocess_and_contract_kernel(
+    DeviceParams cst_dev_params, KernelParams params) {
+  typedef cub::BlockScan<int2, KALDI_CUDA_DECODER_1D_BLOCK> BlockScan;
+  __shared__ typename BlockScan::TempStorage sh_temp_storage;
+  // We need to move the survival tokens to the main_q
+  //
+  // sh_main_q_global_block_offset has two purposes :
+  // (1) to know where to store the survival tokens in the main_q
+  // (2) to perform the prefix sum degrees (of the survival tokens)
+  __shared__ int2 sh_main_q_global_block_offset;
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
+    const int32 aux_q_end = lane_counters->post_expand_aux_q_end;
+    const IntegerCostType int_cutoff = lane_counters->int_cutoff;
+    // Keeping whole CTA alive. We'll use __syncthreads()
+    KALDI_CUDA_DECODER_1D_BLOCK_OFFSET_KERNEL_LOOP(block_offset, thread_idx,
+                                                   aux_q_end) {
+      const int32 aux_q_idx = block_offset + thread_idx;
+      const int32 ichannel = lane_counters->channel_to_compute;
+      int32 degree = 0;
+      int32 arc_start = -1;
+      StateId token_state;
+      IntegerCostType token_int_cost;
+      // We've kept the whole CTA alive. Now we keep only those will a valid
+      // token
+      if (aux_q_idx < aux_q_end) {
+        const int2 both =
+            cst_dev_params.d_aux_q_state_and_cost.lane(ilane)[aux_q_idx];
+        token_state = both.x;
+        token_int_cost = both.y;
+
+        if (token_int_cost < int_cutoff) {
+          // We'll keep that token. Loading its arc degree/csr offset now.
+          arc_start = cst_dev_params.d_arc_ne_offsets[token_state];
+          const int32 arc_end =
+              cst_dev_params.d_arc_ne_offsets[token_state + 1];
+          degree = arc_end - arc_start;
+        }
+      }
+
+      // If we've set a different arc_start,
+      // this thread has a valid unpruned token
+      int32 is_pruned = (arc_start == -1);
+
+      // We now know which tokens will be moved to the main_q, the remaining
+      // will be pruned
+      // we now compute a prefix sum inside the CUDA block to determine the
+      // local indexes of the unpruned tokens
+      // the first unpruned token will have a index of 0, the second 1, ...
+      // We also need to compute the prefix sum of the arc degrees
+      // we start by doing a local prefix sum inside the CUDA block
+      int2 block_prefix_sum_narcs_and_end = {degree, (is_pruned ? 0 : 1)};
+      const int2 zero2 = {0, 0};
+
+      // Computing the prefix sum (exclusive)
+      BlockScan(sh_temp_storage)
+          .ExclusiveScan(block_prefix_sum_narcs_and_end,
+                         block_prefix_sum_narcs_and_end, zero2, PlusPlus());
+
+      if (KALDI_CUDA_DECODER_IS_LAST_1D_THREAD()) {
+        // This conditional branch is entered by the last thread
+        // Because it is the last, the prefix_sum of that thread contains the
+        // sum of all elements
+
+        // We also add the value from this thread - the prefix sum is exclusive
+        // For the sum, we want it inclusive
+        int2 block_sum = block_prefix_sum_narcs_and_end;
+        block_sum.x += degree;
+        block_sum.y += is_pruned ? 0 : 1;
+
+        // Doing two things at the same time :
+        // requesting a spot in the main_q to store the survival tokens from
+        // this CTA
+        // We also increment the narcs value. atomic64.x will contain the number
+        // of
+        // arcs in the main_q up until the atomic64.y index
+        // That's all we need to finish our prefix sum. We add this global
+        // offset.
+
+        // First atomic to check if we are not overflowing main_q.
+        int block_offset =
+            atomicAdd(&lane_counters->main_q_requested, block_sum.y);
+
+        // Verify that we do not overflow
+        if (block_offset + block_sum.y < cst_dev_params.main_q_capacity) {
+          // we don't overflow we can safely grab a spot in the main_q
+          sh_main_q_global_block_offset =
+              atomicAddI2(&lane_counters->main_q_narcs_and_end, block_sum);
+        } else {
+          // our update would overflow
+          lane_counters->q_overflow |= OVERFLOW_MAIN_Q;  // for the host
+          sh_main_q_global_block_offset.y =
+              cst_dev_params.main_q_capacity;  // used as flag to broadcast the
+                                               // information in the CTA
+        }
+      }
+
+      // Syncing because :
+      // - Broadcasting sh_main_q_global_block_offset
+      // - We may reuse sh_temp_storage (cf CUB doc)
+      __syncthreads();
+
+      // Checking if we are overflowing the main_q
+      // All threads are executing the next line
+      if (sh_main_q_global_block_offset.y == cst_dev_params.main_q_capacity)
+        goto end_lane;  // done for this lane
+
+      // If we are executing the following lines it means that we are not
+      // overflowing the queue
+      // We then continue what we were doing
+      if (!is_pruned) {
+        bool moving_emitting_tokens = (lane_counters->main_q_local_offset == 0);
+        // we will move our unpruned token to the main_q, at index main_q_idx
+        InfoToken tok_info = cst_dev_params.d_aux_q_info.lane(ilane)[aux_q_idx];
+        const int32 main_q_idx =
+            sh_main_q_global_block_offset.y + block_prefix_sum_narcs_and_end.y;
+        CostType acoustic_cost = 0.0f;
+        if (moving_emitting_tokens && tok_info.arc_idx != -1) {
+          const int32 arc_ilabel =
+              cst_dev_params.d_arc_pdf_ilabels[tok_info.arc_idx];
+          acoustic_cost = -lane_counters->loglikelihoods[arc_ilabel];
+        }
+        cst_dev_params.d_main_q_info.lane(ilane)[main_q_idx] = tok_info;
+
+        // Moving the token to the main q
+        cst_dev_params.d_main_q_state_and_cost.channel(ichannel)[main_q_idx] = {
+            token_state, token_int_cost};
+        cst_dev_params.d_main_q_acoustic_cost.lane(ilane)[main_q_idx] =
+            acoustic_cost;
+        // Saving the global prefix sum
+        const int32 prefix_sum_narcs =
+            sh_main_q_global_block_offset.x + block_prefix_sum_narcs_and_end.x;
+        cst_dev_params.d_main_q_degrees_prefix_sum.channel(
+            ichannel)[main_q_idx] = prefix_sum_narcs;
+        // Saving the CSR arc offset for that token's state
+        // it will be used by the expand kernel, and avoid doing a new random
+        // memory access in the expand kernel
+        cst_dev_params.d_main_q_arc_offsets.channel(ichannel)[main_q_idx] =
+            arc_start;
+      }
+    }
+
+  end_lane:;  // empty statement
+  }
+}
+
+// GetAdaptiveBeam is used in ExpandArcs
+// When we generate new tokens by traversing arcs, 
+// we can end up creating a lot of tokens, if the current frame 
+// generated loglikelihoods too uniform for instance (we don't have
+// any good tokens that will reduce the cutoff, so we end up generating
+// a lot of tokens)
+// To avoid overflowing the aux_q, we apply a decreasing beam.
+// With aux_q_end being the current aux_q size, we have a decrease function f, with
+// adaptive_beam = f(aux_q_end)
+// f is a decreasing piecewise constant function
+// Please note that when processing tokens, we usually have dozens of thousands of threads
+// generating tokens. Those are already in flight, and will not reload the beam immediatly.
+// It means that we need to start reducing the beam as soon as we detect that we are generating more tokens than
+// expected. 
+// We can configure the function f using KALDI_CUDA_DECODER_ADAPTIVE_BEAM_STATIC_SEGMENT
+// and KALDI_CUDA_DECODER_ADAPTIVE_BEAM_NSTEPS.
+// We will use default_beam for the first max_tokens_per_frame/KALDI_CUDA_DECODER_ADAPTIVE_BEAM_STATIC_SEGMENT
+// tokens in the aux_q.
+// Once we reach that number, we will decrease the adaptive beam linearly from default_beam to 0,
+// using KALDI_CUDA_DECODER_ADAPTIVE_BEAM_NSTEPS steps
+//
+// x-axis : aux_q_end. How much tokens are already in the aux_q
+// y-axis : adaptive_beam = f(aux_q_end)
+// default_beam _| ________________
+//               |               /\ _________
+//               |                |          _________
+//            0 _|   static_segment                   _________
+//               |________________________________________________
+//               |                                             |     
+//   aux_q_end=  0                                    max_tokens_per_frame
+// We have :     
+// static_segment = max_tokens_per_frame/KALDI_CUDA_DECODER_ADAPTIVE_BEAM_STATIC_SEGMENT
+// and KALDI_CUDA_DECODER_ADAPTIVE_BEAM_NSTEPS = 3
+__device__ void UpdateAdaptiveBeam(const DeviceParams &cst_dev_params,
+                                   const int aux_q_index_block_offset,
+                                   IntegerCostType min_int_cost,
+                                   int2 *adaptive_int_beam_with_validity_index,
+                                   LaneCounters *lane_counters) {
+  int32 beam_valid_until_idx = adaptive_int_beam_with_validity_index->y;
+  if (aux_q_index_block_offset < beam_valid_until_idx) return;  // nothing to do
+
+  CostType beam = orderedIntToFloat(adaptive_int_beam_with_validity_index->x);
+  while (aux_q_index_block_offset >= beam_valid_until_idx) {
+    beam /= 2;
+    beam_valid_until_idx += cst_dev_params.adaptive_beam_bin_width;
+  }
+
+  IntegerCostType new_int_cutoff = (min_int_cost < INT_MAX)
+      ? floatToOrderedInt(orderedIntToFloat(min_int_cost) + beam)
+      : INT_MAX;
+  IntegerCostType int_beam = floatToOrderedInt(beam);
+  adaptive_int_beam_with_validity_index->x = int_beam;
+  adaptive_int_beam_with_validity_index->y = beam_valid_until_idx;
+  // We can have races between the two atomics
+  // However the worst than can happen is a CTA might delay updating the beam
+  // This is not a critical bug. However, once we have a floatToOrderedInt
+  // that is generating unsigned ints, we could merge the two atomics into a
+  // single atomic64
+  atomicMin(&lane_counters->adaptive_int_beam_with_validity_index.x, int_beam);
+  atomicMax(&lane_counters->adaptive_int_beam_with_validity_index.y,
+            beam_valid_until_idx);
+  atomicMin(&lane_counters->int_cutoff, new_int_cutoff);
+}
+
+// One CTA / lane
+__global__ void reset_for_frame_and_estimate_cutoff_kernel(
+    DeviceParams cst_dev_params, KernelParams params) {
+  typedef cub::BlockReduce<CostType, KALDI_CUDA_DECODER_1D_BLOCK> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
+    const int32 ichannel = lane_counters->channel_to_compute;
+    ChannelCounters *channel_counters =
+        cst_dev_params.d_channels_counters.channel(ichannel);
+    if (threadIdx.x == 0) {
+      const CostType current_beam = orderedIntToFloat(lane_counters->int_beam);
+      // Do some initialization
+      lane_counters->q_overflow = OVERFLOW_NONE;
+      lane_counters->main_q_n_emitting_tokens = INT_MAX;
+      lane_counters->int_cutoff = INT_MAX;
+      lane_counters->min_int_cost = INT_MAX;
+      lane_counters->q_overflow = OVERFLOW_NONE;
+      lane_counters->aux_q_requested = 0;
+      lane_counters->main_q_requested = 0;
+      lane_counters->main_q_local_offset = 0;
+      lane_counters->compute_max_active =
+          false;  // will be set to true if necessary
+      channel_counters->min_int_cost_and_arg_with_final.x =
+          INT_MAX;  // it will be set with atomicMins
+      const CostType new_beam =
+          fmin(cst_dev_params.default_beam,
+               current_beam * KALDI_CUDA_DECODER_ADAPTIVE_BEAM_RECOVER_RATE);
+      lane_counters->int_beam = floatToOrderedInt(new_beam);
+    }
+    const int32 prev_arg_min = lane_counters->prev_arg_min_int_cost;
+    int2 both =
+        cst_dev_params.d_main_q_state_and_cost.channel(ichannel)[prev_arg_min];
+    int32 int_cost = both.y;
+    CostType previous_cost = orderedIntToFloat(int_cost);
+    const int32 prev_arg_min_state = both.x;
+    int32 arc_start = cst_dev_params.d_arc_e_offsets[prev_arg_min_state];
+    int32 arc_end = cst_dev_params.d_arc_e_offsets[prev_arg_min_state + 1];
+    int32 narcs = arc_end - arc_start;
+    // no loop - we only process the first KALDI_CUDA_DECODER_1D_BLOCK arcs
+    // we just want an estimate
+    CostType total_cost = FLT_MAX;
+    if (threadIdx.x < narcs) {
+      int32 iarc = arc_start + threadIdx.x;
+      CostType arc_fixed_cost = cst_dev_params.d_arc_weights[iarc];
+      const int32 arc_ilabel = cst_dev_params.d_arc_pdf_ilabels[iarc];
+      CostType acoustic_cost = -lane_counters->loglikelihoods[arc_ilabel];
+      total_cost = previous_cost + arc_fixed_cost +
+                   acoustic_cost;  // +0.0f, best prev cost is normalized to 0
+    }
+
+    KALDI_CUDA_DECODER_1D_KERNEL_LOOP(bin_id, KALDI_CUDA_DECODER_HISTO_NBINS) { 
+      cst_dev_params.d_histograms.lane(ilane)[bin_id] = 0; // reset for this frame
+    }
+
+    CostType min = BlockReduce(temp_storage).Reduce(total_cost, cub::Min());
+    if (narcs > 0 && threadIdx.x == 0) {
+      // narcs > 0 to have at least one valid element in the reduce
+      CostType new_cutoff = min + orderedIntToFloat(lane_counters->int_beam);
+      IntegerCostType new_int_cutoff = floatToOrderedInt(new_cutoff);
+      lane_counters->int_cutoff = new_int_cutoff;
+      lane_counters->min_int_cost = floatToOrderedInt(min);
+    }
+  }
+}
+// ExpandArc kernel
+// This kernel does the actual work of traversing arcs
+//
+// Pseudo code :
+// for all token tok in main_q[main_q_offset...end]:
+//      u = tok.next_state
+//      for all arc a(u->v) in the FST:
+//          v_cost = tok.cost + a.cost + accoustic_cost
+//
+//          if v_cost < cutoff and v_cost < best_state_cost[v]
+//              generate token associated to v, add to aux_q
+//              if necessary update cutoff
+//              if aux_q is getting full, reduce beam
+//
+// For more information please refer to http://kaldi-asr.org/doc/decoders.html
+//
+// ExpandArc rely on some preprocessed data to be able to function
+// for instance, it needs the prefix sum of the arc degree of all token.state in
+// the main_q
+// We need to call a Preprocess kernel before ExpandArc
+//
+// ExpandArc is used for both emitting and nonemitting phases
+// Differences between emitting and nonemitting :
+//      1) params.d_q_arc_offset contains offsets to either emitting or
+//      nonemitting arcs.
+//         It is transparent for this kernel. The differentiation was done in
+//         the Preprocess kernel,
+//         which is responsible for filling the params.d_q_arc_offset array
+//      2) Computation of the acoustic cost. If nonemitting, it is equal to 0.
+//      If emitting, we need
+//         to use values from the acoustic model (through the d_loglikelihoods
+//         array)
+//
+// Note : ExpandArc is not the only kernel able to traverse arcs.
+// FinalizeProcessNonemitting contains a simplified version of expand for only
+// one CUDA block
+template <bool IS_EMITTING>
+__global__ void expand_arcs_kernel(DeviceParams cst_dev_params,
+                                   KernelParams params) {
+  // BlockScan that we will use to compute token indexes in the output queue,
+  // and to find the min cost in the block
+  typedef cub::BlockScan<int2, KALDI_CUDA_DECODER_1D_BLOCK> BlockScan;
+  __shared__ typename BlockScan::TempStorage sh_temp_storage_scan;
+
+  // This kernel writes the new token to the output queue aux_q
+  // We will request a spot to store all the new tokens created by threads in
+  // this CUDA block
+  // sh_aux_q_index_block_offset indicates where to store them in the aux_q
+  // tokens created in this CUDA block will be store in :
+  // aux_q[sh_aux_q_index_block_offset], aux_q[sh_aux_q_index_block_offset + 1],
+  __shared__ int32 sh_aux_q_index_block_offset;
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
+    const int32 main_q_offset = lane_counters->main_q_local_offset;
+    const int32 main_q_end = lane_counters->main_q_narcs_and_end.y;
+    const int32 total_narcs = lane_counters->main_q_narcs_and_end.x;
+    KALDI_CUDA_DECODER_1D_BLOCK_OFFSET_KERNEL_LOOP(block_offset, thread_idx,
+                                                   total_narcs) {
+      int2 adaptive_int_beam_with_validity_index =
+          lane_counters->adaptive_int_beam_with_validity_index;
+      const int32 ichannel = lane_counters->channel_to_compute;
+      // Important : this thread is not responsible for a token in the input
+      // queue main_q
+      // but for an arc, going out of a token in the main_q
+      // The main_q contains in total total_narcs
+      // and this thread will compute the main_q_arc_index-th arc of the main_q
+      // For instance, first thread in the grid with threadIdx.x == 0 and
+      // blockIdx.x == 0
+      // will process the first arc of the token in main_q[main_q_offset + 0]
+      // (if that token has at least one arc)
+      //
+      // This insure a perfect one thread = one arc load balancing
+      // but we have work to do to know exactly which arc is the
+      // main_q_arc_index-th arc
+      // (what's its source ? its destination ? its arc_idx the FST CSR ?)
+      int32 main_q_arc_index = block_offset + thread_idx;
+      // We'll need those variables later in the kernel
+      // we declare them outside of the "valid_input" scope
+      // to be able to access them later
+      int32 main_q_idx;
+      int32 arc_idx;
+      StateId arc_next_state;
+      IntegerCostType int_total_cost = INT_MAX;
+      if (main_q_arc_index < total_narcs) {
+        // Current thread must take care of main_q_arc_index-th arc
+        // we need to now what's the source of that arc
+        // ie which token.state in main_q does it start from ?
+        // We use a binary search in the prefix sum of the token's degree to get
+        // that information
+        //
+        // Example : main_q contains 3 tokens
+        // - First token is associated to a state which has 3 outgoing arc
+        // - Second token is associated to a state which has 0 outgoing arc
+        // - Third token is associated to a state which has 2 outgoing arc
+        //
+        // We store the degrees in an array :
+        // [3, 0, 2]
+        //
+        // We then compute the exclusive prefix sum of that array :
+        // [0, 3, 3, 5]
+        //
+        // In total, we have 5 arcs in the main_q. ExpandArc will use 5 threads.
+        //
+        // Let's say we are the fifth thread in ExpandArc.
+        // we have threadIdx.x == 4, and blockIdx.x == 0
+        // it gives us main_q_arc_index == 4
+        // From there we have no idea what we're supposed to do next, we need to
+        // have information about the
+        // arc that we're supposed to traverse
+        //
+        // To do that, we look for the maximum index maxle_i in the prefix sum
+        // array such prefix_sum[i] <= 4
+        //
+        // [0, 3, 3, 5]
+        //          |
+        //         here
+        // maxle_i = 2
+        // it means that our source token is at index 2 in the main_q
+        // and we are computing the arc at index (main_q_arc_index -
+        // prefix_sum[maxle_i]) of that token
+        // ie the arc at index (4-3) = 1, the second arc of the second token in
+        // main_q
+
+        // Searching for the source of the arc that we will process
+        // (main_q_arc_index)
+        // we could preprocess the search in the preprocess kernels - for now
+        // this kernel is fast enough
+        const int32 *degrees_prefix_sum =
+            cst_dev_params.d_main_q_degrees_prefix_sum.channel(ichannel);
+        main_q_idx = binsearch_maxle(degrees_prefix_sum, main_q_arc_index,
+                                     main_q_offset, main_q_end - 1);
+
+        // state_first_arc_idx_in_main_q
+        // d_main_q_degrees_prefix_sum contains the prefix sum of the
+        // degrees of all tokens in the main_q
+        // d_main_q_degrees_prefix_sum[main_q_idx] contains the number of arc
+        // in the main_q until that token
+        const int32 state_first_arc_idx_in_main_q =
+            degrees_prefix_sum[main_q_idx];
+
+        // arc_offset_start is the offset in the CSR, to find the arcs
+        // related to the state main_q_state_[main_q_idx]
+        // it was set by the preprocess kernel
+        const int32 arc_offset_start =
+            cst_dev_params.d_main_q_arc_offsets.channel(ichannel)[main_q_idx];
+
+        // local_arc_index is the arc index for that state
+        // if local_arc_index == 2, we will process the second arc
+        // of state main_q_state_[main_q_idx]
+        const int32 local_arc_index =
+            main_q_arc_index - state_first_arc_idx_in_main_q;
+
+        // corresponding arc_idx in the FST
+        arc_idx = arc_offset_start + local_arc_index;
+
+        // Destination of that arc
+        arc_next_state = cst_dev_params.d_arc_nextstates[arc_idx];
+
+        // Building the total cost incrementally
+        // we'll add the acoustic cost and the old token's cost
+        const CostType arc_fixed_cost = cst_dev_params.d_arc_weights[arc_idx];
+        const CostType prev_token_cost = orderedIntToFloat(
+            cst_dev_params.d_main_q_state_and_cost.channel(ichannel)[main_q_idx]
+                .y);
+        CostType total_cost = prev_token_cost + arc_fixed_cost;
+        const int32 prev_state =
+            cst_dev_params.d_main_q_state_and_cost.channel(ichannel)[main_q_idx]
+                .x;
+        if (IS_EMITTING) {
+          const int32 arc_ilabel = cst_dev_params.d_arc_pdf_ilabels[arc_idx];
+          CostType acoustic_cost = -lane_counters->loglikelihoods[arc_ilabel];
+          total_cost += acoustic_cost;
+        }
+        int_total_cost = floatToOrderedInt(total_cost);
+
+        // If the total_cost is too large compared to our cutoff (beam search)
+        // then let's drop it
+        const IntegerCostType int_cutoff = lane_counters->int_cutoff;
+        if (int_total_cost >= int_cutoff) int_total_cost = INT_MAX;
+      }
+
+      // If int_total_cost < INT_MAX, it means that :
+      // - this thread had a valid input (main_q_arc_index < total_narcs)
+      // - the total_cost of the generated token is < cutoff
+      // We will then add that new token in the output queue, aux_q
+      // We need to know where to put that token in the aux_q
+      // we'll first compute its index inside the CUDA block
+      // the first valid output token in the CUDA block will have index 0,
+      // the second index 1... We compute that using a prefix sum
+      //
+      // We also need to find the overall min cost in the CUDA block
+      // a prefix sum is a scan operation, and a min a reduce operation
+      // we can perform a reduce operation using a scan (using the last value)
+      // we compute the prefix sum and the min in one scan, using the data
+      // struct CostTypeAndInt
+      const int32 has_successor = (int_total_cost < INT_MAX) ? 1 : 0;
+
+      int2 int_cost_and_index = {int_total_cost, has_successor};
+      BlockScan(sh_temp_storage_scan)
+          .InclusiveScan(int_cost_and_index, int_cost_and_index, MinPlus());
+      if (KALDI_CUDA_DECODER_IS_LAST_1D_THREAD()) {
+        // We are in a divergent branch
+        // This is the last thread. The last value of the inclusive scan is the
+        // total
+        const int32 total_successors_in_block = int_cost_and_index.y;
+        // Requesting a spot of size total_successors_in_block in the aux_q
+
+        // note:  using 2 atomics here to avoid adding another kernel
+        // first request more space
+        const int aux_q_index_block_offset = atomicAdd(
+            &lane_counters->aux_q_requested, total_successors_in_block);
+
+        // check for overflow in aux_q
+        // We try to prevent an overflow from happening using an adaptive beam
+        // (cf GetAdaptiveBeam)
+        if (aux_q_index_block_offset + total_successors_in_block <
+            cst_dev_params.aux_q_capacity) {
+          // no overflow
+
+          // grab the aux_q offset
+          sh_aux_q_index_block_offset =
+              atomicAdd(&lane_counters->aux_q_end, total_successors_in_block);
+
+          // We are not overflowing the queue, updating the global values
+            IntegerCostType global_min_int_cost = lane_counters->min_int_cost;
+            IntegerCostType local_min_int_cost = int_cost_and_index.x;
+            // if we found a lower min_cost, update the global value
+            if (local_min_int_cost < global_min_int_cost) {
+              global_min_int_cost = local_min_int_cost;
+              atomicMin(&lane_counters->min_int_cost, global_min_int_cost);
+              CostType beam =
+                  orderedIntToFloat(adaptive_int_beam_with_validity_index.x);
+              IntegerCostType new_int_cutoff = floatToOrderedInt(
+                  orderedIntToFloat(local_min_int_cost) + beam);
+              atomicMin(&lane_counters->int_cutoff, new_int_cutoff);
+            }
+            int32 beam_valid_until_idx =
+                adaptive_int_beam_with_validity_index.y;
+            if (aux_q_index_block_offset >= beam_valid_until_idx) {
+              // This beam is no longer valid. Updating it
+              UpdateAdaptiveBeam(
+                  cst_dev_params, aux_q_index_block_offset, global_min_int_cost,
+                  &adaptive_int_beam_with_validity_index, lane_counters);
+            }
+        } else {
+          // sh_aux_q_index_block_offset is in shared memory
+          // its value is currently invalid (overflow)
+          // we set it to a special value and use it as a flag to broadcast
+          // the fact that we have an overflow and that all threads should exit
+          sh_aux_q_index_block_offset = cst_dev_params.aux_q_capacity;
+
+          // Setting the flag for the host. It will be used to print a warning
+          // to stderr
+          lane_counters->q_overflow |= OVERFLOW_AUX_Q;
+
+          // We do not jump to end_lane now, because only
+          // the first thread (threadIdx.x == 0) is executing this
+          // We wait until the end of the divergent branch
+        }
+      }
+
+      // Sync'ing for two reasons :
+      // - Broadcasting sh_aux_q_index_block_offset
+      // - reusing sh_temp_storage (cf CUB's doc)
+      __syncthreads();
+      // The only case where we can have that condition met,
+      // is if we detected an overflow if the previous lines
+      if (sh_aux_q_index_block_offset == cst_dev_params.aux_q_capacity)
+        goto end_lane;  // done for this lane
+      //
+      // If we're executing the following lines it means everything
+      // is valid and we are not overflowing the aux_q
+      //
+      int_cost_and_index.y -= has_successor;  // we want the exclusive sum now
+      const int32 aux_q_block_index = int_cost_and_index.y;
+      const int32 aux_q_index = sh_aux_q_index_block_offset + aux_q_block_index;
+      if (has_successor) {
+        // We save the new token to the aux_q
+        cst_dev_params.d_aux_q_state_and_cost.lane(ilane)[aux_q_index] = {
+            arc_next_state, int_total_cost};
+        // Index of the parent token
+        // the parent is the token used as input (source of arc)
+        // that parent is at index main_q_idx in the GPU memory
+        // However, the main_q is emptied before processing a new frame
+        // we need to add the offset related to the previous frames index
+        // we add cst_dev_params.main_q_global_offset
+        const int32 prev_token =
+            lane_counters->main_q_global_offset + main_q_idx;
+        assert(main_q_idx >= 0 && main_q_idx < cst_dev_params.main_q_capacity);
+        cst_dev_params.d_aux_q_info.lane(ilane)[aux_q_index] = {prev_token,
+                                                                arc_idx};
+      }
+    }
+  end_lane:;  // ";" is an empty statement
+  }
+}
+
+// post_expand_kernel
+// Called after expand_arcs_kernel
+// Takes care of what needs to be done after an expand_arcs_kernel
+// execution. Mostly resetting the beam (if adaptive beam was triggered,
+// the max_active_ kernels will take care of selecting a good beam),
+// resetting the number of arcs in the main_q (we've processed them all),
+// etc.
+// Threads (1,1,1)
+// Blocks (1, nlanes_used, 1)
+template <bool IS_EMITTING>
+__global__ void post_expand_kernel(DeviceParams cst_dev_params,
+                                   KernelParams params) {
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
+    LaneCounters *h_lane_counters = cst_dev_params.h_lanes_counters.lane(ilane);
+    const int prev_main_q_end = lane_counters->main_q_narcs_and_end.y;
+    const int prev_n_extra_prev_tokens =
+        lane_counters->main_q_n_extra_prev_tokens;
+    const int aux_q_end = lane_counters->aux_q_end;
+    CostType min_cost = orderedIntToFloat(lane_counters->min_int_cost);
+    // The next step is the contracting step from aux_q to main_q
+    // It will need the aux_q_end value. But it will also empty the aux_q
+    // We're resetting aux_q_end to 0 now, but we're saving its old value
+    // in another place
+    lane_counters->post_expand_aux_q_end = aux_q_end;
+    h_lane_counters->post_expand_aux_q_end = aux_q_end;       // pinned memory
+    h_lane_counters->q_overflow = lane_counters->q_overflow;  // pinned memory
+    lane_counters->aux_q_end = 0;
+    lane_counters->aux_q_requested = 0;
+    // We are done processing those arcs
+    lane_counters->main_q_narcs_and_end.x = 0;
+    // Resetting the adaptive beam
+    lane_counters->adaptive_int_beam_with_validity_index.x =
+        lane_counters->int_beam;
+    lane_counters->adaptive_int_beam_with_validity_index.y =
+        cst_dev_params.adaptive_beam_static_segment;
+    CostType beam = orderedIntToFloat(lane_counters->int_beam);
+    lane_counters->int_cutoff = floatToOrderedInt(min_cost + beam);
+    // If the adaptive beam kicked in, we want to reset the beam
+    // the max-active process will take care of selecting the right beam
+    if (IS_EMITTING) {
+      // the main_q contains the tokens from the previous frame
+      // after emitting, we won't use them anymore to create new tokens
+      // we reset the main_q
+      lane_counters->main_q_narcs_and_end = {0, 0};
+      lane_counters->main_q_requested = 0;
+      // The main_q was flushed - we need to update the global_offset
+      lane_counters->main_q_global_offset += prev_main_q_end;
+      if (threadIdx.x == 0 && blockIdx.x == 0)
+        lane_counters->main_q_extra_prev_tokens_global_offset +=
+            prev_n_extra_prev_tokens;
+      // Moving local offset. Tokens created by last expand
+      // will be pruned, and survivals will be moved at the end
+      // of the main q. Those tokens will be placed after local_offset
+      lane_counters->main_q_requested = 0;
+      CostType min_cost = orderedIntToFloat(lane_counters->min_int_cost);
+      lane_counters->min_histo_cost = min_cost;
+      lane_counters->max_histo_cost = min_cost + beam;
+      lane_counters->histo_bin_width = beam / (KALDI_CUDA_DECODER_HISTO_NBINS-1);
+    } else {
+      lane_counters->main_q_local_offset = prev_main_q_end;
+      // reset requested to end of queue
+      lane_counters->main_q_requested = prev_main_q_end;
+    }
+  }
+}
+
+__global__ void post_contract_and_preprocess_kernel(DeviceParams cst_dev_params,
+                                                    KernelParams params) {
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
+    LaneCounters *h_lane_counters = cst_dev_params.h_lanes_counters.lane(ilane);
+    int2 main_q_narcs_and_end = lane_counters->main_q_narcs_and_end;
+    h_lane_counters->main_q_narcs_and_end =
+        main_q_narcs_and_end;                                 // pinned memory
+    h_lane_counters->q_overflow = lane_counters->q_overflow;  // pinned memory
+    atomicMin(&lane_counters->main_q_n_emitting_tokens, main_q_narcs_and_end.y);
+  }
+}
+
+// Meta-kernel (merging preprocess and expand) but only works with 1 CUDA block
+// Used to avoid calling multiple main kernels (such as expand_arcs_kernel)
+// for the tail of non emitting (lots of iterations with small number of arcs)
+//
+// Code is greatly simplified because we use only one CTA / lane
+//
+// Repeat until new queue empty:
+// 1) Preprocess
+// 2) Expand arcs
+//
+// The preprocess stage is not done on the first iteration, because it was
+// already done by the ProcessAndContract kernel. We always call
+// PruneAndPreprocess before calling FinalizeProcessNonemitting
+//
+// At the end, this kernel finalize the computation for current frame,
+// so that it's ready for next ProcessEmitting
+//
+// This kernel works, but can be greatly simplified now.
+__launch_bounds__(KALDI_CUDA_DECODER_LARGEST_1D_BLOCK, 1) __global__
+    void finalize_process_non_emitting_kernel(DeviceParams cst_dev_params,
+                                              KernelParams params) {
+  typedef cub::BlockScan<int2, KALDI_CUDA_DECODER_LARGEST_1D_BLOCK>
+      Int2BlockScan;
+  typedef cub::BlockScan<int, KALDI_CUDA_DECODER_LARGEST_1D_BLOCK> IntBlockScan;
+  __shared__ typename IntBlockScan::TempStorage sh_temp_storage_int_scan;
+  __shared__ typename Int2BlockScan::TempStorage sh_temp_storage_int2_scan;
+
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
+    const int32 ichannel = lane_counters->channel_to_compute;
+    ChannelCounters *channel_counters =
+        cst_dev_params.d_channels_counters.channel(ichannel);
+
+    int2 both = lane_counters->main_q_narcs_and_end;
+    int32 main_q_narcs = both.x;
+    int32 main_q_end = both.y;
+    int32 main_q_local_offset = lane_counters->main_q_local_offset;
+    const int32 main_q_global_offset = lane_counters->main_q_global_offset;
+    // aux_q is empty when this kernel is called
+    int32 aux_q_end = 0;
+    IntegerCostType int_cutoff = lane_counters->int_cutoff;
+    while (main_q_narcs > 0) {
+      // Step 1 : ExpandArcs
+      KALDI_CUDA_DECODER_1D_BLOCK_OFFSET_KERNEL_LOOP(offset, thread_idx,
+                                                     main_q_narcs) {
+        const int32 main_q_arc_idx = offset + thread_idx;
+        // For details on how this code works, please refer to comments in
+        // expand_arcs
+        IntegerCostType total_int_cost = INT_MAX;
+        int32 arc_idx;
+        StateId arc_next_state;
+        int32 main_q_idx;
+        if (main_q_arc_idx < main_q_narcs) {
+          main_q_idx = binsearch_maxle(
+              cst_dev_params.d_main_q_degrees_prefix_sum.channel(ichannel),
+              main_q_arc_idx, main_q_local_offset, main_q_end - 1);
+
+          const int32 state_first_arc_idx_in_main_q =
+              cst_dev_params.d_main_q_degrees_prefix_sum.channel(
+                  ichannel)[main_q_idx];
+          const int32 arc_offset_start =
+              cst_dev_params.d_main_q_arc_offsets.channel(ichannel)[main_q_idx];
+          arc_idx = arc_offset_start +
+                    (main_q_arc_idx - state_first_arc_idx_in_main_q);
+
+          arc_next_state = cst_dev_params.d_arc_nextstates[arc_idx];
+          CostType arc_weight = cst_dev_params.d_arc_weights[arc_idx];
+          CostType prev_token_cost =
+              orderedIntToFloat(cst_dev_params.d_main_q_state_and_cost
+                                    .channel(ichannel)[main_q_idx]
+                                    .y);
+          total_int_cost = floatToOrderedInt(arc_weight + prev_token_cost);
+	  if(total_int_cost < lane_counters->min_int_cost)
+            atomicMin(&lane_counters->min_int_cost, total_int_cost);
+          if (total_int_cost >= int_cutoff) {
+            total_int_cost = INT_MAX;  // above cutoff
+          }
+        }
+        const int32 has_successor = (total_int_cost < INT_MAX) ? 1 : 0;
+
+        int32 local_aux_q_idx;
+        int32 nsuccessors;
+        IntBlockScan(sh_temp_storage_int_scan)
+            .ExclusiveSum(has_successor, local_aux_q_idx,
+                          nsuccessors);  // aggregate
+
+        // Checking if we are overflowing the aux_q
+        if ((aux_q_end + nsuccessors) >= cst_dev_params.aux_q_capacity) {
+          lane_counters->q_overflow |= OVERFLOW_AUX_Q;
+          // nothing to revert in global memory
+          goto finalize_lane;
+        }
+
+        if (has_successor) {
+          const int32 aux_q_idx = aux_q_end + local_aux_q_idx;
+          const int32 prev_token_idx = main_q_global_offset + main_q_idx;
+          cst_dev_params.d_aux_q_state_and_cost.lane(ilane)[aux_q_idx] = {
+              arc_next_state, total_int_cost};
+          cst_dev_params.d_aux_q_info.lane(ilane)[aux_q_idx] = {prev_token_idx,
+                                                                arc_idx};
+        }
+        aux_q_end += nsuccessors;
+        // sync: reusing sh_temp_storage_scan_int
+        __syncthreads();
+      }
+
+      // Step 2 : PreprocessAndContract
+      // Reset for new iteration
+      main_q_narcs = 0;
+      main_q_local_offset = main_q_end;
+      KALDI_CUDA_DECODER_1D_BLOCK_OFFSET_KERNEL_LOOP(offset, thread_idx,
+                                                     aux_q_end) {
+        const int32 aux_q_idx = offset + thread_idx;
+        int32 degree = 0;
+        int32 start = -1;
+        StateId token_state;
+        IntegerCostType token_int_cost;
+        if (aux_q_idx < aux_q_end) {
+          int2 both =
+              cst_dev_params.d_aux_q_state_and_cost.lane(ilane)[aux_q_idx];
+          token_state = both.x;
+          token_int_cost = both.y;
+          // beam may have changed since generation
+          // We are non-emitting in this kernel, using ne offsets
+          start = cst_dev_params.d_arc_ne_offsets[token_state];
+          int32 end = cst_dev_params.d_arc_ne_offsets[token_state + 1];
+          degree = end - start;
+        }
+        int has_valid_nonpruned_token = (start != -1) ? 1 : 0;
+        int2 narcs_and_ntokens_prefix_sum = {degree, has_valid_nonpruned_token};
+        int2 aggregate, zero2 = {0, 0};
+        Int2BlockScan(sh_temp_storage_int2_scan)
+            .ExclusiveScan(narcs_and_ntokens_prefix_sum,
+                           narcs_and_ntokens_prefix_sum, zero2, PlusPlus(),
+                           aggregate);
+        // Checking if we are not overflowing the main_q
+        const int32 total_ntokens = aggregate.y;
+        if ((main_q_end + total_ntokens) >= cst_dev_params.main_q_capacity) {
+          lane_counters->q_overflow |= OVERFLOW_MAIN_Q;
+          goto finalize_lane;
+        }
+        const int32 degree_prefix_sum =
+            main_q_narcs + narcs_and_ntokens_prefix_sum.x;
+        const int32 degree_sum = aggregate.x;
+        main_q_narcs += degree_sum;
+        if (has_valid_nonpruned_token) {
+          const int32 local_main_q_idx = narcs_and_ntokens_prefix_sum.y;
+          const int32 main_q_idx = main_q_end + local_main_q_idx;
+
+          cst_dev_params.d_main_q_arc_offsets.channel(ichannel)[main_q_idx] =
+              start;
+          cst_dev_params.d_main_q_degrees_prefix_sum.channel(
+              ichannel)[main_q_idx] = degree_prefix_sum;
+          cst_dev_params.d_main_q_state_and_cost.channel(
+              ichannel)[main_q_idx] = {token_state, token_int_cost};
+          cst_dev_params.d_main_q_info.lane(ilane)[main_q_idx] =
+              cst_dev_params.d_aux_q_info.lane(ilane)[aux_q_idx];
+          cst_dev_params.d_main_q_acoustic_cost.lane(ilane)[main_q_idx] =
+              0.0f;  // we are always nonemitting in this kernel
+        }
+        main_q_end += total_ntokens;
+        __syncthreads();
+      }
+      aux_q_end = 0;  // aux_q is now empty
+    }
+
+  finalize_lane:
+    if (threadIdx.x == 0) {
+      // This main_q is now final for that frame
+      lane_counters->main_q_narcs_and_end = {0, main_q_end};
+      cst_dev_params.h_lanes_counters.lane(ilane)->main_q_narcs_and_end = {
+          0, main_q_end};  // pinned memory
+    }
+  }
+}
+
+// GetBestCost :
+// Finds all tokens with a cost in [min_cost;min_cost+lattice_beam[
+// Add the final_costs if use_final_probs
+// Does the computation in two steps
+//
+// Step 1: Find the value of min_cost, i.e. the minimum cost in the last token
+// queue
+// (the queue generated by the last frame computed)
+// We set both channel_counters->min_int_cost_and_arg_without_final
+// and channel_counters->min_int_cost_and_arg_with_final
+// One add the final_cost[token.state] before looking for the min
+__global__ void get_best_cost_step1_kernel(DeviceParams cst_dev_params,
+                                           KernelParams params,
+                                           bool use_final_probs,
+                                           CostType fst_zero) {
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
+    const int32 ichannel = lane_counters->channel_to_compute;
+    ChannelCounters *channel_counters =
+        cst_dev_params.d_channels_counters.channel(ichannel);
+    const int32 main_q_end = channel_counters->prev_main_q_narcs_and_end.y;
+    const int32 global_offset = channel_counters->prev_main_q_global_offset;
+    KALDI_CUDA_DECODER_1D_KERNEL_LOOP(idx, main_q_end) {
+      if (idx == 0)
+        lane_counters->n_within_lattice_beam =
+            0;  // will be used in the next kernel
+      const int2 both =
+          cst_dev_params.d_main_q_state_and_cost.channel(ichannel)[idx];
+      const int token_state = both.x;
+      const int token_int_cost = both.y;
+      CostType cost = orderedIntToFloat(token_int_cost);
+      IntegerCostType int_cost = floatToOrderedInt(cost);
+      int32 global_idx = global_offset + idx;
+      // We know what is the min cost (without final costs)
+      // we just need to have the index of one token with that min cost
+
+      if (use_final_probs) {
+        const CostType final_cost =
+            cst_dev_params.d_fst_final_costs[token_state];
+        IntegerCostType int_cost_with_final =
+            floatToOrderedInt(cost + final_cost);
+        if (final_cost != fst_zero) {
+          int2 min_and_arg = {int_cost_with_final,
+                              global_idx};  // sort by cost, put it first
+          atomicMinI2(&channel_counters->min_int_cost_and_arg_with_final,
+                      min_and_arg);
+        }
+      }
+    }
+  }
+}
+
+// Step2: Now that step1 found the min_cost (with and without final cost)
+// If at least one final token (token associated with a final fst state)
+// exists in the token queue, AND if use_final_probs is true,
+// We can detect all tokens with a cost within [min_cost;min_cost+lattice_beam]
+// and list them into d_list_final_tokens_in_main_q
+__global__ void get_best_cost_step2_kernel(DeviceParams cst_dev_params,
+                                           KernelParams params,
+                                           bool use_final_probs,
+                                           CostType fst_zero) {
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
+    const int32 ichannel = lane_counters->channel_to_compute;
+    const ChannelCounters *channel_counters =
+        cst_dev_params.d_channels_counters.channel(ichannel);
+    const int32 main_q_end = channel_counters->prev_main_q_narcs_and_end.y;
+    const int32 global_offset = channel_counters->prev_main_q_global_offset;
+    const int2 min_int_cost_and_arg_with_final =
+        channel_counters->min_int_cost_and_arg_with_final;
+    const int2 min_int_cost_and_arg_without_final =
+        channel_counters->min_int_cost_and_arg_without_final;
+    bool has_reached_final = (min_int_cost_and_arg_with_final.x != INT_MAX);
+    // Use final if we want to use final (use_final_probs is true) and if we
+    // found a final state in the token list
+    bool compute_final = use_final_probs && has_reached_final;
+    IntegerCostType min_cost_to_use =
+        compute_final ? min_int_cost_and_arg_with_final.x
+                      : min_int_cost_and_arg_without_final.x;
+
+    // if token.cost < lattice_cutoff, that token will belong in the output
+    // lattice
+    CostType lattice_cutoff =
+        orderedIntToFloat(min_cost_to_use) + cst_dev_params.lattice_beam;
+    IntegerCostType lattice_int_cutoff = floatToOrderedInt(lattice_cutoff);
+    KALDI_CUDA_DECODER_1D_KERNEL_LOOP(idx, main_q_end) {
+      // First thread of each lane will move the results into lane counters.
+      // That's because we never move channel counters back to host,
+      // so we move those values to the lane counters, and those lane counters
+      // will be moved to host after this kernel
+      if (idx == 0) {
+        // The lane counters will be copied to host
+        lane_counters->min_int_cost_and_arg =
+            compute_final ? min_int_cost_and_arg_with_final
+                          : min_int_cost_and_arg_without_final;
+        lane_counters->has_reached_final = has_reached_final;
+      }
+      // Looking for a token with its int_cost < lattice_int_cutoff
+      const int2 both =
+          cst_dev_params.d_main_q_state_and_cost.channel(ichannel)[idx];
+      const int32 token_state = both.x;
+      int32 token_int_cost = both.y;
+      if (compute_final) {
+        const CostType final_cost =
+            cst_dev_params.d_fst_final_costs[token_state];
+        const CostType token_cost = orderedIntToFloat(token_int_cost);
+        // final_cost == fst_zero -> this state is not final
+        token_int_cost = (final_cost != fst_zero)
+                             ? floatToOrderedInt(token_cost + final_cost)
+                             : INT_MAX;
+      }
+      if (token_int_cost < lattice_int_cutoff) {
+        // That token will be included in the lattice (last frame)
+        // save it
+        int list_idx = atomicAdd(&lane_counters->n_within_lattice_beam, 1);
+        cst_dev_params.h_list_final_tokens_in_main_q.lane(ilane)[list_idx] = {
+            global_offset + idx, token_int_cost};
+      }
+    }
+  }
+}
+__global__ void get_best_cost_step3_kernel(DeviceParams cst_dev_params,
+                                           KernelParams params) {
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    LaneCounters *d_lanes_counters =
+        cst_dev_params.d_lanes_counters.lane(ilane);
+    LaneCounters *h_lanes_counters =
+        cst_dev_params.h_lanes_counters.lane(ilane);
+    h_lanes_counters->min_int_cost_and_arg =
+        d_lanes_counters->min_int_cost_and_arg;
+    h_lanes_counters->has_reached_final = d_lanes_counters->has_reached_final;
+    h_lanes_counters->n_within_lattice_beam =
+        d_lanes_counters->n_within_lattice_beam;
+  }
+}
+// compute_costs_histogram_kernel
+// Used in ApplyMaxActiveAndReduceBeam
+// Compute the histogram of the token.cost in the main_q
+__global__ void compute_costs_histogram_kernel(DeviceParams cst_dev_params,
+                                               KernelParams params,
+                                               bool use_aux_q) {
+  const int nlanes = params.nlanes_used;
+  typedef cub::BlockHistogram<BinId, KALDI_CUDA_DECODER_1D_BLOCK, 1,
+                              KALDI_CUDA_DECODER_HISTO_NBINS + 1>
+      BlockHistogram;
+  __shared__ typename BlockHistogram::TempStorage temp_storage;
+  __shared__ unsigned int smem_histogram[KALDI_CUDA_DECODER_HISTO_NBINS + 1];
+
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
+    const int32 ichannel = lane_counters->channel_to_compute;
+    const int32 q_end = use_aux_q ? lane_counters->post_expand_aux_q_end
+                                  : lane_counters->main_q_narcs_and_end.y;
+    bool compute_max_active = lane_counters->compute_max_active;
+    if (!compute_max_active) {
+      if (q_end <= cst_dev_params.max_active) continue;  // nothing to do
+      // Otherwise let's turn max active on for this frame and lane
+      lane_counters->compute_max_active = true;
+    }
+
+    // Reset local histogram for this lane
+    BlockHistogram(temp_storage).InitHistogram(smem_histogram);
+    CostType min_histo_cost = lane_counters->min_histo_cost;
+    CostType max_histo_cost = lane_counters->max_histo_cost;
+    CostType bin_width = lane_counters->histo_bin_width;
+
+    // We have a sync inside the loop, keeping all threads alive
+    KALDI_CUDA_DECODER_1D_BLOCK_OFFSET_KERNEL_LOOP(block_offset, thread_idx,
+                                                   q_end) {
+      const int32 q_idx = block_offset + thread_idx;
+      // The last bin is for everything we don't want to count:
+      // cost already above the beam, or non-valid tokens
+      // It is the default bin
+      BinId bin_id[1];
+      bin_id[0] = KALDI_CUDA_DECODER_HISTO_NBINS;
+      if (q_idx < q_end) {
+        IntegerCostType int_cost =
+            use_aux_q
+                ? cst_dev_params.d_aux_q_state_and_cost.lane(ilane)[q_idx].y
+                : cst_dev_params.d_main_q_state_and_cost
+                      .channel(ichannel)[q_idx]
+                      .y;
+        CostType cost = orderedIntToFloat(int_cost);
+        CostType extra = cost - min_histo_cost;
+	if(extra <= 0.0f) 
+		bin_id[0] = 0;
+  	else if (extra < max_histo_cost) {
+          bin_id[0] = (BinId)__fdiv_rd(extra, bin_width)+1; // +1 because first bin is cost < min_histo_cost
+        }
+      }
+      BlockHistogram(temp_storage).Composite(bin_id, smem_histogram);  // sync
+      __syncthreads();  // reusing temp_storage
+    }
+
+    // Not using the macros 1D_LOOP because that loop is only within a CTA
+    for (int32 bin_id_w = threadIdx.x;
+         bin_id_w < KALDI_CUDA_DECODER_HISTO_NBINS;
+         bin_id_w += KALDI_CUDA_DECODER_1D_BLOCK) {
+      // Writing the local histo to global
+      // We don't care about the last bin (cf above)
+      int32 s_count = (int32)smem_histogram[bin_id_w];
+      atomicAdd(&cst_dev_params.d_histograms.lane(ilane)[bin_id_w], s_count);
+    }
+    // Making sure we're done reading from smem
+    __syncthreads();
+  }
+}
+
+// update_beam_using_histogram_kernel
+// used in ApplyMaxActiveAndReduceBeam
+// uses the histogram computed in compute_costs_histogram_kernel
+// to find where to cut (where to set the beam)
+// to keep only ~max_active_ tokens.
+// Important: use only one CTA per lane
+__global__ void update_beam_using_histogram_kernel(DeviceParams cst_dev_params,
+                                                   KernelParams params,
+                                                   bool use_aux_q) {
+  typedef cub::BlockScan<int, KALDI_CUDA_DECODER_1D_BLOCK> BlockScan;
+  __shared__ typename BlockScan::TempStorage temp_storage;
+
+  const int nlanes = params.nlanes_used;
+  const int max_active = cst_dev_params.max_active;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
+    bool compute_max_active = lane_counters->compute_max_active;
+    if (!compute_max_active) continue;  // nothing to do
+    CostType beam = orderedIntToFloat(lane_counters->int_beam);
+    CostType min_histo_cost = lane_counters->min_histo_cost;
+    CostType bin_width = lane_counters->histo_bin_width;
+    // We now have our histogram of the token costs (computed in the previous
+    // kernel)
+    // Each thread i is responsible for a bin i, with that bin containing ni
+    // tokens.
+    // We compute the prefix sum of those ni, ending up for each thread with
+    // si=sum[i=1..i](ni)
+    // If the thread i detects that si < max_active_ and s[i+1] >= max_active_,
+    // then we will cut the beam at
+    // the cost of the bin [i+1]
+    //
+    // Assert : one thread in a CTA is responsible for at most one bin
+    // we will not iterate over bins
+    assert(KALDI_CUDA_DECODER_HISTO_NBINS < KALDI_CUDA_DECODER_1D_BLOCK);
+    int bin_id = threadIdx.x;
+    int val = 0;
+    if (bin_id < KALDI_CUDA_DECODER_HISTO_NBINS) 
+      val = cst_dev_params.d_histograms.lane(ilane)[bin_id];
+    
+    int prefix_sum;
+    BlockScan(temp_storage).ExclusiveSum(val, prefix_sum);
+
+    if (prefix_sum < max_active && (prefix_sum + val) >= max_active) {
+      // We found our new beam regarding min_histo_cost
+      // Howevever, the current min_cost could be lower than min_histo_cost
+      // we need to add that diff to the new beam
+      CostType new_beam_for_histo_min_cost = bin_width * bin_id;
+      CostType current_min_cost = orderedIntToFloat(lane_counters->min_int_cost);
+      CostType new_beam = (min_histo_cost - current_min_cost) + new_beam_for_histo_min_cost;
+      IntegerCostType new_int_beam = floatToOrderedInt(new_beam);
+      // Saving our new beam for this lane
+      lane_counters->int_beam = new_int_beam;
+      lane_counters->adaptive_int_beam_with_validity_index.x = new_int_beam;
+      lane_counters->int_cutoff = floatToOrderedInt(current_min_cost + new_beam);
+    }
+  }
+}
+
+//
+// PostProcessingMainQueue kernels.
+// all the following kernels are called when postprocessing a frame
+//
+
+// Filling hashmap values with the tokens that we have in the main queue
+// We do that because multiple tokens associated with the same FST state
+// (but with different arc_idx) can exist in the main_q. We need to detect
+// that situation, count them, detect what the min_cost for that FST state is.
+// It is done using a hashmap
+__global__ void fill_hashmap_with_main_q_kernel(DeviceParams cst_dev_params,
+                                                KernelParams params) {
+  // Operator for the prefix sum inside the CUDA block
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
+    const int32 ichannel = lane_counters->channel_to_compute;
+    ChannelCounters *channel_counters =
+        cst_dev_params.d_channels_counters.channel(ichannel);
+
+    const int32 main_q_end = lane_counters->main_q_narcs_and_end.y;
+    int32 min_int_cost = lane_counters->min_int_cost;
+    CostType min_cost = orderedIntToFloat(min_int_cost);
+    const int32 global_offset = channel_counters->prev_main_q_global_offset;
+    KALDI_CUDA_DECODER_1D_KERNEL_LOOP(main_q_idx, main_q_end) {
+      // Position of considered token in the main_q
+      if (main_q_idx < main_q_end) {
+        int2 both = cst_dev_params.d_main_q_state_and_cost.channel(
+            ichannel)[main_q_idx];
+        StateId token_state = both.x;
+        IntegerCostType token_int_cost = both.y;
+        if (min_int_cost == token_int_cost) {
+          // remove offset = min_cost, set it to 0 explicitely
+          token_int_cost = floatToOrderedInt(0.0f);
+          channel_counters->min_int_cost_and_arg_without_final = {
+              token_int_cost, global_offset + main_q_idx};
+          lane_counters->prev_arg_min_int_cost = main_q_idx;
+        } else {
+          // remove offset = min_cost
+          CostType token_cost = orderedIntToFloat(token_int_cost) - min_cost;
+          token_int_cost = floatToOrderedInt(token_cost);
+        }
+        int local_idx, hash_idx;
+        hashmap_insert_or_aggregate(cst_dev_params.d_hashmap_values.lane(ilane),
+                                    token_state, token_int_cost, main_q_idx,
+                                    cst_dev_params.hashmap_capacity, &local_idx,
+                                    &hash_idx);
+        cst_dev_params.d_main_q_n_extra_prev_tokens_local_idx.lane(
+            ilane)[main_q_idx] = local_idx;
+        cst_dev_params.d_main_q_state_and_cost.channel(ichannel)[main_q_idx].y =
+            token_int_cost;
+        // If we have the min, saving its index for get best cost and the min
+        // cost estimate of the next frame
+
+        // Saving where that token.state ended up in the hashmap
+        // false = this token is not the representative of this state
+        // We will update representing_state once we know more (in the next
+        // kernel)
+        // We first need to add all tokens to the hashmap. Which will be the
+        // case when
+        // this kernel returns.
+        SetFSTStateHashIndex(
+            hash_idx, false,
+            &cst_dev_params.d_main_q_state_hash_idx.lane(ilane)[main_q_idx]);
+      }
+
+      if (main_q_idx == 0) {
+        lane_counters->int_cutoff = floatToOrderedInt(
+            orderedIntToFloat(lane_counters->int_cutoff) - min_cost);
+      }
+    }
+  }
+}
+
+// preprocess_and_list_extra_prev_tokens_kernel_step[i] kernels
+// Called in PostProcessingMainQueue
+// They do two things:
+// - do the "emitting preprocessing". I.e. doing the preprocessing necessary for
+// the future ExpandArcsEmitting that may be done next (if the current frame is
+// not the last one)
+// It consists of filling the d_main_q_degrees_prefix_sum of the emitting arc
+// degrees of the tokens + setting d_main_q_arc_offsets
+// - when we have multiple tokens associated with the same FST state S, we will
+// list them in d_main_q_extra_prev_tokens. We need to know where to put them in
+// that array,
+// so we'll compute a prefix_sum also to compute those indexes. We'll then save
+// the location of each extra tokens list (its offset and size in
+// d_main_q_extra_prev_tokens),
+// and save it into d_main_q_info for later lattice processing
+//
+// First step : Reading the hashmap, detecting which token is representative for
+// each FST state, which is decided by fill_hashmap_with_main_q_kernel()
+// (we pick one of the best ones, with the best ones being the ones with the
+// lowest cost)
+// this representative will be responsible for K tokens, with K being the number
+// of tokens associated with that FST state. We only considers the cases where K
+// > 1,
+// because if K == 1, then we will not store that token in the special list
+// d_main_q_extra_prev_tokens
+// Each representative is also the only token that will propagate emitting arcs
+// for that FST state. Because a representative has the min_cost for that FST
+// state, it is enough to only propagate
+// that one
+// Each representative counts the number of emitting arcs it is responsible for,
+// and we will compute the prefix sum of the arc degrees
+__global__ void emitting_preprocess_and_list_extra_prev_tokens_step1_kernel(
+    DeviceParams cst_dev_params, KernelParams params) {
+  // Operator for the prefix sum inside the CUDA block
+  typedef cub::BlockScan<int2, KALDI_CUDA_DECODER_1D_BLOCK> BlockScan;
+  __shared__ typename BlockScan::TempStorage sh_temp_storage;
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    const LaneCounters *lane_counters =
+        cst_dev_params.d_lanes_counters.lane(ilane);
+    const int32 main_q_end = lane_counters->main_q_narcs_and_end.y;
+    // Final cutoff from last ExpandArc execution
+    // The cutoff can have decreased since moving tokens to the main_q
+    // min_cost cannot be lower than before (we only did non-emitting phases
+    // since then)
+    // but the adaptive beam may have lowered the beam
+    const IntegerCostType int_cutoff = lane_counters->int_cutoff;
+    // Keeping all threads in CTA alive
+    // We'll __syncthreads()
+    KALDI_CUDA_DECODER_1D_BLOCK_OFFSET_KERNEL_LOOP(block_offset, thread_idx,
+                                                   main_q_end) {
+      // We'll take care of the token at index main_q_idx
+      const int32 main_q_idx = block_offset + thread_idx;
+      const int32 ichannel = lane_counters->channel_to_compute;
+      // If that token is the representative of its FST state (token.next_state)
+      // The representative of a FST state is the token with the lowest
+      // token.cost for that FST state
+      // If multiple tokens have token1.cost == token2.cost ==
+      // min_cost_for_that_state, then one is picked (first come first serve,
+      // was done in fill_hashmap_with_main_q_kernel)
+      bool representing_state = false;
+      // Number of emitting arcs for that token
+      // Only the token representative of that FST state can have degree > 0
+      int32 degree = 0;
+      // If that token is representative of a FST state S,
+      // and if multiple tokens are associated with that state S,
+      // then n_extra_prev_token will contain their count
+      int32 n_extra_prev_token = 0;
+      if (main_q_idx < main_q_end) {
+        int2 both = cst_dev_params.d_main_q_state_and_cost.channel(
+            ichannel)[main_q_idx];
+        StateId token_state = both.x;
+        IntegerCostType token_int_cost = both.y;
+        // Loading info about token.next_state. Is there multiple tokens for
+        // that state ?
+        // How many ? What's the min token.cost for that state ?
+        int32 hash_idx;    // we saved the hash_idx after inserting
+        bool bool_buffer;  // will always be false. We just need it to call the
+                           // function
+        GetFSTStateHashIndex(
+            cst_dev_params.d_main_q_state_hash_idx.lane(ilane)[main_q_idx],
+            &hash_idx, &bool_buffer);
+        HashmapValueT h_val =
+            cst_dev_params.d_hashmap_values.lane(ilane)[hash_idx];
+        // Token index of one of the token which the lowest token.cost for that
+        // state
+        uint32_t state_best_int_cost_argmin;
+	GetArgFromPackedArgminUInt64(h_val.min_and_argmin_int_cost_u64, &state_best_int_cost_argmin);
+
+        // Checking if we're the representative of that state
+        representing_state = (main_q_idx == state_best_int_cost_argmin);
+        // Saving the hash_idx of that fst state + if we're responsible for that
+        // state
+        SetFSTStateHashIndex(
+            hash_idx, representing_state,
+            &cst_dev_params.d_main_q_state_hash_idx.lane(ilane)[main_q_idx]);
+
+        // One of the best token for that state will represent that state in the
+        // next frame
+        if (representing_state) {
+          if (token_int_cost < int_cutoff) {
+            // Next step is emitting (next frame), using emitting offsets
+            const int32 start = cst_dev_params.d_arc_e_offsets[token_state];
+            const int32 end = cst_dev_params.d_arc_e_offsets[token_state + 1];
+            degree = end - start;
+            // Saving the start offset for the expand kernel
+            // avoid a new random memory access
+            cst_dev_params.d_main_q_arc_offsets.channel(ichannel)[main_q_idx] =
+                start;
+          }
+          // If that FST state has only one token associated to it, we store
+          // that token directly in
+          // d_main_q_info (its original place)
+          // We only move it into the d_main_q_extra_prev_tokens list if
+          // multiple tokens are associated to that state
+          n_extra_prev_token = (h_val.count > 1) ? (h_val.count) : 0;
+        }
+      }
+
+      // Computing a local prefix sum inside that CUDA block
+      // Others kernels will take care of adding the necessary offset to those
+      // local prefix sums
+      int2 zeroi2 = {0, 0};
+      int2 vali2 = {degree, n_extra_prev_token};
+      int2 aggi2;
+      BlockScan(sh_temp_storage)
+          .ExclusiveScan(vali2, aggi2, zeroi2, PlusPlus());
+      int32 degree_local_prefix_sum = aggi2.x;
+      int32 n_extra_prev_token_prefix_sum = aggi2.y;
+
+      if (main_q_idx < main_q_end) {
+        // This is not the final global prefix sum
+        // Other kernels will add the necessary offset
+        cst_dev_params.d_main_q_degrees_prefix_sum.channel(
+            ichannel)[main_q_idx] = degree_local_prefix_sum;
+        cst_dev_params.d_main_q_extra_prev_tokens_prefix_sum.lane(
+            ilane)[main_q_idx] = n_extra_prev_token_prefix_sum;
+      }
+
+      if (KALDI_CUDA_DECODER_IS_LAST_1D_THREAD()) {
+        // Saving the local sum of degrees of that CUDA block
+        // That's necessary to compute the global offset of that CUDA block,
+        // and that offset is what we need to transform the local prefix sum
+        // into a global prefix sum
+        const int local_sum_index = block_offset / KALDI_CUDA_DECODER_1D_BLOCK;
+        // the prefix sum was exclusive, adding missing value
+        const int degree_inclusive_sum = degree_local_prefix_sum + degree;
+        const int n_extra_prev_tokens_inclusive_sum =
+            n_extra_prev_token_prefix_sum + n_extra_prev_token;
+        cst_dev_params.d_main_q_block_sums_prefix_sum.lane(
+            ilane)[local_sum_index] = {degree_inclusive_sum,
+                                       n_extra_prev_tokens_inclusive_sum};
+      }
+
+      // Synchronization because:
+      // - we may need to reuse sh_temp_storage if the for loop iterates (cf
+      // CUB's doc)
+      __syncthreads();
+    }
+  }
+}
+
+// In step1, we've computed the local (CTA-wide) prefix sums. We also have the
+// local sums of each individual CTAs
+// In this kernel, we will compute the offset of each CTA in the global prefix
+// sum. We will then add those offsets in step3
+// Only one CTA / lane
+__global__ void emitting_preprocess_and_list_extra_prev_tokens_step2_kernel(
+    DeviceParams cst_dev_params, KernelParams params) {
+  typedef cub::BlockScan<int2, KALDI_CUDA_DECODER_1D_BLOCK> BlockScan;
+  __shared__ typename BlockScan::TempStorage sh_temp_storage;
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
+    const int main_q_end = lane_counters->main_q_narcs_and_end.y;
+    const int ntiles = KALDI_CUDA_DECODER_DIV_ROUND_UP(
+        main_q_end, KALDI_CUDA_DECODER_1D_BLOCK);
+    // Using block_offset loop to keep entire CTA alive (we're going to use
+    // __syncthreads in CUB)
+    int2 sum_so_far = {0, 0};
+    KALDI_CUDA_DECODER_1D_BLOCK_OFFSET_KERNEL_LOOP(offset, thread_idx, ntiles) {
+      const int32 itile = offset + thread_idx;
+      const int2 zeroi2 = {0, 0};
+      const int2 val =
+          (itile < ntiles)
+              ? cst_dev_params.d_main_q_block_sums_prefix_sum.lane(ilane)[itile]
+              : zeroi2;
+
+      int2 prefix_sum, sum;
+      BlockScan(sh_temp_storage)
+          .ExclusiveScan(val, prefix_sum, zeroi2, PlusPlus(), sum);
+      PlusPlus pp;
+      prefix_sum = pp(prefix_sum, sum_so_far);
+      sum_so_far = pp(sum_so_far, sum);
+      if (itile < ntiles) {
+        cst_dev_params.d_main_q_block_sums_prefix_sum.lane(ilane)[itile] =
+            prefix_sum;
+      }
+      if (itile == (ntiles - 1)) {
+        const int32 total_narcs = prefix_sum.x + val.x;
+        const int32 total_n_extra_prev_tokens = prefix_sum.y + val.y;
+        lane_counters->main_q_narcs_and_end.x = total_narcs;
+        lane_counters->main_q_n_extra_prev_tokens = total_n_extra_prev_tokens;
+        assert(total_n_extra_prev_tokens >= 0 &&
+               total_n_extra_prev_tokens <= main_q_end);
+      }
+    }
+  }
+}
+
+// Step3: Uses the CTA offsets computed in step2 to transform the CTA-wide
+// prefix sums to global prefix sums
+// The representative of each FST states saves into the hashmap the location of
+// the extra_prev_tokens of that state
+// in d_main_q_extra_prev_tokens. That way each extra tokens will know where to
+// write itself in the next kernel.
+__global__ void emitting_preprocess_and_list_extra_prev_tokens_step3_kernel(
+		DeviceParams cst_dev_params, KernelParams params) {
+	const int nlanes = params.nlanes_used;
+	KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+		const LaneCounters *lane_counters =
+			cst_dev_params.d_lanes_counters.lane(ilane);
+		const int32 ichannel = lane_counters->channel_to_compute;
+		const int main_q_end = lane_counters->main_q_narcs_and_end.y;
+		KALDI_CUDA_DECODER_1D_KERNEL_LOOP(main_q_idx, main_q_end) {
+			const int32 local_sum_idx = main_q_idx / KALDI_CUDA_DECODER_1D_BLOCK;
+			const int2 local_sum_offset =
+				cst_dev_params.d_main_q_block_sums_prefix_sum.lane(
+						ilane)[local_sum_idx];
+			cst_dev_params.d_main_q_degrees_prefix_sum.channel(
+					ichannel)[main_q_idx] += local_sum_offset.x;
+			int extra_prev_tokens_offset =
+				cst_dev_params.d_main_q_extra_prev_tokens_prefix_sum.lane(
+						ilane)[main_q_idx] +
+				local_sum_offset.y;
+			// Loading the hash index associate with token.state
+			// If representative, store the location of the extra prev tokens list for
+			// that state in the hashmap
+			bool is_representative;
+			int32 hash_idx;
+			GetFSTStateHashIndex(
+					cst_dev_params.d_main_q_state_hash_idx.lane(ilane)[main_q_idx],
+					&hash_idx, &is_representative);
+                        if (is_representative) {
+                          HashmapValueT &val =
+                              cst_dev_params.d_hashmap_values.lane(
+                                  ilane)[hash_idx];
+                          uint32_t min;
+                          GetMinFromPackedArgminUInt64(
+                              val.min_and_argmin_int_cost_u64, &min);
+                          unsigned long long new_pack;
+                          PackArgminInUInt64(min, extra_prev_tokens_offset,
+                                             &new_pack);
+                          val.min_and_argmin_int_cost_u64 = new_pack;
+                        }
+		}
+	}
+}
+
+// Step4: We now know where to store our extra prev tokens in
+// d_main_q_extra_prev_tokens.
+// We will now move the tokens that need to be moved (when multiple tokens are
+// associated to the same FST state)
+// into d_main_q_extra_prev_tokens. In d_main_q_info, we will store the location
+// of that list [offset,size]
+// so that when backtracking, when we read d_main_q_info[token_idx], we know
+// where to look to have the list
+// of the same-state tokens
+// It is the last step of the
+// emitting_preprocess_and_list_extra_prev_tokens_step[i]_kernel pipeline
+__global__ void emitting_preprocess_and_list_extra_prev_tokens_step4_kernel(
+    DeviceParams cst_dev_params, KernelParams params) {
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    const LaneCounters *lane_counters =
+        cst_dev_params.d_lanes_counters.lane(ilane);
+    const int32 ichannel = lane_counters->channel_to_compute;
+    const int main_q_end = lane_counters->main_q_narcs_and_end.y;
+    // Previous frames have filled d_main_q_extra_prev_tokens.
+    // d_main_q_extra_prev_tokens was then flushed to host. We want to set the
+    // global
+    // (global in the sense "for all frames") offset on where to read it the
+    // h_all_tokens_extra_prev_tokens_ on host.
+    // adding the main_q_extra_prev_tokens_global_offset for that
+    const int prev_global_idx =
+        lane_counters->main_q_extra_prev_tokens_global_offset;
+    KALDI_CUDA_DECODER_1D_KERNEL_LOOP(main_q_idx, main_q_end) {
+      // We'll take care of token at main_q_idx
+      // Loading hashmap information about token.state
+      bool is_representative;
+      int32 hash_idx;
+      GetFSTStateHashIndex(
+          cst_dev_params.d_main_q_state_hash_idx.lane(ilane)[main_q_idx],
+          &hash_idx, &is_representative);
+
+      HashmapValueT val = cst_dev_params.d_hashmap_values.lane(ilane)[hash_idx];
+      // How many tokens are associated with that fst state token.state
+      int same_count = val.count;
+      bool must_move_to_extra_prev_tokens = (same_count > 1);
+      if (must_move_to_extra_prev_tokens) {
+        // Moving to the extra_prev_tokens list.
+        // Some of those tokens have an extra cost (compared to the best cost
+        // for that FST state)
+        // Generating and saving that extra cost. We will use it when generating
+        // the lattice.
+        CostType token_cost = orderedIntToFloat(
+            cst_dev_params.d_main_q_state_and_cost.channel(ichannel)[main_q_idx]
+                .y);
+	uint32_t best_int_cost;
+        // Where to write this state list in d_main_q_extra_prev_tokens
+	uint32_t extra_prev_tokens_offset;
+	unsigned long long pack = val.min_and_argmin_int_cost_u64;
+	GetMinFromPackedArgminUInt64(pack, &best_int_cost);
+	GetArgFromPackedArgminUInt64(pack, &extra_prev_tokens_offset);
+        CostType best_cost = orderedIntToFloat((int)best_int_cost);
+        CostType extra_cost = token_cost - best_cost;
+	assert(!is_representative || extra_cost == 0.0f);
+        // Loading the token to be moved
+        InfoToken inf_tok =
+            cst_dev_params.d_main_q_info.lane(ilane)[main_q_idx];
+        CostType acoustic_cost =
+            cst_dev_params.d_main_q_acoustic_cost.lane(ilane)[main_q_idx];
+        // Place of that specific token in the extra_prev_tokens sublist of that
+        // specific FST state
+        int32 local_idx =
+            cst_dev_params.d_main_q_n_extra_prev_tokens_local_idx.lane(
+                ilane)[main_q_idx];
+        // Saving the location of the extra prev tokens for that state into that
+        // InfoToken
+        SetSameFSTStateTokensList(
+            prev_global_idx + extra_prev_tokens_offset, same_count,
+            &cst_dev_params.d_main_q_info.lane(ilane)[main_q_idx]);
+        // Where to write this token in d_main_q_extra_prev_tokens
+        int32 list_idx = extra_prev_tokens_offset + local_idx;
+        // Moving token. Also saving extra_cost
+        cst_dev_params.d_main_q_extra_prev_tokens.lane(ilane)[list_idx] =
+            inf_tok;
+        cst_dev_params.d_main_q_extra_and_acoustic_cost.lane(
+            ilane)[list_idx] = {extra_cost, acoustic_cost};
+        assert(inf_tok.prev_token >= (lane_counters->main_q_global_offset -
+                                      cst_dev_params.main_q_capacity) &&
+               inf_tok.prev_token <=
+                   (lane_counters->main_q_global_offset + main_q_end));
+      }
+    }
+  }
+}
+
+// Clear the hashmaps after use
+// Each element in the map has a representative in the main_q
+// Everyone of those representatives has the responsability to reset their
+// corresponding value in the hashmap
+// Once this kernel returns, the hashmaps are cleared
+__global__ void clear_hashmap_kernel(DeviceParams cst_dev_params,
+                                     KernelParams params) {
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
+    const int main_q_end = lane_counters->main_q_narcs_and_end.y;
+    KALDI_CUDA_DECODER_1D_KERNEL_LOOP(main_q_idx, main_q_end) {
+      bool is_representative;
+      int32 hash_idx;
+      GetFSTStateHashIndex(
+          cst_dev_params.d_main_q_state_hash_idx.lane(ilane)[main_q_idx],
+          &hash_idx, &is_representative);
+      // Representative owns a state. Each representative resets its associated
+      // token.state
+      // in the hashmap
+      if (is_representative) {
+        cst_dev_params.d_hashmap_values.lane(ilane)[hash_idx] =
+            KALDI_CUDA_DECODER_HASHMAP_NO_VAL;  // clear
+      }
+    }
+  }
+}
+
+// Kernels wrappers
+
+void SaveChannelsStateFromLanesKernel(const dim3 &grid, const dim3 &block,
+                                      const cudaStream_t &st,
+                                      const DeviceParams &cst_dev_params,
+                                      const KernelParams &kernel_params) {
+  save_channels_state_from_lanes_kernel<<<grid, block, 0, st>>>(cst_dev_params,
+                                                                kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void LoadChannelsStateInLanesKernel(const dim3 &grid, const dim3 &block,
+                                    const cudaStream_t &st,
+                                    const DeviceParams &cst_dev_params,
+                                    const KernelParams &kernel_params) {
+  load_channels_state_in_lanes_kernel<<<grid, block, 0, st>>>(cst_dev_params,
+                                                              kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void InitDecodingOnDeviceKernel(const dim3 &grid, const dim3 &block,
+                                const cudaStream_t &st,
+                                const DeviceParams &cst_dev_params,
+                                const KernelParams &kernel_params) {
+  init_decoding_on_device_kernel<<<grid, block, 0, st>>>(cst_dev_params,
+                                                         kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void InitializeInitialLaneKernel(const dim3 &grid, const dim3 &block,
+                                 const cudaStream_t &st,
+                                 const DeviceParams &cst_dev_params) {
+  initialize_initial_lane_kernel<<<grid, block, 0, st>>>(cst_dev_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void ResetForFrameAndEstimateCutoffKernel(const dim3 &grid, const dim3 &block,
+                                          const cudaStream_t &st,
+                                          const DeviceParams &cst_dev_params,
+                                          const KernelParams &kernel_params) {
+  reset_for_frame_and_estimate_cutoff_kernel<<<grid, block, 0, st>>>(
+      cst_dev_params, kernel_params);
+}
+
+template <bool IS_EMITTING>
+void ExpandArcsKernel(const dim3 &grid, const dim3 &block,
+                      const cudaStream_t &st,
+                      const DeviceParams &cst_dev_params,
+                      const KernelParams &kernel_params) {
+  expand_arcs_kernel<IS_EMITTING><<<grid, block, 0, st>>>(cst_dev_params,
+                                                          kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+template <bool IS_EMITTING>
+void PostExpandKernel(const dim3 &grid, const dim3 &block,
+                      const cudaStream_t &st,
+                      const DeviceParams &cst_dev_params,
+                      const KernelParams &kernel_params) {
+  post_expand_kernel<IS_EMITTING><<<grid, block, 0, st>>>(cst_dev_params,
+                                                          kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void PostContractAndPreprocessKernel(const dim3 &grid, const dim3 &block,
+                                     const cudaStream_t &st,
+                                     const DeviceParams &cst_dev_params,
+                                     const KernelParams &kernel_params) {
+  post_contract_and_preprocess_kernel<<<grid, block, 0, st>>>(cst_dev_params,
+                                                              kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void NonEmittingPreprocessAndContractKernel(const dim3 &grid, const dim3 &block,
+                                            const cudaStream_t &st,
+                                            const DeviceParams &cst_dev_params,
+                                            const KernelParams &kernel_params) {
+  nonemitting_preprocess_and_contract_kernel<<<grid, block, 0, st>>>(
+      cst_dev_params, kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void FillHashmapWithMainQKernel(const dim3 &grid, const dim3 &block,
+                                const cudaStream_t &st,
+                                const DeviceParams &cst_dev_params,
+                                const KernelParams &kernel_params) {
+  fill_hashmap_with_main_q_kernel<<<grid, block, 0, st>>>(cst_dev_params,
+                                                          kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void EmittingPreprocessAndListExtraPrevTokensStep1Kernel(
+    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
+    const DeviceParams &cst_dev_params, const KernelParams &kernel_params) {
+  emitting_preprocess_and_list_extra_prev_tokens_step1_kernel<<<grid, block, 0,
+                                                                st>>>(
+      cst_dev_params, kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void EmittingPreprocessAndListExtraPrevTokensStep2Kernel(
+    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
+    const DeviceParams &cst_dev_params, const KernelParams &kernel_params) {
+  emitting_preprocess_and_list_extra_prev_tokens_step2_kernel<<<grid, block, 0,
+                                                                st>>>(
+      cst_dev_params, kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void EmittingPreprocessAndListExtraPrevTokensStep3Kernel(
+    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
+    const DeviceParams &cst_dev_params, const KernelParams &kernel_params) {
+  emitting_preprocess_and_list_extra_prev_tokens_step3_kernel<<<grid, block, 0,
+                                                                st>>>(
+      cst_dev_params, kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void EmittingPreprocessAndListExtraPrevTokensStep4Kernel(
+    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
+    const DeviceParams &cst_dev_params, const KernelParams &kernel_params) {
+  emitting_preprocess_and_list_extra_prev_tokens_step4_kernel<<<grid, block, 0,
+                                                                st>>>(
+      cst_dev_params, kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void ComputeLaneOffsetsKernel(const dim3 &grid, const dim3 &block,
+                              const cudaStream_t &st,
+                              const DeviceParams &cst_dev_params,
+                              const KernelParams &kernel_params) {
+  compute_lane_offsets_kernel<<<grid, block, 0, st>>>(cst_dev_params,
+                                                      kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+template <typename T>
+void ConcatenateLanesDataKernel(const dim3 &grid, const dim3 &block,
+                                const cudaStream_t &st,
+                                const DeviceParams &cst_dev_params,
+                                const KernelParams &kernel_params,
+                                const LaneMatrixView<T> &src, T *concat,
+                                int32 *lane_offsets) {
+  concatenate_lanes_data_kernel<<<grid, block, 0, st>>>(
+      cst_dev_params, kernel_params, src, concat, lane_offsets);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void InitHashmapKernel(const dim3 &grid, const dim3 &block,
+                       const cudaStream_t &st,
+                       const DeviceParams &cst_dev_params) {
+  init_hashmap_kernel<<<grid, block, 0, st>>>(cst_dev_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void ClearHashmapKernel(const dim3 &grid, const dim3 &block,
+                        const cudaStream_t &st,
+                        const DeviceParams &cst_dev_params,
+                        const KernelParams &kernel_params) {
+  clear_hashmap_kernel<<<grid, block, 0, st>>>(cst_dev_params, kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void ComputeCostsHistogramKernel(const dim3 &grid, const dim3 &block,
+                                 const cudaStream_t &st,
+                                 const DeviceParams &cst_dev_params,
+                                 const KernelParams &kernel_params,
+                                 bool use_aux_q) {
+  compute_costs_histogram_kernel<<<grid, block, 0, st>>>(
+      cst_dev_params, kernel_params, use_aux_q);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void UpdateBeamUsingHistogramKernel(const dim3 &grid, const dim3 &block,
+                                    const cudaStream_t &st,
+                                    const DeviceParams &cst_dev_params,
+                                    const KernelParams &kernel_params,
+                                    bool use_aux_q) {
+  update_beam_using_histogram_kernel<<<grid, block, 0, st>>>(
+      cst_dev_params, kernel_params, use_aux_q);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void FinalizeProcessNonEmittingKernel(const dim3 &grid, const dim3 &block,
+                                      const cudaStream_t &st,
+                                      const DeviceParams &cst_dev_params,
+                                      const KernelParams &kernel_params) {
+  finalize_process_non_emitting_kernel<<<grid, block, 0, st>>>(cst_dev_params,
+                                                               kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void GetBestCostStep1Kernel(const dim3 &grid, const dim3 &block,
+                            const cudaStream_t &st,
+                            const DeviceParams &cst_dev_params,
+                            const KernelParams &kernel_params, bool isfinal,
+                            CostType fst_zero) {
+  get_best_cost_step1_kernel<<<grid, block, 0, st>>>(
+      cst_dev_params, kernel_params, isfinal, fst_zero);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void GetBestCostStep2Kernel(const dim3 &grid, const dim3 &block,
+                            const cudaStream_t &st,
+                            const DeviceParams &cst_dev_params,
+                            const KernelParams &kernel_params, bool isfinal,
+                            CostType fst_zero) {
+  get_best_cost_step2_kernel<<<grid, block, 0, st>>>(
+      cst_dev_params, kernel_params, isfinal, fst_zero);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void GetBestCostStep3Kernel(const dim3 &grid, const dim3 &block,
+                            const cudaStream_t &st,
+                            const DeviceParams &cst_dev_params,
+                            const KernelParams &kernel_params) {
+  get_best_cost_step3_kernel<<<grid, block, 0, st>>>(cst_dev_params,
+                                                     kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+template void ExpandArcsKernel<true>(const dim3 &grid, const dim3 &block,
+                                     const cudaStream_t &st,
+                                     const DeviceParams &cst_dev_params,
+                                     const KernelParams &params);
+template void ExpandArcsKernel<false>(const dim3 &grid, const dim3 &block,
+                                      const cudaStream_t &st,
+                                      const DeviceParams &cst_dev_params,
+                                      const KernelParams &params);
+template void PostExpandKernel<true>(const dim3 &grid, const dim3 &block,
+                                     const cudaStream_t &st,
+                                     const DeviceParams &cst_dev_params,
+                                     const KernelParams &params);
+template void PostExpandKernel<false>(const dim3 &grid, const dim3 &block,
+                                      const cudaStream_t &st,
+                                      const DeviceParams &cst_dev_params,
+                                      const KernelParams &params);
+
+template void ConcatenateLanesDataKernel<InfoToken>(
+    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
+    const DeviceParams &cst_dev_params, const KernelParams &params,
+    const LaneMatrixView<InfoToken> &src, InfoToken *concat,
+    int32 *lane_offsets);
+
+template void ConcatenateLanesDataKernel<CostType>(
+    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
+    const DeviceParams &cst_dev_params, const KernelParams &params,
+    const LaneMatrixView<CostType> &src, CostType *concat, int32 *lane_offsets);
+
+template void ConcatenateLanesDataKernel<float2>(
+    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
+    const DeviceParams &cst_dev_params, const KernelParams &params,
+    const LaneMatrixView<float2> &src, float2 *concat, int32 *lane_offsets);
+
+template void ConcatenateLanesDataKernel<int32>(
+    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
+    const DeviceParams &cst_dev_params, const KernelParams &params,
+    const LaneMatrixView<int32> &src, int32 *concat, int32 *lane_offsets);
+
+}  // end namespace cuda_decoder
+}  // end namespace kaldi
diff --git a/src/cudadecoder/cuda-decoder-kernels.h b/src/cudadecoder/cuda-decoder-kernels.h
new file mode 100644
index 00000000000..c137a98da74
--- /dev/null
+++ b/src/cudadecoder/cuda-decoder-kernels.h
@@ -0,0 +1,206 @@
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Hugo Braun, Justin Luitjens, Ryan Leary
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_CUDA_DECODER_CUDA_DECODER_KERNELS_H_
+#define KALDI_CUDA_DECODER_CUDA_DECODER_KERNELS_H_
+
+#include "cudadecoder/cuda-decoder-common.h"
+#include "util/stl-utils.h"
+
+namespace kaldi {
+namespace cuda_decoder {
+
+// DeviceParams contains all top-level const data used by the kernels
+// i.e. the data that won't change between kernel calls (such as memory pointers
+// to the main_q)
+struct DeviceParams {
+  ChannelMatrixView<ChannelCounters> d_channels_counters;
+  LaneMatrixView<LaneCounters> d_lanes_counters;
+  LaneMatrixView<LaneCounters> h_lanes_counters;
+
+  ChannelMatrixView<int2> d_main_q_state_and_cost;
+  ChannelMatrixView<int32> d_main_q_degrees_prefix_sum;
+  ChannelMatrixView<int32> d_main_q_arc_offsets;
+  LaneMatrixView<CostType> d_main_q_acoustic_cost;
+  LaneMatrixView<InfoToken> d_main_q_info;
+  LaneMatrixView<int2> d_aux_q_state_and_cost;
+  LaneMatrixView<InfoToken> d_aux_q_info;
+  LaneMatrixView<HashmapValueT> d_hashmap_values;
+  LaneMatrixView<int2> h_list_final_tokens_in_main_q;
+  LaneMatrixView<float2> d_main_q_extra_and_acoustic_cost;
+  LaneMatrixView<int32> d_histograms;
+  LaneMatrixView<int2> d_main_q_block_sums_prefix_sum;
+  LaneMatrixView<int32> d_main_q_state_hash_idx;
+  LaneMatrixView<int32> d_main_q_extra_prev_tokens_prefix_sum;
+  LaneMatrixView<int32> d_main_q_n_extra_prev_tokens_local_idx;
+  LaneMatrixView<InfoToken> d_main_q_extra_prev_tokens;
+
+  int32 max_nlanes;
+  int32 main_q_capacity, aux_q_capacity;
+  CostType *d_arc_weights;
+  int32 *d_arc_nextstates;
+  int32 *d_arc_pdf_ilabels;
+  uint32 *d_arc_e_offsets;
+  uint32 *d_arc_ne_offsets;
+  CostType *d_fst_final_costs;
+  int32 nstates;
+  CostType default_beam;
+  CostType lattice_beam;
+  int32 init_channel_id;
+  StateId init_state;
+  CostType init_cost;
+  int32 hashmap_capacity;
+  int32 max_active;
+  int32 adaptive_beam_static_segment;
+  int32 adaptive_beam_bin_width;
+};
+
+// KernelParams contains all the kernels arguments that change between kernel
+// calls
+struct KernelParams {
+  int32 nlanes_used;
+};
+
+// Kernel wrappers
+void SaveChannelsStateFromLanesKernel(const dim3 &grid, const dim3 &block,
+                                      const cudaStream_t &st,
+                                      const DeviceParams &cst_dev_params,
+                                      const KernelParams &kernel_params);
+
+void LoadChannelsStateInLanesKernel(const dim3 &grid, const dim3 &block,
+                                    const cudaStream_t &st,
+                                    const DeviceParams &cst_dev_params,
+                                    const KernelParams &kernel_params);
+
+void InitDecodingOnDeviceKernel(const dim3 &grid, const dim3 &block,
+                                const cudaStream_t &st,
+                                const DeviceParams &cst_dev_params,
+                                const KernelParams &kernel_params);
+
+void InitializeInitialLaneKernel(const dim3 &grid, const dim3 &block,
+                                 const cudaStream_t &st,
+                                 const DeviceParams &cst_dev_params);
+
+void ResetForFrameAndEstimateCutoffKernel(const dim3 &grid, const dim3 &block,
+                                          const cudaStream_t &st,
+                                          const DeviceParams &cst_dev_params,
+                                          const KernelParams &kernel_params);
+
+template <bool IS_EMITTING>
+void ExpandArcsKernel(const dim3 &grid, const dim3 &block,
+                      const cudaStream_t &st,
+                      const DeviceParams &cst_dev_params,
+                      const KernelParams &kernel_params);
+
+template <bool IS_EMITTING>
+void PostExpandKernel(const dim3 &grid, const dim3 &block,
+                      const cudaStream_t &st,
+                      const DeviceParams &cst_dev_params,
+                      const KernelParams &kernel_params);
+
+void PostContractAndPreprocessKernel(const dim3 &grid, const dim3 &block,
+                                     const cudaStream_t &st,
+                                     const DeviceParams &cst_dev_params,
+                                     const KernelParams &kernel_params);
+
+void NonEmittingPreprocessAndContractKernel(const dim3 &grid, const dim3 &block,
+                                            const cudaStream_t &st,
+                                            const DeviceParams &cst_dev_params,
+                                            const KernelParams &kernel_params);
+
+void FillHashmapWithMainQKernel(const dim3 &grid, const dim3 &block,
+                                const cudaStream_t &st,
+                                const DeviceParams &cst_dev_params,
+                                const KernelParams &kernel_params);
+
+void EmittingPreprocessAndListExtraPrevTokensStep1Kernel(
+    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
+    const DeviceParams &cst_dev_params, const KernelParams &kernel_params);
+
+void EmittingPreprocessAndListExtraPrevTokensStep2Kernel(
+    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
+    const DeviceParams &cst_dev_params, const KernelParams &kernel_params);
+
+void EmittingPreprocessAndListExtraPrevTokensStep3Kernel(
+    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
+    const DeviceParams &cst_dev_params, const KernelParams &kernel_params);
+
+void EmittingPreprocessAndListExtraPrevTokensStep4Kernel(
+    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
+    const DeviceParams &cst_dev_params, const KernelParams &kernel_params);
+
+void ComputeLaneOffsetsKernel(const dim3 &grid, const dim3 &block,
+                              const cudaStream_t &st,
+                              const DeviceParams &cst_dev_params,
+                              const KernelParams &kernel_params);
+
+template <typename T>
+void ConcatenateLanesDataKernel(const dim3 &grid, const dim3 &block,
+                                const cudaStream_t &st,
+                                const DeviceParams &cst_dev_params,
+                                const KernelParams &kernel_params,
+                                const LaneMatrixView<T> &src, T *concat,
+                                int32 *lane_offsets);
+
+void InitHashmapKernel(const dim3 &grid, const dim3 &block,
+                       const cudaStream_t &st,
+                       const DeviceParams &cst_dev_params);
+
+void ClearHashmapKernel(const dim3 &grid, const dim3 &block,
+                        const cudaStream_t &st,
+                        const DeviceParams &cst_dev_params,
+                        const KernelParams &kernel_params);
+
+void ComputeCostsHistogramKernel(const dim3 &grid, const dim3 &block,
+                                 const cudaStream_t &st,
+                                 const DeviceParams &cst_dev_params,
+                                 const KernelParams &kernel_params,
+                                 bool use_aux_q);
+
+void UpdateBeamUsingHistogramKernel(const dim3 &grid, const dim3 &block,
+                                    const cudaStream_t &st,
+                                    const DeviceParams &cst_dev_params,
+                                    const KernelParams &kernel_params,
+                                    bool use_aux_q);
+
+void FinalizeProcessNonEmittingKernel(const dim3 &grid, const dim3 &block,
+                                      const cudaStream_t &st,
+                                      const DeviceParams &cst_dev_params,
+                                      const KernelParams &kernel_params);
+
+void GetBestCostStep1Kernel(const dim3 &grid, const dim3 &block,
+                            const cudaStream_t &st,
+                            const DeviceParams &cst_dev_params,
+                            const KernelParams &kernel_params, bool isfinal,
+                            CostType fst_zero);
+
+void GetBestCostStep2Kernel(const dim3 &grid, const dim3 &block,
+                            const cudaStream_t &st,
+                            const DeviceParams &cst_dev_params,
+                            const KernelParams &kernel_params, bool isfinal,
+                            CostType fst_zero);
+
+void GetBestCostStep3Kernel(const dim3 &grid, const dim3 &block,
+                            const cudaStream_t &st,
+                            const DeviceParams &cst_dev_params,
+                            const KernelParams &kernel_params);
+
+typedef unsigned char BinId;
+
+}  // namespace kaldi
+}  // namespace cuda_decoder
+
+#endif  // KALDI_CUDA_DECODER_CUDA_DECODER_KERNELS_H_
diff --git a/src/cudadecoder/cuda-decoder.cc b/src/cudadecoder/cuda-decoder.cc
new file mode 100644
index 00000000000..b0bcb100ab8
--- /dev/null
+++ b/src/cudadecoder/cuda-decoder.cc
@@ -0,0 +1,1814 @@
+// cudadecoder/cuda-decoder.cc
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Hugo Braun, Justin Luitjens, Ryan Leary
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if HAVE_CUDA == 1
+
+#include "cuda-decoder.h"
+#include "cuda-decoder-kernels.h"
+
+#include <cuda_runtime_api.h>
+#include <nvToolsExt.h>
+#include <algorithm>
+#include <cfloat>
+#include <map>
+#include <tuple>
+
+namespace kaldi {
+namespace cuda_decoder {
+CudaDecoder::CudaDecoder(const CudaFst &fst, const CudaDecoderConfig &config,
+                         int32 nlanes, int32 nchannels)
+    : fst_(fst),
+      nlanes_(nlanes),
+      nchannels_(nchannels),
+      channel_lock_(nchannels + 1),
+      extra_cost_min_delta_(0.0f),
+      thread_pool_(NULL),
+      n_threads_used_(0),
+      n_h2h_task_not_done_(0),
+      n_init_decoding_h2h_task_not_done_(0),
+      h2h_threads_running_(true) {
+  ReadConfig(config);
+  // Static asserts on constants
+  CheckStaticAsserts();
+  // Runtime asserts
+  KALDI_ASSERT(nlanes > 0);
+  KALDI_ASSERT(nchannels > 0);
+  KALDI_ASSERT(nlanes_ <= nchannels_);
+  // All GPU work in decoder will be sent to compute_st_
+  cudaStreamCreate(&compute_st_);
+  // Copies D2H of tokens for storage on host are done on
+  // copy_st_, in parallel with compute_st_
+  cudaStreamCreate(&copy_st_);
+  // For all the allocating/initializing process
+  // We create a special channel
+  // containing the exact state a channel should have when starting a new decode
+  // It contains fst.Start(), the non-emitting tokens created by fst.Start(),
+  // and all the data used by the decoder.
+  // When calling InitDecoding() on a new channel, we simply clone this special
+  // channel into that new channel
+  ++nchannels_;                       // adding the special initial channel
+  init_channel_id_ = nchannels_ - 1;  // Using last one as init_channel_params
+  AllocateHostData();
+  AllocateDeviceData();
+  AllocateDeviceKernelParams();
+
+  InitDeviceParams();
+  InitHostData();
+  InitDeviceData();
+
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaEventCreate(&nnet3_done_evt_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaEventCreate(&d2h_copy_acoustic_evt_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaEventCreate(&d2h_copy_infotoken_evt_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(
+      cudaEventCreate(&d2h_copy_extra_prev_tokens_evt_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(
+      cudaEventCreate(&concatenated_data_ready_evt_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaEventCreate(&lane_offsets_ready_evt_));
+
+  ComputeInitialChannel();
+  --nchannels_;  // removing the special initial channel from the count
+
+  // Making sure that everything is ready to use
+  cudaStreamSynchronize(compute_st_);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void CudaDecoder::ReadConfig(const CudaDecoderConfig &cst_config) {
+  CudaDecoderConfig config = cst_config;  // deep copy
+  // Sets the missing values using other values
+  config.ComputeConfig();
+  default_beam_ = config.default_beam;
+  lattice_beam_ = config.lattice_beam;
+  ntokens_pre_allocated_ = config.ntokens_pre_allocated;
+  max_active_ = config.max_active;
+  aux_q_capacity_ = config.aux_q_capacity;
+  main_q_capacity_ = config.main_q_capacity;
+
+  KALDI_ASSERT(default_beam_ >= 0.0f);
+  KALDI_ASSERT(lattice_beam_ >= 0.0f);
+  KALDI_ASSERT(ntokens_pre_allocated_ >= 0);
+  KALDI_ASSERT(max_active_ > 0);
+  KALDI_ASSERT(main_q_capacity_ > 0);
+  KALDI_ASSERT(aux_q_capacity_ >= main_q_capacity_);
+}
+
+void CudaDecoder::AllocateDeviceData() {
+  hashmap_capacity_ =
+      KALDI_CUDA_DECODER_HASHMAP_CAPACITY_FACTOR * main_q_capacity_;
+  d_channels_counters_.Resize(nchannels_, 1);
+  d_lanes_counters_.Resize(
+      nlanes_ + 1,
+      1);  // +1 because we sometimes need last+1 value (for offsets)
+  d_main_q_state_and_cost_.Resize(nchannels_, main_q_capacity_);
+  d_main_q_info_.Resize(nlanes_, main_q_capacity_);
+  d_aux_q_state_and_cost_.Resize(nlanes_, aux_q_capacity_);
+  d_aux_q_info_.Resize(nlanes_, aux_q_capacity_);
+  d_main_q_degrees_prefix_sum_.Resize(nchannels_, main_q_capacity_);
+  d_histograms_.Resize(nlanes_, KALDI_CUDA_DECODER_HISTO_NBINS);
+  d_main_q_extra_prev_tokens_prefix_sum_.Resize(nlanes_, main_q_capacity_);
+  d_main_q_n_extra_prev_tokens_local_idx_.Resize(nlanes_, main_q_capacity_);
+
+  d_main_q_state_hash_idx_.Resize(nlanes_, main_q_capacity_);
+  d_main_q_extra_prev_tokens_.Resize(nlanes_, main_q_capacity_);
+  d_main_q_extra_and_acoustic_cost_.Resize(nlanes_, main_q_capacity_);
+  d_main_q_block_sums_prefix_sum_.Resize(
+      nlanes_, KALDI_CUDA_DECODER_DIV_ROUND_UP(main_q_capacity_,
+                                               KALDI_CUDA_DECODER_1D_BLOCK) +
+                   1);
+  d_main_q_arc_offsets_.Resize(nchannels_, main_q_capacity_);
+  d_hashmap_values_.Resize(nlanes_, hashmap_capacity_);
+  d_main_q_acoustic_cost_.Resize(nlanes_, main_q_capacity_);
+  d_extra_and_acoustic_cost_concat_matrix_.Resize(nlanes_, main_q_capacity_);
+  d_acoustic_cost_concat_matrix_.Resize(nlanes_, main_q_capacity_);
+  d_infotoken_concat_matrix_.Resize(nlanes_, main_q_capacity_);
+  d_extra_prev_tokens_concat_matrix_.Resize(nlanes_, main_q_capacity_);
+  // Reusing data from aux_q. Those two are never used at the same time
+  // d_list_final_tokens_in_main_q_ is used in GetBestPath.
+  // the aux_q is used in AdvanceDecoding
+  h_list_final_tokens_in_main_q_.Resize(nlanes_, main_q_capacity_);
+  d_extra_prev_tokens_concat_ = d_extra_prev_tokens_concat_matrix_.lane(0);
+  d_extra_and_acoustic_cost_concat_ =
+      d_extra_and_acoustic_cost_concat_matrix_.lane(0);
+  d_acoustic_cost_concat_ = d_acoustic_cost_concat_matrix_.lane(0);
+  d_infotoken_concat_ = d_infotoken_concat_matrix_.lane(0);
+}
+
+void CudaDecoder::AllocateHostData() {
+  channel_to_compute_.resize(nlanes_);
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
+      &h_extra_and_acoustic_cost_concat_,
+      nlanes_ * main_q_capacity_ * sizeof(*h_extra_and_acoustic_cost_concat_)));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
+      &h_acoustic_cost_concat_,
+      nlanes_ * main_q_capacity_ * sizeof(*h_acoustic_cost_concat_)));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
+      &h_extra_prev_tokens_concat_,
+      nlanes_ * main_q_capacity_ * sizeof(*h_extra_prev_tokens_concat_)));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
+      &h_infotoken_concat_,
+      nlanes_ * main_q_capacity_ * sizeof(*h_infotoken_concat_)));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(
+      cudaMallocHost(&h_extra_and_acoustic_cost_concat_tmp_,
+                     nlanes_ * main_q_capacity_ *
+                         sizeof(*h_extra_and_acoustic_cost_concat_tmp_)));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
+      &h_acoustic_cost_concat_tmp_,
+      nlanes_ * main_q_capacity_ * sizeof(*h_acoustic_cost_concat_tmp_)));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
+      &h_extra_prev_tokens_concat_tmp_,
+      nlanes_ * main_q_capacity_ * sizeof(*h_extra_prev_tokens_concat_tmp_)));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
+      &h_infotoken_concat_tmp_,
+      nlanes_ * main_q_capacity_ * sizeof(*h_infotoken_concat_tmp_)));
+  h_lanes_counters_.Resize(
+      nlanes_ + 1,
+      1);  // +1 because we sometimes need last+1 value (for offsets)
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
+      &h_channels_counters_, nchannels_ * sizeof(*h_channels_counters_)));
+
+  h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_.resize(nchannels_);
+  h_all_tokens_acoustic_cost_.resize(nchannels_);
+  h_all_tokens_extra_prev_tokens_.resize(nchannels_);
+  h_all_tokens_info_.resize(nchannels_);
+  for (int32 ichannel = 0; ichannel < nchannels_; ++ichannel) {
+    h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_[ichannel].reserve(
+        ntokens_pre_allocated_);
+    h_all_tokens_acoustic_cost_[ichannel].reserve(ntokens_pre_allocated_);
+    h_all_tokens_info_[ichannel].reserve(ntokens_pre_allocated_);
+  }
+  h_main_q_end_lane_offsets_.resize(nlanes_ + 1);
+  h_emitting_main_q_end_lane_offsets_.resize(nlanes_ + 1);
+  h_n_extra_prev_tokens_lane_offsets_.resize(nlanes_ + 1);
+  frame_offsets_.resize(nchannels_);
+  num_frames_decoded_.resize(nchannels_, -1);
+  lanes2channels_todo_.reserve(nlanes_);
+
+  h_all_argmin_cost_.resize(nchannels_, {-1, 0.0f});
+  h_all_final_tokens_list_.resize(nchannels_);
+  h_all_has_reached_final_.resize(nchannels_);
+}
+
+void CudaDecoder::InitDeviceData() {
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMemsetAsync(
+      d_channels_counters_.MutableData(), 0,
+      nchannels_ * sizeof(*d_channels_counters_.MutableData()), compute_st_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMemsetAsync(
+      d_lanes_counters_.MutableData(), 0,
+      nlanes_ * sizeof(*d_lanes_counters_.MutableData()), compute_st_));
+  InitHashmapKernel(KaldiCudaDecoderNumBlocks(hashmap_capacity_, nlanes_),
+                    KALDI_CUDA_DECODER_1D_BLOCK, compute_st_,
+                    *h_device_params_);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void CudaDecoder::InitHostData() {}
+
+void CudaDecoder::AllocateDeviceKernelParams() {
+  h_device_params_ = new DeviceParams();
+  h_kernel_params_ = new KernelParams();
+}
+
+void CudaDecoder::InitDeviceParams() {
+  // Setting Kernel Params
+  // Sent to cuda kernels by copy
+  // Making sure we'll be able to send it to the kernels
+  KALDI_ASSERT((sizeof(KernelParams) + sizeof(DeviceParams)) <
+               KALDI_CUDA_DECODER_MAX_KERNEL_ARGUMENTS_BYTE_SIZE);
+
+  h_device_params_->d_channels_counters = d_channels_counters_.GetView();
+  h_device_params_->d_lanes_counters = d_lanes_counters_.GetView();
+  h_device_params_->h_lanes_counters = h_lanes_counters_.GetView();
+  h_device_params_->d_main_q_state_and_cost =
+      d_main_q_state_and_cost_.GetView();
+  h_device_params_->d_main_q_info = d_main_q_info_.GetView();
+  h_device_params_->d_aux_q_state_and_cost = d_aux_q_state_and_cost_.GetView();
+  h_device_params_->d_main_q_extra_and_acoustic_cost =
+      d_main_q_extra_and_acoustic_cost_.GetView();
+  h_device_params_->d_main_q_acoustic_cost = d_main_q_acoustic_cost_.GetView();
+  h_device_params_->d_aux_q_info = d_aux_q_info_.GetView();
+  h_device_params_->d_main_q_degrees_prefix_sum =
+      d_main_q_degrees_prefix_sum_.GetView();
+  h_device_params_->d_main_q_block_sums_prefix_sum =
+      d_main_q_block_sums_prefix_sum_.GetView();
+  h_device_params_->d_main_q_state_hash_idx =
+      d_main_q_state_hash_idx_.GetView();
+  h_device_params_->d_main_q_extra_prev_tokens_prefix_sum =
+      d_main_q_extra_prev_tokens_prefix_sum_.GetView();
+  h_device_params_->d_main_q_n_extra_prev_tokens_local_idx =
+      d_main_q_n_extra_prev_tokens_local_idx_.GetView();
+  h_device_params_->d_main_q_extra_prev_tokens =
+      d_main_q_extra_prev_tokens_.GetView();
+  h_device_params_->d_main_q_arc_offsets = d_main_q_arc_offsets_.GetView();
+  h_device_params_->d_hashmap_values = d_hashmap_values_.GetView();
+  h_device_params_->d_histograms = d_histograms_.GetView();
+  h_device_params_->d_arc_e_offsets = fst_.d_e_offsets_;
+  h_device_params_->d_arc_ne_offsets = fst_.d_ne_offsets_;
+  h_device_params_->d_arc_pdf_ilabels = fst_.d_arc_pdf_ilabels_;
+  h_device_params_->d_arc_weights = fst_.d_arc_weights_;
+  h_device_params_->d_arc_nextstates = fst_.d_arc_nextstates_;
+  h_device_params_->d_fst_final_costs = fst_.d_final_;
+  h_device_params_->default_beam = default_beam_;
+  h_device_params_->lattice_beam = lattice_beam_;
+  h_device_params_->main_q_capacity = main_q_capacity_;
+  h_device_params_->aux_q_capacity = aux_q_capacity_;
+  h_device_params_->init_channel_id = init_channel_id_;
+  h_device_params_->max_nlanes = nlanes_;
+  h_device_params_->nstates = fst_.num_states_;
+  h_device_params_->init_state = fst_.Start();
+  KALDI_ASSERT(h_device_params_->init_state != fst::kNoStateId);
+  h_device_params_->init_cost = StdWeight::One().Value();
+  h_device_params_->hashmap_capacity = hashmap_capacity_;
+  h_device_params_->max_active = max_active_;
+  // For the first static_beam_q_length elements of the queue, we will keep the
+  // beam static
+  adaptive_beam_static_segment_ =
+      aux_q_capacity_ / KALDI_CUDA_DECODER_ADAPTIVE_BEAM_STATIC_SEGMENT;
+  // For the last adaptive_beam_q_length elements of the queue, we will decrease
+  // the beam, segment by segment
+  // For more information, please refer to the definition of GetAdaptiveBeam in
+  // cuda-decoder-kernels.cu
+  int32 adaptive_beam_q_length =
+      (aux_q_capacity_ - adaptive_beam_static_segment_);
+  int32 adaptive_beam_bin_width =
+      adaptive_beam_q_length / KALDI_CUDA_DECODER_ADAPTIVE_BEAM_NSTEPS;
+  h_device_params_->adaptive_beam_static_segment =
+      adaptive_beam_static_segment_;
+  h_device_params_->adaptive_beam_bin_width = adaptive_beam_bin_width;
+
+  // Reusing aux_q memory to list final states in GetLattice
+  // Those cannot be used at the same time
+  h_device_params_->h_list_final_tokens_in_main_q =
+      h_list_final_tokens_in_main_q_.GetView();
+}
+
+CudaDecoder::~CudaDecoder() {
+  // Stopping h2h tasks
+  h2h_threads_running_ = false;
+  n_h2h_main_task_todo_cv_.notify_all();
+  for (std::thread &thread : cpu_dedicated_threads_) thread.join();
+  cudaStreamDestroy(compute_st_);
+  cudaStreamDestroy(copy_st_);
+
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaFreeHost(h_channels_counters_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(
+      cudaFreeHost(h_extra_and_acoustic_cost_concat_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaFreeHost(h_acoustic_cost_concat_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaFreeHost(h_extra_prev_tokens_concat_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaFreeHost(h_infotoken_concat_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(
+      cudaFreeHost(h_extra_and_acoustic_cost_concat_tmp_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaFreeHost(h_acoustic_cost_concat_tmp_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(
+      cudaFreeHost(h_extra_prev_tokens_concat_tmp_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaFreeHost(h_infotoken_concat_tmp_));
+  // Will call the cudaFrees inside destructors
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaEventDestroy(nnet3_done_evt_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaEventDestroy(d2h_copy_acoustic_evt_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaEventDestroy(d2h_copy_infotoken_evt_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(
+      cudaEventDestroy(d2h_copy_extra_prev_tokens_evt_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(
+      cudaEventDestroy(concatenated_data_ready_evt_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaEventDestroy(lane_offsets_ready_evt_));
+
+  delete h_kernel_params_;
+  delete h_device_params_;
+}
+
+void CudaDecoder::ComputeInitialChannel() {
+  KALDI_ASSERT(nlanes_ > 0);
+  const int32 ilane = 0;
+  KALDI_ASSERT(ilane == 0);
+  // Following kernels working channel_id
+  std::vector<ChannelId> channels = {init_channel_id_};
+  SetChannelsInKernelParams(channels);  // not calling LoadChannelsStateToLanes,
+                                        // init_channel_id_ is a special case
+  h_lanes_counters_.lane(ilane)->channel_to_compute = init_channel_id_;
+
+  cudaMemcpyAsync(d_lanes_counters_.MutableData(), h_lanes_counters_.lane(0),
+                  1 * sizeof(*h_lanes_counters_.lane(0)),
+                  cudaMemcpyHostToDevice, compute_st_);
+  h_lanes_counters_.lane(ilane)->main_q_narcs_and_end.y = 0;
+
+  // Adding the start state to the initial token queue
+  InitializeInitialLaneKernel(KaldiCudaDecoderNumBlocks(1, 1),
+                              KALDI_CUDA_DECODER_ONE_THREAD_BLOCK, compute_st_,
+                              *h_device_params_);
+
+  h_lanes_counters_.lane(ilane)->post_expand_aux_q_end = 1;
+
+  PruneAndPreprocess();
+  FinalizeProcessNonEmittingKernel(
+      KaldiCudaDecoderNumBlocks(1, 1), KALDI_CUDA_DECODER_LARGEST_1D_BLOCK,
+      compute_st_, *h_device_params_, *h_kernel_params_);
+
+  CopyLaneCountersToHostSync();
+  PostProcessingMainQueue();
+  CopyLaneCountersToHostSync();
+
+  const int32 main_q_end =
+      h_lanes_counters_.lane(ilane)->main_q_narcs_and_end.y;
+  KALDI_ASSERT(main_q_end > 0);
+
+  // Moving all data linked to init_channel_id_ to host
+  // that data will be cloned to other channels when calling InitDecoding
+  CopyMainQueueDataToHost();
+  SaveChannelsStateFromLanes();
+
+  KALDI_ASSERT(
+      h_channels_counters_[init_channel_id_].prev_main_q_narcs_and_end.x > 0);
+  KALDI_ASSERT(
+      h_channels_counters_[init_channel_id_].prev_main_q_narcs_and_end.y > 0);
+}
+
+void CudaDecoder::InitDecoding(const std::vector<ChannelId> &channels) {
+  // Cloning the init_channel_id_ channel into all channels in the channels vec
+  const int nlanes_used = channels.size();
+  // Getting *h_kernel_params ready to use
+  LoadChannelsStateToLanes(channels);
+  cudaMemcpyAsync(d_lanes_counters_.MutableData(), h_lanes_counters_.lane(0),
+                  nlanes_used_ * sizeof(*h_lanes_counters_.lane(0)),
+                  cudaMemcpyHostToDevice, compute_st_);
+
+  // Size of the initial main_q
+  ChannelCounters &init_channel_counters =
+      h_channels_counters_[init_channel_id_];
+  const int32 init_main_q_size =
+      init_channel_counters.prev_main_q_narcs_and_end.y;
+
+  KALDI_ASSERT(init_main_q_size > 0);
+  // Getting the channels ready to compute new utterances
+  InitDecodingOnDeviceKernel(
+      KaldiCudaDecoderNumBlocks(init_main_q_size, nlanes_used),
+      KALDI_CUDA_DECODER_1D_BLOCK, compute_st_, *h_device_params_,
+      *h_kernel_params_);
+
+  {
+    std::lock_guard<std::mutex> n_h2h_not_done_lk(
+        n_init_decoding_h2h_task_not_done_mutex_);
+    n_init_decoding_h2h_task_not_done_ += channels.size();
+  }
+  for (ChannelId ichannel : channels) {
+    ChannelCounters &channel_counters = h_channels_counters_[ichannel];
+    channel_counters.prev_main_q_narcs_and_end =
+        init_channel_counters.prev_main_q_narcs_and_end;
+    channel_counters.prev_main_q_n_extra_prev_tokens =
+        init_channel_counters.prev_main_q_n_extra_prev_tokens;
+    channel_counters.prev_main_q_global_offset = 0;
+    channel_counters.prev_main_q_extra_prev_tokens_global_offset = 0;
+    channel_counters.prev_beam = default_beam_;
+
+    int32 n_initial_tokens = h_all_tokens_info_[init_channel_id_].size();
+    num_frames_decoded_[ichannel] = 0;
+    h_channels_counters_[ichannel] = h_channels_counters_[init_channel_id_];
+    h_all_argmin_cost_[ichannel] = {-1, 0.0f};
+    frame_offsets_[ichannel].clear();
+    frame_offsets_[ichannel].push_back(n_initial_tokens);
+    if (thread_pool_)
+      thread_pool_->enqueue(THREAD_POOL_HIGH_PRIORITY,
+                            &CudaDecoder::InitDecodingH2HCopies, this,
+                            ichannel);
+    else
+      InitDecodingH2HCopies(ichannel);
+  }
+}
+
+void CudaDecoder::InitDecodingH2HCopies(ChannelId ichannel) {
+  // Tokens from initial main_q needed on host
+  std::unique_lock<std::mutex> channel_lk(channel_lock_[ichannel]);
+  // Deep copy
+  h_all_tokens_info_[ichannel] = h_all_tokens_info_[init_channel_id_];
+  h_all_tokens_acoustic_cost_[ichannel] =
+      h_all_tokens_acoustic_cost_[init_channel_id_];
+  h_all_tokens_extra_prev_tokens_[ichannel] =
+      h_all_tokens_extra_prev_tokens_[init_channel_id_];
+  h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_[ichannel] =
+      h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_[init_channel_id_];
+
+  bool all_done;
+  {
+    std::lock_guard<std::mutex> lk_not_done(
+        n_init_decoding_h2h_task_not_done_mutex_);
+    all_done = (--n_init_decoding_h2h_task_not_done_ == 0);
+  }
+  if (all_done) {
+    init_decoding_h2h_done_.notify_all();
+  }
+}
+
+void CudaDecoder::LoadChannelsStateToLanes(
+    const std::vector<ChannelId> &channels) {
+  // Setting that channels configuration in kernel_params
+  SetChannelsInKernelParams(channels);
+  KALDI_ASSERT(nlanes_used_ > 0);
+  for (LaneId ilane = 0; ilane < nlanes_used_; ++ilane) {
+    const ChannelId ichannel = channel_to_compute_[ilane];
+    ChannelCounters &channel_counters = h_channels_counters_[ichannel];
+    LaneCounters &lane_counters = *h_lanes_counters_.lane(ilane);
+    lane_counters.channel_to_compute = ichannel;
+    lane_counters.main_q_narcs_and_end =
+        channel_counters.prev_main_q_narcs_and_end;
+    lane_counters.main_q_n_extra_prev_tokens =
+        channel_counters.prev_main_q_n_extra_prev_tokens;
+    int32 int_beam = floatToOrderedIntHost(channel_counters.prev_beam);
+    lane_counters.int_beam = int_beam;
+    lane_counters.adaptive_int_beam_with_validity_index.x = int_beam;
+    lane_counters.adaptive_int_beam_with_validity_index.y =
+        adaptive_beam_static_segment_;
+    lane_counters.main_q_global_offset =
+        channel_counters.prev_main_q_global_offset;
+    lane_counters.main_q_extra_prev_tokens_global_offset =
+        channel_counters.prev_main_q_extra_prev_tokens_global_offset;
+
+    lane_counters.min_int_cost =
+        channel_counters.min_int_cost_and_arg_without_final.x;
+    lane_counters.prev_arg_min_int_cost =
+        channel_counters.min_int_cost_and_arg_without_final.y;
+  }
+}
+
+void CudaDecoder::SaveChannelsStateFromLanes() {
+  KALDI_ASSERT(nlanes_used_ > 0);
+  for (LaneId ilane = 0; ilane < nlanes_used_; ++ilane) {
+    const ChannelId ichannel = channel_to_compute_[ilane];
+    ChannelCounters &channel_counters = h_channels_counters_[ichannel];
+    LaneCounters &lane_counters = *h_lanes_counters_.lane(ilane);
+    channel_counters.prev_main_q_narcs_and_end =
+        lane_counters.main_q_narcs_and_end;
+    channel_counters.prev_main_q_extra_prev_tokens_global_offset =
+        lane_counters.main_q_extra_prev_tokens_global_offset;
+    channel_counters.prev_main_q_global_offset =
+        lane_counters.main_q_global_offset;
+    channel_counters.prev_main_q_n_extra_prev_tokens =
+        lane_counters.main_q_n_extra_prev_tokens;
+    channel_counters.prev_beam = orderedIntToFloatHost(lane_counters.int_beam);
+    channel_counters.min_int_cost_and_arg_without_final = {
+        lane_counters.min_int_cost, lane_counters.prev_arg_min_int_cost};
+  }
+  SaveChannelsStateFromLanesKernel(KaldiCudaDecoderNumBlocks(1, nlanes_used_),
+                                   KALDI_CUDA_DECODER_ONE_THREAD_BLOCK,
+                                   compute_st_, *h_device_params_,
+                                   *h_kernel_params_);
+
+  ResetChannelsInKernelParams();
+}
+
+int32 CudaDecoder::GetMaxForAllLanes(
+    std::function<int32(const LaneCounters &)> func) {
+  int32 max_val = 0;
+  for (LaneId ilane = 0; ilane < nlanes_used_; ++ilane) {
+    const int32 val = func(*h_lanes_counters_.lane(ilane));
+    max_val = std::max(max_val, val);
+  }
+  return max_val;
+}
+
+void CudaDecoder::CopyLaneCountersToHostAsync() {
+  cudaMemcpyAsync(h_lanes_counters_.lane(0), d_lanes_counters_.MutableData(),
+                  nlanes_used_ * sizeof(*h_lanes_counters_.lane(0)),
+                  cudaMemcpyDeviceToHost, compute_st_);
+}
+
+void CudaDecoder::CopyLaneCountersToHostSync() {
+  CopyLaneCountersToHostAsync();
+  cudaStreamSynchronize(compute_st_);
+}
+
+// One sync has to happen between PerformConcatenatedCopy and
+// MoveConcatenatedCopyToVector
+template <typename T>
+void CudaDecoder::MoveConcatenatedCopyToVector(
+    const int32 ilane, const int32 ichannel,
+    const std::vector<int32> &lanes_offsets, T *h_concat,
+    std::vector<std::vector<T>> *vecvec) {
+  // Unpacking the concatenated vector into individual channel storage
+  int32 beg = lanes_offsets[ilane];
+  int32 end = lanes_offsets[ilane + 1];
+  auto &vec = (*vecvec)[ichannel];
+  vec.insert(vec.end(), h_concat + beg, h_concat + end);
+}
+
+void CudaDecoder::ApplyMaxActiveAndReduceBeam(enum QUEUE_ID queue_id) {
+  // Checking if we should activate max active for the current frame
+  // once it is active, it is active for the whole frame (for all non emitting
+  // iterations)
+  // If at least one lane queue is bigger than max_active,
+  // we'll apply a topk on that queue (k=max_active_)
+  bool use_aux_q = (queue_id == AUX_Q);
+  ComputeCostsHistogramKernel(KaldiCudaDecoderNumBlocks(nlanes_used_),
+                              KALDI_CUDA_DECODER_1D_BLOCK, compute_st_,
+                              *h_device_params_, *h_kernel_params_, use_aux_q);
+
+  UpdateBeamUsingHistogramKernel(
+      KaldiCudaDecoderNumBlocks(1, nlanes_used_), KALDI_CUDA_DECODER_1D_BLOCK,
+      compute_st_, *h_device_params_, *h_kernel_params_, use_aux_q);
+}
+
+int32 CudaDecoder::NumFramesToDecode(
+    const std::vector<ChannelId> &channels,
+    std::vector<CudaDecodableInterface *> &decodables, int32 max_num_frames) {
+  int32 nframes_to_decode = INT_MAX;
+  // std::vector<int> debug_ntokens;
+  // std::vector<int> debug_narcs;
+  for (int32 ilane = 0; ilane < nlanes_used_; ++ilane) {
+    const ChannelId ichannel = channels[ilane];
+    const int32 num_frames_decoded = num_frames_decoded_[ichannel];
+    KALDI_ASSERT(num_frames_decoded >= 0 &&
+                 "You must call InitDecoding() before AdvanceDecoding()");
+    int32 num_frames_ready = decodables[ilane]->NumFramesReady();
+    // num_frames_ready must be >= num_frames_decoded, or else
+    // the number of frames ready must have decreased (which doesn't
+    // make sense) or the decodable object changed between calls
+    // (which isn't allowed).
+    KALDI_ASSERT(num_frames_ready >= num_frames_decoded);
+    int32 channel_nframes_to_decode = num_frames_ready - num_frames_decoded;
+    nframes_to_decode = std::min(nframes_to_decode, channel_nframes_to_decode);
+  }
+  if (max_num_frames >= 0)
+    nframes_to_decode = std::min(nframes_to_decode, max_num_frames);
+
+  return nframes_to_decode;
+}
+
+void CudaDecoder::ExpandArcsEmitting() {
+  ExpandArcsKernel<true>(KaldiCudaDecoderNumBlocks(nlanes_used_),
+                         KALDI_CUDA_DECODER_1D_BLOCK, compute_st_,
+                         *h_device_params_, *h_kernel_params_);
+
+  // Updating a few counters, like resetting aux_q_end to 0...
+  // true is for IS_EMITTING
+  PostExpandKernel<true>(KaldiCudaDecoderNumBlocks(1, nlanes_used_),
+                         KALDI_CUDA_DECODER_ONE_THREAD_BLOCK, compute_st_,
+                         *h_device_params_, *h_kernel_params_);
+}
+
+void CudaDecoder::ExpandArcsNonEmitting() {
+  // false is for non emitting
+  ExpandArcsKernel<false>(KaldiCudaDecoderNumBlocks(nlanes_used_),
+                          KALDI_CUDA_DECODER_1D_BLOCK, compute_st_,
+                          *h_device_params_, *h_kernel_params_);
+
+  // false is for non emitting
+  PostExpandKernel<false>(KaldiCudaDecoderNumBlocks(1, nlanes_used_),
+                          KALDI_CUDA_DECODER_ONE_THREAD_BLOCK, compute_st_,
+                          *h_device_params_, *h_kernel_params_);
+}
+
+void CudaDecoder::PruneAndPreprocess() {
+  NonEmittingPreprocessAndContractKernel(
+      KaldiCudaDecoderNumBlocks(nlanes_used_), KALDI_CUDA_DECODER_1D_BLOCK,
+      compute_st_, *h_device_params_, *h_kernel_params_);
+  PostContractAndPreprocessKernel(KaldiCudaDecoderNumBlocks(1, nlanes_used_),
+                                  KALDI_CUDA_DECODER_ONE_THREAD_BLOCK,
+                                  compute_st_, *h_device_params_,
+                                  *h_kernel_params_);
+}
+
+void CudaDecoder::PostProcessingMainQueue() {
+  ApplyMaxActiveAndReduceBeam(MAIN_Q);
+
+  FillHashmapWithMainQKernel(KaldiCudaDecoderNumBlocks(nlanes_used_),
+                             KALDI_CUDA_DECODER_1D_BLOCK, compute_st_,
+                             *h_device_params_, *h_kernel_params_);
+
+  EmittingPreprocessAndListExtraPrevTokensStep1Kernel(
+      KaldiCudaDecoderNumBlocks(nlanes_used_), KALDI_CUDA_DECODER_1D_BLOCK,
+      compute_st_, *h_device_params_, *h_kernel_params_);
+
+  EmittingPreprocessAndListExtraPrevTokensStep2Kernel(
+      KaldiCudaDecoderNumBlocks(nlanes_used_), KALDI_CUDA_DECODER_1D_BLOCK,
+      compute_st_, *h_device_params_, *h_kernel_params_);
+
+  // Step2 wrote main_q_n_extra_prev_tokens
+  // it was the last value missing to compute the lanes offsets
+  // doing it now
+  ComputeLaneOffsetsKernel(KaldiCudaDecoderNumBlocks(1, 1),  // One CTA
+                           KALDI_CUDA_DECODER_1D_BLOCK, compute_st_,
+                           *h_device_params_, *h_kernel_params_);
+  cudaEventRecord(lane_offsets_ready_evt_, compute_st_);
+
+  EmittingPreprocessAndListExtraPrevTokensStep3Kernel(
+      KaldiCudaDecoderNumBlocks(nlanes_used_), KALDI_CUDA_DECODER_1D_BLOCK,
+      compute_st_, *h_device_params_, *h_kernel_params_);
+
+  EmittingPreprocessAndListExtraPrevTokensStep4Kernel(
+      KaldiCudaDecoderNumBlocks(nlanes_used_), KALDI_CUDA_DECODER_1D_BLOCK,
+      compute_st_, *h_device_params_, *h_kernel_params_);
+
+  ClearHashmapKernel(KaldiCudaDecoderNumBlocks(nlanes_used_),
+                     KALDI_CUDA_DECODER_1D_BLOCK, compute_st_,
+                     *h_device_params_, *h_kernel_params_);
+}
+
+void CudaDecoder::CopyMainQueueDataToHost() {
+  cudaEventRecord(concatenated_data_ready_evt_, compute_st_);
+  cudaStreamWaitEvent(copy_st_, concatenated_data_ready_evt_,
+                      0);  // the copies on copy_st will wait on compute_st_
+  cudaEventSynchronize(
+      lane_offsets_ready_evt_);  // we need the total size of each segments
+  LaunchD2HCopies();
+
+  // Making sure the previous H2H copies are done
+  WaitForInitDecodingH2HCopies();
+  WaitForH2HCopies();
+
+  std::swap(h_extra_and_acoustic_cost_concat_tmp_,
+            h_extra_and_acoustic_cost_concat_);
+  std::swap(h_infotoken_concat_tmp_, h_infotoken_concat_);
+  std::swap(h_acoustic_cost_concat_tmp_, h_acoustic_cost_concat_);
+  std::swap(h_extra_prev_tokens_concat_tmp_, h_extra_prev_tokens_concat_);
+  // Saving the offsets computed previously
+  lanes2channels_todo_.clear();
+  for (int32 ilane = 0; ilane < (nlanes_used_ + 1); ++ilane) {
+    h_emitting_main_q_end_lane_offsets_[ilane] =
+        h_lanes_counters_.lane(ilane)->main_q_n_emitting_tokens_lane_offset;
+    h_main_q_end_lane_offsets_[ilane] =
+        h_lanes_counters_.lane(ilane)->main_q_end_lane_offset;
+    h_n_extra_prev_tokens_lane_offsets_[ilane] =
+        h_lanes_counters_.lane(ilane)->main_q_n_extra_prev_tokens_lane_offset;
+    lanes2channels_todo_.push_back(channel_to_compute_[ilane]);
+  }
+
+  LaunchH2HCopies();
+}
+
+void CudaDecoder::LaunchD2HCopies() {
+  // Last offset = total
+  int32 nelements_acoustic_costs = h_lanes_counters_.lane(nlanes_used_)
+                                       ->main_q_n_emitting_tokens_lane_offset;
+  // Moving the d_concat to h_concat (host), async
+  if (nelements_acoustic_costs > 0) {
+    KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMemcpyAsync(
+        h_acoustic_cost_concat_tmp_, d_acoustic_cost_concat_,
+        nelements_acoustic_costs * sizeof(*d_acoustic_cost_concat_),
+        cudaMemcpyDeviceToHost, copy_st_));
+  }
+  cudaEventRecord(d2h_copy_acoustic_evt_, copy_st_);
+
+  int32 nelements_infotoken =
+      h_lanes_counters_.lane(nlanes_used_)->main_q_end_lane_offset;
+  if (nelements_infotoken > 0) {
+    KALDI_DECODER_CUDA_API_CHECK_ERROR(
+        cudaMemcpyAsync(h_infotoken_concat_tmp_, d_infotoken_concat_,
+                        nelements_infotoken * sizeof(*d_infotoken_concat_),
+                        cudaMemcpyDeviceToHost, copy_st_));
+  }
+  cudaEventRecord(d2h_copy_infotoken_evt_, copy_st_);
+  int32 nelements_extra_prev_tokens =
+      h_lanes_counters_.lane(nlanes_used_)
+          ->main_q_n_extra_prev_tokens_lane_offset;
+  if (nelements_extra_prev_tokens > 0) {
+    KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMemcpyAsync(
+        h_extra_prev_tokens_concat_tmp_, d_extra_prev_tokens_concat_,
+        nelements_extra_prev_tokens * sizeof(*d_extra_prev_tokens_concat_),
+        cudaMemcpyDeviceToHost, copy_st_));
+    KALDI_DECODER_CUDA_API_CHECK_ERROR(
+        cudaMemcpyAsync(h_extra_and_acoustic_cost_concat_tmp_,
+                        d_extra_and_acoustic_cost_concat_,
+                        nelements_extra_prev_tokens *
+                            sizeof(*d_extra_and_acoustic_cost_concat_),
+                        cudaMemcpyDeviceToHost, copy_st_));
+  }
+  cudaEventRecord(d2h_copy_extra_prev_tokens_evt_, copy_st_);
+}
+
+void CudaDecoder::ConcatenateData() {
+  ConcatenateLanesDataKernel(
+      KaldiCudaDecoderNumBlocks(nlanes_used_), KALDI_CUDA_DECODER_1D_BLOCK,
+      compute_st_, *h_device_params_, *h_kernel_params_,
+      h_device_params_->d_main_q_acoustic_cost, d_acoustic_cost_concat_,
+      &d_lanes_counters_.lane(0)->main_q_n_emitting_tokens_lane_offset);
+  ConcatenateLanesDataKernel(
+      KaldiCudaDecoderNumBlocks(nlanes_used_), KALDI_CUDA_DECODER_1D_BLOCK,
+      compute_st_, *h_device_params_, *h_kernel_params_,
+      h_device_params_->d_main_q_info, d_infotoken_concat_,
+      &d_lanes_counters_.lane(0)->main_q_end_lane_offset);
+  ConcatenateLanesDataKernel(
+      KaldiCudaDecoderNumBlocks(nlanes_used_), KALDI_CUDA_DECODER_1D_BLOCK,
+      compute_st_, *h_device_params_, *h_kernel_params_,
+      h_device_params_->d_main_q_extra_prev_tokens, d_extra_prev_tokens_concat_,
+      &d_lanes_counters_.lane(0)->main_q_n_extra_prev_tokens_lane_offset);
+  ConcatenateLanesDataKernel(
+      KaldiCudaDecoderNumBlocks(nlanes_used_), KALDI_CUDA_DECODER_1D_BLOCK,
+      compute_st_, *h_device_params_, *h_kernel_params_,
+      h_device_params_->d_main_q_extra_and_acoustic_cost,
+      d_extra_and_acoustic_cost_concat_,
+      &d_lanes_counters_.lane(0)->main_q_n_extra_prev_tokens_lane_offset);
+}
+
+void CudaDecoder::AdvanceDecoding(
+    const std::vector<ChannelId> &channels,
+    std::vector<CudaDecodableInterface *> &decodables, int32 max_num_frames) {
+  if (channels.size() == 0) return;  // nothing to do
+  // Context switch : Loading the channels state in lanes
+  LoadChannelsStateToLanes(channels);
+  KALDI_ASSERT(nlanes_used_ > 0);
+
+  // We'll decode nframes_to_decode, such as all channels have at least that
+  // number
+  // of frames available
+  int32 nframes_to_decode =
+      NumFramesToDecode(channels, decodables, max_num_frames);
+
+  // Looping over the frames that we will compute
+  for (int32 iframe = 0; iframe < nframes_to_decode; ++iframe) {
+    // Loglikelihoods from the acoustic model
+    // Setting the loglikelihoods pointers for that frame
+    for (LaneId ilane = 0; ilane < nlanes_used_; ++ilane) {
+      ChannelId ichannel = channel_to_compute_[ilane];
+      int32 frame = num_frames_decoded_[ichannel];
+      h_lanes_counters_.lane(ilane)->loglikelihoods =
+          decodables[ilane]->GetLogLikelihoodsCudaPointer(frame);
+    }
+    cudaMemcpyAsync(d_lanes_counters_.MutableData(), h_lanes_counters_.lane(0),
+                    nlanes_used_ * sizeof(*h_lanes_counters_.lane(0)),
+                    cudaMemcpyHostToDevice, compute_st_);
+    // compute_st_ will wait for nnet3 to complete
+    cudaEventRecord(nnet3_done_evt_, cudaStreamPerThread);
+    cudaStreamWaitEvent(compute_st_, nnet3_done_evt_, 0);
+
+    // Estimating cutoff using argmin from last frame
+    ResetForFrameAndEstimateCutoffKernel(
+        KaldiCudaDecoderNumBlocks(1, nlanes_used_), KALDI_CUDA_DECODER_1D_BLOCK,
+        compute_st_, *h_device_params_, *h_kernel_params_);
+    // Reset max active status. If necessary, ApplyMaxActiveAndReduceBeam will
+    // switch it back on
+    compute_max_active_ = false;
+
+    // Processing emitting arcs. We've done the preprocess stage at the end of
+    // the previous frame
+    ExpandArcsEmitting();
+    // We'll loop until we have a small enough number of non-emitting arcs
+    // in the token queue. We'll then break the loop
+    for (int i = 0; i < KALDI_CUDA_DECODER_N_NON_EMITTING_MAIN_ITERATIONS;
+         ++i) {
+      // If one of the aux_q contains more than max_active_ tokens,
+      // we'll reduce the beam to only keep max_active_ tokens
+      ApplyMaxActiveAndReduceBeam(AUX_Q);
+      // Prune the aux_q. Apply the latest beam (using the one from
+      // ApplyMaxActiveAndReduceBeam if triggered)
+      // move the survival tokens to the main queue
+      // and do the preprocessing necessary for the next ExpandArcs
+      PruneAndPreprocess();
+
+      // "heavy duty" kernel for non-emitting. The long tail of small
+      // non-emitting iterations will be done in
+      // FinalizeProcessNonEmittingKernel
+      ExpandArcsNonEmitting();
+    }
+    ApplyMaxActiveAndReduceBeam(AUX_Q);
+    PruneAndPreprocess();
+    // Finalizing process non emitting. Takes care of the long tail,
+    // the final iterations with a small numbers of arcs. Do the work inside a
+    // single CTA (per lane),
+    FinalizeProcessNonEmittingKernel(KaldiCudaDecoderNumBlocks(1, nlanes_used_),
+                                     KALDI_CUDA_DECODER_LARGEST_1D_BLOCK,
+                                     compute_st_, *h_device_params_,
+                                     *h_kernel_params_);
+
+    // We now have our final token main queues for that frame
+
+    // Post processing the tokens for that frame
+    // - do the preprocess necessary for the next emitting expand (will happen
+    // with next frame)
+    // - if a state S has more than one token associated to it, generate the
+    // list of those tokens
+    // It allows to backtrack efficiently in GetRawLattice
+    // - compute the extra costs
+    PostProcessingMainQueue();
+
+    // Waiting on previous d2h before writing on same device memory
+    cudaStreamWaitEvent(compute_st_, d2h_copy_extra_prev_tokens_evt_, 0);
+    // Concatenating the data that will be moved to host into large arrays
+    ConcatenateData();
+    // Copying the final lane counters for that frame
+    CopyLaneCountersToHostSync();
+    CheckOverflow();
+
+    // Moving the data necessary for GetRawLattice/GetBestPath back to host for
+    // storage
+    CopyMainQueueDataToHost();
+
+    for (LaneId ilane = 0; ilane < nlanes_used_; ++ilane) {
+      const ChannelId ichannel = channel_to_compute_[ilane];
+      // We're done processing that frame
+      ++num_frames_decoded_[ichannel];
+      const int32 main_q_end =
+          h_lanes_counters_.lane(ilane)->main_q_narcs_and_end.y;
+      // Saving frame offsets for GetRawLattice
+      frame_offsets_[ichannel].push_back(frame_offsets_[ichannel].back() +
+                                         main_q_end);
+    }
+  }
+
+  SaveChannelsStateFromLanes();
+}
+
+void CudaDecoder::CheckOverflow() {
+  for (LaneId ilane = 0; ilane < nlanes_used_; ++ilane) {
+    LaneCounters *lane_counters = h_lanes_counters_.lane(ilane);
+    bool q_overflow = lane_counters->q_overflow;
+    if (q_overflow != OVERFLOW_NONE) {
+      // An overflow was prevented in a kernel
+      // The algorithm can still go on but quality of the result can be reduced
+      // (less tokens were generated)
+
+      if ((q_overflow & OVERFLOW_MAIN_Q) == OVERFLOW_MAIN_Q) {
+        // overflowed main_q
+        KALDI_WARN
+            << "Preventing overflow of main_q. Continuing "
+            << "execution but the quality of the output may be decreased. "
+            << "To prevent this from happening, please increase the parameter "
+               "--main-q-capacity"
+            << " and/or decrease --max-active";
+      }
+      if ((q_overflow & OVERFLOW_AUX_Q) == OVERFLOW_AUX_Q) {
+        // overflowed aux_q
+        KALDI_WARN
+            << "Preventing overflow of aux_q. Continuing "
+            << "execution but the quality of the output may be decreased. "
+            << "To prevent this from happening, please increase the parameter "
+               "--aux-q-capacity"
+            << " and/or decrease --beam";
+      }
+
+      KALDI_ASSERT(lane_counters->main_q_narcs_and_end.y < main_q_capacity_);
+      KALDI_ASSERT(lane_counters->main_q_narcs_and_end.x >= 0);
+      KALDI_ASSERT(lane_counters->main_q_narcs_and_end.y >= 0);
+      KALDI_ASSERT(lane_counters->post_expand_aux_q_end < aux_q_capacity_);
+      KALDI_ASSERT(lane_counters->post_expand_aux_q_end >= 0);
+      KALDI_ASSERT(lane_counters->aux_q_end < aux_q_capacity_);
+      KALDI_ASSERT(lane_counters->aux_q_end >= 0);
+    }
+  }
+}
+
+// GetBestCost
+// returns the minimum cost among all tokens cost in the current frame
+// also returns the index of one token with that min cost
+//
+// Only called at the end of the computation of one audio file
+// not optimized
+void CudaDecoder::GetBestCost(const std::vector<ChannelId> &channels,
+                              bool use_final_costs,
+                              std::vector<std::pair<int32, CostType>> *argmins,
+                              std::vector<std::vector<std::pair<int, float>>>
+                                  *list_finals_token_idx_and_cost,
+                              std::vector<bool> *has_reached_final) {
+  if (channels.size() == 0) return;
+  // Getting the lanes ready to be used with those channels
+  LoadChannelsStateToLanes(channels);
+  cudaMemcpyAsync(d_lanes_counters_.MutableData(), h_lanes_counters_.lane(0),
+                  nlanes_used_ * sizeof(*h_lanes_counters_.lane(0)),
+                  cudaMemcpyHostToDevice, compute_st_);
+
+  auto func_main_q_end = [](const LaneCounters &c) {
+    return c.main_q_narcs_and_end.y;
+  };
+  int32 max_main_q_end = GetMaxForAllLanes(func_main_q_end);
+
+  // Step1 : Finding the best cost in the last token queue, with and without
+  // final costs.
+  // Also saving the indexes of those min.
+  GetBestCostStep1Kernel(
+      KaldiCudaDecoderNumBlocks(max_main_q_end, nlanes_used_),
+      KALDI_CUDA_DECODER_1D_BLOCK, compute_st_, *h_device_params_,
+      *h_kernel_params_, use_final_costs, StdWeight::Zero().Value());
+
+  // Step2: Now that we now what the minimum cost is, we list all tokens within
+  // [min_cost; min_cost+lattice_beam]
+  // min_cost takes into account the final costs if use_final_costs is true,
+  // AND if a final state is is present in the last token queue
+  GetBestCostStep2Kernel(
+      KaldiCudaDecoderNumBlocks(max_main_q_end, nlanes_used_),
+      KALDI_CUDA_DECODER_1D_BLOCK, compute_st_, *h_device_params_,
+      *h_kernel_params_, use_final_costs, StdWeight::Zero().Value());
+
+  // Step3 : Moves some data to host. We are moving the data that couldn't be
+  // moved
+  // directly in step 2, e.g. results of atomics (we don't know which one is
+  // last)
+  GetBestCostStep3Kernel(
+      KaldiCudaDecoderNumBlocks(max_main_q_end, nlanes_used_),
+      KALDI_CUDA_DECODER_1D_BLOCK, compute_st_, *h_device_params_,
+      *h_kernel_params_);
+
+  // Resetting the datastructures
+  argmins->clear();
+  has_reached_final->clear();
+  list_finals_token_idx_and_cost->clear();
+  // list_finals_token_idx_and_cost is a vector<vector<>>
+  // Each channel will have its own list of tokens within [best;
+  // best+lattice_beam]
+  list_finals_token_idx_and_cost->resize(nlanes_used_);
+  // Waiting for the copy
+  cudaStreamSynchronize(compute_st_);
+  for (int32 ilane = 0; ilane < nlanes_used_; ++ilane) {
+    int2 minarg = h_lanes_counters_.lane(ilane)->min_int_cost_and_arg;
+    // Min cost in that channel last token queue
+    CostType min_cost = orderedIntToFloatHost(minarg.x);
+    // index of that min cost
+    int32 arg = minarg.y;
+    // Saving both in output
+    argmins->push_back({arg, min_cost});
+    // Whether or not the last token queue contains at least one token
+    // associated with a final FST state
+    has_reached_final->push_back(
+        h_lanes_counters_.lane(ilane)->has_reached_final);
+    // Number of tokens within [min_cost; min_cost+lattice_beam]
+    int n_within_lattice_beam =
+        h_lanes_counters_.lane(ilane)->n_within_lattice_beam;
+    // Loading those tokens
+    (*list_finals_token_idx_and_cost)[ilane].resize(n_within_lattice_beam);
+    // Moving to output + int2float conversion
+    for (int i = 0; i < n_within_lattice_beam; ++i) {
+      int global_idx = h_list_final_tokens_in_main_q_.lane(ilane)[i].x;
+      float cost_with_final = orderedIntToFloatHost(
+          h_list_final_tokens_in_main_q_.lane(ilane)[i].y);
+      (*list_finals_token_idx_and_cost)[ilane][i].first = global_idx;
+      (*list_finals_token_idx_and_cost)[ilane][i].second = cost_with_final;
+    }
+  }
+}
+
+void CudaDecoder::GetBestPath(const std::vector<ChannelId> &channels,
+                              std::vector<Lattice *> &fst_out_vec,
+                              bool use_final_probs) {
+  KALDI_ASSERT(channels.size() == fst_out_vec.size());
+  nvtxRangePushA("GetBestPath");
+  GetBestCost(channels, use_final_probs, &argmins_,
+              &list_finals_token_idx_and_cost_, &has_reached_final_);
+
+  std::vector<int32> reversed_path;
+  for (int32 ilane = 0; ilane < channels.size(); ++ilane) {
+    const ChannelId ichannel = channels[ilane];
+    const int32 token_with_best_cost = argmins_[ilane].first;
+    std::unique_lock<std::mutex> channel_lk(channel_lock_[ichannel]);
+    // If that token in that frame f is available, then all tokens in that frame
+    // f are available
+    WaitForH2HCopies();
+    const bool isfinal = has_reached_final_[ilane];
+    TokenId token_idx = token_with_best_cost;
+
+    // Backtracking
+    // Going all the way from the token with best cost
+    // to the beginning (StartState)
+    reversed_path.clear();
+
+    // The first token was inserted at the beginning of the queue
+    // it always has index 0
+    // We backtrack until that first token
+    while (token_idx != 0) {
+      InfoToken token = h_all_tokens_info_[ichannel][token_idx];
+      // We want an arc with extra_cost == 0
+      int32 arc_idx;
+      TokenId prev_token_idx;
+      if (token.IsUniqueTokenForStateAndFrame()) {
+        // If we have only one, it is an arc with extra_cost == 0
+        arc_idx = token.arc_idx;
+        prev_token_idx = token.prev_token;
+      } else {
+        // Using the first arc with extra_cost == 0
+        int32 offset, size;
+        std::tie(offset, size) = token.GetSameFSTStateTokensList();
+        bool found_best = false;
+        for (auto i = 0; i < size; ++i) {
+          CostType arc_extra_cost =
+              h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_[ichannel]
+                                                                     [offset +
+                                                                      i].x;
+          // Picking one arc on the best path (extra_cost == 0)
+          if (arc_extra_cost == 0.0f) {
+            InfoToken list_token =
+                h_all_tokens_extra_prev_tokens_[ichannel][offset + i];
+            arc_idx = list_token.arc_idx;
+            prev_token_idx = list_token.prev_token;
+            found_best = true;
+            break;
+          }
+        }
+        KALDI_ASSERT(found_best);
+      }
+      reversed_path.push_back(arc_idx);
+      token_idx = prev_token_idx;
+    }
+
+    Lattice *fst_out = fst_out_vec[ilane];
+    fst_out->DeleteStates();
+    // Building the output Lattice
+    OutputLatticeState curr_state = fst_out->AddState();
+    fst_out->SetStart(curr_state);
+
+    for (int32 i = reversed_path.size() - 1; i >= 1; i--) {
+      int32 arc_idx = reversed_path[i];
+
+      LatticeArc arc(fst_.h_arc_id_ilabels_[arc_idx],
+                     fst_.h_arc_olabels_[arc_idx],
+                     LatticeWeight(fst_.h_arc_weights_[arc_idx], 0),
+                     fst_.h_arc_nextstate_[arc_idx]);
+
+      arc.nextstate = fst_out->AddState();
+      fst_out->AddArc(curr_state, arc);
+      curr_state = arc.nextstate;
+    }
+
+    // Adding final cost to final state
+    if (isfinal && use_final_probs)
+      fst_out->SetFinal(
+          curr_state,
+          LatticeWeight(fst_.h_final_[fst_.h_arc_nextstate_[reversed_path[0]]],
+                        0.0));
+    else
+      fst_out->SetFinal(curr_state, LatticeWeight::One());
+
+    fst::RemoveEpsLocal(fst_out);
+  }
+  nvtxRangePop();
+}
+
+void CudaDecoder::DebugValidateLattice() {
+#if 0
+	//validate lattice consistency
+	for(int frame=0;frame<nframes;frame++) {
+		int token_start=frame_offsets_[ichannel][frame];
+		int token_end=(frame+1<nframes) ? frame_offsets_[ichannel][frame+1] : total_ntokens;
+		int prev_frame_offset=(frame>0) ? frame_offsets_[ichannel][frame-1] : 0;
+		int cur_frame_offset=token_start;
+		int next_frame_offset=token_end;
+
+		bool found_zero = false;
+		//for each token in frame
+		for(int i=token_start;i<token_end;i++) {
+			if(i==0) continue;  //initial token skip this...
+			InfoToken token=h_all_tokens_info_[ichannel][i];
+			KALDI_ASSERT(token.prev_token>=0);
+
+			if(token.IsUniqueTokenForStateAndFrame()) {
+				//previous token must be lower than the next frame start
+				KALDI_ASSERT(token.prev_token<next_frame_offset);
+				//previous token must be larger then previous frame start
+				KALDI_ASSERT(token.prev_token>=prev_frame_offset);
+			} else {
+				int32 offset, size;
+				std::tie(offset,size) = token.GetNextStateTokensList();
+				KALDI_ASSERT(size>0);
+				KALDI_ASSERT(offset>=0 && offset<h_all_tokens_extra_prev_tokens_[ichannel].size());
+				for(auto j=0; j<size; ++j) {
+					KALDI_ASSERT(offset+j<h_all_tokens_extra_prev_tokens_[ichannel].size());
+					InfoToken extra_token=h_all_tokens_extra_prev_tokens_[ichannel][offset+j];
+					//previous token must be lower than the next frame start
+					KALDI_ASSERT(extra_token.prev_token<next_frame_offset);
+					//previous token must be larger then previous frame start
+					KALDI_ASSERT(extra_token.prev_token>=prev_frame_offset);
+				}
+			}
+		}
+	}
+#endif
+}
+
+CudaDecoder::LatticeStateInternalId CudaDecoder::GetLatticeStateInternalId(
+    int32 total_ntokens, TokenId token_idx, InfoToken token) {
+  // If we have a unique token for this (frame,fst_state)
+  // Then its ID is a unique ID for (frame,fst_state)
+  if (token.IsUniqueTokenForStateAndFrame()) return token_idx;
+
+  // If we have multiple tokens for this (frame,fst_state),
+  // let's use the "extra_prev_tokens" offset, which is unique for
+  // (frame,fst_state) in that case
+
+  // Adding the total_ntokens offset to avoid collisions with the previous
+  // case
+  return (total_ntokens + token.prev_token);
+}
+
+void CudaDecoder::AddFinalTokensToLattice(
+    ChannelId ichannel,
+    std::vector<std::pair<TokenId, InfoToken>> *q_curr_frame_todo,
+    std::unordered_map<LatticeStateInternalId, RawLatticeState>
+        *curr_f_raw_lattice_state,
+    Lattice *fst_out) {
+  // Total number of tokens for that utterance. Used in
+  // GetLatticeStateInternalId
+  const int32 total_ntokens = h_all_tokens_info_[ichannel].size();
+  // Reading the overall best_cost for that utterance's last frame. Was set by
+  // GetBestCost
+  const CostType best_cost = h_all_argmin_cost_[ichannel].second;
+  // Iterating through tokens associated with a final state in the last frame
+  for (auto &p : h_all_final_tokens_list_[ichannel]) {
+    // This final token has a final cost of final_token_cost
+    CostType final_token_cost = p.second;
+    // This token has possibly an extra cost compared to the best
+    CostType extra_cost = final_token_cost - best_cost;
+    // We only want to keep paths that have a cost within [best;
+    // best+lattice_beam]
+    if (extra_cost > lattice_beam_) {
+      continue;
+    }
+
+    const TokenId final_token_idx = p.first;
+    InfoToken final_token = h_all_tokens_info_[ichannel][final_token_idx];
+
+    // Internal ID for our lattice_state=(iframe, fst_state)
+    LatticeStateInternalId state_internal_id =
+        GetLatticeStateInternalId(total_ntokens, final_token_idx, final_token);
+    decltype(curr_f_raw_lattice_state->end()) map_it;
+    bool inserted;
+
+    // We need to create the fst_lattice_state linked to our internal id in the
+    // lattice if it doesn't already exists
+    // Inserts only if the key doesn't exist in the map
+    std::tie(map_it, inserted) = curr_f_raw_lattice_state->insert(
+        {state_internal_id, {FLT_MAX, -1, false}});
+
+    // If we've inserted the element, it means that that state didn't exist in
+    // the map
+    // Because this is a final state, we need to do a bit of extra work to add
+    // the final_cost to it
+    if (inserted) {
+      // We want to figure out which FST state this token is associated to
+      // We don't have that info anymore, it wasn't transfered from the GPU
+      // We still need it for final tokens, because we need to know which
+      // final cost to add in the lattice.
+      // To find that original FST state, we need the id of an arc going to
+      // that state,
+      // then we'll look in the graph and figure out next_state[arc_idx]
+      // we just need a valid arc_idx
+      int32 arc_idx;
+      if (final_token.IsUniqueTokenForStateAndFrame()) {
+        // If unique, we can directly use this arc_idx
+        arc_idx = final_token.arc_idx;
+      } else {
+        // If we have multiple tokens associated to that fst state, just pick
+        // the first one
+        // from the list
+        int32 offset, size;
+        std::tie(offset, size) = final_token.GetSameFSTStateTokensList();
+        InfoToken prev_token =
+            h_all_tokens_extra_prev_tokens_[ichannel][offset];
+        arc_idx = prev_token.arc_idx;
+      }
+      // Creating the state associated with our internal id in the lattice
+      OutputLatticeState fst_lattice_final_state = fst_out->AddState();
+      map_it->second.fst_lattice_state = fst_lattice_final_state;
+      q_curr_frame_todo->push_back({final_token_idx, final_token});
+
+      if (h_all_has_reached_final_[ichannel]) {
+        // If we have reached final states, adding the final cost
+        // We now have a valid arc_idx. We can read the FST state
+        StateId fst_next_state = fst_.h_arc_nextstate_[arc_idx];
+
+        fst_out->SetFinal(fst_lattice_final_state,
+                          LatticeWeight(fst_.h_final_[fst_next_state], 0.0));
+      } else {
+        fst_out->SetFinal(fst_lattice_final_state, LatticeWeight::One());
+      }
+    }
+
+    map_it->second.token_extra_cost =
+        std::min(map_it->second.token_extra_cost, extra_cost);
+  }
+}
+
+void CudaDecoder::AddArcToLattice(
+    int32 list_arc_idx, TokenId list_prev_token_idx, InfoToken list_prev_token,
+    int32 curr_frame_offset, CostType acoustic_cost,
+    CostType this_arc_prev_token_extra_cost,
+    LatticeStateInternalId src_state_internal_id,
+    OutputLatticeState fst_lattice_start,
+    OutputLatticeState to_fst_lattice_state,
+    std::vector<std::pair<TokenId, InfoToken>> *q_curr_frame_todo,
+    std::vector<std::pair<TokenId, InfoToken>> *q_prev_frame_todo,
+    std::unordered_map<LatticeStateInternalId, RawLatticeState>
+        *curr_f_raw_lattice_state,
+    std::unordered_map<LatticeStateInternalId, RawLatticeState>
+        *prev_f_raw_lattice_state,
+    std::unordered_set<int32> *f_arc_idx_added, Lattice *fst_out,
+    bool *must_replay_frame) {
+  // We will now add this arc to the output lattice
+  // We know the destination state of the arc (to_fst_lattice_state)
+  // We need to figure out its source
+  // And propagate the extra cost from the destination to the source of that arc
+  // (we go backward)
+  OutputLatticeState from_fst_lattice_state;
+  // Having the predecessor in the previous frame
+  // <=> that token is associated to an emiting arc
+  bool emitting = (list_prev_token_idx < curr_frame_offset);
+  // Checking if the source of that arc is the start state (original state at
+  // the beginning of the decode)
+  if (list_prev_token_idx != 0) {
+    // Selecting the right map
+    // - emitting arc -> previous frame map
+    // - non emitting arc -> same frame map
+    auto *extra_cost_map =
+        emitting ? prev_f_raw_lattice_state : curr_f_raw_lattice_state;
+    decltype(extra_cost_map->end()) from_map_it;
+    bool inserted;
+    // Attempting to insert the state in the map
+    std::tie(from_map_it, inserted) =
+        extra_cost_map->insert({src_state_internal_id, {FLT_MAX, -1, false}});
+    // If it was inserted, its the first time we insert that key in
+    // the map
+    // we need to put that state in the todo list to be considered
+    // next
+    if (inserted) {
+      auto *todo_list = emitting ? q_prev_frame_todo : q_curr_frame_todo;
+      todo_list->push_back({list_prev_token_idx, list_prev_token});
+      from_map_it->second.fst_lattice_state = fst_out->AddState();
+    }
+
+    // Updating the source extra cost using that arc
+    // for an arc a->b
+    // extra_cost(a) = min(extra_cost(a),
+    //		extra_cost(b) + arc_extra_cost(a->b))
+    CostType prev_token_extra_cost = from_map_it->second.token_extra_cost;
+    if (this_arc_prev_token_extra_cost < prev_token_extra_cost) {
+      // We found a new min
+      CostType diff = (prev_token_extra_cost - this_arc_prev_token_extra_cost);
+      // If the change is large enough,
+      // and if the state that we're writing to was already closed,
+      // then we need to replay that frame.
+      // if the source state is already closed it means we've
+      // read its extra_cost value. Now we're writing again to it.
+      // We have to do the first read again, to get the updated
+      // value
+      // that's why we're replaying that frame
+      // (between frames everything is in topological order)
+      if (diff > extra_cost_min_delta_ && from_map_it->second.is_state_closed) {
+        *must_replay_frame = true;
+      }
+      prev_token_extra_cost = this_arc_prev_token_extra_cost;
+      from_map_it->second.token_extra_cost = prev_token_extra_cost;
+    }
+
+    // Reading the OutputLatticeState of the source state in the output lattice
+    from_fst_lattice_state = from_map_it->second.fst_lattice_state;
+  } else {
+    from_fst_lattice_state =
+        fst_lattice_start;  // we simply link it to the source
+  }
+
+  // Checking if it's the first time we insert an arc with that
+  // arc_idx for that frame.
+  // If we're replaying that frame, we don't want duplicates
+  bool is_this_arc_new = f_arc_idx_added->insert(list_arc_idx).second;
+  if (is_this_arc_new) {
+    // The following reads will most likely end up in cache misses
+    // we could load everything sooner
+    LatticeArc arc(
+        fst_.h_arc_id_ilabels_[list_arc_idx], fst_.h_arc_olabels_[list_arc_idx],
+        LatticeWeight(fst_.h_arc_weights_[list_arc_idx], acoustic_cost),
+        to_fst_lattice_state);
+    fst_out->AddArc(from_fst_lattice_state, arc);
+  }
+}
+
+void CudaDecoder::GetTokenRawLatticeData(
+    TokenId token_idx, InfoToken token, int32 total_ntokens,
+    std::unordered_map<LatticeStateInternalId, RawLatticeState>
+        *curr_f_raw_lattice_state,
+    CostType *token_extra_cost, OutputLatticeState *to_fst_lattice_state) {
+  LatticeStateInternalId next_state_internal_id =
+      GetLatticeStateInternalId(total_ntokens, token_idx, token);
+  auto to_map_it = curr_f_raw_lattice_state->find(next_state_internal_id);
+  // We know this token exists in the output lattice (because it's in
+  // q_curr_frame_todo_)
+  KALDI_ASSERT(to_map_it != curr_f_raw_lattice_state->end());
+
+  *token_extra_cost = to_map_it->second.token_extra_cost;
+  *to_fst_lattice_state = to_map_it->second.fst_lattice_state;
+
+  // We read the extra cost from lattice_next_state
+  // We are now closing the state. If we write to it again, we will have
+  // to replay that frame
+  // (so that the latest extra_cost value is read)
+  to_map_it->second.is_state_closed = true;
+}
+
+void CudaDecoder::GetSameFSTStateTokenList(
+    ChannelId ichannel, InfoToken token, InfoToken **tok_beg,
+    float2 **extra_extra_and_acoustic_cost_beg, int32 *nsame) {
+  // We now need to consider all tokens related to that (iframe,
+  // fst_state)
+  // with fst_state being the state this current token is linked to
+  // There's two possibilies:
+  // a) only one token is associated with that fst_state in that frame.
+  // The necessary information
+  // is then stored directly in the token (arc_idx, prev_token)
+  // b) multiple tokens are associated with that fst_state in that
+  // frame. The token that we have right now
+  // only contains information on where to find the list of those
+  // tokens. It contains (offset, size)
+  //
+  // In any cases we consider the list of tokens to process as an array
+  // of InfoToken, which will
+  // be of size 1 in case a), of size > 1 in case b)
+  if (token.IsUniqueTokenForStateAndFrame()) {
+    *tok_beg = &token;
+    // if we've got only one, extra_cost == 0.0
+    *extra_extra_and_acoustic_cost_beg = NULL;
+    *nsame = 1;
+  } else {
+    int32 offset, size;
+    std::tie(offset, size) = token.GetSameFSTStateTokensList();
+    *tok_beg = &h_all_tokens_extra_prev_tokens_[ichannel][offset];
+    *extra_extra_and_acoustic_cost_beg =
+        &h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_[ichannel]
+                                                                [offset];
+    *nsame = size;
+  }
+}
+
+void CudaDecoder::ConsiderTokenForLattice(
+    ChannelId ichannel, int32 iprev, int32 total_ntokens, TokenId token_idx,
+    OutputLatticeState fst_lattice_start, InfoToken *tok_beg,
+    float2 *extra_extra_and_acoustic_cost_beg, CostType token_extra_cost,
+    TokenId list_prev_token_idx, int32 list_arc_idx, InfoToken *list_prev_token,
+    CostType *this_arc_prev_token_extra_cost, CostType *acoustic_cost,
+    OutputLatticeState *lattice_src_state, bool *keep_arc,
+    bool *dbg_found_zero) {
+  CostType arc_extra_cost;
+  if (extra_extra_and_acoustic_cost_beg) {
+    float2 both = extra_extra_and_acoustic_cost_beg[iprev];
+    arc_extra_cost = both.x;
+    *acoustic_cost = both.y;
+  } else {
+    // If we have only one token for that (iframe,fst_state),
+    // Its arc has an extra_cost of zero (it's the only way to
+    // get to that state, so it's the best)
+    arc_extra_cost = 0.0f;
+    *acoustic_cost = h_all_tokens_acoustic_cost_[ichannel][token_idx];
+  }
+  // If we use that arc to go to prev_token, prev_token will have the
+  // following extra cost
+  *this_arc_prev_token_extra_cost = token_extra_cost + arc_extra_cost;
+  // We need at least one arc_extra_cost of zero for each (iframe,
+  // fst_state)
+  // The only use for that boolean is in a KALDI_ASSERT,
+  // because if something went wrong in the kernels it's not likely
+  // that this property will be verified out of luck
+  *dbg_found_zero |= (arc_extra_cost == 0.0f);
+  *list_prev_token = h_all_tokens_info_[ichannel][list_prev_token_idx];
+  // Source of the arc currently considered
+  *lattice_src_state =
+      (list_prev_token_idx != 0)
+          ? GetLatticeStateInternalId(total_ntokens, list_prev_token_idx,
+                                      *list_prev_token)
+          : fst_lattice_start;
+
+  // We only keep the arc if, when using that arc, we can end up
+  // at the last frame with a cost not worse than (best+lattice_beam)
+  // this_arc_prev_token_extra_cost contains the accumulated sums
+  // of extra costs (through the cheapest possible way) to the last
+  // frame
+  *keep_arc = (*this_arc_prev_token_extra_cost < lattice_beam_);
+}
+
+void CudaDecoder::SwapPrevAndCurrLatticeMap(
+    int32 iframe, bool dbg_found_best_path,
+    std::vector<std::pair<TokenId, InfoToken>> *q_curr_frame_todo,
+    std::vector<std::pair<TokenId, InfoToken>> *q_prev_frame_todo,
+    std::unordered_map<LatticeStateInternalId, RawLatticeState>
+        *curr_f_raw_lattice_state,
+    std::unordered_map<LatticeStateInternalId, RawLatticeState>
+        *prev_f_raw_lattice_state,
+    std::unordered_set<int32> *f_arc_idx_added) {
+  q_prev_frame_todo->swap(*q_curr_frame_todo);
+  q_prev_frame_todo->clear();
+  prev_f_raw_lattice_state->swap(*curr_f_raw_lattice_state);
+  prev_f_raw_lattice_state->clear();
+  f_arc_idx_added->clear();
+
+  KALDI_ASSERT(q_prev_frame_todo->empty());
+  if (iframe > 0) {
+    KALDI_ASSERT(!q_curr_frame_todo->empty());
+    if (!dbg_found_best_path) {
+      KALDI_WARN << "Warning didn't find exact best path in GetRawLattice";
+    }
+  }
+}
+
+void CudaDecoder::WaitForH2HCopies() {
+  std::unique_lock<std::mutex> lk(n_h2h_task_not_done_mutex_);
+  h2h_done_.wait(lk, [this] { return (n_h2h_task_not_done_ == 0); });
+}
+
+void CudaDecoder::WaitForInitDecodingH2HCopies() {
+  std::unique_lock<std::mutex> lk(n_init_decoding_h2h_task_not_done_mutex_);
+  init_decoding_h2h_done_.wait(
+      lk, [this] { return (n_init_decoding_h2h_task_not_done_ == 0); });
+}
+
+void CudaDecoder::PrepareForGetRawLattice(
+    const std::vector<ChannelId> &channels, bool use_final_probs) {
+  GetBestCost(channels, use_final_probs, &argmins_,
+              &list_finals_token_idx_and_cost_, &has_reached_final_);
+  for (LaneId ilane = 0; ilane < channels.size(); ++ilane) {
+    ChannelId ichannel = channels[ilane];
+    std::lock_guard<std::mutex> channel_lk(channel_lock_[ichannel]);
+    h_all_argmin_cost_[ichannel] = argmins_[ilane];
+    h_all_final_tokens_list_[ichannel].swap(
+        list_finals_token_idx_and_cost_[ilane]);
+    h_all_has_reached_final_[ichannel] = has_reached_final_[ilane];
+  }
+}
+
+void CudaDecoder::ConcurrentGetRawLatticeSingleChannel(const ChannelId ichannel,
+                                                       Lattice *fst_out) {
+  nvtxRangePushA("GetRawLatticeOneChannel");
+  // Allocating the datastructures that we need
+
+  // [prev|curr]_f_raw_lattice_state
+  // Used to get information about a lattice state (i.e. a (iframe, fst_state)
+  // pair)
+  // using its LatticeStateInternalId (its ID inside of the decoder)
+  // It gives us the OutputLatticeState (its ID in the output lattice)
+  // alongside with the extra_cost of that state in the lattice
+  // Those maps are used to build the external lattice using what we know
+  // internally
+  // Using one map per frame. We always know to which frame a token belongs.
+  // Using one big map slows everything down
+  std::unordered_map<LatticeStateInternalId, RawLatticeState>
+      prev_f_raw_lattice_state, curr_f_raw_lattice_state;
+  // We want the unicity of each arc_idx for one frame. Important because we
+  // can replay a frame (and possibly add multiple time the same arc)
+  std::unordered_set<int32> f_arc_idx_added;
+  // When backtracking, we read tokens in the current frame (in
+  // q_curr_frame_todo_),
+  // we backtrack the associated arc, and we add the predecessor either to
+  // q_curr_frame_todo_ (non-emitting arc, same frame)
+  // or q_prev_frame_todo_ (emitting arc, source in previous frame)
+  std::vector<std::pair<TokenId, InfoToken>> q_curr_frame_todo;
+  std::vector<std::pair<TokenId, InfoToken>> q_prev_frame_todo;
+
+  // We need to lock the channel to read argmin
+  TokenId best_cost_idx;
+  {
+    std::lock_guard<std::mutex> channel_lk(channel_lock_[ichannel]);
+    h_all_tokens_info_.shrink_to_fit();
+    h_all_tokens_acoustic_cost_.shrink_to_fit();
+    h_all_tokens_extra_prev_tokens_.shrink_to_fit();
+    h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_.shrink_to_fit();
+    best_cost_idx = h_all_argmin_cost_[ichannel].first;
+  }
+  KALDI_ASSERT(
+      "You need to call PrepareForGetRawLattice before "
+      "ConcurrentGetRawLatticeSingleChannel" &&
+      best_cost_idx >= 0);
+  const int32 nframes = NumFramesDecoded(ichannel);
+  // Making sure that this token is available for this channel.
+  // We're going to read storage data from this channel. Locking it
+  // If that token in that frame f is available, then all tokens in that frame
+  // f are available
+  WaitForH2HCopies();
+  std::unique_lock<std::mutex> channel_lk(channel_lock_[ichannel]);
+  // Total number of tokens generated by the utterance on channel ichannel
+  const int32 total_ntokens = h_all_tokens_info_[ichannel].size();
+
+  // Preparing output lattice
+  // The start state has to be 0 (cf some asserts somewhere else in Kaldi)
+  // Adding it now
+  fst_out->DeleteStates();
+  OutputLatticeState fst_lattice_start = fst_out->AddState();
+  fst_out->SetStart(fst_lattice_start);
+
+  // Adding the best tokens returned by GetBestCost to the lattice
+  // We also add them to q_curr_frame_todo, and we'll backtrack from there
+  AddFinalTokensToLattice(ichannel, &q_curr_frame_todo,
+                          &curr_f_raw_lattice_state, fst_out);
+
+  // We're now going to backtrack frame by frame
+  // For each frame we're going to process tokens that need to be inserted
+  // into the output lattice
+  // and add their predecessors to the todo list
+  // iframe == -1 contains the start state and the first non emitting tokens.
+  // It is not linked to a real frame
+  for (int32 iframe = nframes - 1; iframe >= -1; --iframe) {
+    // Tokens for the current frame were inserted after this offset in the
+    // token list
+    const int32 curr_frame_offset =
+        (iframe >= 0) ? frame_offsets_[ichannel][iframe] : 0;
+
+    // bool must_replay_frame
+    // In some cases we can update an extra_cost that has already been used
+    // For instance we process arcs in that order :
+    // 1) a -> b, which updates extra_cost[b] using extra_cost[a]
+    // 2) c -> a, which updates extra-cost[a] (using extra_cost[c])
+    // because the arcs were not considered in topological order, we need to
+    // run
+    // again the step 1,
+    // to get the correct extra_cost[b] (using the latest extra_cost[a])
+    // However, we only re-run the step 1 if the value extra_cost[a] has
+    // changed more than extra_cost_min_delta_
+    bool must_replay_frame;
+
+    // dbg_found_best_path is used in an useful assert, making sure the best
+    // path is still there for each frame
+    // if something went wrong in the kernels, it's not likely we respect that
+    // property out of luck
+    bool dbg_found_best_path = false;
+    do {
+      must_replay_frame = false;
+      // Reading something to do. We are pushing stuff back in
+      // q_curr_frame_todo while reading it,
+      // so it's important to always read q_curr_frame_todo_.size() directly
+      // not using a queue, because we may need to recompute the frame (if
+      // must_replay_frame is true)
+      for (int32 u = 0; u < q_curr_frame_todo.size(); ++u) {
+        TokenId token_idx;
+        InfoToken token;
+        std::tie(token_idx, token) = q_curr_frame_todo[u];
+        KALDI_ASSERT(token_idx >= curr_frame_offset);
+        CostType token_extra_cost;
+        StateId to_fst_lattice_state;
+        // Loading the current extra_cost of that token
+        // + its associated state in the lattice
+        GetTokenRawLatticeData(token_idx, token, total_ntokens,
+                               &curr_f_raw_lattice_state, &token_extra_cost,
+                               &to_fst_lattice_state);
+        dbg_found_best_path |= (token_extra_cost == 0.0f);
+
+        InfoToken *tok_beg;
+        float2 *extra_extra_and_acoustic_cost_beg;
+        int32 nsamestate;
+        // Getting the list of the tokens linked to the same FST state, in the
+        // same frame
+        // In the GPU decoder a token is linked to a single arc, but we can
+        // generate
+        // multiple token for a same fst_nextstate in the same frame.
+        // In the CPU decoder we would use the forward_links list to store
+        // everything in the same metatoken
+        // GetSameFSTStateTokenList returns the list of tokens linked to the
+        // same FST state than token
+        // (in the current frame)
+        GetSameFSTStateTokenList(ichannel, token, &tok_beg,
+                                 &extra_extra_and_acoustic_cost_beg,
+                                 &nsamestate);
+
+        // dbg_found_zero used for debugging. For each FST state, we have a
+        // token with the
+        // best cost for that FST state
+        // that token has an extra_cost of 0.0f. This is a sanity check
+        bool dbg_found_zero = false;
+        for (int32 iprev = 0; iprev < nsamestate; ++iprev) {
+          InfoToken list_prev_token;
+          CostType acoustic_cost, this_arc_prev_token_extra_cost;
+          bool keep_arc;
+          LatticeStateInternalId src_state_internal_id;
+          InfoToken list_token = tok_beg[iprev];
+          int32 list_prev_token_idx = list_token.prev_token;
+          int32 list_arc_idx = list_token.arc_idx;
+
+          ConsiderTokenForLattice(
+              ichannel, iprev, total_ntokens, token_idx, fst_lattice_start,
+              tok_beg, extra_extra_and_acoustic_cost_beg, token_extra_cost,
+              list_prev_token_idx, list_arc_idx, &list_prev_token,
+              &this_arc_prev_token_extra_cost, &acoustic_cost,
+              &src_state_internal_id, &keep_arc, &dbg_found_zero);
+
+          if (keep_arc)
+            AddArcToLattice(list_arc_idx, list_prev_token_idx, list_prev_token,
+                            curr_frame_offset, acoustic_cost,
+                            this_arc_prev_token_extra_cost,
+                            src_state_internal_id, fst_lattice_start,
+                            to_fst_lattice_state, &q_curr_frame_todo,
+                            &q_prev_frame_todo, &curr_f_raw_lattice_state,
+                            &prev_f_raw_lattice_state, &f_arc_idx_added,
+                            fst_out, &must_replay_frame);
+        }
+        KALDI_ASSERT(dbg_found_zero);
+      }
+
+      if (must_replay_frame) {
+        // We need to replay the frame. Because all states will be read again,
+        // we can reopen them (and they will be closed again when being read
+        // from again)
+        for (auto it = curr_f_raw_lattice_state.begin();
+             it != curr_f_raw_lattice_state.end(); ++it) {
+          it->second.is_state_closed = false;
+        }
+      }
+    } while (must_replay_frame);
+
+    // Done processing this frame. Swap the datastructures, move on to
+    // previous frame (we go --iframe)
+    SwapPrevAndCurrLatticeMap(iframe, dbg_found_best_path, &q_curr_frame_todo,
+                              &q_prev_frame_todo, &curr_f_raw_lattice_state,
+                              &prev_f_raw_lattice_state, &f_arc_idx_added);
+  }
+  nvtxRangePop();
+}
+
+void CudaDecoder::GetRawLattice(const std::vector<ChannelId> &channels,
+                                std::vector<Lattice *> &fst_out_vec,
+                                bool use_final_probs) {
+  KALDI_ASSERT(channels.size() == fst_out_vec.size());
+  // Getting the list of the best costs in the lastest token queue.
+  // all costs within [best;best+lattice_beam]
+  PrepareForGetRawLattice(channels, use_final_probs);
+  for (int32 ilane = 0; ilane < channels.size(); ++ilane) {
+    const ChannelId ichannel = channels[ilane];
+    Lattice *fst_out = fst_out_vec[ilane];
+    ConcurrentGetRawLatticeSingleChannel(ichannel, fst_out);
+  }
+}
+
+void CudaDecoder::SetChannelsInKernelParams(
+    const std::vector<ChannelId> &channels) {
+  KALDI_ASSERT(channels.size() <= nchannels_);
+  KALDI_ASSERT(channels.size() <= nlanes_);
+  for (LaneId lane_id = 0; lane_id < channels.size(); ++lane_id)
+    channel_to_compute_[lane_id] = channels[lane_id];
+  h_kernel_params_->nlanes_used = channels.size();
+  nlanes_used_ = channels.size();
+}
+
+void CudaDecoder::ResetChannelsInKernelParams() {
+  h_kernel_params_->nlanes_used = 0;
+  nlanes_used_ = 0;
+}
+
+int32 CudaDecoder::NumFramesDecoded(ChannelId ichannel) const {
+  KALDI_ASSERT(ichannel < nchannels_);
+  return num_frames_decoded_[ichannel];
+}
+
+void CudaDecoder::CheckStaticAsserts() {
+  // Checking if all constants look ok
+
+  // We need that because we need to be able to do the scan in one pass in the
+  // kernel
+  // update_beam_using_histogram_kernel
+  KALDI_ASSERT(KALDI_CUDA_DECODER_HISTO_NBINS < KALDI_CUDA_DECODER_1D_BLOCK);
+  KALDI_ASSERT(KALDI_CUDA_DECODER_NONEM_LT_MAX_NARCS > 0);
+}
+
+void CudaDecoder::LaunchH2HCopies() {
+  // Each H2H copy counter
+  n_acoustic_h2h_copies_todo_.store(nlanes_used_ - 1);
+  n_infotoken_h2h_copies_todo_.store(nlanes_used_ - 1);
+  n_extra_prev_tokens_h2h_copies_todo_.store(nlanes_used_ - 1);
+
+  {
+    std::lock_guard<std::mutex> n_h2h_not_done_lk(n_h2h_task_not_done_mutex_);
+    n_h2h_task_not_done_ += thread_pool_ ? n_threads_used_ : 1;
+  }
+  {
+    std::lock_guard<std::mutex> n_h2h_todo_lk(n_h2h_main_task_todo_mutex_);
+    n_h2h_main_task_todo_ = thread_pool_ ? n_threads_used_ : 1;
+  }
+
+  // Either do the copy locally or send it to the threadpool
+  if (thread_pool_) {
+    n_h2h_main_task_todo_cv_.notify_all();
+  } else {
+    ComputeH2HCopies();
+  }
+}
+
+void CudaDecoder::ComputeH2HCopiesCPUWorker() {
+  // Run by a dedicated CPU thread
+  while (h2h_threads_running_) {
+    ComputeH2HCopies();
+  }
+}
+
+void CudaDecoder::ComputeH2HCopies() {
+  // Waiting for either something to do or the instruction to stop the threads
+  {
+    std::unique_lock<std::mutex> n_h2h_lk(n_h2h_main_task_todo_mutex_);
+    n_h2h_main_task_todo_cv_.wait(n_h2h_lk, [this] {
+      return !h2h_threads_running_ || (n_h2h_main_task_todo_ > 0);
+    });
+    --n_h2h_main_task_todo_;
+  }
+  // If we are done, stop the wait and return now. ComputeH2HCopiesCPUWorker
+  // will also return,
+  // stopping the thread
+  if (!h2h_threads_running_) return;
+  // Waiting for the D2H copies. This is threadsafe
+  // Step 1: acoustic costs
+  cudaEventSynchronize(d2h_copy_acoustic_evt_);
+  int32 ilane;
+  while ((ilane = n_acoustic_h2h_copies_todo_.fetch_sub(1)) >= 0) {
+    int32 ichannel = lanes2channels_todo_[ilane];
+    // Lock Channel
+    std::lock_guard<std::mutex> channel_lk(channel_lock_[ichannel]);
+    MoveConcatenatedCopyToVector(
+        ilane, ichannel, h_emitting_main_q_end_lane_offsets_,
+        h_acoustic_cost_concat_, &h_all_tokens_acoustic_cost_);
+    // Adding 0.0f acoustic_costs for non-emittings
+    int32 main_q_end = h_main_q_end_lane_offsets_[ilane + 1] -
+                       h_main_q_end_lane_offsets_[ilane];
+    int32 ntokens_emitting = h_emitting_main_q_end_lane_offsets_[ilane + 1] -
+                             h_emitting_main_q_end_lane_offsets_[ilane];
+    int32 ntokens_nonemitting = main_q_end - ntokens_emitting;
+    auto &vec = h_all_tokens_acoustic_cost_[ichannel];
+    vec.insert(vec.end(), ntokens_nonemitting, 0.0f);
+  }
+
+  // Step 2: infotoken
+  cudaEventSynchronize(d2h_copy_infotoken_evt_);
+  while ((ilane = n_infotoken_h2h_copies_todo_.fetch_sub(1)) >= 0) {
+    int32 ichannel = lanes2channels_todo_[ilane];
+    // Lock Channel
+    std::lock_guard<std::mutex> channel_lk(channel_lock_[ichannel]);
+    MoveConcatenatedCopyToVector(ilane, ichannel, h_main_q_end_lane_offsets_,
+                                 h_infotoken_concat_, &h_all_tokens_info_);
+  }
+
+  // Step 3: extra prev tokens
+  cudaEventSynchronize(d2h_copy_extra_prev_tokens_evt_);
+  while ((ilane = n_extra_prev_tokens_h2h_copies_todo_.fetch_sub(1)) >= 0) {
+    int32 ichannel = lanes2channels_todo_[ilane];
+    // Lock Channel
+    std::lock_guard<std::mutex> channel_lk(channel_lock_[ichannel]);
+    MoveConcatenatedCopyToVector(
+        ilane, ichannel, h_n_extra_prev_tokens_lane_offsets_,
+        h_extra_prev_tokens_concat_, &h_all_tokens_extra_prev_tokens_);
+    MoveConcatenatedCopyToVector(
+        ilane, ichannel, h_n_extra_prev_tokens_lane_offsets_,
+        h_extra_and_acoustic_cost_concat_,
+        &h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_);
+  }
+
+  // If we're the last cpu thread to complete the current tasks, notify the main
+  // thread
+  bool all_done;
+  {
+    std::lock_guard<std::mutex> lk_not_done(n_h2h_task_not_done_mutex_);
+    all_done = (--n_h2h_task_not_done_ == 0);
+  }
+  if (all_done) {
+    h2h_done_.notify_all();
+  }
+}
+
+void CudaDecoder::SetThreadPoolAndStartCPUWorkers(ThreadPool *thread_pool,
+                                                  int32 nworkers) {
+  KALDI_ASSERT(nworkers > 0);
+  n_threads_used_ = nworkers;
+  thread_pool_ = thread_pool;
+  for (int32 i = 0; i < nworkers; ++i)
+    cpu_dedicated_threads_.emplace_back(&CudaDecoder::ComputeH2HCopiesCPUWorker,
+                                        this);
+}
+
+}  // end namespace cuda_decoder
+}  // end namespace kaldi
+
+#endif  // HAVE_CUDA == 1
diff --git a/src/cudadecoder/cuda-decoder.h b/src/cudadecoder/cuda-decoder.h
new file mode 100644
index 00000000000..4db4424853e
--- /dev/null
+++ b/src/cudadecoder/cuda-decoder.h
@@ -0,0 +1,851 @@
+// cudadecoder/cuda-decoder.h
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Hugo Braun, Justin Luitjens, Ryan Leary
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_CUDA_DECODER_CUDA_DECODER_H_
+#define KALDI_CUDA_DECODER_CUDA_DECODER_H_
+
+#include "cudadecoder/cuda-decodable-itf.h"
+#include "cudadecoder/cuda-decoder-common.h"
+#include "cudadecoder/cuda-fst.h"
+#include "nnet3/decodable-online-looped.h"
+#include "thread-pool.h"
+
+#include <cuda_runtime_api.h>
+#include <mutex>
+#include <tuple>
+#include <vector>
+namespace kaldi {
+namespace cuda_decoder {
+
+struct CudaDecoderConfig {
+  BaseFloat default_beam;
+  BaseFloat lattice_beam;
+  int32 ntokens_pre_allocated;
+  int32 main_q_capacity, aux_q_capacity;
+  int32 max_active;
+
+  CudaDecoderConfig()
+      : default_beam(15.0),
+        lattice_beam(10.0),
+        ntokens_pre_allocated(2000000),
+        main_q_capacity(-1),
+        aux_q_capacity(-1),
+        max_active(10000) {}
+
+  void Register(OptionsItf *opts) {
+    opts->Register("beam", &default_beam,
+                   "Decoding beam. Larger->slower, more accurate. If "
+                   "aux-q-capacity is too small, we may decrease the beam "
+                   "dynamically to avoid overflow (adaptive beam, see "
+                   "aux-q-capacity parameter)");
+    opts->Register("lattice-beam", &lattice_beam,
+                   "The width of the lattice beam");
+    opts->Register("max-active", &max_active,
+                   "At the end of each frame computation, we keep only its "
+                   "best max-active tokens. One token is the instantiation of "
+                   "a single arc. Typical values are within the 5k-10k range.");
+    opts->Register("ntokens-pre-allocated", &ntokens_pre_allocated,
+                   "Advanced - Number of tokens pre-allocated in host buffers. "
+                   "If this size is exceeded the buffer will reallocate, "
+                   "reducing performance.");
+    std::ostringstream main_q_capacity_desc;
+    main_q_capacity_desc
+        << "Advanced - Capacity of the main queue : Maximum number of "
+           "tokens that can be stored *after* pruning for each frame. "
+           "Lower -> less memory usage, Higher -> More accurate. "
+           "Tokens stored in the main queue were already selected "
+           "through a max-active pre-selection. It means that for each "
+           "emitting/non-emitting iteration, we can add at most "
+           "~max-active tokens to the main queue. Typically only the "
+           "emitting iteration creates a large number of tokens. Using "
+           "main-q-capacity=k*max-active with k=4..10 should be safe. "
+           "If main-q-capacity is too small, we will print a warning "
+           "but prevent the overflow. The computation can safely "
+           "continue, but the quality of the output may decrease "
+           "(-1 = set to "
+        << KALDI_CUDA_DECODER_MAX_ACTIVE_MAIN_Q_CAPACITY_FACTOR
+        << "*max-active).";
+    opts->Register("main-q-capacity", &main_q_capacity,
+                   main_q_capacity_desc.str());
+    std::ostringstream aux_q_capacity_desc;
+    aux_q_capacity_desc
+        << "Advanced - Capacity of the auxiliary queue : Maximum "
+           "number of raw tokens that can be stored *before* pruning "
+           "for each frame. Lower -> less memory usage, Higher -> More "
+           "accurate. During the tokens generation, if we detect that "
+           "we are getting close to saturating that capacity, we will "
+           "reduce the beam dynamically (adaptive beam) to keep only "
+           "the best tokens in the remaining space. If the aux queue "
+           "is still too small, we will print an overflow warning, but "
+           "prevent the overflow. The computation can safely continue, "
+           "but the quality of the output may decrease. We strongly "
+           "recommend keeping aux-q-capacity large (>400k), to avoid "
+           "triggering the adaptive beam and/or the overflow "
+           "(-1 = set to "
+        << KALDI_CUDA_DECODER_AUX_Q_MAIN_Q_CAPACITIES_FACTOR
+        << "*main-q-capacity).";
+    opts->Register("aux-q-capacity", &aux_q_capacity,
+                   aux_q_capacity_desc.str());
+  }
+
+  void Check() const {
+    KALDI_ASSERT(default_beam > 0.0 && ntokens_pre_allocated >= 0 &&
+                 lattice_beam >= 0.0f && max_active > 0);
+  }
+
+  void ComputeConfig() {
+    if (main_q_capacity == -1)
+      main_q_capacity =
+          max_active * KALDI_CUDA_DECODER_MAX_ACTIVE_MAIN_Q_CAPACITY_FACTOR;
+    if (aux_q_capacity == -1)
+      aux_q_capacity =
+          main_q_capacity * KALDI_CUDA_DECODER_AUX_Q_MAIN_Q_CAPACITIES_FACTOR;
+  }
+};
+
+// Forward declaration.
+// Those contains CUDA code. We don't want to include their definition
+// in this header
+class DeviceParams;
+class KernelParams;
+
+class CudaDecoder {
+ public:
+  // Creating a new CudaDecoder, associated to the FST fst
+  // nlanes and nchannels are defined as follow
+
+  // A decoder channel is linked to one utterance.
+  // When we need to perform decoding on an utterance,
+  // we pick an available channel, call InitDecoding on that channel
+  // (with that ChannelId in the channels vector in the arguments)
+  // then call AdvanceDecoding whenever frames are ready for the decoder
+  // for that utterance (also passing the same ChannelId to AdvanceDecoding)
+  //
+  // A decoder lane is where the computation actually happens
+  // a decoder lane is channel, and perform the actual decoding
+  // of that channel.
+  // If we have 200 lanes, we can compute 200 utterances (channels)
+  // at the same time. We need many lanes in parallel to saturate the big GPUs
+  //
+  // An analogy would be lane -> a CPU core, channel -> a software thread
+  // A channel saves the current state of the decoding for a given utterance.
+  // It can be kept idle until more frames are ready to be processed
+  //
+  // We will use as many lanes as necessary to saturate the GPU, but not more.
+  // A lane has an higher memory usage than a channel. If you just want to be
+  // able to
+  // keep more audio channels open at the same time (when I/O is the bottleneck
+  // for instance,
+  // typically in the context of online decoding), you should instead use more
+  // channels.
+  //
+  // A channel is typically way smaller in term of memory usage, and can be used
+  // to oversubsribe lanes in the context of online decoding
+  // For instance, we could choose nlanes=200 because it gives us good
+  // performance
+  // on a given GPU. It gives us an end-to-end performance of 3000 XRTF. We are
+  // doing online,
+  // so we only get audio at realtime speed for a given utterance/channel.
+  // We then decide to receive audio from 2500 audio channels at the same time
+  // (each at realtime speed),
+  // and as soon as we have frames ready for nlanes=200 channels, we call
+  // AdvanceDecoding on those channels
+  // In that configuration, we have nlanes=200 (for performance), and
+  // nchannels=2500 (to have enough audio
+  // available at a given time).
+  // Using nlanes=2500 in that configuration would first not be possible (out of
+  // memory), but also not necessary.
+  // Increasing the number of lanes is only useful if it increases performance.
+  // If the GPU is saturated at nlanes=200,
+  // you should not increase that number
+  CudaDecoder(const CudaFst &fst, const CudaDecoderConfig &config, int32 nlanes,
+              int32 nchannels);
+
+  // Reads the config from config
+  void ReadConfig(const CudaDecoderConfig &config);
+  // Special constructor for nlanes = nchannels. Here for the non-advanced user
+  // Here we can consider nchannels = batch size. If we want to decode 10
+  // utterances at a time,
+  // we can use nchannels = 10
+  CudaDecoder(const CudaFst &fst, const CudaDecoderConfig &config,
+              int32 nchannels)
+      : CudaDecoder(fst, config, nchannels, nchannels) {}
+  ~CudaDecoder();
+
+  // InitDecoding initializes the decoding, and should only be used if you
+  // intend to call AdvanceDecoding() on the channels listed in channels
+  void InitDecoding(const std::vector<ChannelId> &channels);
+  // Computes the heavy H2H copies of InitDecoding. Usually launched on the
+  // threadpool
+  void InitDecodingH2HCopies(ChannelId ichannel);
+  // AdvanceDecoding on a given batch
+  // a batch is defined by the channels vector
+  // We can compute N channels at the same time (in the same batch)
+  // where N = number of lanes, as defined in the constructor
+  // AdvanceDecoding will compute as many frames as possible while running the
+  // full batch
+  // when at least one channel has no more frames ready to be computed,
+  // AdvanceDecoding returns
+  // The user then decides what to do, i.e.:
+  //
+  // 1) either remove the empty channel from the channels list
+  // and call again AdvanceDecoding
+  // 2) or swap the empty channel with another one that has frames ready
+  // and call again AdvanceDecoding
+  //
+  // Solution 2) should be preferred because we need to run full, big batches to
+  // saturate the GPU
+  //
+  // If max_num_frames is >= 0 it will decode no more than
+  // that many frames.
+  void AdvanceDecoding(const std::vector<ChannelId> &channels,
+                       std::vector<CudaDecodableInterface *> &decodables,
+                       int32 max_num_frames = -1);
+
+  // Returns the number of frames already decoded in a given channel
+  int32 NumFramesDecoded(ChannelId ichannel) const;
+  // GetBestPath gets the one-best decoding traceback. If "use_final_probs" is
+  // true
+  // AND we reached a final state, it limits itself to final states;
+  // otherwise it gets the most likely token not taking into account
+  // final-probs.
+  void GetBestPath(const std::vector<ChannelId> &channels,
+                   std::vector<Lattice *> &fst_out_vec,
+                   bool use_final_probs = true);
+  // It is possible to use a threadsafe version of GetRawLattice, which is
+  // ConcurrentGetRawLatticeSingleChannel()
+  // Which will do the heavy CPU work associated with GetRawLattice
+  // It is necessary to first call PrepareForGetRawLattice *on the main thread*
+  // on the channels.
+  // The main thread is the one we use to call all other functions, like
+  // InitDecoding or AdvanceDecoding
+  // We usually call it "cuda control thread", but it is a CPU thread
+  // For example:
+  // on main cpu thread : Call PrepareForGetRawLattice on channel 8,6,3
+  // then:
+  // on some cpu thread : Call ConcurrentGetRawLatticeSingleChannel on channel 3
+  // on some cpu thread : Call ConcurrentGetRawLatticeSingleChannel on channel 8
+  // on some cpu thread : Call ConcurrentGetRawLatticeSingleChannel on channel 6
+  void PrepareForGetRawLattice(const std::vector<ChannelId> &channels,
+                               bool use_final_probs);
+  void ConcurrentGetRawLatticeSingleChannel(ChannelId ichannel,
+                                            Lattice *fst_out);
+
+  // GetRawLattice gets the lattice decoding traceback (using the lattice-beam
+  // in the CudaConfig parameters).
+  // If "use_final_probs" is true
+  // AND we reached a final state, it limits itself to final states;
+  // otherwise it gets the most likely token not taking into account
+  // final-probs.
+  void GetRawLattice(const std::vector<ChannelId> &channels,
+                     std::vector<Lattice *> &fst_out_vec, bool use_final_probs);
+  // GetBestCost finds the best cost in the last tokens queue
+  // for each channel in channels. If isfinal is true,
+  // we also add the final cost to the token costs before
+  // finding the minimum cost
+  // We list all tokens that have a cost within [best; best+lattice_beam]
+  // in list_lattice_tokens.
+  // We alsos set has_reached_final[ichannel] to true if token associated to a
+  // final state
+  // exists in the last token queue of that channel
+  void GetBestCost(
+      const std::vector<ChannelId> &channels, bool isfinal,
+      std::vector<std::pair<int32, CostType>> *argmins,
+      std::vector<std::vector<std::pair<int, float>>> *list_lattice_tokens,
+      std::vector<bool> *has_reached_final);
+  // (optional) Giving the decoder access to the cpu thread pool
+  // We will use it to compute specific CPU work, such as InitDecodingH2HCopies
+  // For recurrent CPU work, such as ComputeH2HCopies, we will use dedicated CPU
+  // threads
+  // We will launch nworkers of those threads
+  void SetThreadPoolAndStartCPUWorkers(ThreadPool *thread_pool, int32 nworkers);
+
+ private:
+  // Data allocation. Called in constructor
+  void AllocateDeviceData();
+  void AllocateHostData();
+  void AllocateDeviceKernelParams();
+  // Data initialization. Called in constructor
+  void InitDeviceData();
+  void InitHostData();
+  void InitDeviceParams();
+  // Computes the initial channel
+  // The initial channel is used to initialize a channel
+  // when a new utterance starts (we clone it into the given channel)
+  void ComputeInitialChannel();
+  // Updates *h_kernel_params using channels
+  void SetChannelsInKernelParams(const std::vector<ChannelId> &channels);
+  void ResetChannelsInKernelParams();
+  // Context-switch functions
+  // Used to perform the context-switch of load/saving the state of a channels
+  // into a lane. When a channel will be executed on a lane, we load that
+  // channel into that lane (same idea than when we load a software threads into
+  // the registers of a CPU)
+  void LoadChannelsStateToLanes(const std::vector<ChannelId> &channels);
+  void SaveChannelsStateFromLanes();
+  // We compute the decodes by batch. Each decodable in the batch has a
+  // different number of frames ready
+  // We compute the min number of frames ready (so that the full batch is
+  // executing). If max_num_frames
+  // is > 0, we apply that ceiling to the NumFramesToDecode.
+  int32 NumFramesToDecode(const std::vector<ChannelId> &channels,
+                          std::vector<CudaDecodableInterface *> &decodables,
+                          int32 max_num_frames);
+  // Expand the arcs, emitting stage. Must be called after
+  // a preprocess_in_place, which happens in PostProcessingMainQueue.
+  // ExpandArcsEmitting is called first when decoding a frame,
+  // using the preprocessing that happened at the end of the previous frame,
+  // in PostProcessingMainQueue
+  void ExpandArcsEmitting();
+  // ExpandArcs, non-emitting stage. Must be called after PruneAndPreprocess.
+  void ExpandArcsNonEmitting();
+  // If we have more than max_active_ tokens in the queue (either after an
+  // expand, or at the end of the frame)
+  // we will compute a new beam that will only keep a number of tokens as close
+  // as possible to max_active_ tokens
+  // (that number is >= max_active_) (soft topk)
+  // All ApplyMaxActiveAndReduceBeam is find the right beam for that topk and
+  // set it.
+  // We need to then call PruneAndPreprocess (explicitly pruning tokens with
+  // cost > beam)
+  // Or PostProcessingMainQueue (ignoring tokens with cost > beam in the next
+  // frame)
+  void ApplyMaxActiveAndReduceBeam(enum QUEUE_ID queue_id);
+  // Called after an ExpandArcs. Prune the aux_q (output of the ExpandArcs),
+  // move the survival tokens to the main_q, do the preprocessing at the same
+  // time
+  // We don't need it after the last ExpandArcsNonEmitting.
+  void PruneAndPreprocess();
+  // Once the non-emitting is done, the main_q is final for that frame.
+  // We now generate all the data associated with that main_q, such as listing
+  // the different tokens sharing the same token.next_state
+  // we also preprocess for the ExpandArcsEmitting of the next frame
+  // Once PostProcessingMainQueue, all working data is back to its original
+  // state, to make sure we're ready for the next context switch
+  void PostProcessingMainQueue();
+  // Moving the relevant data to host, ie the data that will be needed in
+  // GetBestPath/GetRawLattice.
+  // Happens when PostProcessingMainQueue is done generating that data
+  void CopyMainQueueDataToHost();
+  // CheckOverflow
+  // If a kernel sets the flag h_q_overflow, we send a warning to stderr
+  // Overflows are detected and prevented on the device. It only means
+  // that we've discarded the tokens that were created after the queue was full
+  // That's why we only send a warning. It is not a fatal error
+  void CheckOverflow();
+  // Evaluates the function func for each lane, returning the max of all return
+  // values
+  // (func returns int32)
+  // Used for instance to ge the max number of arcs for all lanes
+  // func is called with h_lanes_counters_[ilane] for each lane.
+  // h_lanes_counters_
+  // must be ready to be used when calling GetMaxForAllLanes (you might want to
+  // call
+  // CopyLaneCountersToHost[A|]sync to make sure everything is ready first)
+  int32 GetMaxForAllLanes(std::function<int32(const LaneCounters &)> func);
+  // Copy the lane counters back to host, async or sync
+  // The lanes counters contain all the information such as main_q_end (number
+  // of tokens in the main_q)
+  // main_q_narcs (number of arcs) during the computation. That's why we
+  // frequently copy it back to host
+  // to know what to do next
+  void CopyLaneCountersToHostAsync();
+  void CopyLaneCountersToHostSync();
+  // The selected tokens for each frame will be copied back to host. We will
+  // store them on host memory, and we wil use them to create the final lattice
+  // once we've reached the last frame
+  // We will also copy information on those tokens that we've generated on the
+  // device, such as which tokens are associated to the same FST state in the
+  // same frame, or their extra cost.
+  // We cannot call individuals Device2Host copies for each channel, because it
+  // would lead to a lot of small copies, reducing performance. Instead we
+  // concatenate all channels data into a single
+  // continuous array, copy that array to host, then unpack it to the individual
+  // channel vectors
+  // The first step (pack then copy to host, async) is done in
+  // ConcatenateData
+  // The second step is done in LaunchD2H and sLaunchH2HCopies
+  // A sync on cudaStream st has to happen between the two functions to make
+  // sure that the copy is done
+  //
+  // Each lane contains X elements to be copied, where X = func(ilane)
+  // That data is contained in the array (pointer, X), with pointer = src[ilane]
+  // It will be concatenated in d_concat on device, then copied async into
+  // h_concat
+  // That copy is launched on stream st
+  // The offset of the data of each lane in the concatenate array is saved in
+  // *lanes_offsets_ptr
+  // it will be used for unpacking in MoveConcatenatedCopyToVector
+  //
+  // func is called with h_lanes_counters_[ilane] for each lane.
+  // h_lanes_counters_
+  // must be ready to be used when calling GetMaxForAllLanes (you might want to
+  // call
+  // CopyLaneCountersToHost[A|]sync to make sure everything is ready first)
+  // Concatenate data on device before calling the D2H copies
+  void ConcatenateData();
+  // Start the D2H copies used to send data back to host at the end of each
+  // frames
+  void LaunchD2HCopies();
+  // ComputeH2HCopies
+  // At the end of each frame, we copy data back to host
+  // That data was concatenated into a single continous array
+  // We then have to unpack it and move it inside host memory
+  // This is done by ComputeH2HCopies
+  void ComputeH2HCopies();
+  // Takes care of preparing the data for ComputeH2HCopies
+  // and check whether we can use the threadpool or we have to do the work on
+  // the current thread
+  void LaunchH2HCopies();
+  // Function called by the CPU worker threads
+  // Calls ComputeH2HCopies when triggered
+  void ComputeH2HCopiesCPUWorker();
+
+  template <typename T>
+  void MoveConcatenatedCopyToVector(const LaneId ilane,
+                                    const ChannelId ichannel,
+                                    const std::vector<int32> &lanes_offsets,
+                                    T *h_concat,
+                                    std::vector<std::vector<T>> *vecvec);
+  void WaitForH2HCopies();
+  void WaitForInitDecodingH2HCopies();
+  // Computes a set of static asserts on the static values
+  // In theory we should do them at compile time
+  void CheckStaticAsserts();
+  // Can be called in GetRawLattice to do a bunch of deep asserts on the data
+  // Slow, so disabled by default
+  void DebugValidateLattice();
+
+  //
+  // Data members
+  //
+
+  // The CudaFst data structure contains the FST graph
+  // in the CSR format, on both the GPU and CPU memory
+  const CudaFst fst_;
+  // Counters used by a decoder lane
+  // Contains all the single values generated during computation,
+  // such as the current size of the main_q, the number of arcs currently in
+  // that queue
+  // We load data from the channel state during context-switch (for instance the
+  // size of the last token queue for that channel)
+  HostLaneMatrix<LaneCounters> h_lanes_counters_;
+  // Counters of channels
+  // Contains all the single values saved to remember the state of a channel
+  // not used during computation. Those values are loaded/saved into/from a lane
+  // during context switching
+  ChannelCounters *h_channels_counters_;
+  // Contain the various counters used by lanes/channels, such as main_q_end,
+  // main_q_narcs. On device memory (equivalent of h_channels_counters on
+  // device)
+  DeviceChannelMatrix<ChannelCounters> d_channels_counters_;
+  DeviceLaneMatrix<LaneCounters> d_lanes_counters_;
+  // Number of lanes and channels, as defined in the constructor arguments
+  int32 nlanes_, nchannels_;
+
+  // We will now define the data used on the GPU
+  // The data is mainly linked to two token queues
+  // - the main queue
+  // - the auxiliary queue
+  //
+  // The auxiliary queue is used to store the raw output of ExpandArcs.
+  // We then prune that aux queue (and apply max-active) and move the survival
+  // tokens in the main queue.
+  // Tokens stored in the main q can then be used to generate new tokens (using
+  // ExpandArcs)
+  // We also generate more information about what's in the main_q at the end of
+  // a frame (in PostProcessingMainQueue)
+  //
+  // As a reminder, here's the data structure of a token :
+  //
+  // struct Token { state, cost, prev_token, arc_idx }
+  //
+  // Please keep in mind that this structure is also used in the context
+  // of lattice decoding. We are not storing a list of forward links like in the
+  // CPU decoder. A token stays an instanciation of an single arc.
+  //
+  // For performance reasons, we split the tokens in three parts :
+  // { state } , { cost }, { prev_token, arc_idx }
+  // Each part has its associated queue
+  // For instance, d_main_q_state[i], d_main_q_cost[i], d_main_q_info[i]
+  // all refer to the same token (at index i)
+  // The data structure InfoToken contains { prev_token, arc_idx }
+  // We also store the acoustic costs independently in d_main_q_acoustic_cost_
+  //
+  // The data is eiher linked to a channel, or to a lane.
+  //
+  // Channel data (DeviceChannelMatrix):
+  //
+  // The data linked with a channel contains the data of frame i we need to
+  // remember
+  // to compute frame i+1. It is the list of tokens from frame i, with some
+  // additional info
+  // (ie the prefix sum of the emitting arcs degrees from those tokens).
+  // We are only storing d_main_q_state_and_cost_ as channel data because that's
+  // all we need in a token to compute
+  // frame i+1. We don't need token.arc_idx or token.prev_token.
+  // The reason why we also store that prefix sum is because we do the emitting
+  // preprocessing
+  // at the end of frame i. The reason for that is that we need infos from the
+  // hashmap to do that preprocessing.
+  // The hashmap is always cleared at the end of a frame. So we need to do the
+  // preprocessing at the end of frame i,
+  // and then save d_main_q_degrees_prefix_sum_. d_main_q_arc_offsets is
+  // generated also during preprocessing.
+  //
+  // Lane data (DeviceLaneMatrix):
+  //
+  // The lane data is everything we use during computation, but which we reset
+  // at the end of each frame.
+  // For instance we use a hashmap at some point during the computation, but at
+  // the end of each frame we reset it. That
+  // way that hashmap is able to compute whichever channel the next time
+  // AdvanceDecoding is called. The reasons why we do that is :
+  //
+  // - We use context switching. Before and after every frames, we can do a
+  // context switching. Which means that a lane cannot save a channel's state
+  // in any way once AdvanceDecoding returns. e.g., during a call of
+  // AdvanceDecoding, ilane=2 may compute 5 frames from channel=57 (as defined
+  // in the std::vector<ChannelId> channels).
+  // In the next call, the same ilane=2 may compute 10 frames from channel=231.
+  // A lane data has to be reset to its original state at the end of each
+  // AdvanceDecoding call.
+  // If somehow some data has to be saved, it needs to be declared as channel
+  // data.
+  //
+  // - The reason why we make the distinction between lane and channel data (in
+  // theory everything could be consider channel data), is because
+  // a lane uses more memory than a channel. In the context of online decoding,
+  // we need to create a lot channels, and we need them to be as small as
+  // possible in memory.
+  // Everything that can be reused between channels is stored as lane data.
+
+  //
+  // Channel data members:
+  //
+
+  DeviceChannelMatrix<int2> d_main_q_state_and_cost_;
+  // Prefix sum of the arc's degrees in the main_q. Used by ExpandArcs,
+  // set in the preprocess stages (either PruneAndPreprocess or
+  // preprocess_in_place in PostProcessingMainQueue)
+  DeviceChannelMatrix<int32> d_main_q_degrees_prefix_sum_;
+  // d_main_q_arc_offsets[i] = fst_.arc_offsets[d_main_q_state[i]]
+  // we pay the price for the random memory accesses of fst_.arc_offsets in the
+  // preprocess kernel
+  // we cache the results in d_main_q_arc_offsets which will be read in a
+  // coalesced fashion in expand
+  DeviceChannelMatrix<int32> d_main_q_arc_offsets_;
+
+  //
+  // Lane data members:
+  //
+
+  // InfoToken
+  // Usually contains {prev_token, arc_idx}
+  // If more than one token is associated to a fst_state,
+  // it will contain where to find the list of those tokens in
+  // d_main_q_extra_prev_tokens
+  // ie {offset,size} in that list. We differentiate the two situations by
+  // calling InfoToken.IsUniqueTokenForStateAndFrame()
+  DeviceLaneMatrix<InfoToken> d_main_q_info_;
+  // Acoustic cost of a given token
+  DeviceLaneMatrix<CostType> d_main_q_acoustic_cost_;
+  // At the end of a frame, we use a hashmap to detect the tokens that are
+  // associated with the same FST state S
+  // We do it that the very end, to only use the hashmap on post-prune, post-max
+  // active tokens
+  DeviceLaneMatrix<HashmapValueT> d_hashmap_values_;
+  // Reminder: in the GPU lattice decoder, a token is always associated
+  // to a single arc. Which means that multiple tokens in the same frame
+  // can be associated with the same FST state.
+  //
+  // We are NOT listing those duplicates as ForwardLinks in an unique meta-token
+  // like in the CPU lattice decoder
+  //
+  // When more than one token is associated to a single FST state,
+  // we will list those tokens into another list : d_main_q_extra_prev_tokens
+  // we will also save data useful in such a case, such as the extra_cost of a
+  // token compared to the best for that state
+  DeviceLaneMatrix<InfoToken> d_main_q_extra_prev_tokens_;
+  DeviceLaneMatrix<float2> d_main_q_extra_and_acoustic_cost_;
+  // Histogram. Used to perform the histogram of the token costs
+  // in the main_q. Used to perform a soft topk of the main_q (max-active)
+  DeviceLaneMatrix<int32> d_histograms_;
+  // When filling the hashmap in PostProcessingMainQueue, we create a hashmap
+  // value for each FST state
+  // presents in the main_q (if at least one token is associated with that
+  // state)
+  // d_main_q_state_hash_idx_[token_idx] is the index of the state token.state
+  // in the hashmap
+  // Stored into a FSTStateHashIndex, which is actually a int32.
+  // FSTStateHashIndex should only
+  // be accessed through [Get|Set]FSTStateHashIndex, because it uses the bit
+  // sign to also remember if that token is the representative of that state.
+  // If only one token is associated with S, its representative will be itself
+  DeviceLaneMatrix<FSTStateHashIndex> d_main_q_state_hash_idx_;
+  // local_idx of the extra cost list for a state
+  // For a given state S, first token associated with S will have local_idx=0
+  // the second one local_idx=1, etc. The order of the local_idxs is random
+  DeviceLaneMatrix<int32> d_main_q_n_extra_prev_tokens_local_idx_;
+  // Where to write the extra_prev_tokens in the d_main_q_extra_prev_tokens_
+  // queue
+  DeviceLaneMatrix<int32> d_main_q_extra_prev_tokens_prefix_sum_;
+  // Used when computing the prefix_sums in preprocess_in_place. Stores
+  // the local_sums per CTA
+  DeviceLaneMatrix<int2> d_main_q_block_sums_prefix_sum_;
+  // Defining the aux_q. Filled by ExpandArcs.
+  // The tokens are moved to the main_q by PruneAndPreprocess
+  DeviceLaneMatrix<int2> d_aux_q_state_and_cost_;
+  DeviceLaneMatrix<InfoToken> d_aux_q_info_;
+  // Dedicated space for the concat of extra_cost. We should reuse memory
+  DeviceLaneMatrix<float2> d_extra_and_acoustic_cost_concat_matrix_;
+  DeviceLaneMatrix<InfoToken> d_extra_prev_tokens_concat_matrix_;
+  DeviceLaneMatrix<CostType> d_acoustic_cost_concat_matrix_;
+  DeviceLaneMatrix<InfoToken> d_infotoken_concat_matrix_;
+  // We will list in d_list_final_tokens_in_main_q all tokens within [min_cost;
+  // min_cost+lattice_beam]
+  // It is used when calling GetBestCost
+  // We only use an interface here because we will actually reuse data from
+  // d_aux_q_state_and_cost
+  // We are done using the aux_q when GetBestCost is called, so we can reuse
+  // that memory
+  HostLaneMatrix<int2> h_list_final_tokens_in_main_q_;
+  // Parameters used by the kernels
+  // DeviceParams contains all the parameters that won't change
+  // i.e. memory address of the main_q for instance
+  // KernelParams contains information that can change.
+  // For instance which channel is executing on which lane
+  DeviceParams *h_device_params_;
+  KernelParams *h_kernel_params_;
+  std::vector<ChannelId> channel_to_compute_;
+  int32 nlanes_used_;  // number of lanes used in h_kernel_params_
+  // Initial lane
+  // When starting a new utterance,
+  // init_channel_id is used to initialize a channel
+  int32 init_channel_id_;
+  // CUDA streams used by the decoder
+  cudaStream_t compute_st_, copy_st_;
+  // Parameters extracted from CudaDecoderConfig
+  // Those are defined in CudaDecoderConfig
+  CostType default_beam_;
+  CostType lattice_beam_;
+  int32 ntokens_pre_allocated_;
+  int32 max_active_;  // Target value from the parameters
+  int32 aux_q_capacity_;
+  int32 main_q_capacity_;
+  // Hashmap capacity. Multiple of max_tokens_per_frame
+  int32 hashmap_capacity_;
+  // Static segment of the adaptive beam. Cf InitDeviceParams
+  int32 adaptive_beam_static_segment_;
+  // The first index of all the following vectors (or vector<vector>)
+  // is the ChannelId. e.g., to get the number of frames decoded in channel 2,
+  // look into num_frames_decoded_[2].
+
+  // Keep track of the number of frames decoded in the current file.
+  std::vector<int32> num_frames_decoded_;
+  // Offsets of each frame in h_all_tokens_info_
+  std::vector<std::vector<int32>> frame_offsets_;
+  // Data storage. We store on host what we will need in
+  // GetRawLattice/GetBestPath
+  std::vector<std::vector<InfoToken>> h_all_tokens_info_;
+  std::vector<std::vector<CostType>> h_all_tokens_acoustic_cost_;
+  std::vector<std::vector<InfoToken>> h_all_tokens_extra_prev_tokens_;
+  std::vector<std::vector<float2>>
+      h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_;
+  std::vector<std::mutex> channel_lock_;  // at some point we should switch to a
+                                          // shared_lock (to be able to compute
+                                          // partial lattices while still
+                                          // streaming new data for this
+                                          // channel)
+  bool worker_threads_running_;
+  // For each channel, set by PrepareForGetRawLattice
+  // argmin cost, list of the tokens within [best_cost;best_cost+lattice_beam]
+  // and if we've reached a final token. Set by PrepareForGetRawLattice.
+  std::vector<std::pair<int32, CostType>> h_all_argmin_cost_;
+  std::vector<std::vector<std::pair<int, float>>> h_all_final_tokens_list_;
+  std::vector<bool> h_all_has_reached_final_;
+
+  // Pinned memory arrays. Used for the DeviceToHost copies
+  float2 *h_extra_and_acoustic_cost_concat_, *d_extra_and_acoustic_cost_concat_;
+  InfoToken *h_infotoken_concat_, *d_infotoken_concat_;
+  CostType *h_acoustic_cost_concat_, *d_acoustic_cost_concat_;
+  InfoToken *h_extra_prev_tokens_concat_, *d_extra_prev_tokens_concat_;
+  // second memory space used for double buffering
+  float2 *h_extra_and_acoustic_cost_concat_tmp_;
+  InfoToken *h_infotoken_concat_tmp_;
+  CostType *h_acoustic_cost_concat_tmp_;
+  InfoToken *h_extra_prev_tokens_concat_tmp_;
+  // Offsets used in MoveConcatenatedCopyToVector
+  std::vector<int32> h_main_q_end_lane_offsets_;
+  std::vector<int32> h_emitting_main_q_end_lane_offsets_;
+  std::vector<int32> h_n_extra_prev_tokens_lane_offsets_;
+  // Used when calling GetBestCost
+  std::vector<std::pair<int32, CostType>> argmins_;
+  std::vector<bool> has_reached_final_;
+  std::vector<std::vector<std::pair<int32, CostType>>>
+      list_finals_token_idx_and_cost_;
+  bool compute_max_active_;
+  cudaEvent_t nnet3_done_evt_;
+  cudaEvent_t d2h_copy_acoustic_evt_;
+  cudaEvent_t d2h_copy_infotoken_evt_;
+  cudaEvent_t d2h_copy_extra_prev_tokens_evt_;
+  cudaEvent_t concatenated_data_ready_evt_;
+  cudaEvent_t lane_offsets_ready_evt_;
+  // GetRawLattice helper
+  // Data used when building the lattice in GetRawLattice
+
+  // few typedef to make GetRawLattice easier to understand
+  // Returns a unique id for each (iframe, fst_state) pair
+  // We need to be able to quickly identity a (iframe, fst_state) ID
+  //
+  // A lattice state is defined by the pair (iframe, fst_state)
+  // A token is associated to a lattice state (iframe, token.next_state)
+  // Multiple token in the same frame can be associated to the same lattice
+  // state
+  // (they all go to the same token.next_state)
+  // We need to quickly identify what is the lattice state of a token.
+  // We are able to do that through GetLatticeStateInternalId(token),
+  // which returns the internal unique ID for each lattice state for a token
+  //
+  // When we build the output lattice, we a get new lattice state
+  // output_lattice_state = fst_out->AddState()
+  // We call this one OutputLatticeState
+  // The conversion between the two is done through maps
+  // [curr|prev]_f_raw_lattice_state_
+  typedef int32 LatticeStateInternalId;
+  typedef StateId OutputLatticeState;
+  typedef int32 TokenId;
+  LatticeStateInternalId GetLatticeStateInternalId(int32 total_ntokens,
+                                                   TokenId token_idx,
+                                                   InfoToken token);
+  // Keeping track of a variety of info about states in the lattice
+  // - token_extra_cost. A path going from the current lattice_state to the
+  // end has an extra cost
+  // compared to the best path (which has an extra cost of 0).
+  // token_extra_cost is the minimum of the extra_cost of all paths going from
+  // the current lattice_state
+  // to the final frame.
+  // - fst_lattice_state is the StateId of the lattice_state in fst_out (in
+  // the output lattice). lattice_state is an internal state used in
+  // GetRawLattice.
+  // - is_state_closed is true if the token_extra_cost has been read by
+  // another token. It means that the
+  // token_extra_cost value has been used, and if we modify token_extra_cost
+  // again, we may need to recompute the current frame (so that everyone uses
+  // the latest
+  // token_extra_cost value)
+  struct RawLatticeState {
+    CostType token_extra_cost;
+    OutputLatticeState fst_lattice_state;
+    bool is_state_closed;
+  };
+  // extra_cost_min_delta_ used in the must_replay_frame situation. Please read
+  // comments
+  // associated with must_replay_frame in GetRawLattice to understand what it
+  // does
+  CostType extra_cost_min_delta_;
+  ThreadPool *thread_pool_;
+  std::vector<std::thread> cpu_dedicated_threads_;
+  int32 n_threads_used_;
+  std::vector<ChannelId> lanes2channels_todo_;
+  std::atomic<int> n_acoustic_h2h_copies_todo_;
+  std::atomic<int> n_extra_prev_tokens_h2h_copies_todo_;
+  std::atomic<int> n_d2h_copies_ready_;
+  std::atomic<int> n_infotoken_h2h_copies_todo_;
+  int32 n_h2h_task_not_done_;
+  int32 n_init_decoding_h2h_task_not_done_;
+  std::atomic<int> n_h2h_main_task_todo_;
+  std::mutex n_h2h_task_not_done_mutex_;
+  std::mutex n_init_decoding_h2h_task_not_done_mutex_;
+  std::mutex n_h2h_main_task_todo_mutex_;
+  std::condition_variable n_h2h_main_task_todo_cv_;
+  std::condition_variable h2h_done_;
+  std::condition_variable init_decoding_h2h_done_;
+  std::atomic<bool> active_wait_;
+  bool h2h_threads_running_;
+  // Using the output from GetBestPath, we add the best tokens (as selected in
+  // GetBestCost)
+  // from the final frame to the output lattice. We also fill the data
+  // structures
+  // (such as q_curr_frame_todo_, or curr_f_raw_lattice_state_) accordingly
+  void AddFinalTokensToLattice(
+      ChannelId ichannel,
+      std::vector<std::pair<TokenId, InfoToken>> *q_curr_frame_todo,
+      std::unordered_map<LatticeStateInternalId, RawLatticeState>
+          *curr_f_raw_lattice_state,
+      Lattice *fst_out);
+  // Check if a token should be added to the lattice. If it should, then
+  // keep_arc will be true
+  void ConsiderTokenForLattice(
+      ChannelId ichannel, int32 iprev, int32 total_ntokens, TokenId token_idx,
+      OutputLatticeState fst_lattice_start, InfoToken *tok_beg,
+      float2 *arc_extra_cost_beg, CostType token_extra_cost,
+      TokenId list_prev_token_idx, int32 list_arc_idx,
+      InfoToken *list_prev_token, CostType *this_arc_prev_token_extra_cost,
+      CostType *acoustic_cost, OutputLatticeState *lattice_src_state,
+      bool *keep_arc, bool *dbg_found_zero);
+  // Add the arc to the lattice. Also updates what needs to be updated in the
+  // GetRawLattice datastructures.
+  void AddArcToLattice(
+      int32 list_arc_idx, TokenId list_prev_token_idx,
+      InfoToken list_prev_token, int32 curr_frame_offset,
+      CostType acoustic_cost, CostType this_arc_prev_token_extra_cost,
+      LatticeStateInternalId src_state_internal_id,
+      OutputLatticeState fst_lattice_start,
+      OutputLatticeState to_fst_lattice_state,
+      std::vector<std::pair<TokenId, InfoToken>> *q_curr_frame_todo,
+      std::vector<std::pair<TokenId, InfoToken>> *q_prev_frame_todo,
+      std::unordered_map<LatticeStateInternalId, RawLatticeState>
+          *curr_f_raw_lattice_state,
+      std::unordered_map<LatticeStateInternalId, RawLatticeState>
+          *prev_f_raw_lattice_state,
+      std::unordered_set<int32> *f_arc_idx_added, Lattice *fst_out,
+      bool *must_replay_frame);
+  // Read a token information
+  void GetTokenRawLatticeData(
+      TokenId token_idx, InfoToken token, int32 total_ntokens,
+      std::unordered_map<LatticeStateInternalId, RawLatticeState>
+          *curr_f_raw_lattice_state,
+      CostType *token_extra_cost, OutputLatticeState *to_fst_lattice_state);
+
+  // A token is an instance of an arc. It goes to a FST state (token.next_state)
+  // Multiple token in the same frame can go to the same FST state.
+  // GetSameFSTStateTokenList
+  // returns that list
+  void GetSameFSTStateTokenList(ChannelId ichannel, InfoToken token,
+                                InfoToken **tok_beg,
+                                float2 **arc_extra_cost_beg, int32 *nprevs);
+
+  // Swap datastructures at the end of a frame. prev becomes curr (we go
+  // backward)
+  //
+  void SwapPrevAndCurrLatticeMap(
+      int32 iframe, bool dbg_found_best_path,
+      std::vector<std::pair<TokenId, InfoToken>> *q_curr_frame_todo,
+      std::vector<std::pair<TokenId, InfoToken>> *q_prev_frame_todo,
+      std::unordered_map<LatticeStateInternalId, RawLatticeState>
+          *curr_f_raw_lattice_state,
+      std::unordered_map<LatticeStateInternalId, RawLatticeState>
+          *prev_f_raw_lattice_state,
+      std::unordered_set<int32> *f_arc_idx_added);
+  KALDI_DISALLOW_COPY_AND_ASSIGN(CudaDecoder);
+};
+
+}  // end namespace cuda_decoder
+}  // end namespace kaldi
+
+#endif  // KALDI_CUDA_DECODER_CUDA_DECODER_H_
diff --git a/src/cudadecoder/cuda-fst.cc b/src/cudadecoder/cuda-fst.cc
new file mode 100644
index 00000000000..6f899d87321
--- /dev/null
+++ b/src/cudadecoder/cuda-fst.cc
@@ -0,0 +1,209 @@
+// cudadecoder/cuda-fst.cc
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Hugo Braun, Justin Luitjens, Ryan Leary
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if HAVE_CUDA == 1
+
+#include "cudadecoder/cuda-fst.h"
+
+#include <cuda_runtime_api.h>
+#include <nvToolsExt.h>
+
+namespace kaldi {
+namespace cuda_decoder {
+
+void CudaFst::ComputeOffsets(const fst::Fst<StdArc> &fst) {
+  // count states since Fst doesn't provide this functionality
+  num_states_ = 0;
+  for (fst::StateIterator<fst::Fst<StdArc> > iter(fst); !iter.Done();
+       iter.Next())
+    ++num_states_;
+
+  // allocate and initialize offset arrays
+  h_final_.resize(num_states_);
+  h_e_offsets_.resize(num_states_ + 1);
+  h_ne_offsets_.resize(num_states_ + 1);
+
+  // iterate through states and arcs and count number of arcs per state
+  e_count_ = 0;
+  ne_count_ = 0;
+
+  // Init first offsets
+  h_ne_offsets_[0] = 0;
+  h_e_offsets_[0] = 0;
+  for (int i = 0; i < num_states_; i++) {
+    h_final_[i] = fst.Final(i).Value();
+    // count emiting and non_emitting arcs
+    for (fst::ArcIterator<fst::Fst<StdArc> > aiter(fst, i); !aiter.Done();
+         aiter.Next()) {
+      StdArc arc = aiter.Value();
+      int32 ilabel = arc.ilabel;
+      if (ilabel != 0) {  // emitting
+        e_count_++;
+      } else {  // non-emitting
+        ne_count_++;
+      }
+    }
+    h_ne_offsets_[i + 1] = ne_count_;
+    h_e_offsets_[i + 1] = e_count_;
+  }
+
+  // We put the emitting arcs before the nonemitting arcs in the arc list
+  // adding offset to the non emitting arcs
+  // we go to num_states_+1 to take into account the last offset
+  for (int i = 0; i < num_states_ + 1; i++)
+    h_ne_offsets_[i] += e_count_;  // e_arcs before
+
+  arc_count_ = e_count_ + ne_count_;
+}
+
+void CudaFst::AllocateData(const fst::Fst<StdArc> &fst) {
+  d_e_offsets_ = static_cast<unsigned int *>(CuDevice::Instantiate().Malloc(
+      (num_states_ + 1) * sizeof(*d_e_offsets_)));
+  d_ne_offsets_ = static_cast<unsigned int *>(CuDevice::Instantiate().Malloc(
+      (num_states_ + 1) * sizeof(*d_ne_offsets_)));
+  d_final_ = static_cast<float *>(
+      CuDevice::Instantiate().Malloc((num_states_) * sizeof(*d_final_)));
+
+  h_arc_weights_.resize(arc_count_);
+  h_arc_nextstate_.resize(arc_count_);
+  // ilabels (id indexing)
+  h_arc_id_ilabels_.resize(arc_count_);
+  h_arc_olabels_.resize(arc_count_);
+
+  d_arc_weights_ = static_cast<float *>(
+      CuDevice::Instantiate().Malloc(arc_count_ * sizeof(*d_arc_weights_)));
+  d_arc_nextstates_ = static_cast<StateId *>(
+      CuDevice::Instantiate().Malloc(arc_count_ * sizeof(*d_arc_nextstates_)));
+
+  // Only the ilabels for the e_arc are needed on the device
+  d_arc_pdf_ilabels_ = static_cast<int32 *>(
+      CuDevice::Instantiate().Malloc(e_count_ * sizeof(*d_arc_pdf_ilabels_)));
+}
+
+void CudaFst::PopulateArcs(const fst::Fst<StdArc> &fst) {
+  // now populate arc data
+  int e_idx = 0;
+  int ne_idx = e_count_;  // starts where e_offsets_ ends
+  for (int i = 0; i < num_states_; i++) {
+    for (fst::ArcIterator<fst::Fst<StdArc> > aiter(fst, i); !aiter.Done();
+         aiter.Next()) {
+      StdArc arc = aiter.Value();
+      int idx;
+      if (arc.ilabel != 0) {  // emitting
+        idx = e_idx++;
+      } else {
+        idx = ne_idx++;
+      }
+      h_arc_weights_[idx] = arc.weight.Value();
+      h_arc_nextstate_[idx] = arc.nextstate;
+      h_arc_id_ilabels_[idx] = arc.ilabel;
+      // For now we consider id indexing == pdf indexing
+      // If the two are differents, we'll call ApplyTransModelOnIlabels with a
+      // TransitionModel
+      h_arc_pdf_ilabels_[idx] = arc.ilabel;
+      h_arc_olabels_[idx] = arc.olabel;
+    }
+  }
+}
+
+void CudaFst::ApplyTransitionModelOnIlabels(
+    const TransitionModel &trans_model) {
+  // Converting ilabel here, to avoid reindexing when reading nnet3 output
+  // We only need to convert the emitting arcs
+  // The emitting arcs are the first e_count_ arcs
+  for (int iarc = 0; iarc < e_count_; ++iarc)
+    h_arc_pdf_ilabels_[iarc] =
+        trans_model.TransitionIdToPdf(h_arc_id_ilabels_[iarc]);
+}
+
+void CudaFst::CopyDataToDevice() {
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMemcpy(
+      d_e_offsets_, &h_e_offsets_[0], (num_states_ + 1) * sizeof(*d_e_offsets_),
+      cudaMemcpyHostToDevice));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMemcpy(
+      d_ne_offsets_, &h_ne_offsets_[0],
+      (num_states_ + 1) * sizeof(*d_ne_offsets_), cudaMemcpyHostToDevice));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMemcpy(d_final_, &h_final_[0],
+                                                num_states_ * sizeof(*d_final_),
+                                                cudaMemcpyHostToDevice));
+
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(
+      cudaMemcpy(d_arc_weights_, &h_arc_weights_[0],
+                 arc_count_ * sizeof(*d_arc_weights_), cudaMemcpyHostToDevice));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMemcpy(
+      d_arc_nextstates_, &h_arc_nextstate_[0],
+      arc_count_ * sizeof(*d_arc_nextstates_), cudaMemcpyHostToDevice));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMemcpy(
+      d_arc_pdf_ilabels_, &h_arc_pdf_ilabels_[0],
+      e_count_ * sizeof(*d_arc_pdf_ilabels_), cudaMemcpyHostToDevice));
+}
+
+void CudaFst::Initialize(const fst::Fst<StdArc> &fst,
+                         const TransitionModel *trans_model) {
+  nvtxRangePushA("CudaFst constructor");
+  start_ = fst.Start();
+
+  ComputeOffsets(fst);
+  AllocateData(fst);
+  // Temporarily allocating data for this vector
+  // We just need it during CSR generation. We will clear it
+  // at the end of Initialize
+  h_arc_pdf_ilabels_.resize(arc_count_);
+  PopulateArcs(fst);
+  if (trans_model) ApplyTransitionModelOnIlabels(*trans_model);
+
+  KALDI_ASSERT(d_e_offsets_);
+  KALDI_ASSERT(d_ne_offsets_);
+  KALDI_ASSERT(d_final_);
+  KALDI_ASSERT(d_arc_weights_);
+  KALDI_ASSERT(d_arc_nextstates_);
+  KALDI_ASSERT(d_arc_pdf_ilabels_);
+
+  CopyDataToDevice();
+
+  // Making sure the graph is ready
+  cudaDeviceSynchronize();
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+  h_arc_pdf_ilabels_.clear();  // we don't need those on host
+  nvtxRangePop();
+}
+
+void CudaFst::Finalize() {
+  nvtxRangePushA("CudaFst destructor");
+
+  // Making sure that Initialize was called before Finalize
+  KALDI_ASSERT(d_e_offsets_ &&
+               "Please call CudaFst::Initialize() before calling Finalize()");
+  KALDI_ASSERT(d_ne_offsets_);
+  KALDI_ASSERT(d_final_);
+  KALDI_ASSERT(d_arc_weights_);
+  KALDI_ASSERT(d_arc_nextstates_);
+  KALDI_ASSERT(d_arc_pdf_ilabels_);
+
+  CuDevice::Instantiate().Free(d_e_offsets_);
+  CuDevice::Instantiate().Free(d_ne_offsets_);
+  CuDevice::Instantiate().Free(d_final_);
+  CuDevice::Instantiate().Free(d_arc_weights_);
+  CuDevice::Instantiate().Free(d_arc_nextstates_);
+  CuDevice::Instantiate().Free(d_arc_pdf_ilabels_);
+  nvtxRangePop();
+}
+
+}  // end namespace cuda_decoder
+}  // end namespace kaldi
+
+#endif  // HAVE_CUDA == 1
diff --git a/src/cudadecoder/cuda-fst.h b/src/cudadecoder/cuda-fst.h
new file mode 100644
index 00000000000..1dac627755b
--- /dev/null
+++ b/src/cudadecoder/cuda-fst.h
@@ -0,0 +1,122 @@
+// cudadecoder/cuda-fst.h
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Hugo Braun, Justin Luitjens, Ryan Leary
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_CUDA_DECODER_CUDA_FST_H_
+#define KALDI_CUDA_DECODER_CUDA_FST_H_
+#include "cudadecoder/cuda-decoder-common.h"
+#include "cudamatrix/cu-device.h"
+#include "lat/kaldi-lattice.h"
+#include "nnet3/decodable-online-looped.h"  // TransitionModel
+
+namespace kaldi {
+namespace cuda_decoder {
+
+typedef fst::StdArc StdArc;
+typedef StdArc::Weight StdWeight;
+typedef StdArc::Label Label;
+
+// FST in both device and host memory
+// Converting the OpenFst format to the CSR Compressed Sparse Row (CSR) Matrix
+// format.
+// https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)
+// Where states = rows and arcs = columns.
+// This format allows us to store the FST in a compact form, and leads to clean
+// memory accesses
+// For instance, when loading the arcs from a given source, we can load all arc
+// informations (destination, weight, etc.) with coalesced reads
+// Emitting arcs and non-emitting arcs are stored as separate matrices for
+// efficiency
+// We then copy the FST to the device (while keeping its original copy on host)
+class CudaFst {
+ public:
+  CudaFst()
+      : d_e_offsets_(nullptr),
+        d_ne_offsets_(nullptr),
+        d_arc_weights_(nullptr),
+        d_arc_nextstates_(nullptr),
+        d_arc_pdf_ilabels_(nullptr),
+        d_final_(nullptr){};
+  // Creates a CSR representation of the FST,
+  // then copies it to the GPU
+  // If a TransitionModel is passed, we'll use it to convert the ilabels id
+  // indexes into pdf indexes
+  // If no TransitionModel is passed, we'll assume TransitionModel == identity
+  // Important: The CudaDecodable won't apply the TransitionModel. If you use a
+  // TransitionModel, you need to apply it now
+  void Initialize(const fst::Fst<StdArc> &fst,
+                  const TransitionModel *trans_model = NULL);
+  void Finalize();
+
+  inline uint32_t NumStates() const { return num_states_; }
+  inline StateId Start() const { return start_; }
+
+ private:
+  friend class CudaDecoder;
+  // Counts arcs and computes offsets of the fst passed in
+  void ComputeOffsets(const fst::Fst<StdArc> &fst);
+  // Allocates memory to store FST
+  void AllocateData(const fst::Fst<StdArc> &fst);
+  // Populate the arcs data (arc.destination, arc.weights, etc.)
+  void PopulateArcs(const fst::Fst<StdArc> &fst);
+  // Converting the id ilabels into pdf ilabels using the transition model
+  // It allows the CudaDecoder to read the acoustic model loglikelihoods at the
+  // right indexes
+  void ApplyTransitionModelOnIlabels(const TransitionModel &trans_model);
+  // Copies fst to device into the pre-allocated datastructures
+  void CopyDataToDevice();
+  // Total number of states
+  unsigned int num_states_;
+  // Starting state of the FST
+  // Computation should start from state start_
+  StateId start_;
+  // Number of emitting, non-emitting, and total number of arcs
+  unsigned int e_count_, ne_count_, arc_count_;
+  // This data structure is similar to a CSR matrix format
+  // with 2 offsets matrices (one emitting one non-emitting).
+  // Offset arrays are num_states_+1 in size (last state needs
+  // its +1 arc_offset)
+  // Arc values for state i are stored in the range of [offset[i],offset[i+1][
+  unsigned int *d_e_offsets_;  // Emitting offset arrays
+  std::vector<unsigned int> h_e_offsets_;
+  unsigned int *d_ne_offsets_;  // Non-emitting offset arrays
+  std::vector<unsigned int> h_ne_offsets_;
+  // These are the values for each arc.
+  // Arcs belonging to state i are found in the range of [offsets[i],
+  // offsets[i+1][
+  // Use e_offsets or ne_offsets depending on what you need
+  // (emitting/nonemitting)
+  // The ilabels arrays are of size e_count_, not arc_count_
+  std::vector<CostType> h_arc_weights_;
+  CostType *d_arc_weights_;
+  std::vector<StateId> h_arc_nextstate_;
+  StateId *d_arc_nextstates_;
+  std::vector<int32> h_arc_id_ilabels_;
+  int32 *d_arc_pdf_ilabels_;
+  std::vector<int32> h_arc_olabels_;
+  // Final costs
+  // final cost of state i is h_final_[i]
+  std::vector<CostType> h_final_;
+  CostType *d_final_;
+
+  // ilabels (pdf indexing)
+  // only populate during CSR generation, cleared after (not needed on host)
+  std::vector<int32> h_arc_pdf_ilabels_;
+};
+
+}  // end namespace cuda_decoder
+}  // end namespace kaldi
+#endif  // KALDI_CUDA_DECODER_CUDA_FST_H_
diff --git a/src/cudadecoder/decodable-cumatrix.cc b/src/cudadecoder/decodable-cumatrix.cc
new file mode 100644
index 00000000000..d7c1d0359a5
--- /dev/null
+++ b/src/cudadecoder/decodable-cumatrix.cc
@@ -0,0 +1,62 @@
+// cudadecoder/decodable-cumatrix.cc
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ * Authors:  Hugo Braun, Justin Luitjens, Ryan Leary
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if HAVE_CUDA == 1
+
+#include "decodable-cumatrix.h"
+
+namespace kaldi {
+namespace cuda_decoder {
+
+DecodableCuMatrixMapped::DecodableCuMatrixMapped(
+    const TransitionModel &tm, const CuMatrixBase<BaseFloat> &likes,
+    int32 frame_offset)
+    : trans_model_(tm), likes_(&likes), frame_offset_(frame_offset) {
+  if (likes.NumCols() != tm.NumPdfs())
+    KALDI_ERR << "Mismatch, matrix has " << likes.NumCols()
+              << " rows but transition-model has " << tm.NumPdfs()
+              << " pdf-ids.";
+}
+
+int32 DecodableCuMatrixMapped::NumFramesReady() const {
+  return frame_offset_ + likes_->NumRows();
+}
+
+bool DecodableCuMatrixMapped::IsLastFrame(int32 frame) const {
+  KALDI_ASSERT(frame < NumFramesReady());
+  return (frame == NumFramesReady() - 1);
+}
+
+// Indices are one-based!  This is for compatibility with OpenFst.
+int32 DecodableCuMatrixMapped::NumIndices() const {
+  return trans_model_.NumTransitionIds();
+}
+
+// returns cuda pointer to nnet3 output
+BaseFloat *
+DecodableCuMatrixMapped::GetLogLikelihoodsCudaPointer(int32 subsampled_frame) {
+  BaseFloat *frame_nnet3_out =
+      (BaseFloat *)likes_->Data() +
+      (subsampled_frame - frame_offset_) * likes_->Stride();
+  return frame_nnet3_out;
+};
+
+}  // end namespace cuda_decoder
+}  // end namespace kaldi
+
+#endif  // HAVE_CUDA == 1
diff --git a/src/cudadecoder/decodable-cumatrix.h b/src/cudadecoder/decodable-cumatrix.h
new file mode 100644
index 00000000000..d34079cc9c7
--- /dev/null
+++ b/src/cudadecoder/decodable-cumatrix.h
@@ -0,0 +1,71 @@
+// cudadecoder/decodable-cumatrix.h
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ * Authors:  Hugo Braun, Justin Luitjens, Ryan Leary
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef KALDI_CUDA_DECODER_DECODABLE_CUMATRIX_H_
+#define KALDI_CUDA_DECODER_DECODABLE_CUMATRIX_H_
+
+#include "cudadecoder/cuda-decodable-itf.h"
+#include "cudamatrix/cu-matrix.h"
+#include "decoder/decodable-matrix.h"
+
+namespace kaldi {
+namespace cuda_decoder {
+
+/**
+  Cuda Decodable matrix.  Takes transition model and posteriors and provides
+  an interface similar to the Decodable Interface
+  */
+class DecodableCuMatrixMapped : public CudaDecodableInterface {
+public:
+  // This constructor creates an object that will not delete "likes" when done.
+  // the frame_offset is the frame the row 0 of 'likes' corresponds to, would be
+  // greater than one if this is not the first chunk of likelihoods.
+  DecodableCuMatrixMapped(const TransitionModel &tm,
+                          const CuMatrixBase<BaseFloat> &likes,
+                          int32 frame_offset = 0);
+
+  virtual int32 NumFramesReady() const;
+
+  virtual bool IsLastFrame(int32 frame) const;
+
+  virtual BaseFloat LogLikelihood(int32 frame, int32 tid) {
+    KALDI_ASSERT(false);
+    return 0.0f;  // never executed, compiler requests a return
+  };
+
+  // Note: these indices are 1-based.
+  virtual int32 NumIndices() const;
+
+  virtual ~DecodableCuMatrixMapped(){};
+
+  // returns cuda pointer to nnet3 output
+  virtual BaseFloat *GetLogLikelihoodsCudaPointer(int32 subsampled_frame);
+
+private:
+  const TransitionModel &trans_model_; // for tid to pdf mapping
+  const CuMatrixBase<BaseFloat> *likes_;
+
+  int32 frame_offset_;
+
+  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableCuMatrixMapped);
+};
+
+}  // end namespace cuda_decoder
+}  // end namespace kaldi.
+
+#endif  // KALDI_CUDA_DECODER_DECODABLE_CUMATRIX_H_
diff --git a/src/cudadecoder/thread-pool.h b/src/cudadecoder/thread-pool.h
new file mode 100644
index 00000000000..12cd27da462
--- /dev/null
+++ b/src/cudadecoder/thread-pool.h
@@ -0,0 +1,161 @@
+// cudadecoder/thread-pool.h
+// Source:  https://github.com/progschj/ThreadPool
+// Modified to add a priority queue 
+// Ubtained under this license:
+/*
+Copyright (c) 2012 Jakob Progsch, Václav Zeman
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+   1. The origin of this software must not be misrepresented; you must not
+   claim that you wrote the original software. If you use this software
+   in a product, an acknowledgment in the product documentation would be
+   appreciated but is not required.
+
+   2. Altered source versions must be plainly marked as such, and must not be
+   misrepresented as being the original software.
+
+   3. This notice may not be removed or altered from any source
+   distribution.
+*/
+
+#ifndef KALDI_CUDA_DECODER_THREAD_POOL_H_
+#define KALDI_CUDA_DECODER_THREAD_POOL_H_
+
+#include <condition_variable>
+#include <functional>
+#include <future>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <stdexcept>
+#include <thread>
+#include <vector>
+
+namespace kaldi {
+namespace cuda_decoder {
+
+// C++ indexes enum 0,1,2...
+enum ThreadPoolPriority  { THREAD_POOL_LOW_PRIORITY, THREAD_POOL_NORMAL_PRIORITY, THREAD_POOL_HIGH_PRIORITY };
+
+class ThreadPool {
+public:
+  ThreadPool(size_t);
+  template <class F, class... Args>
+  auto enqueue(ThreadPoolPriority priority, F &&f, Args &&... args)
+      -> std::future<typename std::result_of<F(Args...)>::type>;
+  template <class F, class... Args>
+  auto enqueue(F &&f, Args &&... args)
+      -> std::future<typename std::result_of<F(Args...)>::type>;
+  ~ThreadPool();
+
+ private:
+  // need to keep track of threads so we can join them
+  std::vector<std::thread> workers;
+  // the task queue
+  struct Task {
+	  std::function<void()> func;
+          // Ordered first by priority, then FIFO order
+          // tasks created first will have a higher priority_with_fifo.second
+          std::pair<ThreadPoolPriority, long long> priority_with_fifo;
+  };
+  friend bool operator<(const ThreadPool::Task &lhs,
+                        const ThreadPool::Task &rhs);
+
+  std::priority_queue<Task> tasks;
+  long long task_counter;
+
+  // synchronization
+  std::mutex queue_mutex;
+  std::condition_variable condition;
+
+  bool stop;
+};
+
+inline bool operator<(const ThreadPool::Task &lhs,
+                      const ThreadPool::Task &rhs) {
+  return lhs.priority_with_fifo < rhs.priority_with_fifo;
+}
+
+// the constructor just launches some amount of workers
+inline ThreadPool::ThreadPool(size_t threads)
+    : task_counter(LONG_MAX), stop(false) {
+  for (size_t i = 0; i < threads; ++i)
+    workers.emplace_back([this] {
+      for (;;) {
+        Task task;
+
+	{
+          std::unique_lock<std::mutex> lock(this->queue_mutex);
+          this->condition.wait(
+              lock, [this] { return this->stop || !this->tasks.empty(); });
+          if (this->stop && this->tasks.empty()) return;
+          if (!tasks.empty()) {
+            task = std::move(this->tasks.top());
+            this->tasks.pop();
+        }
+	}
+        task.func();
+      }
+    });
+}
+
+// add new work item to the pool : normal priority
+template <class F, class... Args>
+auto ThreadPool::enqueue(F &&f, Args &&... args)
+    -> std::future<typename std::result_of<F(Args...)>::type> {
+  return enqueue(THREAD_POOL_NORMAL_PRIORITY, std::forward<F>(f), std::forward<Args>(args)...);
+}
+
+// add new work item to the pool
+template <class F, class... Args>
+auto ThreadPool::enqueue(ThreadPoolPriority priority, F &&f, Args &&... args)
+    -> std::future<typename std::result_of<F(Args...)>::type> {
+  using return_type = typename std::result_of<F(Args...)>::type;
+
+  auto func = std::make_shared<std::packaged_task<return_type()>>(
+      std::bind(std::forward<F>(f), std::forward<Args>(args)...));
+
+  std::future<return_type> res = func->get_future();
+  {
+    std::unique_lock<std::mutex> lock(queue_mutex);
+
+    // don't allow enqueueing after stopping the pool
+    if (stop)
+      throw std::runtime_error("enqueue on stopped ThreadPool");
+    Task task;
+    task.func = [func]() { (*func)(); };
+    long long task_fifo_id = task_counter--;
+    // The following if will temporarly break the FIFO order
+    // (leading to a perf drop for a few seconds)
+    // But it should trigger in ~50 million years
+    if (task_counter == 0) task_counter = LONG_MAX;
+    task.priority_with_fifo = {priority, task_fifo_id};
+    tasks.push(std::move(task));
+  }
+  condition.notify_one();
+  return res;
+}
+
+// the destructor joins all threads
+inline ThreadPool::~ThreadPool() {
+  {
+    std::unique_lock<std::mutex> lock(queue_mutex);
+    stop = true;
+  }
+  condition.notify_all();
+  for (std::thread &worker : workers)
+    worker.join();
+}
+
+}  // end namespace cuda_decoder
+}  // end namespace kaldi
+
+
+#endif  // KALDI_CUDA_DECODER_THREAD_POOL_H_
diff --git a/src/cudadecoderbin/Makefile b/src/cudadecoderbin/Makefile
new file mode 100644
index 00000000000..6a31a52ceca
--- /dev/null
+++ b/src/cudadecoderbin/Makefile
@@ -0,0 +1,27 @@
+all:
+
+include ../kaldi.mk
+
+ifeq ($(CUDA), true)
+
+LDFLAGS += $(CUDA_LDFLAGS)
+LDLIBS += $(CUDA_LDLIBS)
+
+BINFILES = batched-wav-nnet3-cuda
+
+OBJFILES =
+
+TESTFILES =
+
+ADDLIBS = ../cudadecoder/kaldi-cudadecoder.a  ../cudafeat/kaldi-cudafeat.a \
+../online2/kaldi-online2.a ../ivector/kaldi-ivector.a \
+../nnet3/kaldi-nnet3.a ../chain/kaldi-chain.a ../nnet2/kaldi-nnet2.a \
+../cudamatrix/kaldi-cudamatrix.a ../decoder/kaldi-decoder.a \
+../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
+../feat/kaldi-feat.a ../transform/kaldi-transform.a \
+../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
+../matrix/kaldi-matrix.a ../base/kaldi-base.a
+
+endif
+
+include ../makefiles/default_rules.mk
diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda.cc
new file mode 100644
index 00000000000..f15861050d0
--- /dev/null
+++ b/src/cudadecoderbin/batched-wav-nnet3-cuda.cc
@@ -0,0 +1,334 @@
+// cudadecoderbin/batched-wav-nnet3-cuda.cc
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Hugo Braun, Justin Luitjens, Ryan Leary
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if HAVE_CUDA == 1
+
+#include <cuda.h>
+#include <cuda_profiler_api.h>
+#include <nvToolsExt.h>
+#include <sstream>
+#include "cudadecoder/batched-threaded-nnet3-cuda-pipeline.h"
+#include "cudamatrix/cu-allocator.h"
+#include "fstext/fstext-lib.h"
+#include "lat/lattice-functions.h"
+#include "nnet3/am-nnet-simple.h"
+#include "nnet3/nnet-utils.h"
+#include "util/kaldi-thread.h"
+
+using namespace kaldi;
+using namespace cuda_decoder;
+
+// When the pipeline is full, wait for
+// KALDI_CUDA_DECODER_BIN_PIPELINE_FULL_SLEEP
+// Not using a semaphore because it is usually not necessary to wait
+#define KALDI_CUDA_DECODER_BIN_PIPELINE_FULL_SLEEP ((double)1 / 1e5)
+
+void GetDiagnosticsAndPrintOutput(const std::string &utt,
+                                  const fst::SymbolTable *word_syms,
+                                  const CompactLattice &clat,
+                                  std::mutex *stdout_mutex,
+                                  int64 *tot_num_frames, double *tot_like) {
+  if (clat.NumStates() == 0) {
+    KALDI_WARN << "Empty lattice.";
+    return;
+  }
+  CompactLattice best_path_clat;
+  CompactLatticeShortestPath(clat, &best_path_clat);
+
+  Lattice best_path_lat;
+  ConvertLattice(best_path_clat, &best_path_lat);
+
+  double likelihood;
+  LatticeWeight weight;
+  int32 num_frames;
+  std::vector<int32> alignment;
+  std::vector<int32> words;
+  GetLinearSymbolSequence(best_path_lat, &alignment, &words, &weight);
+  num_frames = alignment.size();
+  likelihood = -(weight.Value1() + weight.Value2());
+  *tot_num_frames += num_frames;
+  *tot_like += likelihood;
+  {
+    std::lock_guard<std::mutex> lk(*stdout_mutex);
+    KALDI_VLOG(2) << "Likelihood per frame for utterance " << utt << " is "
+                  << (likelihood / num_frames) << " over " << num_frames
+                  << " frames.";
+
+    if (word_syms != NULL) {
+      std::ostringstream oss_warn;
+      oss_warn << utt << " ";
+      for (size_t i = 0; i < words.size(); i++) {
+        std::string s = word_syms->Find(words[i]);
+        if (s == "")
+          oss_warn << "Word-id " << words[i] << " not in symbol table.";
+        oss_warn << s << " ";
+      }
+      KALDI_WARN << oss_warn.str();
+    }
+  }
+}
+
+// Called when a task is complete. Will be called by different threads
+// concurrently,
+// so it must be threadsafe
+void FinishOneDecode(
+    const std::string &utt, const std::string &key,
+    const BatchedThreadedNnet3CudaPipelineConfig &batched_decoder_config,
+    const fst::SymbolTable *word_syms, const bool write_lattice,
+    BatchedThreadedNnet3CudaPipeline *cuda_pipeline, int64 *num_frames,
+    double *tot_like, CompactLatticeWriter *clat_writer,
+    std::mutex *clat_writer_mutex, std::mutex *stdout_mutex,
+    CompactLattice &clat) {
+  nvtxRangePushA("FinishOneDecode");
+  GetDiagnosticsAndPrintOutput(utt, word_syms, clat, stdout_mutex, num_frames,
+                               tot_like);
+  if (write_lattice) {
+    std::lock_guard<std::mutex> lk(*clat_writer_mutex);
+    clat_writer->Write(utt, clat);
+  }
+
+  nvtxRangePop();
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace fst;
+
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Reads in wav file(s) and simulates online decoding with neural nets\n"
+        "(nnet3 setup), with optional iVector-based speaker adaptation and\n"
+        "optional endpointing.  Note: some configuration values and inputs "
+        "are\n"
+        "set via config files whose filenames are passed as options\n"
+        "\n"
+        "Usage: batched-wav-nnet3-cuda [options] <nnet3-in> <fst-in> "
+        "<wav-rspecifier> <lattice-wspecifier>\n";
+
+    std::string word_syms_rxfilename;
+
+    bool write_lattice = true;
+    int num_todo = -1;
+    int iterations = 1;
+    ParseOptions po(usage);
+    std::mutex stdout_mutex, clat_writer_mutex;
+    int pipeline_length = 4000;  // length of pipeline of outstanding requests,
+                                 // this is independent of queue lengths in
+                                 // decoder
+
+    po.Register("write-lattice", &write_lattice,
+                "Output lattice to a file. Setting to false is useful when "
+                "benchmarking");
+    po.Register("word-symbol-table", &word_syms_rxfilename,
+                "Symbol table for words [for debug output]");
+    po.Register("file-limit", &num_todo,
+                "Limits the number of files that are processed by this driver. "
+                "After N files are processed the remaining files are ignored. "
+                "Useful for profiling");
+    po.Register("iterations", &iterations,
+                "Number of times to decode the corpus. Output will be written "
+                "only once.");
+
+    // Multi-threaded CPU and batched GPU decoder
+    BatchedThreadedNnet3CudaPipelineConfig batched_decoder_config;
+
+    CuDevice::RegisterDeviceOptions(&po);
+    RegisterCuAllocatorOptions(&po);
+    batched_decoder_config.Register(&po);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 4) {
+      po.PrintUsage();
+      return 1;
+    }
+
+    g_cuda_allocator.SetOptions(g_allocator_options);
+    CuDevice::Instantiate().SelectGpuId("yes");
+    CuDevice::Instantiate().AllowMultithreading();
+
+    BatchedThreadedNnet3CudaPipeline cuda_pipeline(batched_decoder_config);
+
+    std::string nnet3_rxfilename = po.GetArg(1), fst_rxfilename = po.GetArg(2),
+                wav_rspecifier = po.GetArg(3), clat_wspecifier = po.GetArg(4);
+
+    TransitionModel trans_model;
+    nnet3::AmNnetSimple am_nnet;
+
+    // read transition model and nnet
+    bool binary;
+    Input ki(nnet3_rxfilename, &binary);
+    trans_model.Read(ki.Stream(), binary);
+    am_nnet.Read(ki.Stream(), binary);
+    SetBatchnormTestMode(true, &(am_nnet.GetNnet()));
+    SetDropoutTestMode(true, &(am_nnet.GetNnet()));
+    nnet3::CollapseModel(nnet3::CollapseModelConfig(), &(am_nnet.GetNnet()));
+
+    CompactLatticeWriter clat_writer(clat_wspecifier);
+
+    fst::Fst<fst::StdArc> *decode_fst =
+        fst::ReadFstKaldiGeneric(fst_rxfilename);
+
+    cuda_pipeline.Initialize(*decode_fst, am_nnet, trans_model);
+
+    delete decode_fst;
+
+    fst::SymbolTable *word_syms = NULL;
+    if (word_syms_rxfilename != "")
+      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_rxfilename)))
+        KALDI_ERR << "Could not read symbol table from file "
+                  << word_syms_rxfilename;
+
+    int32 num_task_submitted = 0, num_err = 0;
+    double tot_like = 0.0;
+    int64 num_frames = 0;
+    double total_audio = 0;
+
+    nvtxRangePush("Global Timer");
+
+    // starting timer here so we
+    // can measure throughput
+    // without allocation
+    // overheads
+    // using kaldi timer, which starts counting in the constructor
+    Timer timer;
+    std::vector<double> iteration_timer;
+    for (int iter = 0; iter < iterations; iter++) {
+      std::string task_group = std::to_string(iter);
+      num_task_submitted = 0;
+      SequentialTableReader<WaveHolder> wav_reader(wav_rspecifier);
+      if (iter > 0)
+        write_lattice =
+            false;  // write the lattices only on the first iteration
+      for (; !wav_reader.Done(); wav_reader.Next()) {
+        nvtxRangePushA("Utterance Iteration");
+
+        while (cuda_pipeline.GetNumberOfTasksPending() >= pipeline_length) {
+          kaldi::Sleep(KALDI_CUDA_DECODER_BIN_PIPELINE_FULL_SLEEP);
+        }
+
+        std::string utt = wav_reader.Key();
+        std::string key = utt;
+        if (iter > 0) {
+          // make key unique for subsequent iterations
+          key = key + "-" + std::to_string(iter);
+        }
+        const WaveData &wave_data = wav_reader.Value();
+
+        if (iter == 0) {
+          // calculating number of utterances per iteration
+          // calculating total audio time per iteration
+          total_audio += wave_data.Duration();
+        }
+
+        // Creating a function alias for the callback function of that utterance
+        auto finish_one_decode_lamba = [
+            // Capturing the arguments that will change by copy
+            utt, key, write_lattice,
+            // Capturing the const/global args by reference
+            &word_syms, &batched_decoder_config, &cuda_pipeline,
+            &clat_writer_mutex, &stdout_mutex, &clat_writer, &num_frames,
+            &tot_like]
+            // The callback function receive the compact lattice as argument
+            // if determinize_lattice is true, it is a determinized lattice
+            // otherwise, it is a raw lattice converted to compact format
+            // through ConvertLattice
+            (CompactLattice & clat_in) {
+              // Content of our callback function. Calling the general
+              // FinishOneDecode function with the proper arguments
+              FinishOneDecode(
+                  // Captured arguments used to specialize FinishOneDecode for
+                  // this task
+                  utt, key, batched_decoder_config, word_syms, write_lattice,
+                  &cuda_pipeline, &num_frames, &tot_like, &clat_writer,
+                  &clat_writer_mutex, &stdout_mutex,
+                  // Generated lattice that will be passed once the task is
+                  // complete
+                  clat_in);
+            };
+        // Adding a new task. Once the output lattice is ready, it will call
+        // finish_one_decode_lamba
+        // Important : finish_one_decode_lamba is called in the threadpool. We
+        // need it to be threadsafe
+        // (use locks around relevant parts, like writing to I/O)
+        cuda_pipeline.OpenDecodeHandle(key, wave_data, task_group,
+                                       finish_one_decode_lamba);
+        num_task_submitted++;
+        std::string group_done;
+        // Non-blocking way to check if a group is done
+        // returns false if zero groups are ready
+        if (cuda_pipeline.IsAnyGroupCompleted(&group_done)) {
+          cuda_pipeline.CloseAllDecodeHandlesForGroup(group_done);
+          double total_time = timer.Elapsed();
+          int32 iter = std::atoi(group_done.c_str());
+          KALDI_LOG << "~Group " << group_done << " completed"
+                    << " Aggregate Total Time: " << total_time
+                    << " Audio: " << total_audio * (iter + 1)
+                    << " RealTimeX: " << total_audio * (iter + 1) / total_time;
+        }
+
+        nvtxRangePop();
+        if (num_todo != -1 && num_task_submitted >= num_todo) break;
+      }  // end utterance loop
+    }    // end iterations loop
+
+    // We've submitted all tasks. Now waiting for them to complete
+    // We could also have called WaitForAllTasks and CloseAllDecodeHandles
+    while (cuda_pipeline.GetNumberOfTasksPending()) {
+      // WaitForAnyGroup is blocking. It will hold until one group is ready
+      std::string group_done = cuda_pipeline.WaitForAnyGroup();
+      cuda_pipeline.CloseAllDecodeHandlesForGroup(group_done);
+      double total_time = timer.Elapsed();
+      int32 iter = std::atoi(group_done.c_str());
+      KALDI_LOG << "~Group " << group_done << " completed"
+                << " Aggregate Total Time: " << total_time
+                << " Audio: " << total_audio * (iter + 1)
+                << " RealTimeX: " << total_audio * (iter + 1) / total_time;
+    }
+
+    // number of seconds elapsed since the creation of timer
+    double total_time = timer.Elapsed();
+    nvtxRangePop();
+
+    KALDI_LOG << "Decoded " << num_task_submitted << " utterances, " << num_err
+              << " with errors.";
+    KALDI_LOG << "Overall likelihood per frame was " << (tot_like / num_frames)
+              << " per frame over " << num_frames << " frames.";
+
+    KALDI_LOG << "Overall: "
+              << " Aggregate Total Time: " << total_time
+              << " Total Audio: " << total_audio * iterations
+              << " RealTimeX: " << total_audio * iterations / total_time;
+
+    delete word_syms;  // will delete if non-NULL.
+
+    clat_writer.Close();
+
+    cuda_pipeline.Finalize();
+    cudaDeviceSynchronize();
+
+    return 0;
+  } catch (const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}  // main()
+
+#endif  // if HAVE_CUDA == 1
diff --git a/src/cudafeat/Makefile b/src/cudafeat/Makefile
new file mode 100644
index 00000000000..dff0dd63174
--- /dev/null
+++ b/src/cudafeat/Makefile
@@ -0,0 +1,30 @@
+
+
+all:
+
+include ../kaldi.mk
+ifeq ($(CUDA), true)
+
+TESTFILES = 
+
+ifeq ($(CUDA), true)
+  OBJFILES +=  feature-window-cuda.o feature-mfcc-cuda.o feature-online-cmvn-cuda.o \
+							 online-ivector-feature-cuda-kernels.o online-ivector-feature-cuda.o \
+							 online-cuda-feature-pipeline.o
+endif
+
+LIBNAME = kaldi-cudafeat
+
+ADDLIBS = ../feat/kaldi-feat.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a ../cudamatrix/kaldi-cudamatrix.a \
+          ../gmm/kaldi-gmm.a ../ivector/kaldi-ivector.a ../online2/kaldi-online2.a
+
+LDFLAGS += $(CUDA_LDFLAGS)
+LDLIBS += $(CUDA_LDLIBS)
+
+
+%.o : %.cu
+	$(CUDATKDIR)/bin/nvcc -c -g $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ -I$(OPENFSTINC)
+endif
+
+include ../makefiles/default_rules.mk
diff --git a/src/cudafeat/feature-mfcc-cuda.cu b/src/cudafeat/feature-mfcc-cuda.cu
new file mode 100644
index 00000000000..78879d89db1
--- /dev/null
+++ b/src/cudafeat/feature-mfcc-cuda.cu
@@ -0,0 +1,543 @@
+// cudafeature/feature-mfcc-cuda.cu
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Justin Luitjens
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if HAVE_CUDA == 1
+#include <nvToolsExt.h>
+#include <cub/cub.cuh>
+#endif
+
+#include "cudafeat/feature-mfcc-cuda.h"
+#include "cudamatrix/cu-rand.h"
+
+// Each thread block processes a unique frame
+// threads in the same threadblock collaborate to
+// compute the frame together.
+__global__ void apply_lifter_and_floor_energy(
+    int num_frames, int num_cols, float cepstral_lifter, bool use_energy,
+    float energy_floor, float *log_energy, float *lifter_coeffs,
+    float *features, int32_t ldf) {
+  int thread_id = threadIdx.x;
+  int frame = blockIdx.x;
+
+  float *feats = features + frame * ldf;
+
+  // apply lifter coefficients
+  if (cepstral_lifter != 0.0f) {
+    for (int c = thread_id; c < num_cols; c += CU1DBLOCK) {
+      float lift = lifter_coeffs[c];
+      float f = feats[c];
+      feats[c] = f * lift;
+    }
+  }
+
+  // Thread 0 for each frame will apply energy
+  if (use_energy && thread_id == 0) {
+    float energy = log_energy[frame];
+    float log_energy_floor = log(energy_floor);
+
+    if (energy_floor > 0.0f && energy < log_energy_floor) {
+      energy = log_energy_floor;
+    }
+    feats[0] = energy;
+  }
+}
+
+// Each threadblock computes a different row of the matrix.
+// Threads in the same block compute the row collaboratively.
+// This kernel must be called out of place (A_in!=A_out).
+__global__ void power_spectrum_kernel(int row_length, float *A_in, int32_t ldi,
+                                      float *A_out, int32_t ldo) {
+  int thread_id = threadIdx.x;
+  int block_id = blockIdx.x;
+  float *Ar = A_in + block_id * ldi;
+  float *Aw = A_out + block_id * ldo;
+
+  int half_length = row_length / 2;
+  for (int idx = thread_id; idx < half_length; idx += CU1DBLOCK) {
+    // ignore special case
+    if (idx == 0) continue;
+
+    float2 val = reinterpret_cast<float2 *>(Ar)[idx];
+    float ret = val.x * val.x + val.y * val.y;
+    Aw[idx] = ret;
+  }
+
+  // handle special case
+  if (threadIdx.x == 0) {
+    float real = Ar[0];
+    // cufft puts this at the end, this is different than kaldi does with its
+    // own
+    // internal implementation
+    float im = Ar[row_length];
+
+    Aw[0] = real * real;
+    Aw[half_length] = im * im;
+  }
+}
+
+// Expects to be called with 32x8 sized thread block.
+__global__ void mel_banks_compute_kernel(int32_t num_frames, float energy_floor,
+                                         int32 *offsets, int32 *sizes,
+                                         float **vecs, const float *feats,
+                                         int32_t ldf, float *mels,
+                                         int32_t ldm) {
+  // Specialize WarpReduce for type float
+  typedef cub::WarpReduce<float> WarpReduce;
+  // Allocate WarpReduce shared memory for 8 warps
+  __shared__ typename WarpReduce::TempStorage temp_storage[8];
+
+  // warp will work together to compute sum
+  int tid = threadIdx.x;
+  int wid = threadIdx.y;
+  // blocks in the x dimension take different bins
+  int bin = blockIdx.x;
+  // frame is a combination of blocks in the y dimension and threads in the y
+  // dimension
+  int frame = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if (frame >= num_frames) return;
+
+  int offset = offsets[bin];
+  int size = sizes[bin];
+  const float *v = vecs[bin];
+  const float *w = feats + frame * ldf + offset;
+
+  // perfom local sum
+  float sum = 0;
+  for (int idx = tid; idx < size; idx += 32) {
+    sum += v[idx] * w[idx];
+  }
+
+  // Sum in cub
+  sum = WarpReduce(temp_storage[wid]).Sum(sum);
+  if (tid == 0) {
+    // avoid log of zero
+    if (sum < energy_floor) sum = energy_floor;
+    float val = logf(sum);
+    mels[frame * ldm + bin] = val;
+  }
+}
+
+__global__ void process_window_kernel(
+    int frame_length, float dither, float energy_floor, bool remove_dc_offset,
+    float preemph_coeff, bool need_raw_log_energy, float *log_energy_pre_window,
+    const float *windowing, float *tmp_windows, int32_t ldt, float *windows,
+    int32_t ldw) {
+  // Specialize WarpReduce for type float
+  typedef cub::BlockReduce<float, CU1DBLOCK> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  int thread_id = threadIdx.x;
+  int row = blockIdx.x;
+  float *tmp_window = tmp_windows + row * ldt;
+  float *window = windows + row * ldw;
+
+  __shared__ float ssum;
+
+  float sum = 0;
+  float wdot = 0;
+
+  for (int idx = thread_id; idx < frame_length; idx += CU1DBLOCK) {
+    // tmp_window contains optional dither.  Apply that on read.
+    float wval = window[idx];
+    if (dither != 0.0f) {
+      wval += tmp_window[idx] * dither;
+    }
+    // compute local sum for removing dc offset
+    sum += wval;
+    // compute dot product for log energy
+    wdot += wval * wval;
+
+    float windowing_mul = 1;
+    if (remove_dc_offset == false && preemph_coeff == 0.0f) {
+      // we are done here so set windowing multiplication on write.
+      windowing_mul = windowing[idx];
+    }
+
+    // write dithered output
+    window[idx] = wval * windowing_mul;
+  }
+  __syncthreads();
+  if (remove_dc_offset) {
+    // we will recompute this below
+    wdot = 0.0f;
+    // use cub to reduce
+    sum = BlockReduce(temp_storage).Sum(sum);
+
+    // broadcast sum to entire block
+    if (thread_id == 0) ssum = sum;
+    __syncthreads();
+
+    sum = -ssum / frame_length;
+    for (int idx = thread_id; idx < frame_length; idx += CU1DBLOCK) {
+      float windowing_mul = 1;
+      float *out = window;
+      if (preemph_coeff == 0.0f) {
+        // we are done here so apply windowing
+        windowing_mul = windowing[idx];
+      } else {
+        // write to temp window as we will copy back into window
+        // when doing pre-emphasis
+        out = tmp_window;
+      }
+      // updated window value
+      float wval = window[idx] + sum;
+
+      // compute new dot product with dc offset removed
+      wdot += wval * wval;
+
+      // write output
+      out[idx] = wval * windowing_mul;
+    }
+  }
+  __syncthreads();
+
+  // if pointer is not NULL we will set energy to either
+  // the computed energy or 0 depending on need_raw_log_energy
+  if (log_energy_pre_window != NULL) {
+    float energy = 0.0f;
+
+    if (need_raw_log_energy) {
+      // must sync to use retemp_storage
+      if (remove_dc_offset) __syncthreads();
+      // use cub to reduce
+      wdot = BlockReduce(temp_storage).Sum(wdot);
+
+      energy = max(wdot, energy_floor);
+    }
+
+    if (thread_id == 0) {
+      log_energy_pre_window[row] = log(energy);
+    }
+  }
+
+  // TODO this could be more efficient using shared memory instead of
+  // tmp_window.
+  if (preemph_coeff != 0.0f) {
+    // wait for tmp_window to be computed
+    __threadfence();
+    __syncthreads();
+    // starting thread idx at 0 to keep writes aligned.
+    // unaligned reads are less painful then unaligned writes
+    for (int idx = thread_id; idx < frame_length; idx += CU1DBLOCK) {
+      float wval = tmp_window[idx];
+      float prev_window = wval;
+      if (idx > 0) {
+        prev_window = tmp_window[idx - 1];
+      }
+      // use __fmul_rn to match CPU
+      // window[idx] = (wval - preemph_coeff*prev_window) * windowing[idx];
+      window[idx] =
+          (wval - __fmul_rn(preemph_coeff, prev_window)) * windowing[idx];
+    }
+  }
+}
+
+__device__ inline int32 FirstSampleOfFrame(int32 frame, int32 frame_shift,
+                                           int32 window_size, bool snip_edges) {
+  if (snip_edges) {
+    return frame * frame_shift;
+  } else {
+    int32 midpoint_of_frame = frame_shift * frame + frame_shift / 2,
+          beginning_of_frame = midpoint_of_frame - window_size / 2;
+    return beginning_of_frame;
+  }
+}
+
+__global__ void extract_window_kernel(
+    int32 frame_shift, int32 frame_length, int32 frame_length_padded,
+    int32 window_size, bool snip_edges, int32_t sample_offset,
+    const BaseFloat __restrict__ *wave, int32 wave_dim,
+    BaseFloat *__restrict__ windows, int32_t wlda) {
+  int frame = blockIdx.x;
+  int tidx = threadIdx.x;
+
+  int32 start_sample =
+      FirstSampleOfFrame(frame, frame_shift, window_size, snip_edges);
+
+  // wave_start and wave_end are start and end indexes into 'wave', for the
+  // piece of wave that we're trying to extract.
+  int32 wave_start = int32(start_sample - sample_offset),
+        wave_end = wave_start + frame_length;
+
+  BaseFloat *window = windows + frame * wlda;
+  if (wave_start >= 0 && wave_end <= wave_dim) {
+    // the normal case-- no edge effects to consider.
+    for (int i = tidx; i < frame_length; i += blockDim.x) {
+      window[i] = wave[wave_start + i];
+    }
+  } else {
+    // Deal with any end effects by reflection, if needed.  This code will only
+    // be reached for about two frames per utterance, so we don't concern
+    // ourselves excessively with efficiency.
+    for (int s = tidx; s < frame_length; s += blockDim.x) {
+      int32 s_in_wave = s + wave_start;
+      while (s_in_wave < 0 || s_in_wave >= wave_dim) {
+        // reflect around the beginning or end of the wave.
+        // e.g. -1 -> 0, -2 -> 1.
+        // dim -> dim - 1, dim + 1 -> dim - 2.
+        // the code supports repeated reflections, although this
+        // would only be needed in pathological cases.
+        if (s_in_wave < 0)
+          s_in_wave = -s_in_wave - 1;
+        else
+          s_in_wave = 2 * wave_dim - 1 - s_in_wave;
+      }
+      window[s] = wave[s_in_wave];
+    }
+  }
+
+  if (frame_length_padded > frame_length) {
+    for (int i = frame_length + tidx; i < frame_length_padded;
+         i += blockDim.x) {
+      window[i] = 0.0f;
+    }
+  }
+}
+
+// For each frame
+//   compute logf(dot(signal_frame, signal_frame))
+__global__ void dot_log_kernel(int32_t num_frames, int32_t frame_length,
+                               float *signal_frame, int32_t lds,
+                               float *signal_log_energy) {
+  // Specialize WarpReduce for type float
+  typedef cub::BlockReduce<float, CU1DBLOCK> BlockReduce;
+  // Allocate WarpReduce shared memory for 8 warps
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  int32_t frame = blockIdx.x;
+  int32_t tid = threadIdx.x;
+
+  float *in = signal_frame + frame * lds;
+  float sum = 0;
+
+  // preform local dot product
+  for (int32_t i = tid; i < frame_length; i += blockDim.x) {
+    float val = in[i];
+    sum += val * val;
+  }
+
+  // reduce using cub
+  sum = BlockReduce(temp_storage).Sum(sum);
+
+  if (threadIdx.x == 0) {
+    signal_log_energy[frame] = logf(sum);
+  }
+}
+
+namespace kaldi {
+
+CudaMfcc::CudaMfcc(const MfccOptions &opts)
+    : MfccComputer(opts),
+      cu_lifter_coeffs_(lifter_coeffs_),
+      cu_dct_matrix_(dct_matrix_),
+      window_function_(opts.frame_opts) {
+  const MelBanks *mel_banks = GetMelBanks(1.0);
+  const std::vector<std::pair<int32, Vector<BaseFloat>>> &bins =
+      mel_banks->GetBins();
+  int size = bins.size();
+  bin_size_ = size;
+  std::vector<int32> offsets(size), sizes(size);
+  std::vector<float *> vecs(size);
+  cu_vecs_ = new CuVector<float>[size];
+  for (int i = 0; i < bins.size(); i++) {
+    cu_vecs_[i].Resize(bins[i].second.Dim(), kUndefined);
+    cu_vecs_[i].CopyFromVec(bins[i].second);
+    vecs[i] = cu_vecs_[i].Data();
+    sizes[i] = cu_vecs_[i].Dim();
+    offsets[i] = bins[i].first;
+  }
+  offsets_ = static_cast<int32 *>(
+      CuDevice::Instantiate().Malloc(size * sizeof(int32)));
+  sizes_ = static_cast<int32 *>(
+      CuDevice::Instantiate().Malloc(size * sizeof(int32)));
+  vecs_ = static_cast<float **>(
+      CuDevice::Instantiate().Malloc(size * sizeof(float *)));
+
+  CU_SAFE_CALL(cudaMemcpyAsync(vecs_, &vecs[0], size * sizeof(float *),
+                               cudaMemcpyHostToDevice, cudaStreamPerThread));
+  CU_SAFE_CALL(cudaMemcpyAsync(offsets_, &offsets[0], size * sizeof(int32),
+                               cudaMemcpyHostToDevice, cudaStreamPerThread));
+  CU_SAFE_CALL(cudaMemcpyAsync(sizes_, &sizes[0], size * sizeof(int32),
+                               cudaMemcpyHostToDevice, cudaStreamPerThread));
+  CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
+
+  frame_length_ = opts.frame_opts.WindowSize();
+  padded_length_ = opts.frame_opts.PaddedWindowSize();
+  fft_length_ = padded_length_ / 2;  // + 1;
+  fft_size_ = 800;
+
+  // place holders to get strides for cufft.  these will be resized correctly
+  // later.  The +2 for cufft/fftw requirements of an extra element at the end.
+  // turning off stride because cufft seems buggy with a stride
+  cu_windows_.Resize(fft_size_, padded_length_, kUndefined,
+                     kStrideEqualNumCols);
+  tmp_window_.Resize(fft_size_, padded_length_ + 2, kUndefined,
+                     kStrideEqualNumCols);
+
+  stride_ = cu_windows_.Stride();
+  tmp_stride_ = tmp_window_.Stride();
+
+  cufftPlanMany(&plan_, 1, &padded_length_, NULL, 1, stride_, NULL, 1,
+                tmp_stride_ / 2, CUFFT_R2C, fft_size_);
+  cufftSetStream(plan_, cudaStreamPerThread);
+}
+
+// ExtractWindow extracts a windowed frame of waveform with a power-of-two,
+// padded size.  It does mean subtraction, pre-emphasis and dithering as
+// requested.
+void CudaMfcc::ExtractWindows(int32_t num_frames, int64 sample_offset,
+                              const CuVectorBase<BaseFloat> &wave,
+                              const FrameExtractionOptions &opts) {
+  KALDI_ASSERT(sample_offset >= 0 && wave.Dim() != 0);
+  int32 frame_length = opts.WindowSize(),
+        frame_length_padded = opts.PaddedWindowSize();
+  int64 num_samples = sample_offset + wave.Dim();
+
+  extract_window_kernel<<<num_frames, CU1DBLOCK>>>(
+      opts.WindowShift(), frame_length, frame_length_padded, opts.WindowSize(),
+      opts.snip_edges, sample_offset, wave.Data(), wave.Dim(),
+      cu_windows_.Data(), cu_windows_.Stride());
+  CU_SAFE_CALL(cudaGetLastError());
+}
+
+void CudaMfcc::ProcessWindows(int num_frames,
+                              const FrameExtractionOptions &opts,
+                              CuVectorBase<BaseFloat> *log_energy_pre_window) {
+  if (num_frames == 0) return;
+
+  int fft_num_frames = cu_windows_.NumRows();
+  KALDI_ASSERT(fft_num_frames % fft_size_ == 0);
+
+  process_window_kernel<<<num_frames, CU1DBLOCK>>>(
+      frame_length_, opts.dither, std::numeric_limits<float>::epsilon(),
+      opts.remove_dc_offset, opts.preemph_coeff, NeedRawLogEnergy(),
+      log_energy_pre_window->Data(), window_function_.cu_window.Data(),
+      tmp_window_.Data(), tmp_window_.Stride(), cu_windows_.Data(),
+      cu_windows_.Stride());
+
+  CU_SAFE_CALL(cudaGetLastError());
+}
+
+void CudaMfcc::ComputeFinalFeatures(int num_frames, BaseFloat vtln_wrap,
+                                    CuVector<BaseFloat> *cu_signal_log_energy,
+                                    CuMatrix<BaseFloat> *cu_features) {
+  Vector<float> tmp;
+  assert(opts_.htk_compat == false);
+
+  if (num_frames == 0) return;
+
+  if (opts_.use_energy && !opts_.raw_energy) {
+    dot_log_kernel<<<num_frames, CU1DBLOCK>>>(
+        num_frames, cu_windows_.NumCols(), cu_windows_.Data(),
+        cu_windows_.Stride(), cu_signal_log_energy->Data());
+    CU_SAFE_CALL(cudaGetLastError());
+  }
+
+  // make sure a reallocation hasn't changed these
+  KALDI_ASSERT(cu_windows_.Stride() == stride_);
+  KALDI_ASSERT(tmp_window_.Stride() == tmp_stride_);
+
+  // Perform FFTs in batches of fft_size.  This reduces memory requirements
+  for (int idx = 0; idx < num_frames; idx += fft_size_) {
+    CUFFT_SAFE_CALL(cufftExecR2C(
+        plan_, cu_windows_.Data() + cu_windows_.Stride() * idx,
+        (cufftComplex *)(tmp_window_.Data() + tmp_window_.Stride() * idx)));
+  }
+
+  // Compute Power spectrum
+  CuMatrix<BaseFloat> power_spectrum(tmp_window_.NumRows(),
+                                     padded_length_ / 2 + 1, kUndefined);
+
+  power_spectrum_kernel<<<num_frames, CU1DBLOCK>>>(
+      padded_length_, tmp_window_.Data(), tmp_window_.Stride(),
+      power_spectrum.Data(), power_spectrum.Stride());
+  CU_SAFE_CALL(cudaGetLastError());
+
+  // mel banks
+  int num_bins = bin_size_;
+  cu_mel_energies_.Resize(num_frames, num_bins, kUndefined);
+  dim3 mel_threads(32, 8);
+  dim3 mel_blocks(num_bins, (num_frames + mel_threads.y - 1) / mel_threads.y);
+  mel_banks_compute_kernel<<<mel_blocks, mel_threads>>>(
+      num_frames, std::numeric_limits<float>::epsilon(), offsets_, sizes_,
+      vecs_, power_spectrum.Data(), power_spectrum.Stride(),
+      cu_mel_energies_.Data(), cu_mel_energies_.Stride());
+  CU_SAFE_CALL(cudaGetLastError());
+
+  // dct transform
+  cu_features->AddMatMat(1.0, cu_mel_energies_, kNoTrans, cu_dct_matrix_,
+                         kTrans, 0.0);
+
+  apply_lifter_and_floor_energy<<<num_frames, CU1DBLOCK>>>(
+      cu_features->NumRows(), cu_features->NumCols(), opts_.cepstral_lifter,
+      opts_.use_energy, opts_.energy_floor, cu_signal_log_energy->Data(),
+      cu_lifter_coeffs_.Data(), cu_features->Data(), cu_features->Stride());
+  CU_SAFE_CALL(cudaGetLastError());
+}
+
+void CudaMfcc::ComputeFeatures(const CuVectorBase<BaseFloat> &cu_wave,
+                               BaseFloat sample_freq, BaseFloat vtln_warp,
+                               CuMatrix<BaseFloat> *cu_features) {
+  nvtxRangePushA("CudaMfcc::ComputeFeatures");
+  const FrameExtractionOptions &frame_opts = GetFrameOptions();
+  int num_frames = NumFrames(cu_wave.Dim(), frame_opts, true);
+  // compute fft frames by rounding up to a multiple of fft_size_
+  int fft_num_frames = num_frames + (fft_size_ - num_frames % fft_size_);
+  int feature_dim = Dim();
+  bool use_raw_log_energy = NeedRawLogEnergy();
+
+  CuVector<BaseFloat> raw_log_energies;
+  raw_log_energies.Resize(num_frames, kUndefined);
+
+  cu_windows_.Resize(fft_num_frames, padded_length_, kUndefined,
+                     kStrideEqualNumCols);
+  cu_features->Resize(num_frames, feature_dim, kUndefined);
+  //+1 matches cufft/fftw requirements
+  tmp_window_.Resize(fft_num_frames, padded_length_ + 2, kUndefined,
+                     kStrideEqualNumCols);
+
+  if (frame_opts.dither != 0.0f) {
+    // Calling cu-rand directly
+    // CuRand class works on CuMatrixBase which must
+    // assume that the matrix is part of a larger matrix
+    // Doing this directly avoids unecessary memory copies
+    CURAND_SAFE_CALL(
+        curandGenerateNormal(GetCurandHandle(), tmp_window_.Data(),
+                             tmp_window_.NumRows() * tmp_window_.Stride(),
+                             0.0 /*mean*/, 1.0 /*stddev*/));
+  }
+
+  // Extract Windows
+  ExtractWindows(num_frames, 0, cu_wave, frame_opts);
+
+  // Process Windows
+  ProcessWindows(num_frames, frame_opts, &raw_log_energies);
+
+  // Compute Features
+  ComputeFinalFeatures(num_frames, 1.0, &raw_log_energies, cu_features);
+
+  nvtxRangePop();
+}
+CudaMfcc::~CudaMfcc() {
+  delete[] cu_vecs_;
+  CuDevice::Instantiate().Free(vecs_);
+  CuDevice::Instantiate().Free(offsets_);
+  CuDevice::Instantiate().Free(sizes_);
+  cufftDestroy(plan_);
+}
+}  // namespace kaldi
diff --git a/src/cudafeat/feature-mfcc-cuda.h b/src/cudafeat/feature-mfcc-cuda.h
new file mode 100644
index 00000000000..5fabc4c8fe4
--- /dev/null
+++ b/src/cudafeat/feature-mfcc-cuda.h
@@ -0,0 +1,74 @@
+// cudafeat/feature-mfcc-cuda.h
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Justin Luitjens
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_CUDAFEAT_FEATURE_MFCC_CUDA_H_
+#define KALDI_CUDAFEAT_FEATURE_MFCC_CUDA_H_
+
+#if HAVE_CUDA == 1
+#include <cufft.h>
+#endif
+
+#include "cudafeat/feature-window-cuda.h"
+#include "cudamatrix/cu-matrix.h"
+#include "cudamatrix/cu-vector.h"
+#include "feat/feature-mfcc.h"
+
+namespace kaldi {
+// This class implements MFCC computation in CUDA.
+// It takes input from device memory and outputs to
+// device memory.  It also does no synchronization.
+class CudaMfcc : public MfccComputer {
+ public:
+  void ComputeFeatures(const CuVectorBase<BaseFloat> &cu_wave,
+                       BaseFloat sample_freq, BaseFloat vtln_warp,
+                       CuMatrix<BaseFloat> *cu_features);
+
+  CudaMfcc(const MfccOptions &opts);
+  ~CudaMfcc();
+
+ private:
+  void ExtractWindows(int32 num_frames, int64 sample_offset,
+                      const CuVectorBase<BaseFloat> &wave,
+                      const FrameExtractionOptions &opts);
+
+  void ProcessWindows(int num_frames, const FrameExtractionOptions &opts,
+                      CuVectorBase<BaseFloat> *log_energy_pre_window);
+
+  void ComputeFinalFeatures(int num_frames, BaseFloat vtln_wrap,
+                            CuVector<BaseFloat> *cu_signal_log_energy,
+                            CuMatrix<BaseFloat> *cu_features);
+
+  CuMatrix<BaseFloat> cu_windows_;
+  CuMatrix<float> tmp_window_, cu_mel_energies_;
+  CuMatrix<float> cu_dct_matrix_;
+  CuVector<float> cu_lifter_coeffs_;
+
+  int frame_length_, padded_length_, fft_length_, fft_size_;
+  cufftHandle plan_;
+  CudaFeatureWindowFunction window_function_;
+
+  int bin_size_;
+  int32 *offsets_, *sizes_;
+  CuVector<float> *cu_vecs_;
+  float **vecs_;
+
+  // for sanity checking cufft
+  int32_t stride_, tmp_stride_;
+};
+}
+
+#endif
diff --git a/src/cudafeat/feature-online-cmvn-cuda.cu b/src/cudafeat/feature-online-cmvn-cuda.cu
new file mode 100644
index 00000000000..e6ed8aef8a0
--- /dev/null
+++ b/src/cudafeat/feature-online-cmvn-cuda.cu
@@ -0,0 +1,203 @@
+// cudafeat/feature-online-cmvn-cuda.cu
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Justin Luitjens
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cub/cub.cuh>
+#include "cudafeat/feature-online-cmvn-cuda.h"
+#include "cudamatrix/cu-matrix.h"
+#include "cudamatrix/cu-vector.h"
+
+__host__ __device__ inline float2 operator-(const float2 &a, const float2 &b) {
+  float2 retval;
+  retval.x = a.x - b.x;
+  retval.y = a.y - b.y;
+  return retval;
+}
+__host__ __device__ inline float2 operator+(const float2 &a, const float2 &b) {
+  float2 retval;
+  retval.x = a.x + b.x;
+  retval.y = a.y + b.y;
+  return retval;
+}
+
+#if __CUDA_ARCH__ == 750
+__launch_bounds__ (1024, 1)
+#else 
+__launch_bounds__ (1024, 2)
+#endif
+__global__ void compute_cmvn_stats_kernel(const float *data, int32_t ldd,
+                                          int32_t num_frames, int32_t feat_dim,
+                                          float *stats, int32_t lds) {
+  typedef cub::BlockScan<float2, 1024> BlockScan;
+  __shared__ typename BlockScan::TempStorage temp_storage;
+
+  int32_t feat = blockIdx.x;
+
+  float2 running_sum = {0.0f, 0.0f};
+  // for each frame, keep threads alive for cub
+  for (int32_t r = 0; r < num_frames; r += blockDim.x) {
+    int32_t rid = r + threadIdx.x;
+
+    float val = 0.0f;
+
+    if (rid < num_frames) {
+      // uncoalesced, could transpose data or do some shared memory swizzling...
+      val = data[rid * ldd + feat];
+    }
+
+    float2 sum = {val, val * val};  // this elements value and value squared
+
+    float2 psum;   // row prefix sum
+    float2 total;  // total count
+    BlockScan(temp_storage).InclusiveSum(sum, psum, total);
+
+    // offset by running sum
+    psum = psum + running_sum;
+    // increase running sum by new total
+    running_sum = running_sum + total;
+
+    // un-coalesced
+    if (rid < num_frames) {
+      reinterpret_cast<float2 *>(&stats[rid * lds])[feat] = psum;
+    }
+  }
+}
+
+__global__ void apply_cmvn_kernel(
+    int32_t cmvn_window, bool var_norm, bool mean_norm, const float *feat_in,
+    int32_t ldi, int32_t num_rows, int32_t num_cols,
+    const float *__restrict__ stats, int32_t lds,
+    const float *__restrict__ global_stats, int32_t ldg, int32_t global_frames,
+    const float *__restrict__ speaker_stats, int32_t ldss,
+    int32_t speaker_frames, float *feat_out, int32_t ldo) {
+  int32_t r = blockIdx.x;
+
+  for (int c = threadIdx.x; c < num_cols; c += blockDim.x) {
+    float2 frame_stats =
+        reinterpret_cast<const float2 __restrict__ *>(&stats[r * lds])[c];
+
+    float val = feat_in[r * ldi + c];
+
+    float window_length = min(r + 1, cmvn_window);
+
+    // we have to subtract row r-cmvn_window stats
+    if (r >= cmvn_window) {
+      // window starting row
+      int32_t o = r - cmvn_window;
+
+      // stats at the start row of the window that must be removed
+      float2 ostats =
+          reinterpret_cast<const float2 __restrict__ *>(&stats[o * lds])[c];
+
+      // remove start of the window stats
+      frame_stats = frame_stats - ostats;
+    }
+
+    // Smooth stats by speaker frames if necessary
+    float smooth_frames = cmvn_window - window_length;
+    if (smooth_frames > 0 && speaker_frames > 0) {
+      float count_from_speaker = min(smooth_frames, (float)speaker_frames);
+      float speaker_count = speaker_stats[num_cols];
+
+      if (count_from_speaker > 0.0) {
+        float alpha = count_from_speaker / speaker_count;
+
+        frame_stats.x += alpha * speaker_stats[c];         // update mean
+        frame_stats.y += alpha * speaker_stats[ldss + c];  // update variance
+        window_length += alpha * speaker_count;  // update window length
+
+        // recompute smooth frames now that we have speaker stats
+        smooth_frames = cmvn_window - window_length;
+      }
+    }
+
+    // Smooth stats by global frames if necessary
+    if (smooth_frames > 0 && global_frames > 0) {
+      float count_from_global = min(smooth_frames, (float)global_frames);
+      float global_count = global_stats[num_cols];
+
+      if (count_from_global > 0.0) {
+        float alpha = count_from_global / global_count;
+
+        frame_stats.x += alpha * global_stats[c];        // update mean
+        frame_stats.y += alpha * global_stats[ldg + c];  // update variance
+        window_length += alpha * global_count;           // update window length
+      }
+    }
+
+    float mean = frame_stats.x / window_length;
+    float var = frame_stats.y / window_length - mean * mean;
+
+    float floor = 1e-20;
+    if (var < floor)  // avoid dividing by zero
+      var = floor;
+
+    if (!var_norm) {
+      // skip variance normalization
+      var = 1.0f;
+    }
+    if (!mean_norm) {
+      assert(false);
+      // skip mean normalization
+      mean = 0.0f;
+    }
+
+    // shift by mean and scale by variance
+    feat_out[r * ldo + c] = (val - mean) / sqrtf(var);
+  }
+}
+
+namespace kaldi {
+
+void CudaOnlineCmvn::ComputeFeatures(const CuMatrixBase<BaseFloat> &feats_in,
+                               CuMatrix<BaseFloat> *feats_out) {
+  int32_t num_frames = feats_in.NumRows();
+  int32_t feat_dim = feats_in.NumCols();
+  feats_out->Resize(num_frames, feat_dim, kUndefined);
+
+  CuMatrix<float> stats(num_frames, feat_dim * 2, kUndefined);
+
+  int threads = 1024;
+  int blocks = feat_dim;
+
+  // compute windowed sum/sum2 prefix sum along column of feats
+  compute_cmvn_stats_kernel<<<blocks, threads>>>(
+      feats_in.Data(), feats_in.Stride(), num_frames, feat_dim, stats.Data(),
+      stats.Stride());
+  CU_SAFE_CALL(cudaGetLastError());
+
+  threads = (feat_dim + 31) / 32 * 32;  // round up to 32 threads
+  if (threads > 1024) threads = 1024;
+
+  const CuMatrix<float> &gstats = cmvn_state_.global_cmvn_stats;
+  const CuMatrix<float> &sstats = cmvn_state_.speaker_cmvn_stats;
+
+  int global_frames = opts_.global_frames;
+  int speaker_frames = opts_.speaker_frames;
+
+  if (gstats.NumRows() == 0) global_frames = 0;
+  if (sstats.NumRows() == 0) speaker_frames = 0;
+
+  // apply cmvn
+  apply_cmvn_kernel<<<num_frames, threads>>>(
+      opts_.cmn_window, opts_.normalize_variance, opts_.normalize_mean,
+      feats_in.Data(), feats_in.Stride(), num_frames, feat_dim, stats.Data(),
+      stats.Stride(), gstats.Data(), gstats.Stride(), global_frames,
+      sstats.Data(), sstats.Stride(), speaker_frames, feats_out->Data(),
+      feats_out->Stride());
+  CU_SAFE_CALL(cudaGetLastError());
+}
+}
diff --git a/src/cudafeat/feature-online-cmvn-cuda.h b/src/cudafeat/feature-online-cmvn-cuda.h
new file mode 100644
index 00000000000..729467a7a88
--- /dev/null
+++ b/src/cudafeat/feature-online-cmvn-cuda.h
@@ -0,0 +1,59 @@
+// cudafeat/feature-online-cmvn-cuda.h
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Justin Luitjens
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_CUDAFEAT_FEATURE_ONLINE_CMVN_CUDA_H_
+#define KALDI_CUDAFEAT_FEATURE_ONLINE_CMVN_CUDA_H_
+
+#include "cudamatrix/cu-matrix.h"
+#include "cudamatrix/cu-vector.h"
+#include "feat/online-feature.h"
+
+namespace kaldi {
+
+struct CudaOnlineCmvnState {
+  // The following is the global CMVN stats, in the usual
+  // format, of dimension 2 x (dim+1), as [  sum-stats          count
+  //                                       sum-sqared-stats   0    ]
+  CuMatrix<float> global_cmvn_stats;
+  CuMatrix<float> speaker_cmvn_stats;
+
+  CudaOnlineCmvnState(){};
+  CudaOnlineCmvnState(const OnlineCmvnState &cmvn_state)
+      : global_cmvn_stats(cmvn_state.global_cmvn_stats),
+        speaker_cmvn_stats(cmvn_state.speaker_cmvn_stats) {}
+
+  CudaOnlineCmvnState(const CudaOnlineCmvnState &cmvn_state)
+      : global_cmvn_stats(cmvn_state.global_cmvn_stats),
+        speaker_cmvn_stats(cmvn_state.speaker_cmvn_stats) {}
+};
+
+class CudaOnlineCmvn {
+ public:
+  CudaOnlineCmvn(const OnlineCmvnOptions &opts, const CudaOnlineCmvnState &cmvn_state)
+      : opts_(opts), cmvn_state_(cmvn_state){};
+  ~CudaOnlineCmvn(){};
+
+  void ComputeFeatures(const CuMatrixBase<BaseFloat> &feats_in,
+                       CuMatrix<BaseFloat> *feats_out);
+
+ private:
+  const OnlineCmvnOptions &opts_;
+  const CudaOnlineCmvnState &cmvn_state_;
+};
+}
+
+#endif
diff --git a/src/cudafeat/feature-window-cuda.cu b/src/cudafeat/feature-window-cuda.cu
new file mode 100644
index 00000000000..7ce7d798ca2
--- /dev/null
+++ b/src/cudafeat/feature-window-cuda.cu
@@ -0,0 +1,39 @@
+// cudafeat/feature-window-cuda.cu
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Justin Luitjens
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if HAVE_CUDA == 1
+#include <nvToolsExt.h>
+#endif
+#include "cudafeat/feature-window-cuda.h"
+#include "matrix/matrix-functions.h"
+
+namespace kaldi {
+
+CudaFeatureWindowFunction::CudaFeatureWindowFunction(
+    const FrameExtractionOptions &opts) {
+  nvtxRangePushA("CudaFeatureWindowFunction::CudaFeatureWindowFunction");
+  int32 frame_length = opts.WindowSize();
+
+  // Create CPU feature window
+  FeatureWindowFunction feature_window(opts);
+
+  // Copy into GPU memory
+  cu_window.Resize(frame_length, kUndefined);
+  cu_window.CopyFromVec(feature_window.window);
+  nvtxRangePop();
+}
+}  // namespace kaldi
diff --git a/src/cudafeat/feature-window-cuda.h b/src/cudafeat/feature-window-cuda.h
new file mode 100644
index 00000000000..ff749a855b9
--- /dev/null
+++ b/src/cudafeat/feature-window-cuda.h
@@ -0,0 +1,38 @@
+// cudafeat/feature-window-cuda.h
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Justin Luitjens
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_CUDAFEAT_FEATURE_WINDOW_CUDA_H_
+#define KALDI_CUDAFEAT_FEATURE_WINDOW_CUDA_H_
+
+#include "cudamatrix/cu-matrix.h"
+#include "cudamatrix/cu-vector.h"
+#include "feat/feature-window.h"
+
+namespace kaldi {
+
+// This struct stores a feature window on the device.
+// Behind the scense it just computes a feature window on
+// the host and then copies it into device memory.
+struct CudaFeatureWindowFunction {
+  CudaFeatureWindowFunction() {}
+  explicit CudaFeatureWindowFunction(const FrameExtractionOptions &opts);
+  CuVector<float> cu_window;
+};
+
+}  // namespace kaldi
+
+#endif  // KALDI_CUDAFEAT_FEATURE_WINDOW_CUDA_H_
diff --git a/src/cudafeat/online-cuda-feature-pipeline.cc b/src/cudafeat/online-cuda-feature-pipeline.cc
new file mode 100644
index 00000000000..4fd092b4f05
--- /dev/null
+++ b/src/cudafeat/online-cuda-feature-pipeline.cc
@@ -0,0 +1,70 @@
+// cudafeat/online-cuda-feature-pipleine.cc
+
+// Copyright    2013  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cudafeat/online-cuda-feature-pipeline.h"
+
+namespace kaldi {
+
+OnlineCudaFeaturePipeline::OnlineCudaFeaturePipeline(
+    const OnlineNnet2FeaturePipelineConfig &config)
+    : info_(config), mfcc(NULL), ivector(NULL) {
+  if (info_.feature_type == "mfcc") {
+    mfcc = new CudaMfcc(info_.mfcc_opts);
+  }
+
+  if (info_.use_ivectors) {
+    OnlineIvectorExtractionConfig ivector_extraction_opts;
+    ReadConfigFromFile(config.ivector_extraction_config,
+                       &ivector_extraction_opts);
+    info_.ivector_extractor_info.Init(ivector_extraction_opts);
+
+    // Only these ivector options are currently supported
+    ivector_extraction_opts.use_most_recent_ivector = true;
+    ivector_extraction_opts.greedy_ivector_extractor = true;
+
+    ivector = new IvectorExtractorFastCuda(ivector_extraction_opts);
+  }
+}
+
+OnlineCudaFeaturePipeline::~OnlineCudaFeaturePipeline() {
+  if (mfcc != NULL) delete mfcc;
+  if (ivector != NULL) delete ivector;
+}
+
+void OnlineCudaFeaturePipeline::ComputeFeatures(
+    const CuVectorBase<BaseFloat> &cu_wave, BaseFloat sample_freq,
+    CuMatrix<BaseFloat> *input_features,
+    CuVector<BaseFloat> *ivector_features) {
+  if (info_.feature_type == "mfcc") {
+    // MFCC
+    float vtln_warp = 1.0;
+    mfcc->ComputeFeatures(cu_wave, sample_freq, vtln_warp, input_features);
+  } else {
+    KALDI_ASSERT(false);
+  }
+
+  // Ivector
+  if (info_.use_ivectors && ivector_features != NULL) {
+    ivector->GetIvector(*input_features, ivector_features);
+  } else {
+    KALDI_ASSERT(false);
+  }
+}
+
+}  // namespace kaldi
diff --git a/src/cudafeat/online-cuda-feature-pipeline.h b/src/cudafeat/online-cuda-feature-pipeline.h
new file mode 100644
index 00000000000..5c71d37b395
--- /dev/null
+++ b/src/cudafeat/online-cuda-feature-pipeline.h
@@ -0,0 +1,55 @@
+// cudafeat/online-cuda-feature-pipeline.h
+
+// Copyright 2013-2014   Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_CUDAFEAT_ONLINE_CUDA_FEATURE_PIPELINE_H_
+#define KALDI_CUDAFEAT_ONLINE_CUDA_FEATURE_PIPELINE_H_
+
+#include <deque>
+#include <string>
+#include <vector>
+
+#include "base/kaldi-error.h"
+#include "cudafeat/feature-mfcc-cuda.h"
+#include "cudafeat/online-ivector-feature-cuda.h"
+#include "matrix/matrix-lib.h"
+#include "online2/online-nnet2-feature-pipeline.h"
+#include "util/common-utils.h"
+
+namespace kaldi {
+
+class OnlineCudaFeaturePipeline {
+ public:
+  explicit OnlineCudaFeaturePipeline(
+      const OnlineNnet2FeaturePipelineConfig &config);
+
+  void ComputeFeatures(const CuVectorBase<BaseFloat> &cu_wave,
+                       BaseFloat sample_freq,
+                       CuMatrix<BaseFloat> *input_features,
+                       CuVector<BaseFloat> *ivector_features);
+
+  ~OnlineCudaFeaturePipeline();
+
+ private:
+  OnlineNnet2FeaturePipelineInfo info_;
+  CudaMfcc *mfcc;
+  IvectorExtractorFastCuda *ivector;
+};
+}  // namespace kaldi
+
+#endif  // KALDI_CUDAFEAT_ONLINE_CUDA_FEATURE_EXTRACTOR_H_
diff --git a/src/cudafeat/online-ivector-feature-cuda-kernels.cu b/src/cudafeat/online-ivector-feature-cuda-kernels.cu
new file mode 100644
index 00000000000..227f49deb63
--- /dev/null
+++ b/src/cudafeat/online-ivector-feature-cuda-kernels.cu
@@ -0,0 +1,239 @@
+// cudafeat/online-ivector-feature-cuda-kernels.cu
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Justin Luitjens
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cub/cub.cuh>
+#include "cudafeat/online-ivector-feature-cuda-kernels.h"
+#include "cudamatrix/cu-common.h"
+namespace kaldi {
+
+// Meant to be called with blockDim= 32x32
+__global__ void batched_gemv_reduce_kernel(int rows, int cols,
+                                           const float* __restrict__ A, int lda,
+                                           const float* __restrict__ X, int ldx,
+                                           float* C) {
+  // Specialize WarpReduce for type float
+  typedef cub::WarpReduce<float> WarpReduce;
+  // Allocate WarpReduce shared memory for 32 warps
+  __shared__ typename WarpReduce::TempStorage temp_storage[32];
+
+  __shared__ float s_A[32][32 + 1];  //+1 to avoid bank conflicts on transpose
+
+  int bid = blockIdx.x;   // batch id
+  int tid = threadIdx.x;  // thread id
+  int wid = threadIdx.y;  // warp id
+
+  // Offset to input matrix to starting row for batch
+  const float* __restrict__ A_in = A + bid * rows * lda;
+  // Offset to input vector to starting column for batch
+  const float* __restrict__ X_in = X + bid * ldx;
+
+  for (int i = 0; i < cols; i += 32) {  // threadIdx.x, keep all threads present
+    int c = i + tid;
+
+    float sum = 0.0f;
+    // Perform dot product
+    for (int j = 0; j < rows;
+         j += 32) {  // threadIdx.y, keep all threads present
+      int r = j + wid;
+
+      float val = 0.0f;
+      if (c < cols && r < rows) {
+        // coalesced reads
+        val = A_in[r * lda + c] * X_in[r];
+      }
+
+      // write to shared memory
+      __syncthreads();  // wait for shared memory to be written
+      s_A[wid][tid] = val;
+      __syncthreads();  // wait for shared memory to be consumed
+
+      // transpose read from shared memory and collect sum
+      sum += s_A[tid][wid];
+    }
+    // reduce sum in cub
+    sum = WarpReduce(temp_storage[wid]).Sum(sum);
+
+    // update c now that we are trasnposed
+    c = i + wid;
+    if (tid == 0 && c < cols) {
+      // Add contribution to final sum.
+      // Atomic necessary due to different batches updating this
+      atomicAdd(&C[c], sum);
+    }
+  }
+}
+
+// computes feats^2.  This works in place and out of place.
+__global__ void square_matrix_kernel(int32_t num_rows, int32_t num_cols,
+                                     const float* feats, int32_t ldf,
+                                     float* feats_sq, int32_t lds) {
+  for (int i = blockIdx.y * blockDim.y + threadIdx.y; i < num_rows;
+       i += blockDim.y * gridDim.y) {
+    for (int j = blockIdx.x * blockDim.x + threadIdx.x; j < num_cols;
+         j += blockDim.x * gridDim.x) {
+      float f = feats[i * ldf + j];
+      feats_sq[i * lds + j] = f * f;
+    }
+  }
+}
+
+// takes features in feat and writes them into sfeats while applying
+// the splicing algorithm for the left and right context.
+// input features that are out of range are clamped.
+__global__ void splice_features_kernel(int32_t num_frames, int32_t feat_dim,
+                                       int32_t left, int32_t size,
+                                       const float* __restrict__ feats,
+                                       int32_t ldf, float* __restrict__ sfeats,
+                                       int32_t lds) {
+  int32_t frame = blockIdx.x;
+  int32_t tid = threadIdx.x;
+
+  // offset feature output to process frame
+  float* feat_out = sfeats + lds * frame;
+
+  // for each splice of input
+  for (int i = 0; i < size; i++) {
+    int r = frame + i + left;
+    // clamp input row
+    if (r < 0) r = 0;
+    if (r > num_frames - 1) r = num_frames - 1;
+
+    // for each column of input in parallel
+    for (int c = tid; c < feat_dim; c += blockDim.x) {
+      // read feature from input row offset by column
+      float val = feats[r * ldf + c];
+
+      // write feature to output offset by splice index and column
+      feat_out[i * feat_dim + c] = val;
+    }
+  }
+}
+
+// Computes the sum of all terms in a matrix.
+// The kernel double buffers the output such that the
+// output is written to retval[b] where b is 0 or 1.
+// The output element of retval is written as zero.
+// Double buffering eliminates a call to cudaMemset
+__global__ void get_matrix_sum_double_buffer_kernel(int32_t b, int32_t num_rows,
+                                                    int32_t num_cols, float* A,
+                                                    int32_t lda, float scale,
+                                                    float* retval) {
+  // Specialize WarpReduce for type float
+  typedef cub::BlockReduce<float, 32, cub::BLOCK_REDUCE_WARP_REDUCTIONS, 32>
+      BlockReduce;
+  // Allocate WarpReduce shared memory for 32 warps
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  float sum = 0.0f;
+
+  // compute local sums in parallel
+  for (int i = blockIdx.y * blockDim.y + threadIdx.y; i < num_rows;
+       i += blockDim.y * gridDim.y) {
+    for (int j = blockIdx.x * blockDim.x + threadIdx.x; j < num_cols;
+         j += blockDim.x * gridDim.x) {
+      sum += A[i * lda + j];
+    }
+  }
+
+  sum = BlockReduce(temp_storage).Sum(sum);
+
+  if (threadIdx.x == 0 && threadIdx.y == 0) {
+    atomicAdd(&retval[b], sum * scale);
+    int next_b = (b + 1) % 2;
+    retval[next_b] = 0.0f;
+  }
+}
+
+// This kernel updates the linear and quadradic terms.
+// It does not support a previous weight coming in and would need to be updated
+// for online decoding.
+__global__ void update_linear_and_quadratic_terms_kernel(
+    int32_t n, float prior_offset, float* cur_tot_weight, int32_t max_count,
+    float* quadratic, float* linear) {
+  float val = 1.0f;
+  float cur_weight = *cur_tot_weight;
+
+  if (max_count > 0.0f) {
+    float new_scale = max((float)cur_weight, (float)max_count) / max_count;
+
+    float prior_scale_change = new_scale - 1.0f;
+    val += prior_scale_change;
+  }
+
+  for (int32_t i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    int32_t diag_idx = ((i + 1) * (i + 2) / 2) - 1;
+    quadratic[diag_idx] += val;
+  }
+
+  if (threadIdx.x == 0) {
+    linear[0] += val * prior_offset;
+  }
+}
+
+void batched_gemv_reduce(int batch_size, int rows, int cols, int A_stride,
+                         const float* AT, int B_stride, const float* B,
+                         const float* y, float* C) {
+  batched_gemv_reduce_kernel<<<batch_size, dim3(32, 32)>>>(
+      rows, cols, AT, A_stride, B, B_stride, C);
+  CU_SAFE_CALL(cudaGetLastError());
+}
+
+void splice_features(int32_t num_frames, int32_t feat_dim, int32_t left,
+                     int32_t size, const float* feats, int32_t ldf,
+                     float* sfeats, int32_t lds) {
+  int threads = (feat_dim + 31) / 32 * 32;  // round up to the nearest warp size
+  if (threads > 1024) threads = 1024;       // Max block size is 1024 threads
+
+  splice_features_kernel<<<num_frames, threads>>>(
+      num_frames, feat_dim, left, size, feats, ldf, sfeats, lds);
+  CU_SAFE_CALL(cudaGetLastError());
+}
+
+void update_linear_and_quadratic_terms(int32_t n, float prior_offset,
+                                       float* cur_tot_weight, int32_t max_count,
+                                       float* quadratic, float* linear) {
+  // Only using 1 CTA here  for now as the updates are tiny and this lets us
+  // use syncthreads as a global barrier.
+  update_linear_and_quadratic_terms_kernel<<<1, 1024>>>(
+      n, prior_offset, cur_tot_weight, max_count, quadratic, linear);
+  CU_SAFE_CALL(cudaGetLastError());
+}
+
+void get_matrix_sum_double_buffer(int32_t b, int32_t num_rows, int32_t num_cols,
+                                  float* A, int32_t lda, float scale,
+                                  float* sum) {
+  dim3 threads(32, 32);
+  dim3 blocks((num_cols + threads.x - 1) / threads.x,
+              (num_rows + threads.y - 1) / threads.y);
+
+  get_matrix_sum_double_buffer_kernel<<<blocks, threads>>>(
+      b, num_rows, num_cols, A, lda, scale, sum);
+  CU_SAFE_CALL(cudaGetLastError());
+}
+
+void square_matrix(int32_t num_rows, int32_t num_cols, const float* feats,
+                   int32_t ldf, float* feats_sq, int32_t lds) {
+  dim3 threads(32, 32);
+  dim3 blocks((num_cols + threads.x - 1) / threads.x,
+              (num_rows + threads.y - 1) / threads.y);
+
+  square_matrix_kernel<<<blocks, threads>>>(num_rows, num_cols, feats, ldf,
+                                            feats_sq, lds);
+  CU_SAFE_CALL(cudaGetLastError());
+}
+}
diff --git a/src/cudafeat/online-ivector-feature-cuda-kernels.h b/src/cudafeat/online-ivector-feature-cuda-kernels.h
new file mode 100644
index 00000000000..62407b77b2b
--- /dev/null
+++ b/src/cudafeat/online-ivector-feature-cuda-kernels.h
@@ -0,0 +1,40 @@
+// cudafeat/online-ivector-feature-cuda-kernels.h
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Justin Luitjens
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef CUDAFEAT_ONLINE_IVECTOR_FEATURE_CUDA_KERNELS
+#define CUDAFEAT_ONLINE_IVECTOR_FEATURE_CUDA_KERNELS
+
+namespace kaldi {
+void batched_gemv_reduce(int batch_size, int rows, int cols, int A_stride,
+                         const float *AT, int B_stride, const float *B,
+                         const float *y, float *C);
+
+void splice_features(int32_t num_frames, int32_t feat_dim, int32_t left,
+                     int32_t size, const float *feats, int32_t ldf,
+                     float *sfeats, int32_t lds);
+
+void update_linear_and_quadratic_terms(int32_t n, float prior_offset_,
+                                       float *cur_tot_weight, int32_t max_count,
+                                       float *quadratic, float *linear);
+
+void get_matrix_sum_double_buffer(int32_t b, int32_t num_rows, int32_t num_cols,
+                                  float *A, int32_t lda, float scale,
+                                  float *sum);
+
+void square_matrix(int32_t num_rows, int32_t num_cols, const float *feats,
+                   int32_t ldf, float *feats_sq, int32_t lds);
+}
+#endif
diff --git a/src/cudafeat/online-ivector-feature-cuda.cc b/src/cudafeat/online-ivector-feature-cuda.cc
new file mode 100644
index 00000000000..2410fe10ef8
--- /dev/null
+++ b/src/cudafeat/online-ivector-feature-cuda.cc
@@ -0,0 +1,282 @@
+// cudafeat/online-ivector-feature-cuda.cc
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Justin Luitjens
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if HAVE_CUDA == 1
+#include <nvToolsExt.h>
+#endif
+#include <iostream>
+
+#include "base/io-funcs.h"
+#include "base/kaldi-common.h"
+#include "base/timer.h"
+#include "cudafeat/feature-online-cmvn-cuda.h"
+#include "cudafeat/online-ivector-feature-cuda-kernels.h"
+#include "cudafeat/online-ivector-feature-cuda.h"
+#include "cudamatrix/cu-device.h"
+#include "cudamatrix/cu-sp-matrix.h"
+#include "gmm/diag-gmm.h"
+#include "util/kaldi-io.h"
+#include "util/table-types.h"
+namespace kaldi {
+
+void IvectorExtractorFastCuda::GetIvector(const CuMatrixBase<BaseFloat> &feats,
+                                          CuVector<BaseFloat> *ivector) {
+  nvtxRangePushA("GetIvector");
+  CuMatrix<BaseFloat> posteriors, X;
+  CuVector<BaseFloat> gamma;
+
+  // normalized pipeline
+  CuMatrix<BaseFloat> lda_feats_normalized(feats.NumRows(), feats.NumCols(),
+                                           kUndefined);
+  {
+    CudaOnlineCmvn cmvn(info_.cmvn_opts, naive_cmvn_state_);
+    CuMatrix<BaseFloat> cmvn_feats(feats.NumRows(), feats.NumCols(),
+                                   kUndefined);
+    CuMatrix<BaseFloat> spliced_feats_normalized;
+
+    // Normalize
+    cmvn.ComputeFeatures(feats, &cmvn_feats);
+
+    // Splice
+    SpliceFeats(cmvn_feats, &spliced_feats_normalized);
+
+    // Transform by LDA matrix
+    lda_feats_normalized.AddMatMat(1.0, spliced_feats_normalized, kNoTrans,
+                                   cu_lda_, kTrans, 0.0);
+  }
+
+  // non-normalized pipeline
+  CuMatrix<BaseFloat> lda_feats(feats.NumRows(), feats.NumCols(), kUndefined);
+  {
+    CuMatrix<BaseFloat> spliced_feats;
+
+    // Splice feats
+    SpliceFeats(feats, &spliced_feats);
+
+    // Transform by LDA matrix
+    lda_feats.AddMatMat(1.0, spliced_feats, kNoTrans, cu_lda_, kTrans, 0.0);
+  }
+
+  // based on normalized feats
+  ComputePosteriors(lda_feats_normalized, &posteriors);
+
+  // based on non-normalized feats
+  ComputeIvectorStats(lda_feats, posteriors, &gamma, &X);
+
+  ComputeIvectorFromStats(gamma, X, ivector);
+
+  nvtxRangePop();
+}
+
+void IvectorExtractorFastCuda::Read(
+    const kaldi::OnlineIvectorExtractionConfig &config) {
+  // read ubm
+  DiagGmm gmm;
+  ReadKaldiObject(config.diag_ubm_rxfilename, &gmm);
+  ubm_gconsts_.Resize(gmm.NumGauss());
+  ubm_gconsts_.CopyFromVec(gmm.gconsts());
+  ubm_means_inv_vars_.Resize(gmm.NumGauss(), gmm.Dim());
+  ubm_means_inv_vars_.CopyFromMat(gmm.means_invvars());
+  ubm_inv_vars_.Resize(gmm.NumGauss(), gmm.Dim());
+  ubm_inv_vars_.CopyFromMat(gmm.inv_vars());
+  num_gauss_ = gmm.NumGauss();
+
+  // read extractor (copied from ivector/ivector-extractor.cc)
+  bool binary;
+  Input input(config.ivector_extractor_rxfilename, &binary);
+  Matrix<float> w;
+  Vector<float> w_vec;
+  std::vector<Matrix<float> > ie_M;
+  std::vector<SpMatrix<float> > ie_Sigma_inv;
+
+  ExpectToken(input.Stream(), binary, "<IvectorExtractor>");
+  ExpectToken(input.Stream(), binary, "<w>");
+  w.Read(input.Stream(), binary);
+  ExpectToken(input.Stream(), binary, "<w_vec>");
+  w_vec.Read(input.Stream(), binary);
+  ExpectToken(input.Stream(), binary, "<M>");
+  int32 size;
+  ReadBasicType(input.Stream(), binary, &size);
+  KALDI_ASSERT(size > 0);
+  ie_M.resize(size);
+  for (int32 i = 0; i < size; i++) {
+    ie_M[i].Read(input.Stream(), binary);
+  }
+  ExpectToken(input.Stream(), binary, "<SigmaInv>");
+  ie_Sigma_inv.resize(size);
+  for (int32 i = 0; i < size; i++) {
+    ie_Sigma_inv[i].Read(input.Stream(), binary);
+  }
+  ExpectToken(input.Stream(), binary, "<IvectorOffset>");
+  ReadBasicType(input.Stream(), binary, &prior_offset_);
+  ExpectToken(input.Stream(), binary, "</IvectorExtractor>");
+
+  // compute derived variables
+  ivector_dim_ = ie_M[0].NumCols();
+  feat_dim_ = ie_M[0].NumRows();
+
+  ie_Sigma_inv_M_f_.Resize(num_gauss_ * feat_dim_, ivector_dim_);
+
+  ie_U_.Resize(num_gauss_, ivector_dim_ * (ivector_dim_ + 1) / 2);
+
+  SpMatrix<float> tmp_sub_U(ivector_dim_);
+  Matrix<float> tmp_Sigma_inv_M(feat_dim_, ivector_dim_);
+  for (int32 i = 0; i < num_gauss_; i++) {
+    // compute matrix ie_Sigma_inv_M[i[
+    tmp_sub_U.AddMat2Sp(1, ie_M[i], kTrans, ie_Sigma_inv[i], 0);
+    SubVector<float> tmp_U_vec(tmp_sub_U.Data(),
+                               ivector_dim_ * (ivector_dim_ + 1) / 2);
+    ie_U_.Row(i).CopyFromVec(tmp_U_vec);
+
+    tmp_Sigma_inv_M.AddSpMat(1, ie_Sigma_inv[i], ie_M[i], kNoTrans, 0);
+
+    // copy into global matrix
+    CuSubMatrix<float> window(ie_Sigma_inv_M_f_, i * feat_dim_, feat_dim_, 0,
+                              ivector_dim_);
+    window.CopyFromMat(tmp_Sigma_inv_M);
+  }
+}
+
+void IvectorExtractorFastCuda::SpliceFeats(const CuMatrixBase<BaseFloat> &feats,
+                                           CuMatrix<BaseFloat> *spliced_feats) {
+  int left = -info_.splice_opts.left_context;
+  int right = info_.splice_opts.right_context;
+  int size = right - left + 1;
+  spliced_feats->Resize(feats.NumRows(), feats.NumCols() * size, kUndefined);
+
+  splice_features(feats.NumRows(), feats.NumCols(), left, size, feats.Data(),
+                  feats.Stride(), spliced_feats->Data(),
+                  spliced_feats->Stride());
+}
+
+void IvectorExtractorFastCuda::ComputePosteriors(
+    const CuMatrixBase<float> &feats, CuMatrix<float> *posteriors) {
+  int num_frames = feats.NumRows();
+
+  posteriors->Resize(num_frames, num_gauss_, kUndefined);
+
+  posteriors->CopyRowsFromVec(ubm_gconsts_);
+
+  CuMatrix<float> feats_sq(feats.NumRows(), feats.NumCols(), kUndefined);
+
+  // using our own kernel here to avoid an extra memcpy.
+  // ApplyPow unfortunately only works in place.
+  square_matrix(feats.NumRows(), feats.NumCols(), feats.Data(), feats.Stride(),
+                feats_sq.Data(), feats_sq.Stride());
+
+  posteriors->AddMatMat(1.0, feats, kNoTrans, ubm_means_inv_vars_, kTrans, 1.0);
+  posteriors->AddMatMat(-0.5, feats_sq, kNoTrans, ubm_inv_vars_, kTrans, 1.0);
+
+  // apply scaling factor
+  posteriors->ApplySoftMaxPerRow();
+
+  if (info_.max_count > 0) {
+    // when max count > 0 we need to know the total posterior sum to adjust
+    // the prior offset.  So calculate that here.
+    get_matrix_sum_double_buffer(
+        b_, posteriors->NumRows(), posteriors->NumCols(), posteriors->Data(),
+        posteriors->Stride(), info_.posterior_scale, tot_post_.Data());
+  }
+}
+
+void IvectorExtractorFastCuda::ComputeIvectorStats(
+    const CuMatrixBase<float> &feats, const CuMatrixBase<float> &posteriors,
+    CuVector<float> *gamma, CuMatrix<float> *X) {
+  gamma->Resize(num_gauss_, kUndefined);
+  X->Resize(num_gauss_, feat_dim_, kUndefined);
+
+  gamma->AddRowSumMat(info_.posterior_scale, posteriors, 0.0f);
+  X->AddMatMat(info_.posterior_scale, posteriors, kTrans, feats, kNoTrans,
+               0.0f);
+}
+
+void IvectorExtractorFastCuda::ComputeIvectorFromStats(
+    const CuVector<float> &gamma, const CuMatrix<float> &X,
+    CuVector<float> *ivector) {
+  CuVector<float> &linear = *ivector;
+  linear.Resize(ivector_dim_, kUndefined);
+  // Initialize to zero as batched kernel is +=
+  linear.SetZero();
+
+  CuSpMatrix<float> quadratic(ivector_dim_, kUndefined);
+
+  batched_gemv_reduce(num_gauss_, feat_dim_, ivector_dim_,
+                      ie_Sigma_inv_M_f_.Stride(), ie_Sigma_inv_M_f_.Data(),
+                      X.Stride(), X.Data(), gamma.Data(), linear.Data());
+
+  CuSubVector<float> q_vec(quadratic.Data(),
+                           ivector_dim_ * (ivector_dim_ + 1) / 2);
+  q_vec.AddMatVec(1.0f, ie_U_, kTrans, gamma, 0.0f);
+
+  // compute and apply prior offset to linear and quadraditic terms
+  // offset tot_post_ by correct buffer
+  update_linear_and_quadratic_terms(quadratic.NumRows(), prior_offset_,
+                                    tot_post_.Data() + b_, info_.max_count,
+                                    quadratic.Data(), linear.Data());
+  // advance double buffer
+  b_ = (b_ + 1) % 2;
+
+  // We are computing a solution to this linear system:
+  // x = quadratic^-1 * linear
+  // ivector+=x
+
+  // Inverting the matrix is unneccessary.  We are only solving a single
+  // linear system.  So just use choleskey's to solve for a single ivector
+  // Equation being solved: quadratic * ivector = linear
+
+  int nrhs = 1;
+
+  // Forming new non-SP matrix for cusolver.
+  CuMatrix<float> A(quadratic);
+
+#if CUDA_VERSION >= 9010
+  // This is the cusolver return code.  Checking it would require
+  // synchronization.
+  // So we do not check it.
+  int *d_info = NULL;
+
+  // query temp buffer size
+  int L_work;
+  CUSOLVER_SAFE_CALL(
+      cusolverDnSpotrf_bufferSize(GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER,
+                                  ivector_dim_, A.Data(), A.Stride(), &L_work));
+
+  // allocate temp buffer
+  float *workspace =
+      static_cast<float *>(CuDevice::Instantiate().Malloc(L_work));
+
+  // perform factorization
+  CUSOLVER_SAFE_CALL(cusolverDnSpotrf(
+      GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, A.Data(),
+      A.Stride(), workspace, L_work, d_info));
+
+  // solve for rhs
+  CUSOLVER_SAFE_CALL(cusolverDnSpotrs(
+      GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, nrhs,
+      A.Data(), A.Stride(), ivector->Data(), ivector_dim_, d_info));
+
+  CuDevice::Instantiate().Free(workspace);
+#else
+  KALDI_ERR << "Online Ivectors in CUDA is not supported by your CUDA version. "
+            << "Upgrade to CUDA 9.1 or later";
+#endif
+  // remove prior
+  CuSubVector<float> ivector0(*ivector, 0, 1);
+  ivector0.Add(-prior_offset_);
+}
+
+};  // namespace kaldi
diff --git a/src/cudafeat/online-ivector-feature-cuda.h b/src/cudafeat/online-ivector-feature-cuda.h
new file mode 100644
index 00000000000..b661521f782
--- /dev/null
+++ b/src/cudafeat/online-ivector-feature-cuda.h
@@ -0,0 +1,123 @@
+// cudafeat/online-ivector-feature-cuda.h
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Justin Luitjens
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef CUDAFEAT_ONLINE_IVECTOR_FEATURE_CUDA_H_
+#define CUDAFEAT_ONLINE_IVECTOR_FEATURE_CUDA_H_
+
+#include <string>
+#include <vector>
+
+#include "base/kaldi-error.h"
+#include "cudafeat/feature-online-cmvn-cuda.h"
+#include "cudamatrix/cu-matrix.h"
+#include "online2/online-ivector-feature.h"
+
+namespace kaldi {
+
+class IvectorExtractorFastCuda {
+ public:
+  IvectorExtractorFastCuda(const OnlineIvectorExtractionConfig &config)
+      : b_(0), tot_post_(2) {
+    if (config.use_most_recent_ivector == false) {
+      KALDI_WARN
+          << "IvectorExractorFastCuda: Ignoring use_most_recent_ivector=false.";
+    }
+    if (config.greedy_ivector_extractor == false) {
+      KALDI_WARN << "IvectorExractorFastCuda: Ignoring "
+                    "greedy_ivector_extractor=false.";
+    }
+
+    info_.Init(config);
+    naive_cmvn_state_ = OnlineCmvnState(info_.global_cmvn_stats);
+    Read(config);
+    cu_lda_.Resize(info_.lda_mat.NumRows(), info_.lda_mat.NumCols());
+    cu_lda_.CopyFromMat(info_.lda_mat);
+  }
+  ~IvectorExtractorFastCuda() {}
+
+  // This function goes directly from features to an i-vector
+  // which makes the computation easier to port to GPU
+  // and make it run more efficiently
+  //
+  // It is roughly the replacement for the following in kaldi:
+  //
+  // DiagGmm.LogLikelihoods(), VectorToPosteriorEntry()
+  // IvectorExtractorUtteranceStats.AccStats()
+  // IvectorExtractor.GetIvectorDistribution()
+  //
+  // Also note we only do single precision (float)
+  // which will *NOT* give same results as kaldi
+  // i-vector extractor which is float precision
+  // however, in practice, the differences do *NOT*
+  // affect overall accuracy
+  //
+  // This function is thread safe as all class variables
+  // are read-only
+  //
+  void GetIvector(const CuMatrixBase<float> &feats, CuVector<float> *ivector);
+
+  int32 FeatDim() const { return feat_dim_; }
+  int32 IvectorDim() const { return ivector_dim_; }
+  int32 NumGauss() const { return num_gauss_; }
+
+ private:
+  OnlineIvectorExtractionInfo info_;
+
+  IvectorExtractorFastCuda(IvectorExtractorFastCuda const &);
+  IvectorExtractorFastCuda &operator=(IvectorExtractorFastCuda const &);
+
+  void Read(const kaldi::OnlineIvectorExtractionConfig &config);
+
+  void SpliceFeats(const CuMatrixBase<BaseFloat> &feats,
+                   CuMatrix<BaseFloat> *spliced_feats);
+
+  void ComputePosteriors(const CuMatrixBase<float> &feats,
+                         CuMatrix<float> *posteriors);
+
+  void ComputeIvectorStats(const CuMatrixBase<float> &feats,
+                           const CuMatrixBase<float> &posteriors,
+                           CuVector<float> *gamma, CuMatrix<float> *X);
+
+  void ComputeIvectorFromStats(const CuVector<float> &gamma,
+                               const CuMatrix<float> &X,
+                               CuVector<float> *ivector);
+
+  CudaOnlineCmvnState naive_cmvn_state_;
+
+  int32 feat_dim_;
+  int32 ivector_dim_;
+  int32 num_gauss_;
+
+  // ubm variables
+  CuVector<BaseFloat> ubm_gconsts_;
+  CuMatrix<BaseFloat> ubm_means_inv_vars_;
+  CuMatrix<BaseFloat> ubm_inv_vars_;
+  CuMatrix<BaseFloat> cu_lda_;
+  // extractor variables
+  CuMatrix<BaseFloat> ie_U_;
+
+  // Batched matrix which sotres this:
+  CuMatrix<BaseFloat> ie_Sigma_inv_M_f_;
+
+  // double buffer to store total posteriors.
+  // double buffering avoids extra calls to intitialize buffer
+  int b_;
+  CuVector<BaseFloat> tot_post_;
+  float prior_offset_;
+};
+}  // namespace kaldi
+
+#endif  // IVECTOR_IVECTOR_EXTRACTOR_FAST_CUDA_H_
diff --git a/src/cudafeatbin/Makefile b/src/cudafeatbin/Makefile
new file mode 100644
index 00000000000..983923622da
--- /dev/null
+++ b/src/cudafeatbin/Makefile
@@ -0,0 +1,27 @@
+
+all:
+EXTRA_CXXFLAGS = -Wno-sign-compare
+include ../kaldi.mk
+
+LDFLAGS += $(CUDA_LDFLAGS)
+LDLIBS += $(CUDA_LDLIBS)
+
+BINFILES =
+
+ifeq ($(CUDA), true)
+  BINFILES += compute-mfcc-feats-cuda apply-cmvn-online-cuda compute-online-feats-cuda
+endif
+
+OBJFILES =
+
+TESTFILES =
+
+ADDLIBS = ../cudafeat/kaldi-cudafeat.a ../online2/kaldi-online2.a  \
+          ../ivector/kaldi-ivector.a ../decoder/kaldi-decoder.a \
+          ../lat/kaldi-lat.a ../cudamatrix/kaldi-cudamatrix.a \
+          ../hmm/kaldi-hmm.a ../feat/kaldi-feat.a \
+          ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a
+
+include ../makefiles/default_rules.mk
diff --git a/src/cudafeatbin/apply-cmvn-online-cuda.cc b/src/cudafeatbin/apply-cmvn-online-cuda.cc
new file mode 100644
index 00000000000..6dc18fdf2ab
--- /dev/null
+++ b/src/cudafeatbin/apply-cmvn-online-cuda.cc
@@ -0,0 +1,107 @@
+// online2bin/apply-cmvn-online.cc
+
+// Copyright      2014  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <vector>
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "feat/online-feature.h"
+#include "cudafeat/feature-online-cmvn-cuda.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    typedef kaldi::int32 int32;
+    using namespace kaldi;
+    const char *usage =
+      "Apply online cepstral mean (and possibly variance) computation online,\n"
+      "using the same code as used for online decoding in the 'new' setup in\n"
+      "online2/ and online2bin/.'\n"
+      "The computation is done on the device in serial. " 
+      "spk2utt is not supported.\n"
+      "\n"
+      "Usage: apply-cmvn-online-cuda [options] <global-cmvn-stats> <feature-rspecifier> "
+      "<feature-wspecifier>\n"
+      "e.g. apply-cmvn-online-cuda 'matrix-sum scp:data/train/cmvn.scp -|' data/train/split8/1/feats.scp ark:-\n";
+
+    ParseOptions po(usage);
+
+    OnlineCmvnOptions cmvn_opts;
+
+    std::string spk2utt_rspecifier;
+    cmvn_opts.Register(&po);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+    
+    g_cuda_allocator.SetOptions(g_allocator_options);
+    CuDevice::Instantiate().SelectGpuId("yes");
+    CuDevice::Instantiate().AllowMultithreading();
+
+    std::string global_stats_rxfilename = po.GetArg(1),
+      feature_rspecifier = po.GetArg(2),
+      feature_wspecifier = po.GetArg(3);
+
+    // global_cmvn_stats helps us initialize to online CMVN to
+    // reasonable values at the beginning of the utterance.
+    Matrix<double> global_cmvn_stats;
+    ReadKaldiObject(global_stats_rxfilename, &global_cmvn_stats);
+
+    BaseFloatMatrixWriter feature_writer(feature_wspecifier);
+    int32 num_done = 0;
+    int64 tot_t = 0;
+      
+    OnlineCmvnState cmvn_state(global_cmvn_stats);
+    CudaOnlineCmvnState cu_cmvn_state(cmvn_state);
+    CudaOnlineCmvn cuda_cmvn(cmvn_opts, cu_cmvn_state);
+
+    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
+    for (; !feature_reader.Done(); feature_reader.Next()) {
+      std::string utt = feature_reader.Key();
+      const Matrix<BaseFloat> &feats = feature_reader.Value();
+      int32_t numRows = feats.NumRows();
+      int32_t numCols = feats.NumCols();
+
+      CuMatrix<BaseFloat> cu_feats_in(feats);
+      CuMatrix<BaseFloat> cu_feats_out(numRows, numCols, kUndefined);
+      Matrix<BaseFloat> normalized_feats(numRows, numCols, kUndefined);
+
+      cuda_cmvn.ComputeFeatures(cu_feats_in, &cu_feats_out);
+
+      normalized_feats.CopyFromMat(cu_feats_out);
+
+      num_done++;
+      tot_t += feats.NumRows();
+      feature_writer.Write(utt, normalized_feats);
+
+      num_done++;
+    }
+
+    KALDI_LOG << "Applied online CMVN to " << num_done << " files, or "
+      << tot_t << " frames.";
+    return (num_done != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
+
diff --git a/src/cudafeatbin/compute-mfcc-feats-cuda.cc b/src/cudafeatbin/compute-mfcc-feats-cuda.cc
new file mode 100644
index 00000000000..db5307c558f
--- /dev/null
+++ b/src/cudafeatbin/compute-mfcc-feats-cuda.cc
@@ -0,0 +1,192 @@
+// cudafeatbin/compute-mfcc-feats-cuda.cc
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Justin Luitjens
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "cudafeat/feature-mfcc-cuda.h"
+#include "feat/wave-reader.h"
+#include "cudamatrix/cu-matrix.h"
+#include "cudamatrix/cu-vector.h"
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    const char *usage =
+        "Create MFCC feature files.\n"
+        "Usage:  compute-mfcc-feats [options...] <wav-rspecifier> <feats-wspecifier>\n";
+
+    // construct all the global objects
+    ParseOptions po(usage);
+    MfccOptions mfcc_opts;
+    bool subtract_mean = false;
+    BaseFloat vtln_warp = 1.0;
+    std::string vtln_map_rspecifier;
+    std::string utt2spk_rspecifier;
+    int32 channel = -1;
+    BaseFloat min_duration = 0.0;
+    // Define defaults for gobal options
+    std::string output_format = "kaldi";
+
+    // Register the MFCC option struct
+    mfcc_opts.Register(&po);
+
+    // Register the options
+    po.Register("output-format", &output_format, "Format of the output "
+                "files [kaldi, htk]");
+    po.Register("subtract-mean", &subtract_mean, "Subtract mean of each "
+                "feature file [CMS]; not recommended to do it this way. ");
+    po.Register("vtln-warp", &vtln_warp, "Vtln warp factor (only applicable "
+                "if vtln-map not specified)");
+    po.Register("vtln-map", &vtln_map_rspecifier, "Map from utterance or "
+                "speaker-id to vtln warp factor (rspecifier)");
+    po.Register("utt2spk", &utt2spk_rspecifier, "Utterance to speaker-id map "
+                "rspecifier (if doing VTLN and you have warps per speaker)");
+    po.Register("channel", &channel, "Channel to extract (-1 -> expect mono, "
+                "0 -> left, 1 -> right)");
+    po.Register("min-duration", &min_duration, "Minimum duration of segments "
+                "to process (in seconds).");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+    
+    g_cuda_allocator.SetOptions(g_allocator_options);
+    CuDevice::Instantiate().SelectGpuId("yes");
+    CuDevice::Instantiate().AllowMultithreading();
+
+
+    std::string wav_rspecifier = po.GetArg(1);
+
+    std::string output_wspecifier = po.GetArg(2);
+
+    CudaMfcc mfcc(mfcc_opts);
+
+    SequentialTableReader<WaveHolder> reader(wav_rspecifier);
+    BaseFloatMatrixWriter kaldi_writer;  // typedef to TableWriter<something>.
+    TableWriter<HtkMatrixHolder> htk_writer;
+
+    if (utt2spk_rspecifier != "")
+      KALDI_ASSERT(vtln_map_rspecifier != "" && "the utt2spk option is only "
+                   "needed if the vtln-map option is used.");
+    RandomAccessBaseFloatReaderMapped vtln_map_reader(vtln_map_rspecifier,
+                                                      utt2spk_rspecifier);
+    
+    if (output_format == "kaldi") {
+      if (!kaldi_writer.Open(output_wspecifier))
+        KALDI_ERR << "Could not initialize output with wspecifier "
+                  << output_wspecifier;
+    } else if (output_format == "htk") {
+      if (!htk_writer.Open(output_wspecifier))
+        KALDI_ERR << "Could not initialize output with wspecifier "
+                  << output_wspecifier;
+    } else {
+      KALDI_ERR << "Invalid output_format string " << output_format;
+    }
+
+    int32 num_utts = 0, num_success = 0;
+    for (; !reader.Done(); reader.Next()) {
+      num_utts++;
+      std::string utt = reader.Key();
+      const WaveData &wave_data = reader.Value();
+      if (wave_data.Duration() < min_duration) {
+        KALDI_WARN << "File: " << utt << " is too short ("
+                   << wave_data.Duration() << " sec): producing no output.";
+        continue;
+      }
+      int32 num_chan = wave_data.Data().NumRows(), this_chan = channel;
+      {  // This block works out the channel (0=left, 1=right...)
+        KALDI_ASSERT(num_chan > 0);  // should have been caught in
+        // reading code if no channels.
+        if (channel == -1) {
+          this_chan = 0;
+          if (num_chan != 1)
+            KALDI_WARN << "Channel not specified but you have data with "
+                       << num_chan  << " channels; defaulting to zero";
+        } else {
+          if (this_chan >= num_chan) {
+            KALDI_WARN << "File with id " << utt << " has "
+                       << num_chan << " channels but you specified channel "
+                       << channel << ", producing no output.";
+            continue;
+          }
+        }
+      }
+      BaseFloat vtln_warp_local;  // Work out VTLN warp factor.
+      if (vtln_map_rspecifier != "") {
+        if (!vtln_map_reader.HasKey(utt)) {
+          KALDI_WARN << "No vtln-map entry for utterance-id (or speaker-id) "
+                     << utt;
+          continue;
+        }
+        vtln_warp_local = vtln_map_reader.Value(utt);
+      } else {
+        vtln_warp_local = vtln_warp;
+      }
+
+      SubVector<BaseFloat> waveform(wave_data.Data(), this_chan);
+      Matrix<BaseFloat> features;
+      try {
+        CuVector<BaseFloat> cu_waveform(waveform);
+        CuMatrix<BaseFloat> cu_features;
+        mfcc.ComputeFeatures(cu_waveform, wave_data.SampFreq(), vtln_warp_local, &cu_features);
+        features.Resize(cu_features.NumRows(), cu_features.NumCols());
+        features.CopyFromMat(cu_features);
+      } catch (...) {
+        KALDI_WARN << "Failed to compute features for utterance "
+                   << utt;
+        continue;
+      }
+      if (subtract_mean) {
+        Vector<BaseFloat> mean(features.NumCols());
+        mean.AddRowSumMat(1.0, features);
+        mean.Scale(1.0 / features.NumRows());
+        for (int32 i = 0; i < features.NumRows(); i++)
+          features.Row(i).AddVec(-1.0, mean);
+      }
+      if (output_format == "kaldi") {
+        kaldi_writer.Write(utt, features);
+      } else {
+        std::pair<Matrix<BaseFloat>, HtkHeader> p;
+        p.first.Resize(features.NumRows(), features.NumCols());
+        p.first.CopyFromMat(features);
+        HtkHeader header = {
+          features.NumRows(),
+          100000,  // 10ms shift
+          static_cast<int16>(sizeof(float)*(features.NumCols())),
+          static_cast<uint16>( 006 | // MFCC
+          (mfcc_opts.use_energy ? 0100 : 020000)) // energy; otherwise c0
+        };
+        p.second = header;
+        htk_writer.Write(utt, p);
+      }
+      if (num_utts % 10 == 0)
+        KALDI_LOG << "Processed " << num_utts << " utterances";
+      KALDI_VLOG(2) << "Processed features for key " << utt;
+      num_success++;
+    }
+    KALDI_LOG << " Done " << num_success << " out of " << num_utts
+              << " utterances.";
+    return (num_success != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
+
diff --git a/src/cudafeatbin/compute-online-feats-cuda.cc b/src/cudafeatbin/compute-online-feats-cuda.cc
new file mode 100644
index 00000000000..b9135c3cee6
--- /dev/null
+++ b/src/cudafeatbin/compute-online-feats-cuda.cc
@@ -0,0 +1,123 @@
+// cudafeatbin/compute-online-feats-cuda.cc
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Justin Luitjens
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if HAVE_CUDA == 1
+#include <nvToolsExt.h>
+#endif
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "cudafeat/online-cuda-feature-pipeline.h"
+#include "feat/wave-reader.h"
+#include "cudamatrix/cu-matrix.h"
+#include "cudamatrix/cu-vector.h"
+
+int main(int argc, char *argv[]) {
+  using namespace kaldi;
+  typedef kaldi::int32 int32;
+  typedef kaldi::int64 int64;
+  try {
+    const char *usage =
+      "Extract features and ivectors for utterances using the cuda online\n"
+      "feature pipeline. This class models the online feature pipeline.\n"  
+      "\n"
+      "Usage:  compute-online-feats-cuda [options] <wave-rspecifier> "
+      "<ivector-wspecifier> <feats-wspecifier>\n"
+      "e.g.: \n"
+      "  ./compute-online-feats-cuda --config=feature_config wav.scp "
+      "ark,scp:ivector.ark,ivector.scp ark,scp:feat.ark,feat.scp\n";
+
+    ParseOptions po(usage);
+    // Use online feature config as that is the flow we are trying to model
+    OnlineNnet2FeaturePipelineConfig feature_opts;
+
+    feature_opts.Register(&po);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    g_cuda_allocator.SetOptions(g_allocator_options);
+    CuDevice::Instantiate().SelectGpuId("yes");
+    CuDevice::Instantiate().AllowMultithreading();
+
+    std::string wav_rspecifier = po.GetArg(1),
+      ivector_wspecifier = po.GetArg(2),
+      feature_wspecifier = po.GetArg(3);
+
+    OnlineCudaFeaturePipeline feature_pipeline(feature_opts);
+
+    SequentialTableReader<WaveHolder> reader(wav_rspecifier);
+    BaseFloatVectorWriter ivector_writer;
+    BaseFloatMatrixWriter feature_writer;
+
+    if (!ivector_writer.Open(ivector_wspecifier)) {
+      KALDI_ERR << "Could not initialize ivector_writer with wspecifier "
+        << ivector_wspecifier;
+    }
+    if (!feature_writer.Open(feature_wspecifier)) {
+      KALDI_ERR << "Could not initialize feature_writer with wspecifier "
+        << feature_wspecifier;
+    }
+
+    int32 num_utts = 0, num_success = 0;
+    for (; !reader.Done(); reader.Next()) {
+      num_utts++;
+      std::string utt = reader.Key();
+      KALDI_LOG << "Processing Utterance " << utt;
+      try
+      {
+        const WaveData &wave_data = reader.Value();
+        SubVector<BaseFloat> waveform(wave_data.Data(), 0);
+        CuVector<BaseFloat> cu_wave(waveform);
+        CuMatrix<BaseFloat> cu_features;
+        CuVector<BaseFloat> cu_ivector;
+
+        nvtxRangePushA("Feature Extract");
+        feature_pipeline.ComputeFeatures(cu_wave,  wave_data.SampFreq(),
+            &cu_features, &cu_ivector);
+        cudaDeviceSynchronize();
+        nvtxRangePop();
+
+        Matrix<BaseFloat> features(cu_features.NumRows(), cu_features.NumCols());
+        Vector<BaseFloat> ivector(cu_ivector.Dim());
+
+        features.CopyFromMat(cu_features);
+        ivector.CopyFromVec(cu_ivector);
+
+        feature_writer.Write(utt, features);
+        ivector_writer.Write(utt, ivector);
+
+        num_success++;
+      } catch (...) {
+        KALDI_WARN << "Failed to compute features for utterance "
+          << utt;
+        continue;
+      }
+    }
+    KALDI_LOG << "Processed " << num_utts << " utterances with "
+      << num_utts - num_success << " failures.";
+    return (num_success != 0 ? 0 : 1);
+
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+
+}
diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc
index 80380bdb92c..025f4d2651f 100644
--- a/src/cudamatrix/cu-allocator.cc
+++ b/src/cudamatrix/cu-allocator.cc
@@ -521,11 +521,13 @@ void CuMemoryAllocator::AllocateNewRegion(size_t size) {
                 << "switching the GPUs to exclusive mode (nvidia-smi -c 3) and using "
                 << "the option --use-gpu=wait to scripts like "
                 << "steps/nnet3/chain/train.py.  Memory info: "
-                << mem_info;
+                << mem_info
+                << " CUDA error: '" << cudaGetErrorString(e) << "'";
     } else {
       KALDI_ERR << "Failed to allocate a memory region of " << region_size
                 << " bytes.  Possibly smaller minibatch size would help.  "
-                << "Memory info: " << mem_info;
+                << "Memory info: " << mem_info
+                << " CUDA error: '" << cudaGetErrorString(e) << "'";
     }
   }
   // this_num_subregions would be approximately 'opts_.num_subregions' if
diff --git a/src/cudamatrix/cu-array-inl.h b/src/cudamatrix/cu-array-inl.h
index 23b20501d4c..53de59fe4fc 100644
--- a/src/cudamatrix/cu-array-inl.h
+++ b/src/cudamatrix/cu-array-inl.h
@@ -105,8 +105,9 @@ void CuArrayBase<T>::CopyFromVec(const std::vector<T> &src) {
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     CU_SAFE_CALL(
-        cudaMemcpy(data_, &src.front(), src.size() * sizeof(T),
-                   cudaMemcpyHostToDevice));
+        cudaMemcpyAsync(data_, &src.front(), src.size() * sizeof(T),
+                   cudaMemcpyHostToDevice, cudaStreamPerThread));
+    CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
@@ -122,7 +123,9 @@ void CuArray<T>::CopyFromVec(const std::vector<T> &src) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
-    CU_SAFE_CALL(cudaMemcpy(this->data_, &src.front(), src.size()*sizeof(T), cudaMemcpyHostToDevice));
+    CU_SAFE_CALL(cudaMemcpyAsync(this->data_, &src.front(), 
+          src.size()*sizeof(T), cudaMemcpyHostToDevice, cudaStreamPerThread));
+    CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
@@ -179,7 +182,9 @@ void CuArrayBase<T>::CopyToVec(std::vector<T> *dst) const {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
-    CU_SAFE_CALL(cudaMemcpy(&dst->front(), Data(), this->dim_ * sizeof(T), cudaMemcpyDeviceToHost));
+    CU_SAFE_CALL(cudaMemcpyAsync(&dst->front(), Data(), this->dim_ * sizeof(T),
+          cudaMemcpyDeviceToHost, cudaStreamPerThread));
+    CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile("CuArray::CopyToVecD2H", tim);
   } else
 #endif
@@ -196,7 +201,9 @@ void CuArrayBase<T>::CopyToHost(T *dst) const {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
-    CU_SAFE_CALL(cudaMemcpy(dst, Data(), this->dim_ * sizeof(T), cudaMemcpyDeviceToHost));
+    CU_SAFE_CALL(cudaMemcpyAsync(dst, Data(), this->dim_ * sizeof(T),
+          cudaMemcpyDeviceToHost, cudaStreamPerThread));
+    CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile("CuArray::CopyToVecD2H", tim);
   } else
 #endif
@@ -212,7 +219,8 @@ void CuArrayBase<T>::SetZero() {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
-    CU_SAFE_CALL(cudaMemset(this->data_, 0, this->dim_ * sizeof(T)));
+    CU_SAFE_CALL(cudaMemsetAsync(this->data_, 0, this->dim_ * sizeof(T),
+          cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile("CuArray::SetZero", tim);
   } else
 #endif
diff --git a/src/cudamatrix/cu-block-matrix.cc b/src/cudamatrix/cu-block-matrix.cc
index fc8f4b7ce72..e0c64912207 100644
--- a/src/cudamatrix/cu-block-matrix.cc
+++ b/src/cudamatrix/cu-block-matrix.cc
@@ -140,7 +140,9 @@ void CuBlockMatrix<Real>::SetCudaData() {
     size_t size = NumBlocks() * sizeof(CuBlockMatrixData);
     cu_data_ = static_cast<CuBlockMatrixData*>(
         CuDevice::Instantiate().Malloc(size));
-    CU_SAFE_CALL(cudaMemcpy(cu_data_, &(tmp_cu_data[0]), size, cudaMemcpyHostToDevice));
+    CU_SAFE_CALL(cudaMemcpyAsync(cu_data_, &(tmp_cu_data[0]), size, 
+                                 cudaMemcpyHostToDevice, cudaStreamPerThread));
+    CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile(__func__, tim);    
   }
 #endif
diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h
index 7446a76bf93..3d9d7e52939 100644
--- a/src/cudamatrix/cu-common.h
+++ b/src/cudamatrix/cu-common.h
@@ -43,6 +43,14 @@
   } \
 }
 
+#define CUFFT_SAFE_CALL(fun) \
+{ \
+  int32 ret; \
+  if ((ret = (fun)) != CUFFT_SUCCESS) { \
+    KALDI_ERR << "cublasResult " << ret << " returned from '" << #fun << "'"; \
+  } \
+}
+
 #define CUBLAS_SAFE_CALL(fun) \
 { \
   int32 ret; \
@@ -51,6 +59,15 @@
   } \
 }
 
+#define CUSOLVER_SAFE_CALL(fun) \
+{ \
+  int32 ret; \
+  if ((ret = (fun)) != 0) { \
+    KALDI_ERR << "cusolverStatus_t " << ret << " : \"" << ret << "\" returned from '" << #fun << "'"; \
+  } \
+}
+
+
 #define CUSPARSE_SAFE_CALL(fun) \
 { \
   int32 ret; \
diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
index 49c179b3673..a41ebccd51e 100644
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@@ -110,9 +110,34 @@ void CuDevice::Initialize() {
     // Initialize CUBLAS.
     CUBLAS_SAFE_CALL(cublasCreate(&cublas_handle_));
     CUBLAS_SAFE_CALL(cublasSetStream(cublas_handle_, cudaStreamPerThread));
+
+#if CUDA_VERSION >= 9010
+    CUSOLVER_SAFE_CALL(cusolverDnCreate(&cusolverdn_handle_));
+    CUSOLVER_SAFE_CALL(cusolverDnSetStream(cusolverdn_handle_, 
+            cudaStreamPerThread));
+#endif
+    
+#if CUDA_VERSION >= 9000 
+    if (device_options_.use_tensor_cores) {
+      // Enable tensor cores in CUBLAS
+      // Note if the device does not support tensor cores this will fall back to normal math mode
+      CUBLAS_SAFE_CALL(cublasSetMathMode(cublas_handle_, 
+            CUBLAS_TENSOR_OP_MATH));
+    }
+#endif
+
     // Initialize the cuSPARSE library
     CUSPARSE_SAFE_CALL(cusparseCreate(&cusparse_handle_));
     CUSPARSE_SAFE_CALL(cusparseSetStream(cusparse_handle_, cudaStreamPerThread));
+
+    // Initialize the generator,
+    CURAND_SAFE_CALL(curandCreateGenerator(
+          &curand_handle_, CURAND_RNG_PSEUDO_DEFAULT));
+    // To get same random sequence, call srand() before the constructor is invoked,
+    CURAND_SAFE_CALL(curandSetGeneratorOrdering(
+          curand_handle_, CURAND_ORDERING_PSEUDO_DEFAULT));
+    CURAND_SAFE_CALL(curandSetStream(curand_handle_, cudaStreamPerThread));
+    SeedGpu();
   }
 }
 
@@ -245,9 +270,34 @@ void CuDevice::FinalizeActiveGpu() {
     // Initialize CUBLAS.
     CUBLAS_SAFE_CALL(cublasCreate(&cublas_handle_));
     CUBLAS_SAFE_CALL(cublasSetStream(cublas_handle_, cudaStreamPerThread));
+    
+#if CUDA_VERSION >= 9010 
+    CUSOLVER_SAFE_CALL(cusolverDnCreate(&cusolverdn_handle_));
+    CUSOLVER_SAFE_CALL(cusolverDnSetStream(cusolverdn_handle_,
+            cudaStreamPerThread));
+#endif
+
+#if CUDA_VERSION >= 9000 
+    if (device_options_.use_tensor_cores) {
+      // Enable tensor cores in CUBLAS
+      // Note if the device does not support tensor cores this will fall back to normal math mode
+      CUBLAS_SAFE_CALL(cublasSetMathMode(cublas_handle_, 
+            CUBLAS_TENSOR_OP_MATH));
+    }
+#endif
+
+    
     // Initialize the cuSPARSE library
     CUSPARSE_SAFE_CALL(cusparseCreate(&cusparse_handle_));
     CUSPARSE_SAFE_CALL(cusparseSetStream(cusparse_handle_, cudaStreamPerThread));
+    
+    // Initialize the generator,
+    CURAND_SAFE_CALL(curandCreateGenerator(
+          &curand_handle_, CURAND_RNG_PSEUDO_DEFAULT));
+    // To get same random sequence, call srand() before the constructor is invoked,
+    CURAND_SAFE_CALL(curandSetGeneratorOrdering(
+          curand_handle_, CURAND_ORDERING_PSEUDO_DEFAULT));
+    SeedGpu();
 
     // Notify the user which GPU is being userd.
     char name[128];
@@ -417,7 +467,7 @@ void CuDevice::AccuProfile(const char *function_name,
     // per-thread default stream.  Since we compile with
     // -DCUDA_API_PER_THREAD_DEFAULT_STREAM, this equates to a per-thread
     // stream.
-    cudaStreamSynchronize(0);
+    CU_SAFE_CALL(cudaStreamSynchronize(0));
     double elapsed = timer.Elapsed();
     if (profile_map_.find(key) == profile_map_.end())
       profile_map_[key] = elapsed;
@@ -511,7 +561,8 @@ CuDevice::CuDevice():
     initialized_(false),
     device_id_copy_(-1),
     cublas_handle_(NULL),
-    cusparse_handle_(NULL) {
+    cusparse_handle_(NULL),
+    cusolverdn_handle_(NULL) {
 }
 
 CuDevice::~CuDevice() {
@@ -519,12 +570,22 @@ CuDevice::~CuDevice() {
     CUBLAS_SAFE_CALL(cublasDestroy(cublas_handle_));
   if (cusparse_handle_)
     CUSPARSE_SAFE_CALL(cusparseDestroy(cusparse_handle_));
+  if (curand_handle_) {
+    CURAND_SAFE_CALL(curandDestroyGenerator(curand_handle_));
+  }
+#if CUDA_VERSION >= 9010
+  if (cusolverdn_handle_) {
+    CUSOLVER_SAFE_CALL(cusolverDnDestroy(cusolverdn_handle_));
+  }
+#endif
 }
 
 
 // Each thread has its own copy of the CuDevice object.
 // Note: this was declared "static".
 thread_local CuDevice CuDevice::this_thread_device_;
+  
+CuDevice::CuDeviceOptions CuDevice::device_options_;
 
 // define and initialize the static members of the CuDevice object.
 int32 CuDevice::device_id_ = -1;
diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h
index dc3df7e347d..9341f180069 100644
--- a/src/cudamatrix/cu-device.h
+++ b/src/cudamatrix/cu-device.h
@@ -26,6 +26,7 @@
 #if HAVE_CUDA == 1
 #include <cublas_v2.h>
 #include <cusparse.h>
+#include <curand.h>
 #include <map>
 #include <string>
 #include <iostream>
@@ -34,6 +35,17 @@
 #include "base/kaldi-common.h"
 #include "base/timer.h"
 #include "cudamatrix/cu-allocator.h"
+#include "cudamatrix/cu-common.h"
+
+#if CUDA_VERSION >= 9010
+#include <cusolverDn.h>
+#else
+// cusolver not supported.  
+// Setting a few types to minimize compiler guards.
+// If a user tries to use cusovler it will throw an error.
+typedef void* cusolverDnHandle_t;
+typedef int cusolverStatus_t;
+#endif
 
 namespace kaldi {
 
@@ -80,7 +92,23 @@ class CuDevice {
 
   inline cublasHandle_t GetCublasHandle() { return cublas_handle_; }
   inline cusparseHandle_t GetCusparseHandle() { return cusparse_handle_; }
+  inline curandGenerator_t GetCurandHandle() { return curand_handle_; }
+  inline cusolverDnHandle_t GetCusolverDnHandle() { 
+#if CUDA_VERSION < 9010
+    KALDI_ERR << "CUDA VERSION '" << CUDA_VERSION << "' not new enough to support "
+      << "cusolver. Upgrade to at least 9.1";
+#endif
+    return cusolverdn_handle_; 
+  }
 
+  inline void SeedGpu() {
+    if (CuDevice::Instantiate().Enabled()) {
+      // To get same random sequence, call srand() before the method is invoked,
+      CURAND_SAFE_CALL(curandSetPseudoRandomGeneratorSeed(
+            curand_handle_, RandInt(128, RAND_MAX)));
+      CURAND_SAFE_CALL(curandSetGeneratorOffset(curand_handle_, 0));
+    }
+  }
   // We provide functions Malloc(), MallocPitch() and Free() which replace
   // cudaMalloc(), cudaMallocPitch() and cudaFree().  Their function is to cache
   // the results of previous allocations to avoid the very large overhead that
@@ -184,8 +212,31 @@ class CuDevice {
   /// (i.e. from outside the class), call this only if Enabled() returns true.
   bool IsComputeExclusive();
 
+  // Register command line options for CUDA device.  
+  // This must be done before calling CuDevice::Initialize()
+  // Example:
+  //  CuDevice::RegisterDeviceOptions(&po);
+  //  po.Read(argc, argv);
+  //  CuDevice::Initialize();
+  static void RegisterDeviceOptions(OptionsItf *po) {
+    CuDevice::device_options_.Register(po);  
+  }
   ~CuDevice();
  private:
+
+  struct CuDeviceOptions {
+    bool use_tensor_cores; // Enable tensor cores
+    CuDeviceOptions () : use_tensor_cores(false) {};
+    void Register(OptionsItf *po) {
+      po->Register("cuda-use-tensor-cores", &use_tensor_cores, 
+          "Enable FP16 tensor math. "
+          "This is higher performance but less accuracy. "
+          "This is only recommended for inference.");
+    }
+  };
+
+  static CuDeviceOptions device_options_;
+
   // Default constructor used to initialize this_thread_device_
   CuDevice();
   CuDevice(CuDevice&); // Disallow.
@@ -268,9 +319,9 @@ class CuDevice {
   int32 device_id_copy_;
 
   cublasHandle_t cublas_handle_;
-
   cusparseHandle_t cusparse_handle_;
-
+  curandGenerator_t curand_handle_;
+  cusolverDnHandle_t cusolverdn_handle_;
 }; // class CuDevice
 
 
@@ -285,9 +336,22 @@ class CuTimer: public Timer {
 
 // This function is declared as a more convenient way to get the CUDA device handle for use
 // in the CUBLAS v2 API, since we so frequently need to access it.
-inline cublasHandle_t GetCublasHandle() { return CuDevice::Instantiate().GetCublasHandle(); }
+inline cublasHandle_t GetCublasHandle() { 
+  return CuDevice::Instantiate().GetCublasHandle(); 
+}
+
+inline cusolverDnHandle_t GetCusolverDnHandle() { 
+  return CuDevice::Instantiate().GetCusolverDnHandle(); 
+}
+
 // A more convenient way to get the handle to use cuSPARSE APIs.
-inline cusparseHandle_t GetCusparseHandle() { return CuDevice::Instantiate().GetCusparseHandle(); }
+inline cusparseHandle_t GetCusparseHandle() { 
+  return CuDevice::Instantiate().GetCusparseHandle(); 
+}
+
+inline curandGenerator_t GetCurandHandle() { 
+  return CuDevice::Instantiate().GetCurandHandle(); 
+}
 
 
 }  // namespace kaldi
diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
index a61bb601e8e..2b99d09f0e4 100644
--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@@ -6,6 +6,7 @@
 //                2013  Xiaohui Zhang
 //           2013-2015  Guoguo Chen
 //           2016-2018  Shiyin Kang
+//                2019  Yiwen Shao
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -39,6 +40,12 @@ typedef float   BaseFloat;
 #endif
 
 
+void cudaD_add_row_sum_mat(int Gr, int Bl, double* result, const double* mat,
+                           const MatrixDim d, const double alpha,
+                           const double beta);
+void cudaF_add_row_sum_mat(int Gr, int Bl, float* result, const float* mat,
+                           const MatrixDim d, const float alpha,
+                           const float beta);
 void cudaD_add_col_sum_mat(int Gr, int Bl, double* result, const double* mat,
                            const MatrixDim d, const double alpha,
                            const double beta);
@@ -195,34 +202,6 @@ void cudaD_add_vec_vec(int Gr, int Bl, double alpha, double* v, const double* x,
                        const double* y, double beta, int dim);
 void cudaF_add_vec_vec(int Gr, int Bl, float alpha, float* v, const float* x,
                        const float* y, float beta, int dim);
-void cudaD_apply_ceiling(dim3 Gr, dim3 Bl, double* mat, double ceiling_val,
-                         MatrixDim d);
-void cudaF_apply_ceiling(dim3 Gr, dim3 Bl, float* mat, float ceiling_val,
-                         MatrixDim d);
-void cudaD_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
-void cudaF_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
-void cudaD_apply_exp_limited(dim3 Gr, dim3 Bl, double* mat, MatrixDim d,
-                             double lower_limit, double upper_limit);
-void cudaF_apply_exp_limited(dim3 Gr, dim3 Bl, float* mat, MatrixDim d,
-                             float lower_limit, float upper_limit);
-void cudaD_apply_exp_special(dim3 Gr, dim3 Bl, double* out, MatrixDim out_dim,
-                             const double* in, int in_stride);
-void cudaF_apply_exp_special(dim3 Gr, dim3 Bl, float* out, MatrixDim out_dim,
-                             const float* in, int in_stride);
-void cudaD_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val,
-                       MatrixDim d);
-void cudaF_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val,
-                       MatrixDim d);
-void cudaD_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
-void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
-void cudaD_apply_log(dim3 Gr, dim3 Bl, double *mat, MatrixDim d);
-void cudaF_apply_log(dim3 Gr, dim3 Bl, float *mat, MatrixDim d);
-void cudaD_apply_pow_abs(dim3 Gr, dim3 Bl, double* mat, double power,
-                         bool include_sign, MatrixDim d);
-void cudaF_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power,
-                         bool include_sign, MatrixDim d);
-void cudaD_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power, MatrixDim d);
-void cudaF_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power, MatrixDim d);
 void cudaD_block_add_mat_mat(dim3 Gr, dim3 Bl, CuBlockMatrixData *B_cu_data,
                              int num_blocks, const double *C_data,
                              int C_num_cols, int C_row_stride, int C_col_stride,
@@ -500,6 +479,36 @@ void cudaD_heaviside(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d,
                      int src_stride);
 void cudaF_heaviside(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
                      int src_stride);
+void cudaD_exp(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d,
+	       int src_stride);
+void cudaF_exp(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
+	       int src_stride);
+void cudaD_pow(dim3 Gr, dim3 Bl, double *y, const double *x, double power, MatrixDim d,
+	       int src_stride);
+void cudaF_pow(dim3 Gr, dim3 Bl, float *y, const float *x, float power, MatrixDim d,
+	       int src_stride);
+void cudaD_ceiling(dim3 Gr, dim3 Bl, double* y, const double* x, double ceiling_val,
+		   MatrixDim dim, int src_stride);
+void cudaF_ceiling(dim3 Gr, dim3 Bl, float* y, const float* x, float ceiling_val,
+		   MatrixDim dim, int src_stride);
+void cudaD_floor(dim3 Gr, dim3 Bl, double* y, const double* x, double floor_val,
+		 MatrixDim dim, int src_stride);
+void cudaF_floor(dim3 Gr, dim3 Bl, float* y, const float* x, float floor_val,
+		 MatrixDim dim, int src_stride);
+void cudaD_exp_limited(dim3 Gr, dim3 Bl, double* y, const double* x,
+		       double lower_limit, double upper_limit, MatrixDim d, int src_stride);
+void cudaF_exp_limited(dim3 Gr, dim3 Bl, float* y, const float* x,
+		       float lower_limit, float upper_limit, MatrixDim d, int src_stride);
+void cudaD_exp_special(dim3 Gr, dim3 Bl, double* y, const double* x,
+		       MatrixDim d, int src_stride);
+void cudaF_exp_special(dim3 Gr, dim3 Bl, float* y, const float* x,
+		       MatrixDim d, int src_stride);
+void cudaD_log(dim3 Gr, dim3 Bl, double* y, const double* x, MatrixDim d, int src_stride);
+void cudaF_log(dim3 Gr, dim3 Bl, float* y, const float* x, MatrixDim d, int src_stride);
+void cudaD_pow_abs(dim3 Gr, dim3 Bl, double* y, const double* x, double power,
+		   bool include_sign, MatrixDim dim, int src_stride);
+void cudaF_pow_abs(dim3 Gr, dim3 Bl, float* y, const float* x, float power,
+		   bool include_sign, MatrixDim dim, int src_stride);  
 void cuda_int32_add(dim3 Gr, dim3 Bl, int32_cuda *mat, int32_cuda value,
                     MatrixDim d);
 void cuda_int32_set_const(dim3 Gr, dim3 Bl, int32_cuda *mat, int32_cuda value,
@@ -790,6 +799,30 @@ void cuda_uncompress_uint8(dim3 Gr, dim3 Bl, BaseFloat *dest,
                           MatrixDim dim, const uint8_t *src,
                           int src_stride, float scale);
 
+// copies the sub matrix in src[range_start, range_end] to the matrix in dst
+// if src row is outside of the clamped range it will clamp to the specified
+// rows. src and dst cannot overlap.
+void cudaF_mat_copy_range_clamped(
+   int32_t row_start, int32_t row_end, int32_t num_cols,
+   const float *src, int32_t lds, 
+   int32_t clamp_low, int32_t clamp_high,
+   float *dst, int32_t ldd);
+void cudaD_mat_copy_range_clamped(
+   int32_t row_start, int32_t row_end, int32_t num_cols,
+   const double *src, int32_t lds, 
+   int32_t clamp_low, int32_t clamp_high,
+   double *dst, int32_t ldd);
+
+// for i=[0,num_mats) perform the matrix copy outputs[i] = inputs[i] where
+// the matrices are of size num_rows[i] x num_cols[i] and have a leading
+// dimension of ldo[i] for the output and ldi[i] for the input.
+void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
+    int32_t *num_cols, const float **inputs, int32_t *ldi, float **outputs,
+    int32_t *ldo);
+void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
+    int32_t *num_cols, const double **inputs, int32_t *ldi, double **outputs,
+    int32_t *ldo);
+
 // Launches a kernel that does nothing, explicitly using the legacy default stream;
 // this will synchronize all CUDA streams (except for non-blocking streams) on the
 // device.
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 5a5307b9f87..21468ca9f63 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -8,6 +8,7 @@
 //           2013-2015  Guoguo Chen
 //           2016-2018  Shiyin Kang
 //                2017  Hossein Hadian, Daniel Galvez
+//                2019  Yiwen Shao
 
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -28,7 +29,7 @@
 #include <limits>
 #include <math_constants.h>
 #include "cudamatrix/cu-kernels-ansi.h"
-
+#include <cub/block/block_reduce.cuh>
 
 
 /***********************************************************************
@@ -293,25 +294,6 @@ static void _add_smat_trans(Real* mat, MatrixDim mat_dim, Real alpha,
   }
 }
 
-/// For each element x of the matrix, set it to
-/// (x < 0 ? exp(x) : x + 1).
-/// Use block/grid sizes for simple matrix ops
-template<typename T>
-__global__
-static void _apply_exp_special(T* out, MatrixDim out_dim, const T* in,
-                               int in_stride) {
-  const int i = blockIdx.x * blockDim.x + threadIdx.x;
-  const int j = blockIdx.y * blockDim.y + threadIdx.y;
-  if (i < out_dim.rows && j < out_dim.cols) {
-    T x = in[i * in_stride + j];
-    if (x < T(0)) {
-      out[i * out_dim.stride + j] = exp(x);
-    } else {
-      out[i * out_dim.stride + j] = x + T(1);
-    }
-  }
-}
-
 /// Fill the array 'data' with the sequence [base ... base + length)
 /// Use 1D block and 1D grid
 template<typename T>
@@ -389,37 +371,6 @@ static void _trace_mat_smat(const Real* mat, MatrixDim mat_dim,
   }
 }
 
-template<typename Real>
-__global__
-static void _apply_exp(Real* mat, MatrixDim d) {
-  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
-  int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
-  int32_cuda index = i + j * d.stride;
-  if (i < d.cols && j < d.rows) {
-    mat[index] = exp(mat[index]);
-  }
-}
-
-template<typename Real>
-__global__
-static void _apply_exp_limited(Real* mat, MatrixDim d,
-                               Real lower_limit, Real upper_limit) {
-  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
-  int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
-  int32_cuda index = i + j * d.stride;
-  if (i < d.cols && j < d.rows) {
-    Real x = mat[index];
-    // I'm writing !(x >= lower_limit) instead of (x < lower_limit) so that
-    // nan's will be set to the lower-limit.
-    if (!(x >= lower_limit))
-      x = lower_limit;
-    else if (x > upper_limit)
-      x = upper_limit;
-    mat[index] = exp(x);
-  }
-}
-
-
 template<typename Real>
 __global__
 static void _scale_diag_packed(Real* mat, Real value, int dim) {
@@ -500,16 +451,6 @@ static void _scale(Real* mat, Real value, MatrixDim d) {
     mat[index] = mat[index] * value;
 }
 
-template<typename Real>
-__global__
-static void _apply_log(Real* mat, MatrixDim d) {
-  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
-  int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
-  int32_cuda index = i + j * d.stride;
-  if (i < d.cols && j < d.rows)
-    mat[index] = log(mat[index]);
-}
-
 template<typename Real>
 __global__
 static void _mul_elements(Real* mat, const Real* A, MatrixDim dst_d,
@@ -958,6 +899,7 @@ static void _trace_mat_mat(const Real* A, const Real* B, MatrixDim dA,
     Real trans[TileDim][TileDim + 1];
     Real sum[CU1DBLOCK];
   } smem;
+
   // linear thread id;
   const int32_cuda tid = threadIdx.y * blockDim.x + threadIdx.x;
   const int32_cuda grid_height = gridDim.y * TileDim;
@@ -1021,6 +963,7 @@ static void _trace_mat_mat(const Real* A, const Real* B, MatrixDim dA,
   if (tid == 0) {
     value[blockIdx.y * gridDim.x + blockIdx.x] = smem.sum[0];
   }
+
 }
 
 // _trace_mat_mat_trans reduce the partial sum to
@@ -1030,6 +973,7 @@ __global__
 static void _trace_mat_mat_trans(const Real* A, const Real* B, MatrixDim dA,
                                  int B_stride, Real* value) {
   __shared__ Real ssum[CU1DBLOCK];
+
   // linear thread id;
   const int32_cuda tid = threadIdx.y * blockDim.x + threadIdx.x;
   const int32_cuda j = blockIdx.x * blockDim.x + threadIdx.x;
@@ -1046,7 +990,7 @@ static void _trace_mat_mat_trans(const Real* A, const Real* B, MatrixDim dA,
   }
   ssum[tid] = tsum;
   __syncthreads();
-
+  
   // Block reduce
 # pragma unroll
   for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
@@ -1697,6 +1641,48 @@ static void _vec_transform_reduce(
     result[blockIdx.x] = op.PostReduce(sdata[0], result[blockIdx.x]);
 }
 
+// Reduce a matrix 'mat' to a row vector 'result'
+template<EnumTransformReduce TransReduceType, typename Real>
+__global__
+static void _transform_reduce_mat_rows(
+    Real *result, const Real *mat, const MatrixDim d,
+    const TransReduceOp<TransReduceType, Real> op) {
+
+  __shared__ Real sdata[CU1DBLOCK];
+  const int tid = threadIdx.x;
+  const int j = blockIdx.x;
+
+  Real tdata = op.InitValue();
+  for (int i = tid; i < d.rows; i += CU1DBLOCK) {
+    //Note the loads of mat are uncoalesced.  We could eliminate these
+    //with shared memory but at the matrix sizes we are currently looking 
+    //at it probably would not help much and would add a lot of complexity.
+    //Alternatively we could look at something like trov to help loads.
+    tdata = op.Reduce(tdata, op.Transform(mat[i * d.stride + j]));
+  }
+  sdata[tid] = tdata;
+  __syncthreads();
+
+  // Tree reduce
+# pragma unroll
+  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
+    if (tid < shift)
+      sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]);
+    __syncthreads();
+  }
+
+  // Reduce last warp. Threads implicitly synchronized within a warp.
+  if (tid < warpSize) {
+    for (int shift = warpSize; shift > 0; shift >>= 1)
+      sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]);
+  }
+
+  // Output to vector result.
+  if (tid == 0) {
+    result[j] = op.PostReduce(sdata[0], result[j]);
+  }
+}
+
 // Reduce a matrix 'mat' to a column vector 'result'
 template<EnumTransformReduce TransReduceType, typename Real>
 __global__
@@ -1834,83 +1820,6 @@ static void _vec_apply_ceiling(Real *v, Real ceiling_val, float *count,
   }
 }
 
-template<typename Real>
-__global__
-static void _apply_pow(Real* mat, Real power, MatrixDim d) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;  // col index
-  int j = blockIdx.y * blockDim.y + threadIdx.y;  // row index
-  int index = i + j * d.stride;
-  if (i < d.cols && j < d.rows) {
-    if (power == 1.0)
-      return;
-    if (power == 2.0) {
-      mat[index] = mat[index] * mat[index];
-    } else if (power == 0.5) {
-      if (!(mat[index] >= 0.0))
-        return;
-      mat[index] = sqrt(mat[index]);
-    } else {
-      mat[index] = pow(mat[index], power);
-    }
-  }
-}
-
-template<typename Real>
-__global__
-static void _apply_pow_abs(Real* mat, Real power, bool include_sign,
-                           MatrixDim d) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;  // col index
-  int j = blockIdx.y * blockDim.y + threadIdx.y;  // row index
-  int index = i + j * d.stride;
-  if (i < d.cols && j < d.rows) {
-    if (include_sign == true && mat[index] < 0) {
-      if (power == 1.0)
-        mat[index] = -std::abs(mat[index]);
-      if (power == 2.0) {
-        mat[index] = -mat[index] * mat[index];
-      } else if (power == 0.5) {
-        mat[index] = -sqrt(std::abs(mat[index]));
-      } else {
-        mat[index] = -pow(std::abs(mat[index]), power);
-      }
-    } else {
-      if (power == 1.0)
-        mat[index] = std::abs(mat[index]);
-      if (power == 2.0) {
-        mat[index] = mat[index] * mat[index];
-      } else if (power == 0.5) {
-        mat[index] = sqrt(std::abs(mat[index]));
-      } else if (power < 0.0 && mat[index] == 0.0) {
-        mat[index] = 0.0;
-      } else {
-        mat[index] = pow(std::abs(mat[index]), power);
-      }
-    }
-  }
-}
-
-template<typename Real>
-__global__
-static void _apply_heaviside(Real* mat, MatrixDim d) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;  // col index
-  int j = blockIdx.y * blockDim.y + threadIdx.y;  // row index
-  int index = i + j * d.stride;
-  if (i < d.cols && j < d.rows)
-    mat[index] = (mat[index] > 0.0 ? 1.0 : 0.0);
-}
-
-template<typename Real>
-__global__
-static void _apply_floor(Real* mat, Real floor_val, MatrixDim d) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;  // col index
-  int j = blockIdx.y * blockDim.y + threadIdx.y;  // row index
-  int index = i + j * d.stride;
-
-  if (i < d.cols && j < d.rows) {
-    mat[index] = max(mat[index], floor_val);
-  }
-}
-
 template<typename Real>
 __global__
 static void _copy_cols(Real* dst, const Real *src,
@@ -2072,18 +1981,6 @@ static void _add_to_rows(Real alpha, Real* const * dst, const Real *src,
   }
 }
 
-template<typename Real>
-__global__
-static void _apply_ceiling(Real* mat, Real ceiling_val, MatrixDim d) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-  int j = blockIdx.y * blockDim.y + threadIdx.y;
-  int index = i + j * d.stride;
-
-  if (i < d.cols && j < d.rows) {
-    mat[index] = min(mat[index], ceiling_val);
-  }
-}
-
 template<typename Real>
 __global__
 static void _invert_elements(Real* data, MatrixDim d) {
@@ -2471,7 +2368,7 @@ static void _diff_parametric_relu(Real* eout, const Real* e, const Real* y,
 
 template<typename Real>
 __global__
-static void _heaviside(Real*y, const Real*x, MatrixDim d, int src_stride) {
+static void _heaviside(Real* y, const Real* x, MatrixDim d, int src_stride) {
   int i = blockIdx.x * blockDim.x + threadIdx.x;
   int j = blockIdx.y * blockDim.y + threadIdx.y;
   int dst_index = i + j * d.stride, src_index = i + j * src_stride;
@@ -2481,10 +2378,126 @@ static void _heaviside(Real*y, const Real*x, MatrixDim d, int src_stride) {
   }
 }
 
+template<typename Real>
+__global__
+static void _exp(Real* y, const Real* x, MatrixDim d, int src_stride) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int dst_index = i + j * d.stride, src_index = i + j * src_stride;
+  if (i < d.cols && j < d.rows) {
+    Real res = exp(x[src_index]);
+    y[dst_index] = res;
+  }
+}
+
+template<typename Real>
+__global__
+static void _pow(Real* y, const Real* x, Real power, MatrixDim d, int src_stride) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int dst_index = i + j * d.stride, src_index = i + j * src_stride;
+  if (i < d.cols && j < d.rows) {
+    y[dst_index] = pow(x[src_index], power);
+  }
+}
+
+template<typename Real>
+__global__
+static void _ceiling(Real* y, const Real* x, Real ceiling_val, MatrixDim d, int src_stride) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int dst_index = i + j * d.stride, src_index = i + j * src_stride;
+
+  if (i < d.cols && j < d.rows) {
+    y[dst_index] = min(x[src_index], ceiling_val);
+  }
+}
+
+template<typename Real>
+__global__
+static void _floor(Real* y, const Real* x, Real floor_val, MatrixDim d, int src_stride) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;  // col index
+  int j = blockIdx.y * blockDim.y + threadIdx.y;  // row index
+  int dst_index = i + j * d.stride, src_index = i + j * src_stride;
+
+  if (i < d.cols && j < d.rows) {
+    y[dst_index] = max(x[src_index], floor_val);
+  }
+}
+
+template<typename Real>
+__global__
+static void _exp_limited(Real* y, const Real* x, Real lower_limit, Real upper_limit,
+			 MatrixDim d, int src_stride) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int dst_index = i + j * d.stride, src_index = i + j * src_stride;
+  if (i < d.cols && j < d.rows) {
+    const Real x_i = x[src_index];
+    // I'm writing !(x >= lower_limit) instead of (x < lower_limit) so that
+    // nan's will be set to the lower-limit.
+    if (!(x_i  >= lower_limit))
+      y[dst_index] = exp(lower_limit);
+    else if (x_i > upper_limit)
+      y[dst_index] = exp(upper_limit);
+    else
+      y[dst_index] = exp(x_i);
+  }
+}
+
+/// For each element x of the matrix, set it to
+/// (x < 0 ? exp(x) : x + 1).
+/// Use block/grid sizes for simple matrix ops
+template<typename Real>
+__global__
+static void _exp_special(Real* y, const Real* x, MatrixDim d,
+			 int src_stride) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int dst_index = i + j * d.stride, src_index = i + j * src_stride;
+  if (i < d.cols && j < d.rows) {
+    const Real in = x[src_index];
+    if (in < Real(0)) {
+      y[dst_index] = exp(in);
+    } else {
+      y[dst_index] = in + Real(1);
+    }
+  }
+}
+
+template<typename Real>
+__global__
+static void _log(Real* y, const Real* x, MatrixDim d, int src_stride) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int dst_index = i + j * d.stride, src_index = i + j * src_stride;
+  if (i < d.cols && j < d.rows)
+    y[dst_index] = log(x[src_index]);
+}
+
+template<typename Real>
+__global__
+static void _pow_abs(Real* y, const Real* x, Real power, bool include_sign,
+		     MatrixDim d, int src_stride) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;  // col index
+  int j = blockIdx.y * blockDim.y + threadIdx.y;  // row index
+  int dst_index = i + j * d.stride, src_index = i + j * src_stride;
+  if (i < d.cols && j < d.rows) {
+    if (include_sign == true && x[src_index] < 0) {
+        y[dst_index] = -pow(std::abs(x[src_index]), power);
+    }
+    else {
+      y[dst_index] = pow(std::abs(x[src_index]), power);
+    }
+  }
+}
+
 template<typename Real>
 __global__
 static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) {
-  __shared__ Real smem[CU1DBLOCK];
+  __shared__ Real smem;
+  typedef cub::BlockReduce<Real, CU1DBLOCK> BlockReduceT;
+  __shared__ typename BlockReduceT::TempStorage temp_storage;
   const int i = blockIdx.x;
   const int x_start = i * src_stride;
   const int y_start = i * d.stride;
@@ -2496,29 +2509,14 @@ static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) {
   for (int j = tid; j < d.cols; j += CU1DBLOCK) {
     tmax = fmax(tmax, x[x_start + j]);
   }
-  smem[tid] = tmax;
-  __syncthreads();
-
-  // reduce to 2x warpSize elements per row
-# pragma unroll
-  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
-    if (tid < shift) {
-      smem[tid] = fmax(smem[tid], smem[tid + shift]);
-    }
-    __syncthreads();
-  }
-
-  // reduce to 1 element per row
-  if (tid < warpSize) {
-#   pragma unroll
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
-      smem[tid] = fmax(smem[tid], smem[tid + shift]);
-    }
-  }
+  tmax = BlockReduceT(temp_storage).Reduce(tmax, cub::Max());
 
   // broadcast max to all threads
+  if (tid == 0) {
+    smem = tmax;
+  }
   __syncthreads();
-  Real max = smem[0];
+  Real max = smem;
 
   // sum_j(exp(x(i,j)-max))
   // reduce to CU1DBLOCK elements per row.
@@ -2526,29 +2524,14 @@ static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) {
   for (int j = tid; j < d.cols; j += CU1DBLOCK) {
     tsum += exp(x[x_start + j] - max);
   }
-  smem[tid] = tsum;
-  __syncthreads();
-
-  // reduce to 2x warpSize elements per row
-# pragma unroll
-  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
-    if (tid < shift) {
-      smem[tid] += smem[tid + shift];
-    }
-    __syncthreads();
-  }
-
-  // reduce to 1 element per row
-  if (tid < warpSize) {
-#   pragma unroll
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
-      smem[tid] += smem[tid + shift];
-    }
-  }
+  tsum = BlockReduceT(temp_storage).Sum(tsum);
 
   // broadcast sum to all threads
+  if (tid == 0) {
+    smem = tsum;
+  }
   __syncthreads();
-  Real inv_sum = Real(1) / smem[0];
+  Real inv_sum = Real(1) / smem;
 
   // normalize the row
   for (int j = tid; j < d.cols; j += CU1DBLOCK) {
@@ -2577,43 +2560,27 @@ static void _normalize_per_row(Real *y, int y_stride, const Real *x,
   const int i = blockIdx.x;
   const int tid = threadIdx.x;
   const Real* x_row = x + i * x_d.stride;
-  __shared__ Real ssum[CU1DBLOCK];
+
+  typedef cub::BlockReduce<Real, CU1DBLOCK> BlockReduceT;
+  __shared__ typename BlockReduceT::TempStorage temp_storage;
+
+  __shared__ Real stddev_div_target_rms;
+  __shared__ Real scale;
 
   // Reduce x_j^2 to CU1DBLOCK elements per row
   Real tsum = Real(0);
   for (int j = tid; j < x_d.cols; j += CU1DBLOCK) {
     tsum += x_row[j] * x_row[j];
   }
-  ssum[tid] = tsum;
-  __syncthreads();
-
-  // Tree reduce to 2x warpSize elements per row
-# pragma unroll
-  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
-    if (tid < shift)
-      ssum[tid] += ssum[tid + shift];
-    __syncthreads();
-  }
+  tsum = BlockReduceT(temp_storage).Sum(tsum);
 
-  // Reduce last warp to 1 element per row.
-  // Threads implicitly synchronized within a warp.
-  if (tid < warpSize) {
-#   pragma unroll
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
-      ssum[tid] += ssum[tid + shift];
-    }
-  }
-
-  const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66
   if (tid == 0) {
-    ssum[0] = sqrt(
-        fmax(ssum[0] / (target_rms * target_rms * x_d.cols), kSquaredNormFloor));
+    const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66
+    stddev_div_target_rms = sqrt(
+      fmax(tsum / (target_rms * target_rms * x_d.cols), kSquaredNormFloor));
+    scale = Real(1) / stddev_div_target_rms;
   }
-
-  // Broadcast floored stddev to all threads.
   __syncthreads();
-  const Real stddev_div_target_rms = ssum[0];
-  const Real scale = Real(1) / stddev_div_target_rms;
 
   // Store normalized input to output
   Real* y_row = y + i * y_stride;
@@ -2626,7 +2593,6 @@ static void _normalize_per_row(Real *y, int y_stride, const Real *x,
   }
 }
 
-
 template<typename Real>
 __global__
 static void _diff_normalize_per_row(Real *id, int id_stride, const Real *iv,
@@ -2721,7 +2687,9 @@ template<typename Real>
 __global__
 static void _log_softmax_reduce(Real* y, const Real* x, MatrixDim y_dim,
                                 int x_stride) {
-  __shared__ Real smem[CU1DBLOCK];
+  __shared__ Real smem;
+  typedef cub::BlockReduce<Real, CU1DBLOCK> BlockReduceT;
+  __shared__ typename BlockReduceT::TempStorage temp_storage;
   const int i = blockIdx.x;
   const int x_start = i * x_stride;
   const int y_start = i * y_dim.stride;
@@ -2733,28 +2701,14 @@ static void _log_softmax_reduce(Real* y, const Real* x, MatrixDim y_dim,
   for (int j = tid; j < y_dim.cols; j += CU1DBLOCK) {
     tmax = fmax(tmax, x[x_start + j]);
   }
-  smem[tid] = tmax;
-  __syncthreads();
-
-  // reduce to 2x warpSize elements per row
-# pragma unroll
-  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
-    if (tid < shift) {
-      smem[tid] = fmax(smem[tid], smem[tid + shift]);
-    }
-    __syncthreads();
-  }
-
-  // reduce to 1 element per row
-  if (tid < warpSize) {
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
-      smem[tid] = fmax(smem[tid], smem[tid + shift]);
-    }
-  }
+  tmax = BlockReduceT(temp_storage).Reduce(tmax, cub::Max());
 
   // broadcast max to all threads
+  if (tid == 0) {
+    smem = tmax;
+  }
   __syncthreads();
-  Real max = smem[0];
+  Real max = smem;
 
   // sum_j(exp(x(i,j)-max))
   // reduce to CU1DBLOCK elements per row.
@@ -2762,28 +2716,14 @@ static void _log_softmax_reduce(Real* y, const Real* x, MatrixDim y_dim,
   for (int j = tid; j < y_dim.cols; j += CU1DBLOCK) {
     tsum += exp(x[x_start + j] - max);
   }
-  smem[tid] = tsum;
-  __syncthreads();
-
-  // reduce to 2x warpSize elements per row
-# pragma unroll
-  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
-    if (tid < shift) {
-      smem[tid] += smem[tid + shift];
-    }
-    __syncthreads();
-  }
-
-  // reduce to 1 element per row
-  if (tid < warpSize) {
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
-      smem[tid] += smem[tid + shift];
-    }
-  }
+  tsum = BlockReduceT(temp_storage).Sum(tsum);
 
   // broadcast sum to all threads
+  if (tid == 0) {
+    smem = tsum;
+  }
   __syncthreads();
-  Real log_sum = log(smem[0]);
+  Real log_sum = log(smem);
 
   // normalize the row
   for (int j = tid; j < y_dim.cols; j += CU1DBLOCK) {
@@ -3023,7 +2963,10 @@ __global__
 static void _diff_softmax(Real* x, const MatrixDim dim, const Real* value,
                           const int value_stride, const Real* diff,
                           const int diff_stride) {
-  __shared__ Real ssum[CU1DBLOCK];
+  __shared__ Real ssum;
+  typedef cub::BlockReduce<Real, CU1DBLOCK> BlockReduceT;
+  __shared__ typename BlockReduceT::TempStorage temp_storage;
+
   const int tid = threadIdx.x;
   const int i = blockIdx.x;
   const int value_start = i * value_stride;
@@ -3035,29 +2978,14 @@ static void _diff_softmax(Real* x, const MatrixDim dim, const Real* value,
   for (int j = tid; j < dim.cols; j += CU1DBLOCK) {
     tsum += value[value_start + j] * diff[diff_start + j];
   }
-  ssum[tid] = tsum;
-  __syncthreads();
-
-  // Tree reduce to 2x warpSize elements.
-# pragma unroll
-  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
-    if (tid < shift) {
-      ssum[tid] += ssum[tid + shift];
-    }
-    __syncthreads();
-  }
-
-  // Warp reduce to 1 element. Threads implicitly synchronized within a warp.
-  if (tid < warpSize) {
-#   pragma unroll
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
-      ssum[tid] += ssum[tid + shift];
-    }
-  }
+  tsum = BlockReduceT(temp_storage).Sum(tsum);
 
   // Broadcast result to all threads
+  if (tid == 0) {
+    ssum = tsum;
+  }
   __syncthreads();
-  const Real pe = ssum[0];
+  const Real pe = ssum;
 
   // Apply element-wise x = value * (diff - pe)
   for (int j = tid; j < dim.cols; j += CU1DBLOCK) {
@@ -3077,7 +3005,9 @@ static void _diff_log_softmax(const MatrixDim in_deriv_dim,
                               const Real* out_deriv, const int out_deriv_stride,
                               Real* in_deriv) {
 
-  __shared__ Real ssum[CU1DBLOCK];
+  __shared__ Real ssum;
+  typedef cub::BlockReduce<Real, CU1DBLOCK> BlockReduceT;
+  __shared__ typename BlockReduceT::TempStorage temp_storage;
   const int tid = threadIdx.x;
   const int i = blockIdx.x;
   const int out_value_start = i * out_value_stride;
@@ -3089,29 +3019,14 @@ static void _diff_log_softmax(const MatrixDim in_deriv_dim,
   for (int j = tid; j < in_deriv_dim.cols; j += CU1DBLOCK) {
     tsum += out_deriv[out_deriv_start + j];
   }
-  ssum[tid] = tsum;
-  __syncthreads();
-
-  // Tree reduce to 2x warpSize elements.
-# pragma unroll
-  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
-    if (tid < shift) {
-      ssum[tid] += ssum[tid + shift];
-    }
-    __syncthreads();
-  }
-
-  // Warp reduce to 1 element. Threads implicitly synchronized within a warp.
-  if (tid < warpSize) {
-#   pragma unroll
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
-      ssum[tid] += ssum[tid + shift];
-    }
-  }
+  tsum = BlockReduceT(temp_storage).Sum(tsum);
 
   // Broadcast result to all threads
+  if (tid == 0) {
+    ssum = tsum;
+  }
   __syncthreads();
-  const Real sum_e = ssum[0];
+  const Real sum_e = ssum;
 
   // Apply element-wise x = out_deriv - exp(value) * sum_e
   for (int j = tid; j < in_deriv_dim.cols; j += CU1DBLOCK) {
@@ -3699,6 +3614,77 @@ static void _cuda_uncompress(BaseFloat *dest, MatrixDim dim,
   }
 }
 
+template <typename Real>
+__global__
+void _cuda_mat_copy_range_clamped(
+   int32_t row_start, int32_t row_end, int32_t num_cols,
+   const Real * __restrict__ src, int32_t lds, 
+   int32_t clamp_low, int32_t clamp_high,
+   Real * __restrict__ dst, int32_t ldd) {
+  int32_t rid = blockIdx.y*blockDim.y+threadIdx.y;
+  int32_t cid = blockIdx.x*blockDim.x+threadIdx.x;
+
+  int32_t num_rows = row_end - row_start;
+  // for each row in parallel
+  for (int32_t r = rid; r < num_rows; r += blockDim.y * gridDim.y) {
+    // for each column in parallel
+    for (int32_t c = cid; c < num_cols; c += blockDim.x * gridDim.x) {
+      // compute offset row
+      int32_t r_in = r + row_start;
+      // clamp if necessary
+      if (r_in < clamp_low) r_in = clamp_low;
+      if (r_in > clamp_high) r_in = clamp_high;
+
+      // copy data
+      dst[r * ldd + c] = src[r_in * lds + c];
+    }
+  }
+}
+
+template <typename Real> 
+struct MatrixCopyDesc {
+  const Real *input;
+  Real *output;
+  int32_t ldi, ldo;
+  int32_t num_rows, num_cols;
+};
+
+template <typename Real>
+struct  BatchedMatrixCopyDesc {
+  //maximum size allowed in formal parameter list
+  static const int32_t MAX_BATCH_SIZE=128; 
+  MatrixCopyDesc<Real> batch[MAX_BATCH_SIZE];
+};
+
+// launched with a block size of 32x32 (32 rows, 32 cols per CTA)
+// grid dim x,y expands to fill out average in x/y across batches
+// grid dim.z is batch
+template<typename Real>
+__global__ 
+void _cuda_batch_copy_mats(BatchedMatrixCopyDesc<Real> batch_desc) {
+
+  int32_t rid = blockIdx.y * blockDim.y + threadIdx.y;
+  int32_t cid = blockIdx.x * blockDim.x + threadIdx.x;
+  int32_t bid = blockIdx.z;  // batch id 
+
+  // read copy parameters
+  MatrixCopyDesc<Real> desc = batch_desc.batch[bid];
+  int32_t num_rows = desc.num_rows;
+  int32_t num_cols = desc.num_cols;
+  const Real *input = desc.input;
+  Real *output = desc.output;
+  int32_t ldi = desc.ldi;
+  int32_t ldo = desc.ldo;
+
+  // for each row of output in parallel
+  for (int32_t r = rid; r < num_rows; r += blockDim.y * gridDim.y) {
+    // for each of column of output in parallel
+    for (int32_t c = cid; c < num_cols; c+= blockDim.x * gridDim.x) {
+      output[r * ldo + c] = input[r * ldi + c];
+    }
+  }
+}
+
 __global__
 static void _noop_kernel() {
 }
@@ -3760,28 +3746,6 @@ void cudaFD_copy_from_tp(dim3 Gr, dim3 Bl, float* A, const double* B,
   _copy_from_tp<<<Gr,Bl>>>(A,B,dmat);
 }
 
-void cudaF_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) {
-  _apply_exp<<<Gr,Bl>>>(mat,d);
-}
-
-void cudaF_apply_exp_limited(dim3 Gr, dim3 Bl, float* mat, MatrixDim d,
-                             float lower_limit, float upper_limit) {
-  _apply_exp_limited<<<Gr,Bl>>>(mat, d, lower_limit, upper_limit);
-}
-
-void cudaF_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power, MatrixDim d) {
-  _apply_pow<<<Gr,Bl>>>(mat, power, d);
-}
-
-void cudaF_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power,
-                         bool include_sign, MatrixDim d) {
-  _apply_pow_abs<<<Gr,Bl>>>(mat, power, include_sign, d);
-}
-
-void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) {
-  _apply_heaviside<<<Gr,Bl>>>(mat, d);
-}
-
 void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src,
                      const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
                      int src_stride) {
@@ -3838,16 +3802,6 @@ void cudaF_add_to_rows_direct(dim3 Gr, dim3 Bl, float alpha, float* const * dst,
   _add_to_rows<<<Gr,Bl>>>(alpha, dst, src, src_dim);
 }
 
-void cudaF_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val,
-                       MatrixDim d) {
-  _apply_floor<<<Gr,Bl>>>(mat, floor_val, d);
-}
-
-void cudaF_apply_ceiling(dim3 Gr, dim3 Bl, float* mat, float ceiling_val,
-                         MatrixDim d) {
-  _apply_ceiling<<<Gr,Bl>>>(mat, ceiling_val, d);
-}
-
 void cudaF_set_diag(int Gr, int Bl, float* mat, float value, MatrixDim d) {
   _set_diag<<<Gr,Bl>>>(mat,value,d);
 }
@@ -3880,10 +3834,6 @@ void cudaF_scale(dim3 Gr, dim3 Bl, float* mat, float value, MatrixDim d) {
   _scale<<<Gr,Bl>>>(mat,value,d);
 }
 
-void cudaF_apply_log(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) {
-  _apply_log<<<Gr,Bl>>>(mat,d);
-}
-
 void cudaF_mul_elements(dim3 Gr, dim3 Bl, float* mat, const float* A,
                         MatrixDim dst_d, int src_stride) {
   _mul_elements<<<Gr,Bl>>>(mat,A,dst_d,src_stride);
@@ -4030,6 +3980,12 @@ void cudaF_sum_mat_cols(int Gr, int Bl, float* result, const float* mat,
   _transform_reduce_mat_cols<<<Gr,Bl>>>(result,mat,d,
       TransReduceOp<SUM,float>());
 }
+void cudaF_add_row_sum_mat(int Gr, int Bl, float* result, const float* mat,
+                           const MatrixDim d, const float alpha,
+                           const float beta) {
+  _transform_reduce_mat_rows<<<Gr, Bl>>>(result, mat, d,
+      TransReduceOp<SUMAB, float>(alpha, beta));
+}
 void cudaF_add_col_sum_mat(int Gr, int Bl, float* result, const float* mat,
                            const MatrixDim d, const float alpha,
                            const float beta) {
@@ -4037,6 +3993,7 @@ void cudaF_add_col_sum_mat(int Gr, int Bl, float* result, const float* mat,
       TransReduceOp<SUMAB, float>(alpha, beta));
 }
 
+
 void cudaF_replace_value(int Gr, int Bl, float *v, int dim, float orig,
                          float changed) {
   _replace_value<<<Gr,Bl>>>(v, dim, orig, changed);
@@ -4301,6 +4258,45 @@ void cudaF_heaviside(dim3 Gr, dim3 Bl, float* y, const float* x, MatrixDim d,
   _heaviside<<<Gr,Bl>>>(y, x, d, src_stride);
 }
 
+void cudaF_exp(dim3 Gr, dim3 Bl, float* y, const float* x, MatrixDim d,
+	       int src_stride) {
+  _exp<<<Gr,Bl>>>(y, x, d, src_stride);
+}
+
+void cudaF_pow(dim3 Gr, dim3 Bl, float* y, const float* x, float power, MatrixDim d,
+	       int src_stride) {
+  _pow<<<Gr,Bl>>>(y, x, power, d, src_stride);
+}
+
+void cudaF_ceiling(dim3 Gr, dim3 Bl, float* y, const float* x, float ceiling_val,
+		   MatrixDim d, int src_stride) {
+  _ceiling<<<Gr,Bl>>>(y, x, ceiling_val, d, src_stride);
+}
+
+void cudaF_floor(dim3 Gr, dim3 Bl, float* y, const float* x, float floor_val,
+		 MatrixDim d, int src_stride) {
+  _floor<<<Gr,Bl>>>(y, x, floor_val, d, src_stride);
+}
+
+void cudaF_exp_limited(dim3 Gr, dim3 Bl, float* y, const float* x,
+		       float lower_limit, float upper_limit, MatrixDim d, int src_stride) {
+  _exp_limited<<<Gr,Bl>>>(y, x, lower_limit, upper_limit, d, src_stride);
+}
+
+void cudaF_exp_special(dim3 Gr, dim3 Bl, float* y, const float* x, MatrixDim d,
+		       int src_stride) {
+  _exp_special<<<Gr, Bl>>>(y, x, d, src_stride);
+}
+
+void cudaF_log(dim3 Gr, dim3 Bl, float* y, const float* x, MatrixDim d, int src_stride) {
+  _log<<<Gr,Bl>>>(y, x, d, src_stride);
+}
+
+void cudaF_pow_abs(dim3 Gr, dim3 Bl, float* y, const float* x, float power,
+		   bool include_sign, MatrixDim d, int src_stride) {
+  _pow_abs<<<Gr,Bl>>>(y, x, power, include_sign, d, src_stride);
+}
+
 void cudaF_softmax_reduce(size_t Gr, size_t Bl, float* y, const float* x,
                           MatrixDim d, int src_stride) {
   _softmax_reduce<<<Gr,Bl>>>(y, x, d, src_stride);
@@ -4464,30 +4460,6 @@ void cudaDF_copy_from_tp(dim3 Gr, dim3 Bl, double* A, const float* B,
   _copy_from_tp<<<Gr,Bl>>>(A,B,dmat);
 }
 
-void cudaD_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) {
-  _apply_exp<<<Gr,Bl>>>(mat,d);
-}
-
-void cudaD_apply_exp_limited(dim3 Gr, dim3 Bl, double* mat, MatrixDim d,
-                             double lower_limit, double upper_limit) {
-  _apply_exp_limited<<<Gr,Bl>>>(mat, d, lower_limit, upper_limit);
-}
-
-
-
-void cudaD_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power, MatrixDim d) {
-  _apply_pow<<<Gr,Bl>>>(mat, power, d);
-}
-
-void cudaD_apply_pow_abs(dim3 Gr, dim3 Bl, double* mat, double power,
-                         bool include_sign, MatrixDim d) {
-  _apply_pow_abs<<<Gr,Bl>>>(mat, power, include_sign, d);
-}
-
-void cudaD_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) {
-  _apply_heaviside<<<Gr,Bl>>>(mat, d);
-}
-
 void cudaD_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src,
                      const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
                      int src_stride) {
@@ -4545,16 +4517,6 @@ void cudaD_add_to_rows_direct(dim3 Gr, dim3 Bl, double alpha,
   _add_to_rows<<<Gr,Bl>>>(alpha, dst, src, src_dim);
 }
 
-void cudaD_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val,
-                       MatrixDim d) {
-  _apply_floor<<<Gr,Bl>>>(mat, floor_val, d);
-}
-
-void cudaD_apply_ceiling(dim3 Gr, dim3 Bl, double* mat, double ceiling_val,
-                         MatrixDim d) {
-  _apply_ceiling<<<Gr,Bl>>>(mat, ceiling_val, d);
-}
-
 void cudaD_set_diag(int Gr, int Bl, double* mat, double value, MatrixDim d) {
   _set_diag<<<Gr,Bl>>>(mat,value,d);
 }
@@ -4588,10 +4550,6 @@ void cudaD_scale(dim3 Gr, dim3 Bl, double* mat, double value, MatrixDim d) {
   _scale<<<Gr,Bl>>>(mat,value,d);
 }
 
-void cudaD_apply_log(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) {
-  _apply_log<<<Gr,Bl>>>(mat,d);
-}
-
 void cudaD_mul_elements(dim3 Gr, dim3 Bl, double* mat, const double* A,
                         MatrixDim dst_d, int src_stride) {
   _mul_elements<<<Gr,Bl>>>(mat,A,dst_d,src_stride);
@@ -4738,6 +4696,12 @@ void cudaD_sum_mat_cols(int Gr, int Bl, double* result, const double* mat,
   _transform_reduce_mat_cols<<<Gr,Bl>>>(result,mat,d,
       TransReduceOp<SUM,double>());
 }
+void cudaD_add_row_sum_mat(int Gr, int Bl, double* result, const double* mat,
+                           const MatrixDim d, const double alpha,
+                           const double beta) {
+  _transform_reduce_mat_rows<<<Gr, Bl>>>(result, mat, d,
+      TransReduceOp<SUMAB, double>(alpha, beta));
+}
 void cudaD_add_col_sum_mat(int Gr, int Bl, double* result, const double* mat,
                            const MatrixDim d, const double alpha,
                            const double beta) {
@@ -4998,6 +4962,45 @@ void cudaD_heaviside(dim3 Gr, dim3 Bl, double* y, const double* x, MatrixDim d,
   _heaviside<<<Gr,Bl>>>(y, x, d, src_stride);
 }
 
+void cudaD_exp(dim3 Gr, dim3 Bl, double* y, const double* x, MatrixDim d,
+	       int src_stride) {
+  _exp<<<Gr,Bl>>>(y, x, d, src_stride);
+}
+
+void cudaD_pow(dim3 Gr, dim3 Bl, double* y, const double* x, double power, MatrixDim d,
+	       int src_stride) {
+  _pow<<<Gr,Bl>>>(y, x, power, d, src_stride);
+}
+
+void cudaD_ceiling(dim3 Gr, dim3 Bl, double* y, const double* x, double ceiling_val,
+		   MatrixDim d, int src_stride) {
+  _ceiling<<<Gr,Bl>>>(y, x, ceiling_val, d, src_stride);
+}
+
+void cudaD_floor(dim3 Gr, dim3 Bl, double* y, const double* x, double floor_val,
+		 MatrixDim d, int src_stride) {
+  _floor<<<Gr,Bl>>>(y, x, floor_val, d, src_stride);
+}
+
+void cudaD_exp_limited(dim3 Gr, dim3 Bl, double* y, const double* x,
+		       double lower_limit, double upper_limit, MatrixDim d, int src_stride) {
+  _exp_limited<<<Gr,Bl>>>(y, x, lower_limit, upper_limit, d, src_stride);
+}
+
+void cudaD_exp_special(dim3 Gr, dim3 Bl, double* y, const double* x, MatrixDim d,
+		       int src_stride) {
+  _exp_special<<<Gr, Bl>>>(y, x, d, src_stride);
+}
+
+void cudaD_log(dim3 Gr, dim3 Bl, double* y, const double* x, MatrixDim d, int src_stride) {
+  _log<<<Gr,Bl>>>(y, x, d, src_stride);
+}
+
+void cudaD_pow_abs(dim3 Gr, dim3 Bl, double* y, const double* x, double power,
+		   bool include_sign, MatrixDim d, int src_stride) {
+  _pow_abs<<<Gr,Bl>>>(y, x, power, include_sign, d, src_stride);
+}
+
 void cudaD_softmax_reduce(size_t Gr, size_t Bl, double* y, const double* x,
                           MatrixDim d, int src_stride) {
   _softmax_reduce<<<Gr,Bl>>>(y, x, d, src_stride);
@@ -5386,14 +5389,6 @@ void cudaF_add_smat_trans(dim3 Gr, dim3 Bl, float* mat, MatrixDim mat_dim,
   _add_smat_trans<<<Gr, Bl>>>(mat, mat_dim, alpha, smat_row_ptr, smat_col_idx,
                               smat_val);
 }
-void cudaD_apply_exp_special(dim3 Gr, dim3 Bl, double* out, MatrixDim out_dim,
-                             const double* in, int in_stride) {
-  _apply_exp_special<<<Gr, Bl>>>(out, out_dim, in, in_stride);
-}
-void cudaF_apply_exp_special(dim3 Gr, dim3 Bl, float* out, MatrixDim out_dim,
-                             const float* in, int in_stride) {
-  _apply_exp_special<<<Gr, Bl>>>(out, out_dim, in, in_stride);
-}
 
 void cuda_compress_uint8_sign(dim3 Gr, dim3 Bl, const BaseFloat *src, MatrixDim dim,
                               unsigned char *dest, int dest_stride) {
@@ -5468,3 +5463,161 @@ void cuda_uncompress_int16(dim3 Gr, dim3 Bl, BaseFloat *dest,
 void cuda_legacy_noop() {
   _noop_kernel<<<1, 1, 0, cudaStreamLegacy>>>();
 }
+
+void cudaF_mat_copy_range_clamped(
+   int32_t row_start, int32_t row_end, int32_t num_cols,
+   const float *src, int32_t lds, 
+   int32_t clamp_low, int32_t clamp_high,
+   float *dst, int32_t ldd) {
+
+  int32_t num_rows =  row_end - row_start;
+  dim3 threads(32,32);
+  dim3 blocks((num_cols+31)/32,(num_rows+31)/32);
+
+  _cuda_mat_copy_range_clamped<float><<<blocks,threads>>>(row_start, row_end, num_cols,
+      src, lds, clamp_low, clamp_high, dst, ldd);
+}
+
+void cudaD_mat_copy_range_clamped(
+   int32_t row_start, int32_t row_end, int32_t num_cols,
+   const double *src, int32_t lds, 
+   int32_t clamp_low, int32_t clamp_high,
+   double *dst, int32_t ldd) {
+
+  int32_t num_rows =  row_end - row_start;
+  dim3 threads(32,32);
+  dim3 blocks((num_cols+31)/32,(num_rows+31)/32);
+
+  _cuda_mat_copy_range_clamped<double><<<blocks,threads>>>(row_start, row_end, num_cols,
+      src, lds, clamp_low, clamp_high, dst, ldd);
+}
+
+void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
+    int32_t *num_cols, const float **inputs, int32_t *ldi, float **outputs,
+    int32_t *ldo) {
+
+  dim3 threads(32,32);
+  int32_t total_rows=0, total_cols=0;
+  
+  BatchedMatrixCopyDesc<float> batch_desc; 
+  const int32_t MAX_BATCH_SIZE=batch_desc.MAX_BATCH_SIZE;
+
+  int i;
+  for (i = 0; i < num_mats; i++) {
+    int b = i%MAX_BATCH_SIZE;
+    
+    // fill in desc
+    MatrixCopyDesc<float> &desc = batch_desc.batch[b];
+    desc.num_rows = num_rows[i];
+    desc.num_cols = num_cols[i];
+    desc.input = inputs[i];
+    desc.output = outputs[i];
+    desc.ldi = ldi[i];
+    desc.ldo = ldo[i];
+
+    total_rows+=desc.num_rows;
+    total_cols+=desc.num_cols;
+
+    if (b==MAX_BATCH_SIZE-1) {
+      // compute average number of rows/cols across batch
+      int32_t rows = ceilf(total_rows / (float)MAX_BATCH_SIZE);
+      int32_t cols = ceilf(total_cols / (float)MAX_BATCH_SIZE);
+      dim3 blocks((cols + 31) / 32,
+                  (rows + 31) / 32, 
+                  MAX_BATCH_SIZE);
+
+      // no memcpy needed here.  Memory will be passed down directly
+      // through paramter passing and live in constant memory
+      
+      // launch batch
+       _cuda_batch_copy_mats<<<blocks,threads>>>(batch_desc);
+
+       // reset total counters
+       total_rows=0;
+       total_cols=0;
+    }
+  }
+
+  int32_t remaining = i%MAX_BATCH_SIZE;
+
+  if (remaining > 0) {
+      // compute average number of rows/cols across batch
+      int32_t rows = ceilf(total_rows / (float)remaining);
+      int32_t cols = ceilf(total_cols / (float)remaining);
+      
+      dim3 blocks((cols + 31) / 32,
+                  (rows + 31) / 32, 
+                  remaining);
+
+      // no memcpy needed here.  Memory will be passed down directly
+      // through paramter passing and live in constant memory
+
+      // launch batch
+       _cuda_batch_copy_mats<<<blocks,threads>>>(batch_desc);
+  }
+}
+
+void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
+    int32_t *num_cols, const double **inputs, int32_t *ldi, double **outputs,
+    int32_t *ldo) {
+
+  dim3 threads(32,32);
+  int32_t total_rows=0, total_cols=0;
+  
+  BatchedMatrixCopyDesc<double> batch_desc; 
+  const int32_t MAX_BATCH_SIZE=batch_desc.MAX_BATCH_SIZE;
+
+  int i;
+  for (i = 0; i < num_mats; i++) {
+    int b = i%MAX_BATCH_SIZE;
+    
+    // fill in desc
+    MatrixCopyDesc<double> &desc = batch_desc.batch[b];
+    desc.num_rows = num_rows[i];
+    desc.num_cols = num_cols[i];
+    desc.input = inputs[i];
+    desc.output = outputs[i];
+    desc.ldi = ldi[i];
+    desc.ldo = ldo[i];
+
+    total_rows+=desc.num_rows;
+    total_cols+=desc.num_cols;
+
+    if (b==MAX_BATCH_SIZE-1) {
+      // compute average number of rows/cols across batch
+      int32_t rows = ceilf(total_rows / (float)MAX_BATCH_SIZE);
+      int32_t cols = ceilf(total_cols / (float)MAX_BATCH_SIZE);
+      dim3 blocks((cols + 31) / 32,
+                  (rows + 31) / 32, 
+                  MAX_BATCH_SIZE);
+
+      // no memcpy needed here.  Memory will be passed down directly
+      // through paramter passing and live in constant memory
+      
+      // launch batch
+       _cuda_batch_copy_mats<<<blocks,threads>>>(batch_desc);
+
+       // reset total counters
+       total_rows=0;
+       total_cols=0;
+    }
+  }
+
+  int32_t remaining = i%MAX_BATCH_SIZE;
+
+  if (remaining > 0) {
+      // compute average number of rows/cols across batch
+      int32_t rows = ceilf(total_rows / (float)remaining);
+      int32_t cols = ceilf(total_cols / (float)remaining);
+
+      dim3 blocks((cols + 31) / 32,
+                  (rows + 31) / 32, 
+                  remaining);
+      
+      // no memcpy needed here.  Memory will be passed down directly
+      // through paramter passing and live in constant memory
+
+      // launch batch
+       _cuda_batch_copy_mats<<<blocks,threads>>>(batch_desc);
+  }
+}
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index 6c24ce0dd58..1df1626fc6d 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -7,6 +7,7 @@
 //                2013  Xiaohui Zhang
 //           2013-2015  Guoguo Chen
 //           2016-2018  Shiyin Kang
+//                2019  Yiwen Shao
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -38,6 +39,16 @@
 
 namespace kaldi {
 
+inline void cuda_add_row_sum_mat(int Gr, int Bl, double* result,
+                                 const double* mat, const MatrixDim d,
+                                 const double alpha, const double beta) {
+  cudaD_add_row_sum_mat(Gr, Bl, result, mat, d, alpha, beta);
+}
+inline void cuda_add_row_sum_mat(int Gr, int Bl, float* result,
+                                 const float* mat, const MatrixDim d,
+                                 const float alpha, const float beta) {
+  cudaF_add_row_sum_mat(Gr, Bl, result, mat, d, alpha, beta);
+}
 inline void cuda_add_col_sum_mat(int Gr, int Bl, double* result,
                                  const double* mat, const MatrixDim d,
                                  const double alpha, const double beta) {
@@ -335,74 +346,6 @@ inline void cuda_add_vec_vec(int Gr, int Bl, float alpha, float* v,
                              int dim) {
   cudaF_add_vec_vec(Gr, Bl, alpha, v, x, y, beta, dim);
 }
-inline void cuda_apply_ceiling(dim3 Gr, dim3 Bl, double* mat,
-                               double ceiling_val, MatrixDim dim) {
-  cudaD_apply_ceiling(Gr, Bl, mat, ceiling_val, dim);
-}
-inline void cuda_apply_ceiling(dim3 Gr, dim3 Bl, float* mat, float ceiling_val,
-                               MatrixDim dim) {
-  cudaF_apply_ceiling(Gr, Bl, mat, ceiling_val, dim);
-}
-inline void cuda_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) {
-  cudaD_apply_exp(Gr, Bl, mat, d);
-}
-inline void cuda_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) {
-  cudaF_apply_exp(Gr, Bl, mat, d);
-}
-inline void cuda_apply_exp_limited(dim3 Gr, dim3 Bl, double* mat, MatrixDim d,
-                                   double lower_limit, double upper_limit) {
-  cudaD_apply_exp_limited(Gr, Bl, mat, d, lower_limit, upper_limit);
-}
-inline void cuda_apply_exp_limited(dim3 Gr, dim3 Bl, float* mat, MatrixDim d,
-                                   float lower_limit, float upper_limit) {
-  cudaF_apply_exp_limited(Gr, Bl, mat, d, lower_limit, upper_limit);
-}
-inline void cuda_apply_exp_special(dim3 Gr, dim3 Bl, double* out,
-                                   MatrixDim out_dim, const double* in,
-                                   int in_stride) {
-  cudaD_apply_exp_special(Gr, Bl, out, out_dim, in, in_stride);
-}
-inline void cuda_apply_exp_special(dim3 Gr, dim3 Bl, float* out,
-                                   MatrixDim out_dim, const float* in,
-                                   int in_stride) {
-  cudaF_apply_exp_special(Gr, Bl, out, out_dim, in, in_stride);
-}
-inline void cuda_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val,
-                             MatrixDim dim) {
-  cudaD_apply_floor(Gr, Bl, mat, floor_val, dim);
-}
-inline void cuda_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val,
-                             MatrixDim dim) {
-  cudaF_apply_floor(Gr, Bl, mat, floor_val, dim);
-}
-inline void cuda_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim dim) {
-  cudaD_apply_heaviside(Gr, Bl, mat, dim);
-}
-inline void cuda_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim dim) {
-  cudaF_apply_heaviside(Gr, Bl, mat, dim);
-}
-inline void cuda_apply_log(dim3 Gr, dim3 Bl, double *mat, MatrixDim d) {
-  cudaD_apply_log(Gr, Bl, mat, d);
-}
-inline void cuda_apply_log(dim3 Gr, dim3 Bl, float *mat, MatrixDim d) {
-  cudaF_apply_log(Gr, Bl, mat, d);
-}
-inline void cuda_apply_pow_abs(dim3 Gr, dim3 Bl, double* mat, double power,
-                               bool include_sign, MatrixDim dim) {
-  cudaD_apply_pow_abs(Gr, Bl, mat, power, include_sign, dim);
-}
-inline void cuda_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power,
-                               bool include_sign, MatrixDim dim) {
-  cudaF_apply_pow_abs(Gr, Bl, mat, power, include_sign, dim);
-}
-inline void cuda_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power,
-                           MatrixDim dim) {
-  cudaD_apply_pow(Gr, Bl, mat, power, dim);
-}
-inline void cuda_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power,
-                           MatrixDim dim) {
-  cudaF_apply_pow(Gr, Bl, mat, power, dim);
-}
 inline cublasStatus_t cuda_axpy(cublasHandle_t handle, int n, double alpha,
                                 const double *x, int incx, double *y,
                                 int incy) {
@@ -929,19 +872,81 @@ inline void cuda_group_spec_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x,
                                   double power) {
   cudaD_group_spec_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power);
 }
-inline void cuda_group_spec_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x,
+inline void cuda_group_spec_pnorm(dim3 Gr, dim3 Bl, float* y, const float* x,
                                   MatrixDim d, int src_stride, int group_size,
                                   float power) {
   cudaF_group_spec_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power);
 }
-inline void cuda_heaviside(dim3 Gr, dim3 Bl, double *y, const double *x,
+inline void cuda_heaviside(dim3 Gr, dim3 Bl, double* y, const double* x,
                            MatrixDim d, int src_stride) {
   cudaD_heaviside(Gr, Bl, y, x, d, src_stride);
 }
-inline void cuda_heaviside(dim3 Gr, dim3 Bl, float *y, const float *x,
+inline void cuda_heaviside(dim3 Gr, dim3 Bl, float* y, const float* x,
                            MatrixDim d, int src_stride) {
   cudaF_heaviside(Gr, Bl, y, x, d, src_stride);
 }
+inline void cuda_exp(dim3 Gr, dim3 Bl, double* y, const double* x,
+		     MatrixDim d, int src_stride) {
+  cudaD_exp(Gr, Bl, y, x, d, src_stride);
+}
+inline void cuda_exp(dim3 Gr, dim3 Bl, float* y, const float* x,
+		     MatrixDim d, int src_stride) {
+  cudaF_exp(Gr, Bl, y, x, d, src_stride);
+}
+inline void cuda_pow(dim3 Gr, dim3 Bl, double* y, const double* x, double power,
+		     MatrixDim d, int src_stride) {
+  cudaD_pow(Gr, Bl, y, x, power, d, src_stride);
+}
+inline void cuda_pow(dim3 Gr, dim3 Bl, float* y, const float* x, float power,
+		     MatrixDim d, int src_stride) {
+  cudaF_pow(Gr, Bl, y, x, power, d, src_stride);
+}
+inline void cuda_ceiling(dim3 Gr, dim3 Bl, double* y, const double* x, double ceiling_val,
+			 MatrixDim dim, int src_stride) {
+  cudaD_ceiling(Gr, Bl, y, x, ceiling_val, dim, src_stride);
+}
+inline void cuda_ceiling(dim3 Gr, dim3 Bl, float* y, const float* x, float ceiling_val,
+			 MatrixDim dim, int src_stride) {
+  cudaF_ceiling(Gr, Bl, y, x, ceiling_val, dim, src_stride);
+}
+inline void cuda_floor(dim3 Gr, dim3 Bl, double* y, const double* x, double floor_val,
+		       MatrixDim dim, int src_stride) {
+  cudaD_floor(Gr, Bl, y, x, floor_val, dim, src_stride);
+}
+inline void cuda_floor(dim3 Gr, dim3 Bl, float* y, const float* x, float floor_val,
+		       MatrixDim dim, int src_stride) {
+  cudaF_floor(Gr, Bl, y, x, floor_val, dim, src_stride);
+}
+inline void cuda_exp_limited(dim3 Gr, dim3 Bl, double* y, const double* x,
+			     double lower_limit, double upper_limit, MatrixDim d, int src_stride) {
+  cudaD_exp_limited(Gr, Bl, y, x, lower_limit, upper_limit, d, src_stride);
+}
+inline void cuda_exp_limited(dim3 Gr, dim3 Bl, float* y, const float* x,
+			     float lower_limit, float upper_limit, MatrixDim d, int src_stride) {
+  cudaF_exp_limited(Gr, Bl, y, x, lower_limit, upper_limit, d, src_stride);
+}
+inline void cuda_exp_special(dim3 Gr, dim3 Bl, double* y, const double* x,
+			     MatrixDim d, int src_stride) {
+  cudaD_exp_special(Gr, Bl, y, x, d, src_stride);
+}
+inline void cuda_exp_special(dim3 Gr, dim3 Bl, float* y, const float* x,
+			     MatrixDim d, int src_stride) {
+  cudaF_exp_special(Gr, Bl, y, x, d, src_stride);
+}
+inline void cuda_log(dim3 Gr, dim3 Bl, double* y, const double* x, MatrixDim d, int src_stride) {
+  cudaD_log(Gr, Bl, y, x, d, src_stride);
+}
+inline void cuda_log(dim3 Gr, dim3 Bl, float* y, const float* x, MatrixDim d, int src_stride) {
+  cudaF_log(Gr, Bl, y, x, d, src_stride);
+}
+inline void cuda_pow_abs(dim3 Gr, dim3 Bl, double* y, const double* x, double power,
+			 bool include_sign, MatrixDim dim, int src_stride) {
+  cudaD_pow_abs(Gr, Bl, y, x, power, include_sign, dim, src_stride);
+}
+inline void cuda_pow_abs(dim3 Gr, dim3 Bl, float* y, const float* x, float power,
+			 bool include_sign, MatrixDim dim, int src_stride) {
+  cudaF_pow_abs(Gr, Bl, y, x, power, include_sign, dim, src_stride);
+}
 inline void cuda_invert_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim d) {
   cudaD_invert_elements(Gr, Bl, data, d);
 }
@@ -1551,6 +1556,38 @@ inline void cuda_mat_uncompress(dim3 Gr, dim3 Bl, BaseFloat *dest,
   cuda_uncompress_uint16(Gr, Bl, dest, dim, src, src_stride, scale);
 }
 
+inline void cuda_mat_copy_range_clamped(
+   int32_t row_start, int32_t row_end, int32_t num_cols,
+   const double *src, int32_t lds, 
+   int32_t clamp_low, int32_t clamp_high,
+   double *dst, int32_t ldd) {
+  cudaD_mat_copy_range_clamped(row_start, row_end, num_cols,
+      src, lds, clamp_low, clamp_high, dst, ldd);
+}
+
+inline void cuda_mat_copy_range_clamped(
+   int32_t row_start, int32_t row_end, int32_t num_cols,
+   const float *src, int32_t lds, 
+   int32_t clamp_low, int32_t clamp_high,
+   float *dst, int32_t ldd) {
+  cudaF_mat_copy_range_clamped(row_start, row_end, num_cols,
+      src, lds, clamp_low, clamp_high, dst, ldd);
+}
+
+inline void cuda_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
+    int32_t *num_cols, const float **inputs, int32_t *ldi, float **outputs,
+    int32_t *ldo) {
+  cudaF_batched_copy_mats(num_mats, num_rows, num_cols, inputs, ldi,
+      outputs, ldo);
+}
+
+inline void cuda_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
+    int32_t *num_cols, const double **inputs, int32_t *ldi, double **outputs,
+    int32_t *ldo) {
+  cudaD_batched_copy_mats(num_mats, num_rows, num_cols, inputs, ldi,
+      outputs, ldo);
+}
+    
 
 } // namespace kaldi
 
diff --git a/src/cudamatrix/cu-math-test.cc b/src/cudamatrix/cu-math-test.cc
index 09255c9587b..022742ed29f 100644
--- a/src/cudamatrix/cu-math-test.cc
+++ b/src/cudamatrix/cu-math-test.cc
@@ -545,6 +545,50 @@ static void UnitTestCuMathNormalizePerRow() {
   }
 }
 
+
+template<typename Real>
+static void UnitTestCuMathNormalizePerRow_v2() {
+
+  int row = 128;
+  int col = 1024;
+
+  Matrix<Real> Hi(row,col);
+  Matrix<Real> Ho(row,col);
+  Hi.SetRandn();
+  Hi.Scale(5.0);
+  Hi.ApplyFloor(0.0); // like ReLU,
+
+  CuMatrix<Real> Di(row, col);
+  CuMatrix<Real> Do(row, col);
+  Di.CopyFromMat(Hi);
+
+  Real target_rms = 0.3456;
+  bool add_log_stddev = false;
+  const Real kSquaredNormFloor = 1.35525271560688e-20; // 2^-66
+
+  //gpu
+  cu::NormalizePerRow(Di, target_rms, add_log_stddev, &Do);
+
+  //cpu
+  {
+    MatrixBase<Real>& in(Hi);
+    MatrixBase<Real>& out(Ho);
+    Real target_rms=0.3456;
+    Vector<Real> in_norm(in.NumRows());
+    Real d_scaled = in.NumCols() * target_rms * target_rms;
+    in_norm.AddDiagMat2(1.0 / d_scaled, in, kNoTrans, 0.0);
+    in_norm.ApplyFloor(kSquaredNormFloor);
+    in_norm.ApplyPow(-0.5);
+    out.CopyFromMat(in);
+    out.MulRowsVec(in_norm);
+  }
+
+  Matrix<Real> Ho2(Do);
+  // here the BUG was detected (by processing big-enough matrix),
+  AssertEqual(Ho,Ho2,0.00001);
+}
+
+
 template<typename Real>
 static void UnitTestCuDiffNormalizePerRow() {
   for (int32 i = 0; i < 2; i++) {
@@ -660,6 +704,7 @@ template<typename Real> void CudaMathUnitTest() {
   UnitTestEnsureNonzero<Real>();
   UnitTestBackpropLstmNonlinearity<Real>();
   UnitTestCuMathNormalizePerRow<Real>();
+  UnitTestCuMathNormalizePerRow_v2<Real>();
   UnitTestCuDiffNormalizePerRow<Real>();
 }
 
@@ -673,9 +718,9 @@ int main() {
   for (; loop < 2; loop++) {
     CuDevice::Instantiate().SetDebugStrideMode(true);
     if (loop == 0)
-      CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
+      CuDevice::Instantiate().SelectGpuId("no"); // 0 means no GPU
     else
-      CuDevice::Instantiate().SelectGpuId("yes"); // -2 .. automatic selection
+      CuDevice::Instantiate().SelectGpuId("yes"); // 1 .. automatic selection
 #endif
     srand(time(NULL));
     kaldi::CudaMathUnitTest<float>();
diff --git a/src/cudamatrix/cu-matrix-speed-test.cc b/src/cudamatrix/cu-matrix-speed-test.cc
index c67eaf220b8..230112b1bd0 100644
--- a/src/cudamatrix/cu-matrix-speed-test.cc
+++ b/src/cudamatrix/cu-matrix-speed-test.cc
@@ -505,7 +505,7 @@ template<typename Real> void TestCuMatrixSoftmax(int32 dim) {
   Timer tim;
   int32 iter = 0;
   for (;tim.Elapsed() < time_in_secs; iter++) {
-    N.ApplySoftMaxPerRow(M);
+    N.SoftMaxPerRow(M);
   }
 
   BaseFloat fdim = dim;
@@ -523,7 +523,7 @@ template<typename Real> void TestCuMatrixLogSoftmax(int32 dim) {
   Timer tim;
   int32 iter = 0;
   for (;tim.Elapsed() < time_in_secs; iter++) {
-    N.ApplyLogSoftMaxPerRow(M);
+    N.LogSoftMaxPerRow(M);
   }
 
   BaseFloat fdim = dim;
diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc
index 46bc6ea0cb2..be8483e48f5 100644
--- a/src/cudamatrix/cu-matrix-test.cc
+++ b/src/cudamatrix/cu-matrix-test.cc
@@ -77,8 +77,8 @@ static void UnitTestCuMatrixTraceMatMat() {
   for (int32 i = 0; i < 2; i++) {
     int32 M = 100 + Rand() % 200, N = 100 + Rand() % 200;
     CuMatrix<Real> A(M, N);
-    A.SetRandn();
-    // add a bias to avoid numerical failure when comparing r2 and r3
+    A.SetRandUniform();
+    // Add bias to avoid numbers close to zero
     A.Add(0.1);
     if (i % 2 == 1) {
       CuMatrix<Real> B(M, N);
@@ -143,7 +143,8 @@ template<typename Real>
 static void UnitTestCuMatrixApplyLog() {
   int32 M = 100 + Rand() % 200, N = 100 + Rand() % 200;
   Matrix<Real> H(M, N);
-  H.SetRandn();
+  H.SetRandUniform(); // Using uniform distribution to ensure positive numbers
+  H.Add(0.1);         // Add bias to eliminate zeros
   H.MulElements(H); // make numbers positive
 
   CuMatrix<Real> D(H);
@@ -153,7 +154,7 @@ static void UnitTestCuMatrixApplyLog() {
 
   Matrix<Real> H2(D);
 
-  AssertEqual(H,H2);
+  KALDI_ASSERT(ApproxEqual(H,H2));
 }
 
 
@@ -173,8 +174,7 @@ static void UnitTestCuMatrixApplyExpSpecial() {
   H.ApplyExpSpecial();
 
   Matrix<Real> H2(D);
-
-  AssertEqual(H,H2);
+  KALDI_ASSERT(ApproxEqual(H,H2));
 }
 
 template<typename Real>
@@ -190,7 +190,7 @@ static void UnitTestCuMatrixApplyExp() {
 
   Matrix<Real> H2(D);
 
-  AssertEqual(H,H2);
+  KALDI_ASSERT(ApproxEqual(H,H2));
 }
 
 
@@ -200,21 +200,17 @@ static void UnitTestCuMatrixApplyExpLimited() {
   Matrix<Real> H(M, N);
   H.SetRandn();
 
-
   BaseFloat lower_limit = -0.2, upper_limit = 0.2;
 
   CuMatrix<Real> D(H);
-
   D.ApplyExpLimited(lower_limit, upper_limit);
 
-
   H.ApplyFloor(lower_limit);
   H.ApplyCeiling(upper_limit);
   H.ApplyExp();
-
   Matrix<Real> H2(D);
 
-  AssertEqual(H,H2);
+  KALDI_ASSERT(ApproxEqual(H,H2));
 }
 
 
@@ -235,7 +231,7 @@ static void UnitTestCuMatrixSigmoid() {
 
     Matrix<Real> H2(E);
 
-    AssertEqual(H, H2);
+    KALDI_ASSERT(ApproxEqual(H, H2));
   }
 }
 
@@ -251,7 +247,7 @@ static void UnitTestCuMatrixScale() {
   H.Scale(scale);
   Matrix<Real> E(D);
 
-  AssertEqual(H, E);
+  KALDI_ASSERT(ApproxEqual(H, E));
 }
 
 template<typename Real>
@@ -266,7 +262,7 @@ static void UnitTestCuMatrixAdd() {
   H.Add(offset);
   Matrix<Real> E(D);
 
-  AssertEqual(H, E);
+  KALDI_ASSERT(ApproxEqual(H, E));
 }
 
 
@@ -285,7 +281,7 @@ static void UnitTestCuMatrixSoftHinge() {
 
   Matrix<Real> H2(E);
 
-  AssertEqual(H,H2);
+  KALDI_ASSERT(ApproxEqual(H,H2));
 }
 
 template<typename Real>
@@ -308,7 +304,7 @@ static void UnitTestCuMatrixGroupPnorm() {
       CuMatrix<Real> E(M, N);
       E.GroupPnorm(D, p);
       Matrix<Real> H2(E);
-      AssertEqual(H, H2);
+      KALDI_ASSERT(ApproxEqual(H, H2));
     }
   }
 }
@@ -330,7 +326,7 @@ static void UnitTestCuMatrixGroupMax() {
     CuMatrix<Real> E(M, N);
     E.GroupMax(D);
     Matrix<Real> H2(E);
-    AssertEqual(H,H2);
+    KALDI_ASSERT(ApproxEqual(H,H2));
   }
 }
 
@@ -344,7 +340,7 @@ static void UnitTestCuMatrixSet() {
     m1.Set(value);
     m2.Set(value);
     Matrix<Real> m3(m1);
-    AssertEqual(m2, m3);
+    KALDI_ASSERT(ApproxEqual(m2, m3));
   }
 }
 
@@ -369,7 +365,7 @@ static void UnitTestCuMatrixApplyPow() {
 
     H.ApplyPow(pow);
     Matrix<Real> H2(cH);
-    AssertEqual(H, H2);
+    KALDI_ASSERT(ApproxEqual(H, H2));
   }
 }
 
@@ -390,7 +386,7 @@ static void UnitTestCuMatrixApplyPowAbs() {
 
     H.ApplyPowAbs(pow, true);
     Matrix<Real> H2(cH);
-    AssertEqual(H, H2);
+    KALDI_ASSERT(ApproxEqual(H, H2));
   }
 }
 
@@ -417,7 +413,7 @@ static void UnitTestCuMatrixCopyRowsFromVec() {
     mat.CopyRowsFromVec(vec);
 
     Matrix<Real> mat2(cu_mat);
-    AssertEqual(mat, mat2);
+    KALDI_ASSERT(ApproxEqual(mat, mat2));
   }
 }
 
@@ -442,7 +438,7 @@ static void UnitTestCuMatrixCopyColsFromVec() {
     mat.CopyColsFromVec(vec);
 
     Matrix<Real> mat2(cu_mat);
-    AssertEqual(mat, mat2);
+    KALDI_ASSERT(ApproxEqual(mat, mat2));
   }
 }
 
@@ -477,8 +473,8 @@ static void UnitTestCuMatrixCopyRows() {
         if (reorder[i] < 0) O(i, j) = 0;
         else O(i, j) = M(reorder[i], j);
 
-    AssertEqual(N1, O);
-    AssertEqual(N2, O);
+    KALDI_ASSERT(ApproxEqual(N1, O));
+    KALDI_ASSERT(ApproxEqual(N2, O));
   }
 }
 
@@ -512,7 +508,7 @@ static void UnitTestCuMatrixCopyToRows() {
     CuArray<Real*> reorder_dst_cuda(reorder_dst);
     M.CopyToRows(reorder_dst_cuda);
 
-    AssertEqual(N, O);
+    KALDI_ASSERT(ApproxEqual(N, O));
   }
 }
 
@@ -552,8 +548,8 @@ static void UnitTestCuMatrixAddRows() {
       }
     }
 
-    AssertEqual(N1, O);
-    AssertEqual(N2, O);
+    KALDI_ASSERT(ApproxEqual(N1, O));
+    KALDI_ASSERT(ApproxEqual(N2, O));
   }
 }
 
@@ -588,7 +584,7 @@ static void UnitTestCuMatrixMulRows() {
       }
     }
 
-    AssertEqual(N1, O);
+    KALDI_ASSERT(ApproxEqual(N1, O));
   }
 }
 
@@ -630,8 +626,8 @@ static void UnitTestCuMatrixAddToRows() {
     CuArray<Real*> reorder_dst_cuda(reorder_dst);
     M.AddToRows(alpha, reorder_dst_cuda);
     M.AddToRows(alpha, reorder_cuda, &N2);
-    AssertEqual(N1, O);
-    AssertEqual(N2, O);
+    KALDI_ASSERT(ApproxEqual(N1, O));
+    KALDI_ASSERT(ApproxEqual(N2, O));
   }
 }
 
@@ -648,13 +644,13 @@ void UnitTestCuMatrixCopyCross() {
       mat2.CopyFromMat(mat1);
       CuMatrix<Real> mat3(M, N);
       mat3.CopyFromMat(mat2);
-      AssertEqual(mat1, mat3);
+      KALDI_ASSERT(ApproxEqual(mat1, mat3));
     } else {
       CuMatrix<float> mat2(N, M);
       mat2.CopyFromMat(mat1, kTrans);
       CuMatrix<Real> mat3(M, N);
       mat3.CopyFromMat(mat2, kTrans);
-      AssertEqual(mat1, mat3);
+      KALDI_ASSERT(ApproxEqual(mat1, mat3));
     }
   }
 }
@@ -669,7 +665,7 @@ template<typename Real> void UnitTestCuMatrixCopyCross2() {
     mat2.CopyFromMat(mat1);
     CuMatrix<Real> mat3(M, N);
     mat3.CopyFromMat(mat2);
-    AssertEqual(mat1, mat3);
+    KALDI_ASSERT(ApproxEqual(mat1, mat3));
   }
 }
 
@@ -708,7 +704,7 @@ static void UnitTestCuMatrixSumColumnRanges() {
     CuArray<Int32Pair> indices_tmp(indices);
     cu_dst.SumColumnRanges(cu_src, indices_tmp);
     Matrix<Real> dst2(cu_dst);
-    AssertEqual(dst, dst2);
+    KALDI_ASSERT(ApproxEqual(dst, dst2));
   }
 }
 
@@ -748,7 +744,7 @@ static void UnitTestCuMatrixAddRowRanges() {
     CuArray<Int32Pair> cu_indexes(indexes);
     cu_dst.AddRowRanges(cu_src, cu_indexes);
     Matrix<Real> dst2(cu_dst);
-    AssertEqual(dst1, dst2);
+    KALDI_ASSERT(ApproxEqual(dst1, dst2));
   }
 }
 
@@ -774,7 +770,7 @@ static void UnitTestCuMatrixCopyCols() {
       for (int32 j = 0; j < num_cols2; j++)
         if (reorder[j] < 0) O(i, j) = 0;
         else O(i, j) = M(i, reorder[j]);
-    AssertEqual(N, O);
+    KALDI_ASSERT(ApproxEqual(N, O));
   }
 }
 
@@ -806,7 +802,7 @@ static void UnitTextCuMatrixAddSmat() {
 
     Matrix<Real> mat2(cumat);
 
-    AssertEqual(mat, mat2);
+    KALDI_ASSERT(ApproxEqual(mat, mat2));
   }
 }
 
@@ -844,7 +840,7 @@ static void UnitTextCuMatrixAddMatSmat() {
 
     Matrix<Real> result2(curesult);
 
-    AssertEqual(result, result2);
+    KALDI_ASSERT(ApproxEqual(result, result2));
   }
 }
 
@@ -882,7 +878,7 @@ static void UnitTextCuMatrixAddSmatMat() {
 
     Matrix<Real> result2(curesult);
 
-    AssertEqual(result, result2);
+    KALDI_ASSERT(ApproxEqual(result, result2));
   }
 }
 
@@ -907,7 +903,7 @@ static void UnitTestCuMatrixAddCols() {
       for (int32 j = 0; j < num_cols2; j++)
         if (reorder[j] < 0) O(i, j) = 0;
         else O(i, j) = M(i, reorder[j]);
-    AssertEqual(N, O);
+    KALDI_ASSERT(ApproxEqual(N, O));
   }
 }
 
@@ -929,7 +925,7 @@ static void UnitTestCuMatrixApplyFloor() {
     H.ApplyFloor(floor);
     Matrix<Real> H2(cH);
 
-    AssertEqual(H, H2);
+    KALDI_ASSERT(ApproxEqual(H, H2));
   }
 }
 
@@ -950,7 +946,7 @@ static void UnitTestCuMatrixApplyCeiling() {
     H.ApplyCeiling(ceiling);
     Matrix<Real> H2(cH);
 
-    AssertEqual(H, H2);
+    KALDI_ASSERT(ApproxEqual(H, H2));
   }
 }
 
@@ -969,7 +965,7 @@ static void UnitTestCuMatrixApplyHeaviside() {
     cH.ApplyHeaviside();
     H.ApplyHeaviside();
     Matrix<Real> H2(cH);
-    AssertEqual(H, H2);
+    KALDI_ASSERT(ApproxEqual(H, H2));
   }
 }
 
@@ -988,7 +984,7 @@ static void UnitTestCuMatrixHeaviside() {
     cH2.Heaviside(cH);
     H.ApplyHeaviside();
     Matrix<Real> H2(cH2);
-    AssertEqual(H, H2);
+    KALDI_ASSERT(ApproxEqual(H, H2));
   }
 }
 
@@ -1014,7 +1010,7 @@ static void UnitTestCuMatrixMulElements() {
     Matrix<Real> Ha2(dimM, dimN);
     Da.CopyToMat(&Ha2);
 
-    AssertEqual(Ha,Ha2);
+    KALDI_ASSERT(ApproxEqual(Ha,Ha2));
   }
 }
 
@@ -1026,7 +1022,9 @@ static void UnitTestCuMatrixDivElements() {
     Matrix<Real> Ha(dimM, dimN);
     Matrix<Real> Hb(dimM, dimN);
     Ha.SetRandn();
-    Hb.SetRandn();
+
+    Hb.SetRandUniform();  // Use uniform distirbution t ensure positive numbers
+    Hb.Add(0.1);          // Add bias to ensure we do not divide by zero
 
     CuMatrix<Real> Da(dimM, dimN);
     CuMatrix<Real> Db(dimM, dimN);
@@ -1039,7 +1037,7 @@ static void UnitTestCuMatrixDivElements() {
     Matrix<Real> Ha2(dimM, dimN);
     Da.CopyToMat(&Ha2);
 
-    AssertEqual(Ha,Ha2);
+    KALDI_ASSERT(ApproxEqual(Ha,Ha2));
   }
 }
 
@@ -1052,6 +1050,7 @@ static void UnitTestCuMatrixMax() {
 
   CuMatrix<Real> Da(100,100);
   CuMatrix<Real> Db(100,100);
+  
   Da.CopyFromMat(Ha);
   Db.CopyFromMat(Hb);
 
@@ -1061,7 +1060,7 @@ static void UnitTestCuMatrixMax() {
   Matrix<Real> Ha2(100,100);
   Da.CopyToMat(&Ha2);
 
-  AssertEqual(Ha,Ha2);
+  KALDI_ASSERT(ApproxEqual(Ha,Ha2));
 }
 
 template<typename Real>
@@ -1082,7 +1081,7 @@ static void UnitTestCuMatrixMin() {
   Matrix<Real> Ha2(100,100);
   Da.CopyToMat(&Ha2);
 
-  AssertEqual(Ha, Ha2);
+  KALDI_ASSERT(ApproxEqual(Ha, Ha2));
 }
 
 
@@ -1105,7 +1104,7 @@ static void UnitTestCuMatrixMulColsVec() {
   Matrix<Real> Hm2(100,99);
   Dm.CopyToMat(&Hm2);
 
-  AssertEqual(Hm,Hm2);
+  KALDI_ASSERT(ApproxEqual(Hm,Hm2));
 }
 
 
@@ -1131,7 +1130,7 @@ static void UnitTestCuMatrixMulRowsVec() {
     Matrix<Real> Hm2(dimM, dimN);
     Dm.CopyToMat(&Hm2);
 
-    AssertEqual(Hm,Hm2);
+    KALDI_ASSERT(ApproxEqual(Hm,Hm2));
   }
 }
 
@@ -1157,7 +1156,7 @@ static void UnitTestCuMatrixMulRowsGroupMat() {
 
     Matrix<Real> Hm2(dimM, dimN);
     Dm.CopyToMat(&Hm2);
-    AssertEqual(Hm,Hm2);
+    KALDI_ASSERT(ApproxEqual(Hm,Hm2));
   }
 }
 
@@ -1196,7 +1195,7 @@ static void UnitTestCuMatrixDiffGroupPnorm() {
 
     Matrix<Real> Hid2(dimM, dimN);
     Did.CopyToMat(&Hid2);
-    AssertEqual(Hid, Hid2);
+    KALDI_ASSERT(ApproxEqual(Hid, Hid2));
   }
 }
 
@@ -1231,7 +1230,7 @@ static void UnitTestCuMatrixGroupMaxDeriv() {
   // KALDI_LOG << "Hr " << Hr << " Dr " << Dr << "Ds" << Ds << " Hs " << Hs ;
   Matrix<Real> Hr2(dimM, dimN);
   Dr.CopyToMat(&Hr2);
-  AssertEqual(Hr,Hr2);
+  KALDI_ASSERT(ApproxEqual(Hr,Hr2));
 }
 
 template<typename Real> static void UnitTestCuMatrixAddDiagVecMat() {
@@ -1266,7 +1265,7 @@ template<typename Real> static void UnitTestCuMatrixAddDiagVecMat() {
     }
 
     M.AddDiagVecMat(alpha, V, N, trans, beta);
-    AssertEqual(M, Mcheck);
+    KALDI_ASSERT(ApproxEqual(M, Mcheck));
     KALDI_ASSERT(M.Sum() != 0.0);
   }
 }
@@ -1294,7 +1293,7 @@ template<typename Real> static void UnitTestCuMatrixAddMatDiagVec() {
     Mcheck.AddMat(alpha, buf, kNoTrans);
 
     M.AddMatDiagVec(alpha, N, trans, V, beta);
-    AssertEqual(M, Mcheck);
+    KALDI_ASSERT(ApproxEqual(M, Mcheck));
     KALDI_ASSERT(M.Sum() != 0.0);
   }
 }
@@ -1313,7 +1312,7 @@ template<typename Real> static void UnitTestCuMatrixAddMatMatElements() {
   Mcheck.Scale(beta); Mcheck.AddMat(alpha, buf, kNoTrans);
 
   M.AddMatMatElements(alpha, A, B, beta);
-  AssertEqual(M, Mcheck);
+  KALDI_ASSERT(ApproxEqual(M, Mcheck));
   KALDI_ASSERT(M.Sum() != 0.0);
 }
 
@@ -1332,11 +1331,11 @@ template<typename Real> static void UnitTestCuMatrixSetMatMatDivMat() {
   M.SetMatMatDivMat(A,B,C);
   ref.AddMatMatElements(1.0, A, B, 0.0);
   ref.DivElements(C);
-  AssertEqual(M, ref);
+  KALDI_ASSERT(ApproxEqual(M, ref));
 
   C.SetZero();
   M.SetMatMatDivMat(A,B,C);
-  AssertEqual(M, A);
+  KALDI_ASSERT(ApproxEqual(M, A));
 }
 
 template<typename Real>
@@ -1359,7 +1358,7 @@ static void UnitTestCuMatrixDivRowsVec() {
   Matrix<Real> Hm2(dimM, dimN);
   Dm.CopyToMat(&Hm2);
 
-  AssertEqual(Hm, Hm2);
+  KALDI_ASSERT(ApproxEqual(Hm, Hm2));
 }
 
 
@@ -1382,13 +1381,13 @@ static void UnitTestCuMatrixAddMat() {
   Matrix<Real> Ha2(100,100);
   Da.CopyToMat(&Ha2);
 
-  AssertEqual(Ha,Ha2);
+  KALDI_ASSERT(ApproxEqual(Ha,Ha2));
 
   //check use with submatrix
   CuMatrix<Real> mat1(10,10,kSetZero);
   mat1.AddMat(1.0,Da.Range(5,10,12,10)); //different stride for mat1,mat2
   CuMatrix<Real> mat2(Da.Range(5,10,12,10));
-  AssertEqual(mat1,mat2);
+  KALDI_ASSERT(ApproxEqual(mat1,mat2));
 
   for (int i = 0; i < 10; i++) {
     int32 N = 5 * (10 + Rand() % 10),  M = 100 + Rand() % 50;
@@ -1408,14 +1407,14 @@ static void UnitTestCuMatrixAddMat() {
 
     Matrix<Real> Hc2(N,M);
     Dc.CopyToMat(&Hc2);
-    AssertEqual(Hc,Hc2);
+    KALDI_ASSERT(ApproxEqual(Hc,Hc2));
 
     // check use with submatrix
     CuMatrix<Real> mat3(N/5,M,kSetZero);
     mat3.AddMat(1.0, Dd.Range(0,M,0,N/5),kTrans);
 
     CuMatrix<Real> mat4(Dd.Range(0,M,0,N/5),kTrans);
-    AssertEqual(mat3,mat4);
+    KALDI_ASSERT(ApproxEqual(mat3,mat4));
   }
 }
 
@@ -1444,7 +1443,7 @@ static void UnitTestCuMatrixAddMatBlocks1() {
       }
     }
     dst.AddMatBlocks(alpha, src);
-    AssertEqual(dst, dst_copy);
+    KALDI_ASSERT(ApproxEqual(dst, dst_copy));
   }
 }
 
@@ -1471,7 +1470,7 @@ static void UnitTestCuMatrixAddMatBlocks1Trans() {
       }
     }
     dst.AddMatBlocks(alpha, src, kTrans);
-    AssertEqual(dst, dst_copy);
+    KALDI_ASSERT(ApproxEqual(dst, dst_copy));
   }
 }
 
@@ -1501,7 +1500,7 @@ static void UnitTestCuMatrixAddMatBlocks2() {
       }
     }
     dst.AddMatBlocks(alpha, src);
-    AssertEqual(dst, dst_copy);
+    KALDI_ASSERT(ApproxEqual(dst, dst_copy));
   }
 }
 
@@ -1553,7 +1552,7 @@ static void UnitTestCuMatrixAddVecToCols() {
   Matrix<Real> Hm2(100,99);
   Dm.CopyToMat(&Hm2);
 
-  AssertEqual(Hm,Hm2);
+  KALDI_ASSERT(ApproxEqual(Hm,Hm2));
 }
 
 
@@ -1576,7 +1575,7 @@ static void UnitTestCuMatrixAddVecToRows() {
   Matrix<Real> Hm2(100,99);
   Dm.CopyToMat(&Hm2);
 
-  AssertEqual(Hm,Hm2);
+  KALDI_ASSERT(ApproxEqual(Hm,Hm2));
 }
 
 
@@ -1603,7 +1602,7 @@ static void UnitTestCuMatrixSymAddMat2() {
 
     CuTpMatrix<Real> T1(M), T2(M2);
     CuMatrix<Real> X1(T1), X2(T2); // so we can test equality.
-    AssertEqual(X1, X2);
+    KALDI_ASSERT(ApproxEqual(X1, X2));
     KALDI_ASSERT(dimM == 0 || X1.Trace() != 0);
   }
 }
@@ -1683,8 +1682,8 @@ static void UnitTestCuMatrixAddMatMat() {
   Dc1.CopyToMat(&Hc1a);
   Dc2.CopyToMat(&Hc2a);
 
-  AssertEqual(Hc1,Hc1a);
-  AssertEqual(Hc2,Hc2a);
+  KALDI_ASSERT(ApproxEqual(Hc1,Hc1a));
+  KALDI_ASSERT(ApproxEqual(Hc2,Hc2a));
 }
 
 
@@ -1708,7 +1707,7 @@ static void UnitTestCuMatrixAddVecVec() {
   Matrix<Real> A2(100, 200);
   CuA.CopyToMat(&A2);
 
-  AssertEqual(A,A2);
+  KALDI_ASSERT(ApproxEqual(A,A2));
 }
 
 
@@ -1773,8 +1772,8 @@ static void UnitTestCuMatrixAddMatMatBatched() {
     (*HC2[i]).AddMatMat(0.5f, *(HA[i]), kTrans, *(HB[i]), kTrans, 0.0f);
     DC1[i]->CopyToMat(&Hca1);
     DC2[i]->CopyToMat(&Hca2);
-    AssertEqual(*(HC1[i]), Hca1);
-    AssertEqual(*(HC2[i]), Hca2);
+    KALDI_ASSERT(ApproxEqual(*(HC1[i]), Hca1));
+    KALDI_ASSERT(ApproxEqual(*(HC2[i]), Hca2));
     delete Ha[i]; delete Hb[i]; delete Hc1[i]; delete Hc2[i];
     delete HA[i]; delete HB[i]; delete HC1[i]; delete HC2[i];
     delete Da[i]; delete Db[i]; delete Dc1[i]; delete Dc2[i];
@@ -1796,7 +1795,7 @@ static void UnitTestCuMatrixAddToDiag() {
     M.AddToDiag(alpha);
     Mc.AddToDiag(alpha);
     Matrix<Real> M2(Mc);
-    AssertEqual(M, M2);
+    KALDI_ASSERT(ApproxEqual(M, M2));
   }
 }
 
@@ -1810,7 +1809,7 @@ static void UnitTestCuMatrixAdd2() {
     M.Add(alpha);
     Mc.Add(alpha);
     Matrix<Real> M2(Mc);
-    AssertEqual(M, M2);
+    KALDI_ASSERT(ApproxEqual(M, M2));
   }
 }
 
@@ -1826,7 +1825,7 @@ static void UnitTestCuMatrixCopyFromMat() {
     CuMatrix<Real> B(dim, dim);
     B.CopyFromMat(E);
 
-    AssertEqual<Real>(B, E);
+    KALDI_ASSERT(ApproxEqual<Real>(B, E));
   }
 }
 
@@ -1842,7 +1841,7 @@ static void UnitTestCuMatrixCopyFromTp() {
     B.CopyFromTp(A, kNoTrans);
     C.CopyFromTp(E, kNoTrans);
     CuMatrix<Real> D(B);
-    AssertEqual<Real>(D, C);
+    KALDI_ASSERT(ApproxEqual<Real>(D, C));
   }
 }
 
@@ -1865,7 +1864,7 @@ static void UnitTestCuMatrixAddMatTp() {
     D.AddMatTp(1.0, E, kNoTrans, F, kNoTrans, 1.0);
 
     CuMatrix<Real> G(A);
-    AssertEqual<Real>(G, D);
+    KALDI_ASSERT(ApproxEqual<Real>(G, D));
   }
 }
 
@@ -1884,7 +1883,7 @@ static void UnitTestCuMatrixTranspose() {
     Matrix<Real> hA(A);
     Matrix<Real> hB(B);
     hB.Transpose();
-    AssertEqual(hA, hB);
+    KALDI_ASSERT(ApproxEqual(hA, hB));
   }
 }
 
@@ -1907,7 +1906,7 @@ static void UnitTestCuMatrixAddTpMat() {
     D.AddTpMat(1.0, F, kNoTrans, E, kNoTrans, 1.0);
 
     CuMatrix<Real> G(A);
-    AssertEqual<Real>(G, D);
+    KALDI_ASSERT(ApproxEqual<Real>(G, D));
   }
 }
 
@@ -1933,7 +1932,7 @@ static void UnitTestCuVectorAddVec() {
   Vector<Real> Hv2(777);
   Dv.CopyToVec(&Hv2);
 
-  AssertEqual(Hv,Hv2);
+  KALDI_ASSERT(ApproxEqual(Hv,Hv2));
 }
 
 
@@ -1964,7 +1963,7 @@ static void UnitTestCuVectorAddRowSumMat() {
   Vector<Real> Hv2(Y);
   Dv.CopyToVec(&Hv2);
 
-  AssertEqual(Hv,Hv2);
+  KALDI_ASSERT(ApproxEqual(Hv,Hv2));
 }
 
 
@@ -1992,7 +1991,7 @@ static void UnitTestCuVectorAddRowSumMatLarge() {
   Vector<Real> Hv2(990);
   Dv.CopyToVec(&Hv2);
 
-  AssertEqual(Hv,Hv2);
+  KALDI_ASSERT(ApproxEqual(Hv,Hv2));
 }
 
 
@@ -2023,7 +2022,7 @@ static void UnitTestCuVectorAddColSumMat() {
   Vector<Real> Hv2(X);
   Dv.CopyToVec(&Hv2);
 
-  AssertEqual(Hv,Hv2);
+  KALDI_ASSERT(ApproxEqual(Hv,Hv2));
 }
 
 template<typename Real>
@@ -2068,7 +2067,7 @@ static void UnitTestCuVectorAddColSumMatLarge() {
   Vector<Real> Hv2(1000);
   Dv.CopyToVec(&Hv2);
 
-  AssertEqual(Hv,Hv2);
+  KALDI_ASSERT(ApproxEqual(Hv,Hv2));
 }
 
 
@@ -2087,7 +2086,7 @@ static void UnitTestCuVectorInvertElements() {
   Vector<Real> Hv2(777);
   Dv.CopyToVec(&Hv2);
 
-  AssertEqual(Hv,Hv2);
+  KALDI_ASSERT(ApproxEqual(Hv,Hv2));
 }
 
 template<typename Real>
@@ -2104,7 +2103,7 @@ static void UnitTestCuMatrixInvertElements() {
   Matrix<Real> Hm2(77, 77);
   Dm.CopyToMat(&Hm2);
 
-  AssertEqual(Hm,Hm2);
+  KALDI_ASSERT(ApproxEqual(Hm,Hm2));
 }
 
 
@@ -2123,7 +2122,7 @@ static void UnitTestCuMatrixIO() {
     CuMatrix<Real> mat2;
     std::istringstream is(os.str());
     mat2.Read(is, binary);
-    AssertEqual(mat, mat2);
+    KALDI_ASSERT(ApproxEqual(mat, mat2));
   }
 }
 
@@ -2151,7 +2150,7 @@ static void UnitTestCuVectorAddTpVec() {
   Vector<Real> Hv2(300);
   Dv.CopyToVec(&Hv2);
 
-  AssertEqual(Hv,Hv2);
+  KALDI_ASSERT(ApproxEqual(Hv,Hv2));
 }
 
 template<typename Real>
@@ -2189,7 +2188,7 @@ static void UnitTestCuVectorMulTp() {
   Vector<Real> Hv2(300);
   Dv.CopyToVec(&Hv2);
 
-  AssertEqual(Hv,Hv2);
+  KALDI_ASSERT(ApproxEqual(Hv,Hv2));
 }
 
 template<typename Real, typename OtherReal>
@@ -2243,7 +2242,7 @@ static void UnitTestCuSigmoid() {
   Matrix<Real> Ho2(100,111);
   Do.CopyToMat(&Ho2);
 
-  AssertEqual(Ho,Ho2);
+  KALDI_ASSERT(ApproxEqual(Ho,Ho2));
 }
 
 
@@ -2274,7 +2273,7 @@ static void UnitTestCuDiffSigmoid() {
   Matrix<Real> Ho2(100,111);
   Do.CopyToMat(&Ho2);
 
-  AssertEqual(Ho,Ho2);
+  KALDI_ASSERT(ApproxEqual(Ho,Ho2));
 }
 
 
@@ -2317,7 +2316,7 @@ static void UnitTestCuDiffSoftmax() {
     Matrix<Real> Ho2(m, n);
     Do.CopyToMat(&Ho2);
 
-    AssertEqual(Ho, Ho2);
+    KALDI_ASSERT(ApproxEqual(Ho, Ho2));
   }
 }
 
@@ -2362,7 +2361,7 @@ static void UnitTestCuDiffLogSoftmax() {
     Matrix<Real> Ho2(m, n);
     Do.CopyToMat(&Ho2);
 
-    AssertEqual(Ho, Ho2);
+    KALDI_ASSERT(ApproxEqual(Ho, Ho2));
  }
 }
 
@@ -2385,11 +2384,11 @@ static void UnitTestCuSoftmax() {
 
     //gpu
     if (i % 2 == 0) {
-      Do.ApplySoftMaxPerRow(Di);
+      Do.SoftMaxPerRow(Di);
     } else {
       // in-place
       Do.CopyFromMat(Di);
-      Do.ApplySoftMaxPerRow(Do);
+      Do.SoftMaxPerRow(Do);
     }
     //cpu
     Ho.CopyFromMat(Hi);
@@ -2399,7 +2398,7 @@ static void UnitTestCuSoftmax() {
 
     Matrix<Real> Ho2(Do);
 
-    AssertEqual(Ho,Ho2,0.00001);
+    KALDI_ASSERT(ApproxEqual(Ho,Ho2,(Real)0.00001));
   }
 }
 
@@ -2422,11 +2421,11 @@ static void UnitTestCuLogSoftmax() {
 
     //gpu
     if (i % 2 == 0) {
-      Do.ApplyLogSoftMaxPerRow(Di);
+      Do.LogSoftMaxPerRow(Di);
     } else {
       // in-place.
       Do.CopyFromMat(Di);
-      Do.ApplyLogSoftMaxPerRow(Do);
+      Do.LogSoftMaxPerRow(Do);
     }
     //cpu
     Ho.CopyFromMat(Hi);
@@ -2436,7 +2435,7 @@ static void UnitTestCuLogSoftmax() {
 
     Matrix<Real> Ho2(Do);
 
-    AssertEqual(Ho, Ho2, 0.00001);
+    KALDI_ASSERT(ApproxEqual(Ho, Ho2, (Real)0.00001));
   }
 }
 
@@ -2469,7 +2468,12 @@ static void UnitTestCuFindRowMaxId() {
     std::vector<int32> Hmax2(dimM);
     Dmax.CopyToVec(&Hmax2);
 
-    KALDI_ASSERT(Hmax == Hmax2);
+    // If the same value were generated randomly we can get to a case
+    // where the GPU and CPU return different columns.  Both would be correct.
+    // Thus check that the max for each row is the same and not the index.
+    for (MatrixIndexT r=0; r<Hi.NumRows(); r++) {
+      KALDI_ASSERT(Hi(r, Hmax[r]) == Di(r, Hmax2[r]));
+    }
   }
 }
 
@@ -2508,8 +2512,8 @@ static void UnitTestCuDiffXent() {
   Vector<Real> Hlogpost2(X);
   Dlogpost.CopyToVec(&Hlogpost2);
 
-  AssertEqual(Hi,Hi2);
-  AssertEqual(Hlogpost,Hlogpost2);
+  KALDI_ASSERT(ApproxEqual(Hi,Hi2));
+  KALDI_ASSERT(ApproxEqual(Hlogpost,Hlogpost2));
 }
 
 template<typename Real> void UnitTestCheck() {
@@ -2542,8 +2546,8 @@ void UnitTestSwapCu2Cu() {
   Di.CopyToMat(&Hf);
   Matrix<Real> Hf2(Di2.NumRows(), Di2.NumCols());
   Di2.CopyToMat(&Hf2);
-  AssertEqual(Hi,Hf2);
-  AssertEqual(Hi2,Hf);
+  KALDI_ASSERT(ApproxEqual(Hi,Hf2));
+  KALDI_ASSERT(ApproxEqual(Hi2,Hf));
 }
 
 template<typename Real>
@@ -2561,8 +2565,8 @@ void UnitTestSwapCu2M() {
   Di.Swap(&Hi2);
   Matrix<Real> Hf(Di.NumRows(), Di.NumCols());
   Di.CopyToMat(&Hf);
-  AssertEqual(Di2,Hf);
-  AssertEqual(Hi2,Hi);
+  KALDI_ASSERT(ApproxEqual(Di2,Hf));
+  KALDI_ASSERT(ApproxEqual(Hi2,Hi));
 }
 
 
@@ -2582,7 +2586,7 @@ void UnitTestCuTanh() {
   //cpu
   Matrix<Real> Hf(H.NumRows(), H.NumCols());
   Hf.Tanh(H);
-  AssertEqual(Df,Hf);
+  KALDI_ASSERT(ApproxEqual(Df,Hf));
 }
 
 template<typename Real>
@@ -2611,7 +2615,7 @@ static void UnitTestCuDiffTanh() {
   Matrix<Real> Ho2(100,111);
   Do.CopyToMat(&Ho2);
 
-  AssertEqual(Ho,Ho2);
+  KALDI_ASSERT(ApproxEqual(Ho,Ho2));
 }
 
 // just need this for testing function below.  Compute n!!
@@ -2633,7 +2637,7 @@ static void UnitTestCuMatrixSetRandn() {
     M.SetRandn();
     srand(104);
     N.SetRandn();
-    AssertEqual(M, N);
+    KALDI_ASSERT(ApproxEqual(M, N));
   }
 
   for (int32 i = 0; i < 5; i++) {
@@ -2650,10 +2654,13 @@ static void UnitTestCuMatrixSetRandn() {
       // see http://en.wikipedia.org/wiki/Normal_distribution#Moments,
       // note that mu = 0 and sigma = 1.
       Real expected_moment = (pow % 2 == 1 ? 0 : DoubleFactorial(pow - 1));
+      Real expected_twice_moment = DoubleFactorial(2 * pow - 1);
       Real k = 10.0; // This is just a constant we use to give us some wiggle
                      // room before rejecting the distribution... e.g. 20 sigma,
                      // quite approximately.
-      Real allowed_deviation = k * pow / sqrt(static_cast<Real>(rows * cols));
+      // VAR(X) = E(X^2) - (E(X))^2
+      Real deviation = sqrt(expected_twice_moment - expected_moment * expected_moment);
+      Real allowed_deviation = k * deviation / sqrt(static_cast<Real>(rows * cols));
       // give it a bit more wiggle room for higher powers.. this is quite
       // unscientific, it would be better to involve the absolute moments or
       // something like that, and use one of those statistical inequalities,
@@ -2822,29 +2829,37 @@ static void UnitTestCuMatrixAddElements() {
     CuMatrix<Real> M(H);
     int32 num_elements = 100 + Rand() % 10;
     std::vector<MatrixElement<Real> > input;
-    std::vector<Int32Pair> input_index;
+    std::set<Int32Pair> input_index;      //Set used to ensure unique elements
+    std::vector<Int32Pair> input_index_v;
     Real *input_value = new Real[num_elements];
     BaseFloat scale = -1 + (0.33 * (Rand() % 5));
     for (int32 j = 0; j < num_elements; j++) {
-      MatrixIndexT r = Rand() % dimM;
-      MatrixIndexT c = Rand() % dimN;
       Int32Pair tmp_pair;
-      tmp_pair.first = r;
-      tmp_pair.second = c;
+      // Generate a unique random index
+      do {
+        tmp_pair.first = Rand() % dimM;
+        tmp_pair.second = Rand() % dimN;
+      } while (input_index.find(tmp_pair)!=input_index.end());
+      input_index.insert(tmp_pair);  
+
+      MatrixIndexT r = tmp_pair.first;
+      MatrixIndexT c = tmp_pair.second;
+      input_index_v.push_back(tmp_pair);
+
       Real offset = -1 + (0.33 * (Rand() % 5));
       M(r, c) += scale * offset;
       MatrixElement<Real> t = {r, c, offset};
       input.push_back(t);
-      input_index.push_back(tmp_pair);
       input_value[j] = offset;
     }
+    
     H.AddElements(scale, input);
-    CuArray<Int32Pair> cu_input_index(input_index);
+    CuArray<Int32Pair> cu_input_index(input_index_v);
     H_copy.AddElements(scale, cu_input_index, input_value);
     delete[] input_value;
 
-    AssertEqual(H, M);
-    AssertEqual(H_copy, M);
+    KALDI_ASSERT(ApproxEqual(H, M));
+    KALDI_ASSERT(ApproxEqual(H_copy, M));
   }
 }
 
@@ -2866,7 +2881,7 @@ static void UnitTestCuMatrixAddToElements() {
     }
     CuArray<int32> cu_elements(elements);
     A_copy.AddToElements(alpha, cu_elements);
-    AssertEqual(A_copy, A);
+    KALDI_ASSERT(ApproxEqual(A_copy, A));
   }
 }
 
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index 247c2236565..9ce2c356881 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -8,6 +8,7 @@
 //           2013-2015  Guoguo Chen
 //           2016-2017  Shiyin Kang
 //                2017  Hossein Hadian
+//                2019  Yiwen Shao
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -321,8 +322,10 @@ void CuMatrixBase<Real>::CopyFromMat(const MatrixBase<Real> &src,
       MatrixIndexT dst_pitch = stride_*sizeof(Real);
       MatrixIndexT src_pitch = src.Stride()*sizeof(Real);
       MatrixIndexT width = src.NumCols()*sizeof(Real);
-      CU_SAFE_CALL(cudaMemcpy2D(data_, dst_pitch, src.Data(), src_pitch,
-                                width, src.NumRows(), cudaMemcpyHostToDevice));
+      CU_SAFE_CALL(cudaMemcpy2DAsync(data_, dst_pitch, src.Data(), src_pitch,
+                                width, src.NumRows(), cudaMemcpyHostToDevice,
+                                cudaStreamPerThread));
+      CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
 
       CuDevice::Instantiate().AccuProfile("CuMatrixBase::CopyFromMat(from CPU)", tim);
     } else {
@@ -412,6 +415,33 @@ template
 CuMatrix<double>::CuMatrix(const MatrixBase<double> &other, MatrixTransposeType trans);
 
 
+template <typename Real>
+void CuMatrixBase<Real>:: CopyRangeFromMatClamped(const CuMatrixBase<Real> & src,
+      int32_t start_range, int32_t end_range,
+      int32_t clamp_low, int32_t clamp_high) {
+
+  KALDI_ASSERT(NumCols() == this->NumCols());
+  KALDI_ASSERT(NumRows() == end_range-start_range);
+
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    cuda_mat_copy_range_clamped(start_range, end_range, NumCols(),
+      src.Data(), src.Stride(), clamp_low, clamp_high,
+      Data(), Stride());
+  } else 
+#endif
+  {
+    for (int32 t = start_range; t < end_range; t++) {
+      int32 t_clamped = t;
+      if (t_clamped < clamp_low) t_clamped = clamp_low;
+      if (t_clamped >= clamp_high) t_clamped = clamp_high;
+      CuSubVector<Real> dest_row=this->Row(t - start_range);
+      const CuSubVector<Real> src_row=src.Row(t_clamped);
+      dest_row.CopyFromVec(src_row);
+    }
+  }
+}
+
 template<typename Real>
 template<typename OtherReal>
 void CuMatrixBase<Real>::CopyToMat(MatrixBase<OtherReal> *dst,
@@ -429,9 +459,10 @@ void CuMatrixBase<Real>::CopyToMat(MatrixBase<OtherReal> *dst,
       MatrixIndexT src_pitch = stride_*sizeof(Real);
       MatrixIndexT dst_pitch = dst->Stride()*sizeof(Real);
       MatrixIndexT width = NumCols()*sizeof(Real);
-      CU_SAFE_CALL(cudaMemcpy2D(dst->Data(), dst_pitch, this->data_, src_pitch,
-                                width, this->num_rows_, cudaMemcpyDeviceToHost));
-
+      CU_SAFE_CALL(cudaMemcpy2DAsync(dst->Data(), dst_pitch, this->data_, 
+                                     src_pitch, width, this->num_rows_, 
+                                     cudaMemcpyDeviceToHost, cudaStreamPerThread));
+      CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
       CuDevice::Instantiate().AccuProfile("CuMatrix::CopyToMatD2H", tim);
     }
   } else
@@ -479,8 +510,9 @@ void CuMatrixBase<Real>::SetZero() {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
-    CU_SAFE_CALL(cudaMemset2D(data_, stride_ * sizeof(Real), 0,
-                              num_cols_ * sizeof(Real), num_rows_ ));
+    CU_SAFE_CALL(cudaMemset2DAsync(data_, stride_ * sizeof(Real), 0,
+                              num_cols_ * sizeof(Real), num_rows_ , 
+                              cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile("CuMatrix::SetZero", tim);
   } else
 #endif
@@ -630,27 +662,6 @@ void CuMatrixBase<Real>::Scale(Real value) {
   }
 }
 
-template<typename Real>
-void CuMatrixBase<Real>::ApplyLog() {
-  #if HAVE_CUDA == 1
-  if (CuDevice::Instantiate().Enabled()) {
-    if (num_rows_ == 0) return;
-    CuTimer tim;
-
-    dim3 dimGrid, dimBlock;
-    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
-                                          &dimGrid, &dimBlock);
-
-    cuda_apply_log(dimGrid, dimBlock, data_, Dim());
-    CU_SAFE_CALL(cudaGetLastError());
-
-    CuDevice::Instantiate().AccuProfile(__func__, tim);
-  } else
-  #endif
-  {
-    Mat().ApplyLog();
-  }
-}
 
 template<typename Real>
 void CuMatrixBase<Real>::MulElements(const CuMatrixBase<Real>& A) {
@@ -1668,7 +1679,10 @@ void CuMatrix<Real>::CompObjfAndDeriv(const std::vector<MatrixElement<Real> >& s
       return;
     }
     void *addr = CuDevice::Instantiate().Malloc(sv_labels.size() * sizeof(MatrixElement<Real>));
-    CU_SAFE_CALL(cudaMemcpy(addr, sv_labels.data(), sv_labels.size() * sizeof(MatrixElement<Real>), cudaMemcpyHostToDevice));
+    CU_SAFE_CALL(cudaMemcpyAsync(addr, sv_labels.data(), sv_labels.size() * 
+                                 sizeof(MatrixElement<Real>), 
+                                 cudaMemcpyHostToDevice, 
+                                 cudaStreamPerThread));
     CuTimer tim;
     CuVector<Real> tmp(2, kUndefined);
     int dimBlock(CU1DBLOCK);
@@ -1700,7 +1714,7 @@ void CuMatrix<Real>::CompObjfAndDeriv(const std::vector<MatrixElement<Real> >& s
 }
 
 template<typename Real> // Y->this, X->src
-void CuMatrixBase<Real>::ApplySoftMaxPerRow(const CuMatrixBase<Real> &src) {
+void CuMatrixBase<Real>::SoftMaxPerRow(const CuMatrixBase<Real> &src) {
   KALDI_ASSERT(SameDim(*this, src));
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
@@ -1723,7 +1737,7 @@ void CuMatrixBase<Real>::ApplySoftMaxPerRow(const CuMatrixBase<Real> &src) {
 }
 
 template<typename Real> // Y->this, X->src
-void CuMatrixBase<Real>::ApplyLogSoftMaxPerRow(const CuMatrixBase<Real> &src) {
+void CuMatrixBase<Real>::LogSoftMaxPerRow(const CuMatrixBase<Real> &src) {
   KALDI_ASSERT(SameDim(*this, src));
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
@@ -1962,7 +1976,7 @@ void CuMatrixBase<Real>::DiffXent(const CuArrayBase<int32> &tgt,
     for(int32 r = 0; r < num_rows; r++) {
       int32 col_tgt = tgt.Data()[r];
       Real &value = Mat()(r, col_tgt);
-      log_post_tgt->Vec()(r) = Log(value);
+      log_post_tgt->Vec()(r) = kaldi::Log(value);
       value -= 1.0;
     }
   }
@@ -2088,6 +2102,7 @@ void CuMatrixBase<Real>::Cholesky(CuMatrixBase<Real> *inv_cholesky) {
   // (5)(d) zero L12 and M12.
   this_12.SetZero();
   inv_12.SetZero();
+
 }
 
 
@@ -2243,7 +2258,9 @@ void AddMatMatBatched(const Real alpha, std::vector<CuSubMatrix<Real>* > &C,
       host_c_array[i] = C[i]->data_;
     }
 
-    CU_SAFE_CALL(cudaMemcpy(device_abc_array, host_abc_array, 3*size*sizeof(Real*), cudaMemcpyHostToDevice));
+    CU_SAFE_CALL(cudaMemcpyAsync(device_abc_array, host_abc_array, 
+                                 3*size*sizeof(Real*), cudaMemcpyHostToDevice,
+                                 cudaStreamPerThread));
 
     CUBLAS_SAFE_CALL(cublas_gemmBatched(GetCublasHandle(),
                                         (transB==kTrans? CUBLAS_OP_T:CUBLAS_OP_N),
@@ -2323,15 +2340,21 @@ void CuMatrixBase<Real>::CopyRowsFromVec(const VectorBase<Real> &v) {
     if (v.Dim() == num_rows_*num_cols_) {
       if (stride_ == num_cols_) {
         const Real* v_data = v.Data();
-        cudaMemcpy(data_, v_data, sizeof(Real)*num_rows_*num_cols_, cudaMemcpyHostToDevice);
+        CU_SAFE_CALL(cudaMemcpyAsync(data_, v_data, 
+                                     sizeof(Real)*num_rows_*num_cols_, 
+                                     cudaMemcpyHostToDevice, 
+                                     cudaStreamPerThread));
       } else {
         const Real *v_data = v.Data();
         for (MatrixIndexT r = 0; r < num_rows_; r++) {
           Real *row_data = RowData(r);
-          cudaMemcpy(row_data, v_data, sizeof(Real)*num_cols_, cudaMemcpyHostToDevice);
+          CU_SAFE_CALL(cudaMemcpyAsync(row_data, v_data, sizeof(Real)*num_cols_, 
+                                       cudaMemcpyHostToDevice, 
+                                       cudaStreamPerThread));
           v_data += num_cols_;
         }
       }
+      CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     } else if (v.Dim() == num_cols_) {
       dim3 dimGrid, dimBlock;
       GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
@@ -2409,61 +2432,72 @@ void CuMatrixBase<Real>::CopyColFromVec(const CuVectorBase<Real> &v,
 }
 
 template<typename Real>
-void CuMatrixBase<Real>::ApplyPow(Real power) {
+void CuMatrixBase<Real>::Heaviside(const CuMatrixBase<Real> &src) {
+  KALDI_ASSERT(SameDim(*this, src));
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
-    cuda_apply_pow(dimGrid, dimBlock, data_, power, Dim());
+    cuda_heaviside(dimGrid, dimBlock, this->data_, src.data_, this->Dim(),
+                   src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
+
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
-#endif
+  #endif
   {
-    Mat().ApplyPow(power);
+    Mat().Heaviside(src.Mat());
   }
 }
 
 template<typename Real>
-void CuMatrixBase<Real>::ApplyPowAbs(Real power, bool include_sign) {
+void CuMatrixBase<Real>::Exp(const CuMatrixBase<Real> &src) {
+  KALDI_ASSERT(SameDim(*this, src));
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
-    cuda_apply_pow_abs(dimGrid, dimBlock, data_, power, include_sign, Dim());
+    cuda_exp(dimGrid, dimBlock, this->data_, src.data_, this->Dim(),
+	     src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
+
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
-#endif
+  #endif
   {
-    Mat().ApplyPowAbs(power, include_sign);
+    Mat().Exp(src.Mat());
   }
 }
 
 template<typename Real>
-void CuMatrixBase<Real>::ApplyHeaviside() {
+void CuMatrixBase<Real>::Log(const CuMatrixBase<Real> &src) {
+  KALDI_ASSERT(SameDim(*this, src));
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
+    if (num_rows_ == 0) return;
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
-    cuda_apply_heaviside(dimGrid, dimBlock, data_, Dim());
+
+    cuda_log(dimGrid, dimBlock, this->data_, src.data_, this->Dim(),
+	     src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
+
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
-#endif
+  #endif
   {
-    Mat().ApplyHeaviside();
+    Mat().Log(src.Mat());
   }
 }
 
 template<typename Real>
-void CuMatrixBase<Real>::Heaviside(const CuMatrixBase<Real> &src) {
+void CuMatrixBase<Real>::Pow(const CuMatrixBase<Real> &src, Real power) {
   KALDI_ASSERT(SameDim(*this, src));
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
@@ -2471,38 +2505,41 @@ void CuMatrixBase<Real>::Heaviside(const CuMatrixBase<Real> &src) {
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
-    cuda_heaviside(dimGrid, dimBlock, this->data_, src.data_, this->Dim(),
-                   src.Stride());
+    cuda_pow(dimGrid, dimBlock, this->data_, src.data_, power, this->Dim(),
+	     src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
   #endif
   {
-    Mat().Heaviside(src.Mat());
+    Mat().Pow(src.Mat(), power);
   }
 }
 
 template<typename Real>
-void CuMatrixBase<Real>::ApplyExp() {
+void CuMatrixBase<Real>::PowAbs(const CuMatrixBase<Real> &src, Real power, bool include_sign) {
+  KALDI_ASSERT(SameDim(*this, src));
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
-    cuda_apply_exp(dimGrid, dimBlock, data_, Dim());
+    cuda_pow_abs(dimGrid, dimBlock, this->data_, src.data_, power, include_sign,
+		 this->Dim(), src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
-    Mat().ApplyExp();
+    Mat().PowAbs(src.Mat(), power, include_sign);
   }
 }
-
+  
 template<typename Real>
-void CuMatrixBase<Real>::ApplyExpLimited(Real lower_limit, Real upper_limit) {
+void CuMatrixBase<Real>::ExpLimited(const CuMatrixBase<Real> &src, Real lower_limit, Real upper_limit) {
+  KALDI_ASSERT(SameDim(*this, src));
   KALDI_ASSERT(upper_limit > lower_limit);
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
@@ -2510,82 +2547,72 @@ void CuMatrixBase<Real>::ApplyExpLimited(Real lower_limit, Real upper_limit) {
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
-    cuda_apply_exp_limited(dimGrid, dimBlock, data_, Dim(), lower_limit, upper_limit);
+    cuda_exp_limited(dimGrid, dimBlock, this->data_, src.data_, lower_limit, upper_limit,
+		     this->Dim(), src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
-    int32 num_rows = num_rows_, num_cols = num_cols_;
-    for (int32 r = 0; r < num_rows; r++) {
-      Real *row_data = this->RowData(r);
-      for (int32 c = 0; c < num_cols; c++) {
-        Real x = row_data[c];
-        if (!(x >= lower_limit))
-          x = lower_limit;
-        if (x > upper_limit)
-          x = upper_limit;
-        row_data[c] = Exp(x);
-      }
-    }
+    Mat().ExpLimited(src.Mat(), lower_limit, upper_limit);
   }
 }
 
 
 template<typename Real>
-void CuMatrixBase<Real>::ApplyExpSpecial() {
+void CuMatrixBase<Real>::ExpSpecial(const CuMatrixBase<Real> &src) {
+  KALDI_ASSERT(SameDim(*this, src));
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
-
-    const int warpSize = 32;
-    dim3 dimBlock(CU1DBLOCK / warpSize, warpSize);
-    dim3 dimGrid(n_blocks(NumRows(), dimBlock.x),
-                 n_blocks(NumCols(), dimBlock.y));
-
-    cuda_apply_exp_special(dimGrid, dimBlock, Data(), Dim(), Data(), Stride());
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
+    cuda_exp_special(dimGrid, dimBlock, this->data_, src.data_, Dim(), src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
-    Mat().ApplyExpSpecial();
+    Mat().ExpSpecial(src.Mat());
   }
 }
 
 template<typename Real>
-void CuMatrixBase<Real>::ApplyFloor(Real floor_val) {
+void CuMatrixBase<Real>::Floor(const CuMatrixBase<Real> &src, Real floor_val) {
+  KALDI_ASSERT(SameDim(*this, src));
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
-    cuda_apply_floor(dimGrid, dimBlock, data_, floor_val, Dim());
+    cuda_floor(dimGrid, dimBlock, data_, src.data_, floor_val, this->Dim(), src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
-    Mat().ApplyFloor(floor_val);
+    Mat().Floor(src.Mat(), floor_val);
   }
 }
 
 template<typename Real>
-void CuMatrixBase<Real>::ApplyCeiling(Real ceiling_val) {
+void CuMatrixBase<Real>::Ceiling(const CuMatrixBase<Real> &src, Real ceiling_val) {
+  KALDI_ASSERT(SameDim(*this, src));
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
-    cuda_apply_ceiling(dimGrid, dimBlock, data_, ceiling_val, Dim());
+    cuda_ceiling(dimGrid, dimBlock, this->data_, src.data_, ceiling_val, this->Dim(), src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
-    Mat().ApplyCeiling(ceiling_val);
+    Mat().Ceiling(src.Mat(), ceiling_val);
   }
 }
 
@@ -2597,16 +2624,19 @@ void VectorBase<Real>::CopyRowsFromMat(const CuMatrixBase<Real> &mat) {
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     if (mat.Stride() == mat.NumCols()) {
-      cudaMemcpy(data_, mat.Data(), sizeof(Real)*dim_, cudaMemcpyDeviceToHost);
+      CU_SAFE_CALL(cudaMemcpyAsync(data_, mat.Data(), sizeof(Real)*dim_, 
+                   cudaMemcpyDeviceToHost, cudaStreamPerThread));
     } else {
       // we could definitely do better than the following.
       Real* vec_data = data_;
       for (MatrixIndexT r = 0; r < mat.NumRows(); r++) {
-        cudaMemcpy(vec_data, mat.RowData(r), sizeof(Real) * mat.NumCols(),
-                   cudaMemcpyDeviceToHost);
+        CU_SAFE_CALL(cudaMemcpyAsync(vec_data, mat.RowData(r), 
+                     sizeof(Real) * mat.NumCols(), cudaMemcpyDeviceToHost, 
+                     cudaStreamPerThread));
         vec_data += mat.NumCols();
       }
     }
+    CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile("CuVectorBase::CopyRowsFromMat", tim);
   } else
 #endif
@@ -3255,9 +3285,9 @@ void CuMatrixBase<Real>::AddElements(Real alpha,
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     void *addr = CuDevice::Instantiate().Malloc(input.size() * sizeof(MatrixElement<Real>));
-    CU_SAFE_CALL(cudaMemcpy(addr, input.data(),
-                        input.size() * sizeof(MatrixElement<Real>),
-                            cudaMemcpyHostToDevice));
+    CU_SAFE_CALL(cudaMemcpyAsync(addr, input.data(),
+                                 input.size() * sizeof(MatrixElement<Real>),
+                                 cudaMemcpyHostToDevice, cudaStreamPerThread));
 
     CuTimer tim;
     int dimBlock(CU1DBLOCK);
@@ -3287,8 +3317,9 @@ void CuMatrixBase<Real>::AddElements(Real alpha, const CuArrayBase<Int32Pair> &i
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     CuVector<Real> tmp_vec(indexes.Dim(), kUndefined);
-    CU_SAFE_CALL(cudaMemcpy(tmp_vec.Data(), input, indexes.Dim() * sizeof(Real),
-                            cudaMemcpyHostToDevice));
+    CU_SAFE_CALL(cudaMemcpyAsync(tmp_vec.Data(), input, 
+                                 indexes.Dim() * sizeof(Real),
+                                 cudaMemcpyHostToDevice, cudaStreamPerThread));
 
     int dimBlock(CU1DBLOCK);
     int dimGrid = n_blocks(indexes.Dim(), CU1DBLOCK);
diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h
index 85aa4c049e7..a531ecd45b9 100644
--- a/src/cudamatrix/cu-matrix.h
+++ b/src/cudamatrix/cu-matrix.h
@@ -6,6 +6,7 @@
 //                2013  Xiaohui Zhang
 //           2013-2015  Guoguo Chen
 //                2017  Shiyin Kang
+//                2019  Yiwen Shao
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -237,6 +238,7 @@ class CuMatrixBase {
   void CopyFromMat(const MatrixBase<OtherReal> &src,
                    MatrixTransposeType trans = kNoTrans);
 
+
   void CopyFromGeneralMat(const GeneralMatrix &src,
                           MatrixTransposeType trans = kNoTrans);
 
@@ -248,6 +250,13 @@ class CuMatrixBase {
   template<typename OtherReal>
   void CopyFromTp(const CuTpMatrix<OtherReal> &M,
                   MatrixTransposeType trans = kNoTrans);
+  
+  // This function will copy from source rows (start_range, end_range]
+  // if the range is outside of the clamped region then the clamped
+  // row will be replicated across the out of range areas
+  void CopyRangeFromMatClamped(const CuMatrixBase<Real> & src,
+      int32_t start_range, int32_t end_range,
+      int32_t clamp_low, int32_t clamp_high);
 
   template<typename OtherReal>
   void CopyFromMat(const CuMatrixBase<OtherReal> &M,
@@ -283,6 +292,48 @@ class CuMatrixBase {
   /// in general, there are different ways to deal with the situation when x==0.]
   void Heaviside(const CuMatrixBase<Real> &src);
 
+  void Exp(const CuMatrixBase<Real> &src);
+
+  void Log(const CuMatrixBase<Real> &src);
+
+  void Pow(const CuMatrixBase<Real> &src, Real power);
+
+  /// Apply power to the absolute value of each element.
+  /// If include_sign is true, the result will be multiplied with
+  /// the sign of the input value.
+  /// If the power is negative and the input to the power is zero,
+  /// The output will be set zero. If include_sign is true, it will
+  /// multiply the result by the sign of the input.
+  void PowAbs(const CuMatrixBase<Real> &src, Real power, bool include_sign=false);
+
+  void Floor(const CuMatrixBase<Real> &src, Real floor_val);
+  
+  void Ceiling(const CuMatrixBase<Real> &src, Real ceiling_val);
+  
+  /// This is equivalent to running:
+  /// Floor(src, lower_limit);
+  /// Ceiling(src, upper_limit);
+  /// Exp(src)
+  void ExpLimited(const CuMatrixBase<Real> &src, Real lower_limit, Real upper_limit);
+
+  /// For each element x of the matrix, set it to
+  /// (x < 0 ? exp(x) : x + 1).  This function is used
+  /// in our RNNLM training.
+  void ExpSpecial(const CuMatrixBase<Real> &src);
+  
+  /// Softmax nonlinearity
+  /// Y = Softmax(X) : Yij = e^Xij / sum_k(e^Xik), done to each row,
+  /// with attention to avoiding  overflow or underflow.
+  /// Supports in-place operation (i.e. this == &src).
+  void SoftMaxPerRow(const CuMatrixBase<Real> &src);
+
+  /// LogSoftmax nonlinearity
+  /// Y = LogSoftmax(X) : Yij = Xij - log(sum_k(e^Xik)), done to each row,
+  /// with attention to avoiding  overflow or underflow.
+  /// Supports in-place operation (i.e. this == &src).
+  void LogSoftMaxPerRow(const CuMatrixBase<Real> &src);
+
+  
   /// Apply the function y = log(1 + exp(x)), to each element.
   /// Note: the derivative of this function is the sigmoid function.
   /// This is like a soft ReLU.
@@ -384,44 +435,51 @@ class CuMatrixBase {
   /// The output is symmetric.
   void SymInvertPosDef();
 
-  void ApplyPow(Real power);
-  /// Apply power to the absolute value of each element.
-  /// If include_sign is true, the result will be multiplied with
-  /// the sign of the input value.
-  /// If the power is negative and the input to the power is zero,
-  /// The output will be set zero. If include_sign is true, it will
-  /// multiply the result by the sign of the input.
-  void ApplyPowAbs(Real power, bool include_sign=false);
-  /// For each element, sets x = (x > 0 ? 1.0 : 0.0).
-  /// See also Heaviside().
-  void ApplyHeaviside();
-  void ApplyFloor(Real floor_val);
-  void ApplyCeiling(Real ceiling_val);
-  void ApplyExp();
-
-
-  /// This is equivalent to running:
-  /// ApplyFloor(lower_limit);
-  /// ApplyCeiling(upper_limit);
-  /// ApplyExp()
-  void ApplyExpLimited(Real lower_limit, Real upper_limit);
-
-  /// For each element x of the matrix, set it to
-  /// (x < 0 ? exp(x) : x + 1).  This function is used
-  /// in our RNNLM training.
-  void ApplyExpSpecial();
-
-  /// Softmax nonlinearity
-  /// Y = Softmax(X) : Yij = e^Xij / sum_k(e^Xik), done to each row,
-  /// with attention to avoiding  overflow or underflow.
-  /// Supports in-place operation (i.e. this == &src).
-  void ApplySoftMaxPerRow(const CuMatrixBase<Real> &src);
-
-  /// LogSoftmax nonlinearity
-  /// Y = LogSoftmax(X) : Yij = Xij - log(sum_k(e^Xik)), done to each row,
-  /// with attention to avoiding  overflow or underflow.
-  /// Supports in-place operation (i.e. this == &src).
-  void ApplyLogSoftMaxPerRow(const CuMatrixBase<Real> &src);
+  inline void ApplyPow(Real power) {
+    this -> Pow(*this, power);
+  };
+
+  
+  inline void ApplyPowAbs(Real power, bool include_sign=false) {
+    this -> PowAbs(*this, power, include_sign);
+  };
+  
+  inline void ApplyHeaviside() {
+    this -> Heaviside(*this);
+  };
+  
+  inline void ApplyFloor(Real floor_val) {
+    this -> Floor(*this, floor_val);
+  };
+  
+  inline void ApplyCeiling(Real ceiling_val) {
+    this -> Ceiling(*this, ceiling_val);
+  };
+  
+  inline void ApplyExp() {
+    this -> Exp(*this);
+  };
+
+
+  inline void ApplyExpLimited(Real lower_limit, Real upper_limit) {
+    this -> ExpLimited(*this, lower_limit, upper_limit);
+  };
+
+  inline void ApplyExpSpecial() {
+    this -> ExpSpecial(*this);
+  };
+
+  inline void ApplySoftMaxPerRow() {
+    this -> SoftMaxPerRow(*this);
+  };
+
+  inline void ApplyLogSoftMaxPerRow() {
+    this -> LogSoftMaxPerRow(*this);
+  };
+
+  inline void ApplyLog() {
+    this -> Log(*this);
+  };
 
   /// Find the id of the maximal element for each row (resizes the 'id'
   /// array to the appropriate size).
@@ -434,7 +492,6 @@ class CuMatrixBase {
   /// Zeroes all elements for which col > row.
   void SetZeroAboveDiag();
   void Scale(Real value);
-  void ApplyLog();
 
   /// Multiply two matrices elementwise: C = C .* A
   void MulElements(const CuMatrixBase<Real> &A);
diff --git a/src/cudamatrix/cu-matrixdim.h b/src/cudamatrix/cu-matrixdim.h
index 74912dad6e3..248e08199a1 100644
--- a/src/cudamatrix/cu-matrixdim.h
+++ b/src/cudamatrix/cu-matrixdim.h
@@ -79,6 +79,14 @@ extern "C" {
     int32_cuda first;
     int32_cuda second;
   } Int32Pair;
+
+  inline bool operator<(const Int32Pair &a, const Int32Pair &b) {
+    if (a.first < b.first)
+      return true;
+    if (a.first > b.first)
+      return false;
+    return a.second < b.second;
+  }
 }
 
 #endif
diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc
index d4dbdf12143..756d580c7cf 100644
--- a/src/cudamatrix/cu-packed-matrix.cc
+++ b/src/cudamatrix/cu-packed-matrix.cc
@@ -162,8 +162,9 @@ void CuPackedMatrix<Real>::CopyFromPacked(const PackedMatrix<Real> &src) {
   if (CuDevice::Instantiate().Enabled()) {
     if (num_rows_ == 0) return; // Nothing to do.
     CuTimer tim;
-    CU_SAFE_CALL(cudaMemcpy(data_, src.data_, src.SizeInBytes(),
-                            cudaMemcpyHostToDevice));
+    CU_SAFE_CALL(cudaMemcpyAsync(data_, src.data_, src.SizeInBytes(),
+                                 cudaMemcpyHostToDevice, cudaStreamPerThread));
+    CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile("CuPackedMatrix::CopyFromPacked2", tim);
   } else
 #endif
@@ -184,8 +185,9 @@ void CuPackedMatrix<Real>::CopyToPacked(PackedMatrix<Real> *dst) const {
     size_t nr = static_cast<size_t>(num_rows_),
       num_bytes = ((nr * (nr+1)) / 2) * sizeof(Real);
 
-    CU_SAFE_CALL(cudaMemcpy(dst->data_, data_, num_bytes,
-                            cudaMemcpyDeviceToHost));
+    CU_SAFE_CALL(cudaMemcpyAsync(dst->data_, data_, num_bytes,
+                                 cudaMemcpyDeviceToHost, cudaStreamPerThread));
+    CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile("CuPackedMatrix::CopyToPackedD2H", tim);
   } else
 #endif
@@ -248,7 +250,8 @@ void CuPackedMatrix<Real>::SetZero() {
     size_t nr = static_cast<size_t>(num_rows_),
       num_bytes = ((nr * (nr+1)) / 2) * sizeof(Real);
 
-    CU_SAFE_CALL(cudaMemset(reinterpret_cast<void*>(this->data_), 0, num_bytes));
+    CU_SAFE_CALL(cudaMemsetAsync(reinterpret_cast<void*>(this->data_), 0, 
+          num_bytes, cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile("CuPackedMatrix::SetZero", tim);
   } else
   #endif
diff --git a/src/cudamatrix/cu-packed-matrix.h b/src/cudamatrix/cu-packed-matrix.h
index 0131ba6c101..8ed7ed79f7b 100644
--- a/src/cudamatrix/cu-packed-matrix.h
+++ b/src/cudamatrix/cu-packed-matrix.h
@@ -122,8 +122,10 @@ class CuPackedMatrix {
 #if HAVE_CUDA == 1
     if (CuDevice::Instantiate().Enabled()) {    
       Real value;
-      CU_SAFE_CALL(cudaMemcpy(&value, this->data_ + (r * (r+1)) / 2 + c,
-                              sizeof(Real), cudaMemcpyDeviceToHost));
+      CU_SAFE_CALL(cudaMemcpyAsync(&value, this->data_ + (r * (r+1)) / 2 + c,
+                                   sizeof(Real), cudaMemcpyDeviceToHost,
+                                   cudaStreamPerThread));
+      CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
       return value;
     } else
 #endif
diff --git a/src/cudamatrix/cu-rand.cc b/src/cudamatrix/cu-rand.cc
index 20439834a98..63d858c25e9 100644
--- a/src/cudamatrix/cu-rand.cc
+++ b/src/cudamatrix/cu-rand.cc
@@ -69,7 +69,8 @@ void CuRand<Real>::RandUniform(CuMatrixBase<Real> *tgt) {
     CuMatrix<Real> tmp(tgt->NumRows(), tgt->NumCols(), kUndefined,
                        kStrideEqualNumCols);
     size_t s = static_cast<size_t>(tmp.NumRows()) * static_cast<size_t>(tmp.Stride());
-    CURAND_SAFE_CALL(curandGenerateUniformWrap(gen_, tmp.Data(), s));
+    CURAND_SAFE_CALL(curandGenerateUniformWrap(
+          GetCurandHandle(), tmp.Data(), s));
     tgt->CopyFromMat(tmp);
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
@@ -86,7 +87,8 @@ void CuRand<Real>::RandUniform(CuMatrix<Real> *tgt) {
     CuTimer tim;
     // Here we don't need to use 'tmp' matrix,
     size_t s = static_cast<size_t>(tgt->NumRows()) * static_cast<size_t>(tgt->Stride());
-    CURAND_SAFE_CALL(curandGenerateUniformWrap(gen_, tgt->Data(), s));
+    CURAND_SAFE_CALL(curandGenerateUniformWrap(
+          GetCurandHandle(), tgt->Data(), s));
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
@@ -100,7 +102,8 @@ void CuRand<Real>::RandUniform(CuVectorBase<Real> *tgt) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
-    CURAND_SAFE_CALL(curandGenerateUniformWrap(gen_, tgt->Data(), tgt->Dim()));
+    CURAND_SAFE_CALL(curandGenerateUniformWrap(
+          GetCurandHandle(), tgt->Data(), tgt->Dim()));
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
@@ -125,7 +128,8 @@ void CuRand<Real>::RandGaussian(CuMatrixBase<Real> *tgt) {
     MatrixIndexT num_cols_even = tgt->NumCols() + (tgt->NumCols() % 2); // + 0 or 1,
     CuMatrix<Real> tmp(tgt->NumRows(), num_cols_even, kUndefined,
                        kStrideEqualNumCols);
-    CURAND_SAFE_CALL(curandGenerateNormalWrap(gen_, tmp.Data(), tmp.NumRows()*tmp.Stride()));
+    CURAND_SAFE_CALL(curandGenerateNormalWrap(
+          GetCurandHandle(), tmp.Data(), tmp.NumRows()*tmp.Stride()));
     tgt->CopyFromMat(tmp.ColRange(0,tgt->NumCols()));
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
@@ -143,7 +147,8 @@ void CuRand<Real>::RandGaussian(CuMatrix<Real> *tgt) {
     // Here we don't need to use 'tmp' matrix, if the number of elements is even,
     MatrixIndexT num_elements = tgt->NumRows() * tgt->Stride();
     if (0 == (num_elements % 2)) {
-      CURAND_SAFE_CALL(curandGenerateNormalWrap(gen_, tgt->Data(), num_elements));
+      CURAND_SAFE_CALL(curandGenerateNormalWrap(
+            GetCurandHandle(), tgt->Data(), num_elements));
     } else {
       // We use 'tmp' matrix with one column added, this guarantees an even
       // number of elements.  Use the option kStrideEqualNumCols to ensure
@@ -152,8 +157,8 @@ void CuRand<Real>::RandGaussian(CuMatrix<Real> *tgt) {
       MatrixIndexT num_cols_even = tgt->NumCols() + (tgt->NumCols() % 2); // + 0 or 1,
       CuMatrix<Real> tmp(tgt->NumRows(), num_cols_even, kUndefined,
                          kStrideEqualNumCols);
-      CURAND_SAFE_CALL(curandGenerateNormalWrap(gen_, tmp.Data(),
-                                            tmp.NumRows() * tmp.Stride()));
+      CURAND_SAFE_CALL(curandGenerateNormalWrap(
+            GetCurandHandle(), tmp.Data(), tmp.NumRows() * tmp.Stride()));
       tgt->CopyFromMat(tmp.ColRange(0,tgt->NumCols()));
     }
     CuDevice::Instantiate().AccuProfile(__func__, tim);
@@ -174,11 +179,13 @@ void CuRand<Real>::RandGaussian(CuVectorBase<Real> *tgt) {
     // curandGenerateUniform(), curandGenerateUniformDouble().
     MatrixIndexT num_elements = tgt->Dim();
     if (0 == (num_elements % 2)) {
-      CURAND_SAFE_CALL(curandGenerateNormalWrap(gen_, tgt->Data(), tgt->Dim()));
+      CURAND_SAFE_CALL(curandGenerateNormalWrap(
+            GetCurandHandle(), tgt->Data(), tgt->Dim()));
     } else {
       MatrixIndexT dim_even = tgt->Dim() + (tgt->Dim() % 2); // + 0 or 1,
       CuVector<Real> tmp(dim_even, kUndefined);
-      CURAND_SAFE_CALL(curandGenerateNormalWrap(gen_, tmp.Data(), tmp.Dim()));
+      CURAND_SAFE_CALL(curandGenerateNormalWrap(
+            GetCurandHandle(), tmp.Data(), tmp.Dim()));
       tgt->CopyFromVec(tmp.Range(0,tgt->Dim()));
     }
     CuDevice::Instantiate().AccuProfile(__func__, tim);
diff --git a/src/cudamatrix/cu-rand.h b/src/cudamatrix/cu-rand.h
index fafc747df8d..6e0be648270 100644
--- a/src/cudamatrix/cu-rand.h
+++ b/src/cudamatrix/cu-rand.h
@@ -20,10 +20,7 @@
 #ifndef KALDI_CUDAMATRIX_CU_RAND_H_
 #define KALDI_CUDAMATRIX_CU_RAND_H_
 
-#if HAVE_CUDA == 1
-  #include <curand.h>
-#endif
-
+#include "cudamatrix/cu-device.h"
 #include "cudamatrix/cu-matrix.h"
 #include "cudamatrix/cu-vector.h"
 #include "base/kaldi-math.h"
@@ -33,36 +30,10 @@ namespace kaldi {
 template<typename Real>
 class CuRand {
  public:
-  CuRand() {
-  #if HAVE_CUDA == 1
-    if (CuDevice::Instantiate().Enabled()) {
-      // Initialize the generator,
-      CURAND_SAFE_CALL(curandCreateGenerator(&gen_, CURAND_RNG_PSEUDO_DEFAULT));
-      // To get same random sequence, call srand() before the constructor is invoked,
-      CURAND_SAFE_CALL(curandSetGeneratorOrdering(gen_, CURAND_ORDERING_PSEUDO_DEFAULT));
-      CURAND_SAFE_CALL(curandSetPseudoRandomGeneratorSeed(gen_, RandInt(128, RAND_MAX)));
-      CURAND_SAFE_CALL(curandSetGeneratorOffset(gen_, 0));
-    }
-  #endif
-  }
 
-  ~CuRand() {
+   void SeedGpu() {
   #if HAVE_CUDA == 1
-    if (CuDevice::Instantiate().Enabled()) {
-      // Release the generator,
-      CURAND_SAFE_CALL(curandDestroyGenerator(gen_));
-    }
-  #endif
-  }
-
-  /// Generate new seed for the GPU,
-  void SeedGpu() {
-  #if HAVE_CUDA == 1
-    if (CuDevice::Instantiate().Enabled()) {
-      // To get same random sequence, call srand() before the method is invoked,
-      CURAND_SAFE_CALL(curandSetPseudoRandomGeneratorSeed(gen_, RandInt(128, RAND_MAX)));
-      CURAND_SAFE_CALL(curandSetGeneratorOffset(gen_, 0));
-    }
+		CuDevice::Instantiate().SeedGpu();
   #endif
   }
 
@@ -88,11 +59,6 @@ class CuRand {
   void BinarizeProbs(const CuMatrix<Real> &probs, CuMatrix<Real> *states);
   /// add gaussian noise to each element,
   void AddGaussNoise(CuMatrix<Real> *tgt, Real gscale = 1.0);
-
- private:
-  #if HAVE_CUDA == 1
-  curandGenerator_t gen_;
-  #endif
 };
 
 }  // namsepace
diff --git a/src/cudamatrix/cu-sparse-matrix-test.cc b/src/cudamatrix/cu-sparse-matrix-test.cc
index 38f78c7c5e5..aad34b5dd54 100644
--- a/src/cudamatrix/cu-sparse-matrix-test.cc
+++ b/src/cudamatrix/cu-sparse-matrix-test.cc
@@ -173,7 +173,7 @@ static void UnitTestCuSparseMatrixSum() {
 
     Real sum1 = cu_smat.Sum();
     Real sum2 = mat.Sum();
-    KALDI_ASSERT(fabs(sum1 - sum2) < 1.0e-05);
+    KALDI_ASSERT(fabs(sum1 - sum2) < 1.0e-04);
   }
 }
 
diff --git a/src/cudamatrix/cu-value.h b/src/cudamatrix/cu-value.h
index cab0a3235d7..af8e19987ce 100644
--- a/src/cudamatrix/cu-value.h
+++ b/src/cudamatrix/cu-value.h
@@ -54,7 +54,9 @@ class CuValue {
   inline Real operator = (Real r) { // assignment from Real
 #if HAVE_CUDA == 1
     if (CuDevice::Instantiate().Enabled()) {
-      CU_SAFE_CALL(cudaMemcpy(data_, &r, sizeof(Real), cudaMemcpyHostToDevice));
+      CU_SAFE_CALL(cudaMemcpyAsync(data_, &r, sizeof(Real), 
+            cudaMemcpyHostToDevice, cudaStreamPerThread));
+      CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
       return r;
     } else
 #endif
@@ -65,14 +67,16 @@ class CuValue {
   }
 
   inline Real operator += (Real r) { return (*this = r + Real(*this)); }
+  inline Real operator -= (Real r) { return (*this = Real(*this) - r); }
     
 
   inline operator Real () const { // assignment to Real
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Real value;
-    CU_SAFE_CALL(cudaMemcpy(&value, data_,
-                            sizeof(Real), cudaMemcpyDeviceToHost));
+    CU_SAFE_CALL(cudaMemcpyAsync(&value, data_, sizeof(Real), 
+                 cudaMemcpyDeviceToHost, cudaStreamPerThread));
+    CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     return value;
   } else
 #endif
diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc
index 536e55d8a3b..5ee5d578511 100644
--- a/src/cudamatrix/cu-vector.cc
+++ b/src/cudamatrix/cu-vector.cc
@@ -4,6 +4,7 @@
 //           2012-2014  Johns Hopkins University (author: Daniel Povey)
 //                2017  Daniel Galvez
 //           2016-2018  Shiyin Kang
+//                2019  Yiwen Shao
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -221,18 +222,18 @@ void CuVectorBase<Real>::CopyRowsFromMat(const MatrixBase<Real> &mat) {
     if (dim_ == 0) return;
     CuTimer tim;
     if (mat.Stride() == mat.NumCols()) {
-      CU_SAFE_CALL(cudaMemcpy(data_, mat.Data(), sizeof(Real)*dim_,
-                              cudaMemcpyHostToDevice));
+      CU_SAFE_CALL(cudaMemcpyAsync(data_, mat.Data(), sizeof(Real)*dim_,
+                              cudaMemcpyHostToDevice, cudaStreamPerThread));
     } else {
       Real* vec_data = data_;
       for (MatrixIndexT r = 0; r < mat.NumRows(); r++) {
-        CU_SAFE_CALL(cudaMemcpy(vec_data, mat.RowData(r),
+        CU_SAFE_CALL(cudaMemcpyAsync(vec_data, mat.RowData(r),
                                 sizeof(Real) * mat.NumCols(),
-                                cudaMemcpyHostToDevice));
+                                cudaMemcpyHostToDevice, cudaStreamPerThread));
         vec_data += mat.NumCols();
       }
     }
-    CU_SAFE_CALL(cudaGetLastError());
+    CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
@@ -249,18 +250,21 @@ void MatrixBase<Real>::CopyRowsFromVec(const CuVectorBase<Real> &v) {
     if (num_rows_ == 0) return;
     CuTimer tim;
     if (Stride() == NumCols()) {
-      CU_SAFE_CALL(cudaMemcpy(data_, v.Data(),
+      CU_SAFE_CALL(cudaMemcpyAsync(data_, v.Data(),
                               sizeof(Real)*v.Dim(),
-                              cudaMemcpyDeviceToHost));
+                              cudaMemcpyDeviceToHost,
+                              cudaStreamPerThread));
     } else {
       const Real* vec_data = v.Data();
       for (MatrixIndexT r = 0; r < NumRows(); r++) {
-        CU_SAFE_CALL(cudaMemcpy(RowData(r), vec_data,
+        CU_SAFE_CALL(cudaMemcpyAsync(RowData(r), vec_data,
                                 sizeof(Real) * NumCols(),
-                                cudaMemcpyDeviceToHost));
+                                cudaMemcpyDeviceToHost,
+                                cudaStreamPerThread));
         vec_data += NumCols();
       }
     }
+    CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
@@ -346,7 +350,7 @@ void CuVectorBase<Real>::ApplySoftMax() {
 }
 
 template<typename Real>
-void CuVectorBase<Real>::ApplyFloor(Real floor_val, MatrixIndexT *floored_count) {
+void CuVectorBase<Real>::Floor(const CuVectorBase<Real> &src, Real floor_val, MatrixIndexT *floored_count) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     int dimBlock(CU1DBLOCK);
@@ -357,8 +361,8 @@ void CuVectorBase<Real>::ApplyFloor(Real floor_val, MatrixIndexT *floored_count)
       // We are calling a function meant for matrices, by viewing the
       // vector as a matrix with a single row.
       ::MatrixDim dim = {1, Dim(), 1};
-      cuda_apply_floor(dimGrid, dimBlock, data_, floor_val, dim);
-      CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyFloorNoCount", tim);
+      cuda_floor(dimGrid, dimBlock, this->data_, src.Data(), floor_val, dim, 1);
+      CuDevice::Instantiate().AccuProfile("CuVectorBase::FloorNoCount", tim);
     } else {
       if (dim_ == 0) { *floored_count = 0; return; }
       CuTimer tim;
@@ -368,17 +372,18 @@ void CuVectorBase<Real>::ApplyFloor(Real floor_val, MatrixIndexT *floored_count)
       cuda_vec_apply_floor(dimGrid, dimBlock, data_, floor_val, count_vec.Data(), dim_);
       CU_SAFE_CALL(cudaGetLastError());
       *floored_count = count_vec.Sum();
-      CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyFloor", tim);
+      CuDevice::Instantiate().AccuProfile("CuVectorBase::Floor", tim);
     }
   } else
 #endif
   {
-    Vec().ApplyFloor(floor_val, floored_count);
+    Vec().Floor(src.Vec(), floor_val, floored_count);
   }
 }
 
 template<typename Real>
-void CuVectorBase<Real>::ApplyCeiling(Real ceiling_val, MatrixIndexT *ceiled_count) {
+void CuVectorBase<Real>::Ceiling(const CuVectorBase<Real> &src, Real ceiling_val,
+				 MatrixIndexT *ceiled_count) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     int dimBlock(CU1DBLOCK);
@@ -389,9 +394,9 @@ void CuVectorBase<Real>::ApplyCeiling(Real ceiling_val, MatrixIndexT *ceiled_cou
       // We are calling a function meant for matrices, by viewing the
       // vector as a matrix with a single row.
       ::MatrixDim dim = {1, Dim(), 1};
-      cuda_apply_ceiling(dimGrid, dimBlock, data_, ceiling_val, dim);
+      cuda_ceiling(dimGrid, dimBlock, this->data_, src.Data(), ceiling_val, dim, 1);
 
-      CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyCeilingNoCount", tim);
+      CuDevice::Instantiate().AccuProfile("CuVectorBase::CeilingNoCount", tim);
     } else {
       if (dim_ == 0) { *ceiled_count = 0; return; }
       CuTimer tim;
@@ -401,17 +406,17 @@ void CuVectorBase<Real>::ApplyCeiling(Real ceiling_val, MatrixIndexT *ceiled_cou
       cuda_vec_apply_ceiling(dimGrid, dimBlock, data_, ceiling_val, count_vec.Data(), dim_);
       CU_SAFE_CALL(cudaGetLastError());
       *ceiled_count = count_vec.Sum();
-      CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyCeiling", tim);
+      CuDevice::Instantiate().AccuProfile("CuVectorBase::Ceiling", tim);
     }
   } else
 #endif
   {
-    Vec().ApplyCeiling(ceiling_val, ceiled_count);
+    Vec().Ceiling(src.Vec(), ceiling_val, ceiled_count);
   }
 }
 
 template<typename Real>
-void CuVectorBase<Real>::ApplyPow(Real power) {
+void CuVectorBase<Real>::Pow(const CuVectorBase<Real> &src, Real power) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (dim_ == 0) return;
@@ -422,13 +427,13 @@ void CuVectorBase<Real>::ApplyPow(Real power) {
     dim3 dimGrid(n_blocks(Dim(), CU1DBLOCK), 1);
     ::MatrixDim fake_matrix_dim = { 1, Dim(), 1 };
     // num_cols is Dim(), num_rows is 1, stride is 1 (it's a don't-care).
-    cuda_apply_pow(dimGrid, dimBlock, data_, power, fake_matrix_dim);
+    cuda_pow(dimGrid, dimBlock, this->data_, src.Data(), power, fake_matrix_dim, 1);
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyPow", tim);
   } else
 #endif
   {
-    Vec().ApplyPow(power);
+    Vec().Pow(src.Vec(), power);
   }
 }
 
@@ -475,6 +480,27 @@ void CuVectorBase<Real>::ApplyLog() {
   }
 }
 
+template<typename Real>
+void CuVectorBase<Real>::ApplyLogSoftMax() {
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    if (dim_ == 0) return;
+    CuTimer tim;
+    size_t dimBlock = CU1DBLOCK;
+    size_t dimGrid = 1;       // dimGrid value represent the number of rows
+    ::MatrixDim dim = { 1, this->dim_, this->dim_};
+    
+    cuda_log_softmax_reduce(dimGrid, dimBlock, data_, data_, dim, this->dim_);
+    CU_SAFE_CALL(cudaGetLastError());
+    CuDevice::Instantiate().AccuProfile(__func__, tim);
+  } else
+#endif
+  {
+    Vec().ApplyLogSoftMax();
+  }
+}
+
+
 
 template<typename Real>
 void CuVectorBase<Real>::AddMatVec(const Real alpha,
@@ -886,7 +912,9 @@ void CuVectorBase<Real>::CopyFromVec(const VectorBase<OtherReal> &src) {
       KALDI_ASSERT(src.Dim() == dim_);
       if (dim_ == 0) return;
       CuTimer tim;
-      CU_SAFE_CALL(cudaMemcpy(data_, src.Data(), src.Dim()*sizeof(Real), cudaMemcpyHostToDevice));
+      CU_SAFE_CALL(cudaMemcpyAsync(data_, src.Data(), src.Dim()*sizeof(Real), 
+                                   cudaMemcpyHostToDevice, cudaStreamPerThread));
+      CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
       CuDevice::Instantiate().AccuProfile("CuVector::CopyFromVecH2D", tim);
     }
   } else
@@ -917,8 +945,10 @@ void CuVectorBase<Real>::CopyToVec(VectorBase<OtherReal> *dst) const {
     } else {
       if (dim_ == 0) return;
       CuTimer tim;
-      CU_SAFE_CALL(cudaMemcpy(dst->Data(), this->data_,
-                              sizeof(Real) * dim_, cudaMemcpyDeviceToHost));
+      CU_SAFE_CALL(cudaMemcpyAsync(dst->Data(), this->data_,
+                              sizeof(Real) * dim_, cudaMemcpyDeviceToHost,
+                              cudaStreamPerThread));
+      CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
       CuDevice::Instantiate().AccuProfile(__func__, tim);
     }
   } else
@@ -1072,7 +1102,8 @@ void CuVectorBase<Real>::SetZero() {
     KALDI_ASSERT(dim_>=0);
     KALDI_ASSERT(data_!=NULL);
     CuTimer tim;
-    CU_SAFE_CALL(cudaMemset(data_, 0, dim_*sizeof(Real)));
+    CU_SAFE_CALL(cudaMemsetAsync(data_, 0, dim_*sizeof(Real),
+          cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile("CuVector::SetZero", tim);
   } else
 #endif
@@ -1248,10 +1279,19 @@ void CuVectorBase<Real>::AddRowSumMat(Real alpha, const CuMatrixBase<Real> &mat,
   KALDI_ASSERT(mat.NumCols() == Dim());
   if (Dim() == 0)
     return;
-  CuVector<Real> ones(mat.NumRows());
-  ones.Set(1.0);
-  this->AddMatVec(alpha, mat, kTrans, ones, beta);
-
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    CuTimer tim;
+    cuda_add_row_sum_mat(mat.NumCols(), CU1DBLOCK, Data(), mat.Data(),
+                         mat.Dim(), alpha, beta);
+    CU_SAFE_CALL(cudaGetLastError());
+    
+    CuDevice::Instantiate().AccuProfile(__func__, tim);
+  } else 
+#endif
+  {
+    Vec().AddRowSumMat(alpha, mat.Mat(), beta);
+  }
 }
 
 template<typename Real>
diff --git a/src/cudamatrix/cu-vector.h b/src/cudamatrix/cu-vector.h
index 69ca2ae3125..9c532b52f39 100644
--- a/src/cudamatrix/cu-vector.h
+++ b/src/cudamatrix/cu-vector.h
@@ -6,6 +6,7 @@
 //           2013       Xiaohui Zhang
 //           2015       Guoguo Chen
 //           2017       Daniel Galvez
+//           2019       Yiwen Shao
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -131,12 +132,26 @@ class CuVectorBase {
                     const MatrixTransposeType trans,
                     const CuArrayBase<int32> &elements);
 
+  void Floor(const CuVectorBase<Real> &src, Real floor_val, MatrixIndexT *floored_count = NULL);
+  void Ceiling(const CuVectorBase<Real> &src, Real ceiling_val, MatrixIndexT *ceiled_count = NULL);
+  void Pow(const CuVectorBase<Real> &src, Real power);
+  
+  inline void ApplyFloor(Real floor_val, MatrixIndexT *floored_count = NULL) {
+    this -> Floor(*this, floor_val, floored_count); 
+  };
+  
+  inline void ApplyCeiling(Real ceiling_val, MatrixIndexT *ceiled_count = NULL) {
+    this -> Ceiling(*this, ceiling_val, ceiled_count);
+  };
+  
+  inline void ApplyPow(Real power) {
+    this -> Pow(*this, power);
+  };
+
   void ApplySoftMax();
+  void ApplyLogSoftMax();
   void ApplyExp();
   void ApplyLog();
-  void ApplyFloor(Real floor_val, MatrixIndexT *floored_count = NULL);
-  void ApplyCeiling(Real ceiling_val, MatrixIndexT *ceiled_count = NULL);
-  void ApplyPow(Real power);
   Real Sum() const;
 
   void SetRandn();
diff --git a/src/decoder/biglm-faster-decoder.h b/src/decoder/biglm-faster-decoder.h
index a6b99fba95e..8e36deb8bb6 100644
--- a/src/decoder/biglm-faster-decoder.h
+++ b/src/decoder/biglm-faster-decoder.h
@@ -397,13 +397,11 @@ class BiglmFasterDecoder {
             if (new_weight < next_weight_cutoff) {  // not pruned..
               PairId next_pair = ConstructPair(arc.nextstate, next_lm_state);
               Token *new_tok = new Token(arc, ac_weight, tok);
-              Elem *e_found = toks_.Find(next_pair);
+              Elem *e_found = toks_.Insert(next_pair, new_tok);
               if (new_weight + adaptive_beam < next_weight_cutoff)
                 next_weight_cutoff = new_weight + adaptive_beam;
-              if (e_found == NULL) {
-                toks_.Insert(next_pair, new_tok);
-              } else {
-                if ( *(e_found->val) < *new_tok ) {
+              if (e_found->val != new_tok) {
+                if (*(e_found->val) < *new_tok) {
                   Token::TokenDelete(e_found->val);
                   e_found->val = new_tok;
                 } else {
@@ -426,11 +424,12 @@ class BiglmFasterDecoder {
     // Processes nonemitting arcs for one frame. 
     KALDI_ASSERT(queue_.empty());
     for (const Elem *e = toks_.GetList(); e != NULL;  e = e->tail)
-      queue_.push_back(e->key);
+      queue_.push_back(e);
     while (!queue_.empty()) {
-      PairId state_pair = queue_.back();
+      const Elem *e = queue_.back();
       queue_.pop_back();
-      Token *tok = toks_.Find(state_pair)->val;  // would segfault if state not
+      PairId state_pair = e->key;
+      Token *tok = e->val;  // would segfault if state not
       // in toks_ but this can't happen.
       if (tok->weight_.Value() > cutoff) { // Don't bother processing successors.
         continue;
@@ -450,15 +449,14 @@ class BiglmFasterDecoder {
           if (new_tok->weight_.Value() > cutoff) {  // prune
             Token::TokenDelete(new_tok);
           } else {
-            Elem *e_found = toks_.Find(next_pair);
-            if (e_found == NULL) {
-              toks_.Insert(next_pair, new_tok);
-              queue_.push_back(next_pair);
+            Elem *e_found = toks_.Insert(next_pair, new_tok);
+            if (e_found->val == new_tok) {
+              queue_.push_back(e_found);
             } else {
               if ( *(e_found->val) < *new_tok ) {
                 Token::TokenDelete(e_found->val);
                 e_found->val = new_tok;
-                queue_.push_back(next_pair);
+                queue_.push_back(e_found);
               } else {
                 Token::TokenDelete(new_tok);
               }
@@ -477,7 +475,7 @@ class BiglmFasterDecoder {
   fst::DeterministicOnDemandFst<fst::StdArc> *lm_diff_fst_;
   BiglmFasterDecoderOptions opts_;
   bool warned_noarc_;
-  std::vector<PairId> queue_;  // temp variable used in ProcessNonemitting,
+  std::vector<const Elem* > queue_;  // temp variable used in ProcessNonemitting,
   std::vector<BaseFloat> tmp_array_;  // used in GetCutoff.
   // make it class member to avoid internal new/delete.
 
diff --git a/src/decoder/faster-decoder.cc b/src/decoder/faster-decoder.cc
index 105289eb6d7..84b3424f119 100644
--- a/src/decoder/faster-decoder.cc
+++ b/src/decoder/faster-decoder.cc
@@ -277,13 +277,11 @@ double FasterDecoder::ProcessEmitting(DecodableInterface *decodable) {
           double new_weight = arc.weight.Value() + tok->cost_ + ac_cost;
           if (new_weight < next_weight_cutoff) {  // not pruned..
             Token *new_tok = new Token(arc, ac_cost, tok);
-            Elem *e_found = toks_.Find(arc.nextstate);
+            Elem *e_found = toks_.Insert(arc.nextstate, new_tok);
             if (new_weight + adaptive_beam < next_weight_cutoff)
               next_weight_cutoff = new_weight + adaptive_beam;
-            if (e_found == NULL) {
-              toks_.Insert(arc.nextstate, new_tok);
-            } else {
-              if ( *(e_found->val) < *new_tok ) {
+            if (e_found->val != new_tok) {
+              if (*(e_found->val) < *new_tok) {
                 Token::TokenDelete(e_found->val);
                 e_found->val = new_tok;
               } else {
@@ -307,11 +305,12 @@ void FasterDecoder::ProcessNonemitting(double cutoff) {
   // Processes nonemitting arcs for one frame. 
   KALDI_ASSERT(queue_.empty());
   for (const Elem *e = toks_.GetList(); e != NULL;  e = e->tail)
-    queue_.push_back(e->key);
+    queue_.push_back(e);
   while (!queue_.empty()) {
-    StateId state = queue_.back();
+    const Elem* e = queue_.back();
     queue_.pop_back();
-    Token *tok = toks_.Find(state)->val;  // would segfault if state not
+    StateId state = e->key;
+    Token *tok = e->val;  // would segfault if state not
     // in toks_ but this can't happen.
     if (tok->cost_ > cutoff) { // Don't bother processing successors.
       continue;
@@ -326,15 +325,14 @@ void FasterDecoder::ProcessNonemitting(double cutoff) {
         if (new_tok->cost_ > cutoff) {  // prune
           Token::TokenDelete(new_tok);
         } else {
-          Elem *e_found = toks_.Find(arc.nextstate);
-          if (e_found == NULL) {
-            toks_.Insert(arc.nextstate, new_tok);
-            queue_.push_back(arc.nextstate);
+          Elem *e_found = toks_.Insert(arc.nextstate, new_tok);
+          if (e_found->val == new_tok) {
+            queue_.push_back(e_found);
           } else {
-            if ( *(e_found->val) < *new_tok ) {
+            if (*(e_found->val) < *new_tok) {
               Token::TokenDelete(e_found->val);
               e_found->val = new_tok;
-              queue_.push_back(arc.nextstate);
+              queue_.push_back(e_found);
             } else {
               Token::TokenDelete(new_tok);
             }
diff --git a/src/decoder/faster-decoder.h b/src/decoder/faster-decoder.h
index baedcc022b6..db03569614f 100644
--- a/src/decoder/faster-decoder.h
+++ b/src/decoder/faster-decoder.h
@@ -170,7 +170,7 @@ class FasterDecoder {
   HashList<StateId, Token*> toks_;
   const fst::Fst<fst::StdArc> &fst_;
   FasterDecoderOptions config_;
-  std::vector<StateId> queue_;  // temp variable used in ProcessNonemitting,
+  std::vector<const Elem* > queue_;  // temp variable used in ProcessNonemitting,
   std::vector<BaseFloat> tmp_array_;  // used in GetCutoff.
   // make it class member to avoid internal new/delete.
 
diff --git a/src/decoder/grammar-fst.cc b/src/decoder/grammar-fst.cc
index 6f95993d078..1b79e7b5521 100644
--- a/src/decoder/grammar-fst.cc
+++ b/src/decoder/grammar-fst.cc
@@ -25,10 +25,10 @@ namespace fst {
 
 GrammarFst::GrammarFst(
     int32 nonterm_phones_offset,
-    const ConstFst<StdArc> &top_fst,
-    const std::vector<std::pair<Label, const ConstFst<StdArc> *> > &ifsts):
+    std::shared_ptr<const ConstFst<StdArc> > top_fst,
+    const std::vector<std::pair<Label, std::shared_ptr<const ConstFst<StdArc> > > > &ifsts):
     nonterm_phones_offset_(nonterm_phones_offset),
-    top_fst_(&top_fst),
+    top_fst_(top_fst),
     ifsts_(ifsts) {
   Init();
 }
@@ -69,11 +69,6 @@ void GrammarFst::Destroy() {
   nonterminal_map_.clear();
   entry_arcs_.clear();
   instances_.clear();
-  // the following will only do something if we read this object from disk using
-  // its Read() function.
-  for (size_t i = 0; i < fsts_to_delete_.size(); i++)
-    delete fsts_to_delete_[i];
-  fsts_to_delete_.clear();
 }
 
 
@@ -127,7 +122,7 @@ void GrammarFst::InitInstances() {
   KALDI_ASSERT(instances_.empty());
   instances_.resize(1);
   instances_[0].ifst_index = -1;
-  instances_[0].fst = top_fst_;
+  instances_[0].fst = top_fst_.get();
   instances_[0].parent_instance = -1;
   instances_[0].parent_state = -1;
 }
@@ -314,7 +309,7 @@ int32 GrammarFst::GetChildInstanceId(int32 instance_id, int32 nonterminal,
   }
   int32 ifst_index = iter->second;
   child_instance.ifst_index = ifst_index;
-  child_instance.fst = ifsts_[ifst_index].second;
+  child_instance.fst = ifsts_[ifst_index].second.get();
   child_instance.parent_instance = instance_id;
   child_instance.parent_state = state;
   InitEntryOrReentryArcs(*(parent_instance.fst), state,
@@ -429,20 +424,111 @@ void GrammarFst::Read(std::istream &is, bool binary) {
         "update your code.";
   ReadBasicType(is, binary, &num_ifsts);
   ReadBasicType(is, binary, &nonterm_phones_offset_);
-  top_fst_ = ReadConstFstFromStream(is);
-  fsts_to_delete_.push_back(top_fst_);
+  top_fst_ = std::shared_ptr<const ConstFst<StdArc> >(ReadConstFstFromStream(is));
   for (int32 i = 0; i < num_ifsts; i++) {
     int32 nonterminal;
     ReadBasicType(is, binary, &nonterminal);
-    ConstFst<StdArc> *this_fst =  ReadConstFstFromStream(is);
-    fsts_to_delete_.push_back(this_fst);
-    ifsts_.push_back(std::pair<int32, const ConstFst<StdArc>* >(nonterminal,
-                                                                this_fst));
+    std::shared_ptr<const ConstFst<StdArc> >
+        this_fst(ReadConstFstFromStream(is));
+    ifsts_.push_back(std::pair<int32, std::shared_ptr<const ConstFst<StdArc> > >(
+        nonterminal, this_fst));
   }
   Init();
 }
 
 
+/**
+   This utility function input-determinizes a specified state s of the FST
+   'fst'.   (This input-determinizes while treating epsilon as a real symbol,
+   although for the application we expect to use it, there won't be epsilons).
+
+   What this function does is: for any symbol i that appears as the ilabel of
+   more than one arc leaving state s of FST 'fst', it creates an additional
+   state, it creates a new state t with epsilon-input transitions leaving it for
+   each of those multiple arcs leaving state s; it deletes the original arcs
+   leaving state s; and it creates a single arc leaving state s to the newly
+   created state with the ilabel i on it.  It sets the weights as necessary to
+   preserve equivalence and also to ensure that if, prior to this modification,
+   the FST was stochastic when cast to the log semiring (see
+   IsStochasticInLog()), it still will be.  I.e. when interpreted as
+   negative logprobs, the weight from state s to t would be the sum of
+   the weights on the original arcs leaving state s.
+
+   This is used as a very cheap solution when preparing FSTs for the grammar
+   decoder, to ensure that there is only one entry-state to the sub-FST for each
+   phonetic left-context; this keeps the grammar-FST code (i.e. the code that
+   stitches them together) simple.  Of course it will tend to introduce
+   unnecessary epsilons, and if we were careful we might be able to remove
+   some of those, but this wouldn't have a substantial impact on overall
+   decoder performance so we don't bother.
+ */
+static void InputDeterminizeSingleState(StdArc::StateId s,
+                                        VectorFst<StdArc> *fst) {
+  bool was_input_deterministic = true;
+  typedef StdArc Arc;
+  typedef Arc::StateId StateId;
+  typedef Arc::Label Label;
+  typedef Arc::Weight Weight;
+
+  struct InfoForIlabel {
+    std::vector<size_t> arc_indexes;  // indexes of all arcs with this ilabel
+    float tot_cost;  // total cost of all arcs leaving state s for this
+                     // ilabel, summed as if they were negative log-probs.
+    StateId new_state;  // state-id of new state, if any, that we have created
+                        // to remove duplicate symbols with this ilabel.
+    InfoForIlabel(): new_state(-1) { }
+  };
+
+  std::unordered_map<Label, InfoForIlabel> label_map;
+
+  size_t arc_index = 0;
+  for (ArcIterator<VectorFst<Arc> > aiter(*fst, s);
+       !aiter.Done(); aiter.Next(), ++arc_index) {
+    const Arc &arc = aiter.Value();
+    InfoForIlabel &info = label_map[arc.ilabel];
+    if (info.arc_indexes.empty()) {
+      info.tot_cost = arc.weight.Value();
+    } else {
+      info.tot_cost = -kaldi::LogAdd(-info.tot_cost, -arc.weight.Value());
+      was_input_deterministic = false;
+    }
+    info.arc_indexes.push_back(arc_index);
+  }
+
+  if (was_input_deterministic)
+    return;  // Nothing to do.
+
+  // 'new_arcs' will contain the modified list of arcs
+  // leaving state s
+  std::vector<Arc> new_arcs;
+  new_arcs.reserve(arc_index);
+  arc_index = 0;
+  for (ArcIterator<VectorFst<Arc> > aiter(*fst, s);
+       !aiter.Done(); aiter.Next(), ++arc_index) {
+    const Arc &arc = aiter.Value();
+    Label ilabel = arc.ilabel;
+    InfoForIlabel &info = label_map[ilabel];
+    if (info.arc_indexes.size() == 1) {
+      new_arcs.push_back(arc);  // no changes needed
+    } else {
+      if (info.new_state < 0) {
+        info.new_state = fst->AddState();
+        // add arc from state 's' to newly created state.
+        new_arcs.push_back(Arc(ilabel, 0, Weight(info.tot_cost),
+                               info.new_state));
+      }
+      // add arc from new state to original destination of this arc.
+      fst->AddArc(info.new_state, Arc(0, arc.olabel,
+                                      Weight(arc.weight.Value() - info.tot_cost),
+                                      arc.nextstate));
+    }
+  }
+  fst->DeleteArcs(s);
+  for (size_t i = 0; i < new_arcs.size(); i++)
+    fst->AddArc(s, new_arcs[i]);
+}
+
+
 // This class contains the implementation of the function
 // PrepareForGrammarFst(), which is declared in grammar-fst.h.
 class GrammarFstPreparer {
@@ -475,6 +561,12 @@ class GrammarFstPreparer {
           // OK, state s is a special state.
           FixArcsToFinalStates(s);
           MaybeAddFinalProbToState(s);
+          // The following ensures that the start-state of sub-FSTs only has
+          // a single arc per left-context phone (the graph-building recipe can
+          // end up creating more than one if there were disambiguation symbols,
+          // e.g. for langauge model backoff).
+          if (s == fst_->Start() && IsEntryState(s))
+            InputDeterminizeSingleState(s, fst_);
         }
       }
     }
@@ -487,7 +579,7 @@ class GrammarFstPreparer {
 
   // Returns true if state 's' has at least one arc coming out of it with a
   // special nonterminal-related ilabel on it (i.e. an ilabel >=
-  // kNontermBigNumber)
+  // kNontermBigNumber), and false otherwise.
   bool IsSpecialState(StateId s) const;
 
   // This function verifies that state s does not currently have any
@@ -509,6 +601,10 @@ class GrammarFstPreparer {
   // modify this state (by adding input-epsilon arcs), and false otherwise.
   bool NeedEpsilons(StateId s) const;
 
+  // Returns true if state s (which is expected to be the start state, although we
+  // don't check this) has arcs with nonterminal symbols #nonterm_begin.
+  bool IsEntryState(StateId s) const;
+
   // Fixes any final-prob-related problems with this state.  The problem we aim
   // to fix is that there may be arcs with nonterminal symbol #nonterm_end which
   // transition from this state to a state with non-unit final prob.  This
@@ -599,6 +695,24 @@ bool GrammarFstPreparer::IsSpecialState(StateId s) const {
   return false;
 }
 
+bool GrammarFstPreparer::IsEntryState(StateId s) const {
+  int32 big_number = kNontermBigNumber,
+      encoding_multiple = GetEncodingMultiple(nonterm_phones_offset_);
+
+  for (ArcIterator<FST> aiter(*fst_, s ); !aiter.Done(); aiter.Next()) {
+    const Arc &arc = aiter.Value();
+    int32 nonterminal = (arc.ilabel - big_number) /
+        encoding_multiple;
+    // we check that at least one has label with nonterminal equal to #nonterm_begin...
+    // in fact they will all have this value if at least one does, and this was checked
+    // in NeedEpsilons().
+    if (nonterminal == GetPhoneSymbolFor(kNontermBegin))
+      return true;
+  }
+  return false;
+}
+
+
 bool GrammarFstPreparer::NeedEpsilons(StateId s) const {
 
   // See the documentation for GetCategoryOfArc() for explanation of what these are.
@@ -647,7 +761,7 @@ bool GrammarFstPreparer::NeedEpsilons(StateId s) const {
     if (nonterminal == GetPhoneSymbolFor(kNontermBegin) &&
         s != fst_->Start()) {
       KALDI_ERR << "#nonterm_begin symbol is present but this is not the "
-          "first arc.  Did you do fstdeterminizestar while compiling?";
+          "first state.  Did you do fstdeterminizestar while compiling?";
     }
     if (nonterminal == GetPhoneSymbolFor(kNontermEnd)) {
       if (fst_->NumArcs(arc.nextstate) != 0 ||
diff --git a/src/decoder/grammar-fst.h b/src/decoder/grammar-fst.h
index f66933c132d..cfbfcad4ec6 100644
--- a/src/decoder/grammar-fst.h
+++ b/src/decoder/grammar-fst.h
@@ -88,9 +88,11 @@ template<> class ArcIterator<GrammarFst>;
    points whenever we invoke a nonterminal.  For more information
    see \ref grammar (i.e. ../doc/grammar.dox).
 
-   Caution: this class is not thread safe, i.e. you shouldn't access the same
-   GrammarFst from multiple threads.  We can fix this later if needed.
- */
+   THREAD SAFETY: you can't use this object from multiple threads; you should
+   create lightweight copies of this object using the copy constructor,
+   e.g. `new GrammarFst(this_grammar_fst)`, if you want to decode from multiple
+   threads using the same GrammarFst.
+*/
 class GrammarFst {
  public:
   typedef GrammarFstArc Arc;
@@ -136,16 +138,20 @@ class GrammarFst {
               phones.txt, i.e. the things with names like "#nonterm:foo" and
               "#nonterm:bar" in phones.txt.  Also no nonterminal may appear more
               than once in 'fsts'.  ifsts may be empty, even though that doesn't
-              make much sense.  This function does not take ownership of
-              these pointers (i.e. it will not delete them when it is destroyed).
+              make much sense.
     */
   GrammarFst(
       int32 nonterm_phones_offset,
-      const ConstFst<StdArc> &top_fst,
-      const std::vector<std::pair<int32, const ConstFst<StdArc> *> > &ifsts);
+      std::shared_ptr<const ConstFst<StdArc> > top_fst,
+      const std::vector<std::pair<int32, std::shared_ptr<const ConstFst<StdArc> > > > &ifsts);
+
+  /// Copy constructor.  Useful because this object is not thread safe so cannot
+  /// be used by multiple parallel decoder threads, but it is lightweight and
+  /// can copy it without causing the stored FSTs to be copied.
+  GrammarFst(const GrammarFst &other) = default;
 
   ///  This constructor should only be used prior to calling Read().
-  GrammarFst(): top_fst_(NULL) { }
+  GrammarFst() { }
 
   // This Write function allows you to dump a GrammarFst to disk as a single
   // object.  It only supports binary mode, but the option is allowed for
@@ -229,14 +235,15 @@ class GrammarFst {
     an arc-index leaving a particular state in an FST (i.e. an index
     that we could use to Seek() to the matching arc).
 
-      @param [in]  fst  The FST we are looking for state-indexes for
-      @param [in]  entry_state  The state in the FST-- must have arcs with
-                 ilabels decodable as (nonterminal_symbol, left_context_phone).
-                 Will either be the start state (if 'nonterminal_symbol'
-                 corresponds to #nonterm_begin), or an internal state
-                 (if 'nonterminal_symbol' corresponds to #nonterm_reenter).
-                 The arc-indexes of those arcs will be the values
-                 we set in 'phone_to_arc'
+      @param [in]  fst  The FST that is being entered (or reentered)
+      @param [in]  entry_state  The state in 'fst' which is being entered
+                 (or reentered); will be fst.Start() if it's being
+                 entered.  It must have arcs with ilabels decodable as
+                 (nonterminal_symbol, left_context_phone).  Will either be the
+                 start state (if 'nonterminal_symbol' corresponds to
+                 #nonterm_begin), or an internal state (if 'nonterminal_symbol'
+                 corresponds to #nonterm_reenter).  The arc-indexes of those
+                 arcs will be the values we set in 'phone_to_arc'
       @param [in]  nonterminal_symbol  The index in phones.txt of the
                  nonterminal symbol we expect to be encoded in the ilabels
                  of the arcs leaving 'entry_state'.  Will either correspond
@@ -448,12 +455,12 @@ class GrammarFst {
   // The top-level FST passed in by the user; contains the start state and
   // final-states, and may invoke FSTs in 'ifsts_' (which can also invoke
   // each other recursively).
-  const ConstFst<StdArc> *top_fst_;
+  std::shared_ptr<const ConstFst<StdArc> > top_fst_;
 
   // A list of pairs (nonterm, fst), where 'nonterm' is a user-defined
   // nonterminal symbol as numbered in phones.txt (e.g. #nonterm:foo), and
   // 'fst' is the corresponding FST.
-  std::vector<std::pair<int32, const ConstFst<StdArc> *> > ifsts_;
+  std::vector<std::pair<int32, std::shared_ptr<const ConstFst<StdArc> > > > ifsts_;
 
   // Maps from the user-defined nonterminals like #nonterm:foo as numbered
   // in phones.txt, to the corresponding index into 'ifsts_', i.e. the ifst_index.
@@ -473,11 +480,6 @@ class GrammarFst {
   // representing top_fst_, and it will be populated with more elements on
   // demand.  An instance_id refers to an index into this vector.
   std::vector<FstInstance> instances_;
-
-  // A list of FSTs that are to be deleted when this object is destroyed.  This
-  // will only be nonempty if we have read this object from the disk using
-  // Read().
-  std::vector<const ConstFst<StdArc> *> fsts_to_delete_;
 };
 
 
diff --git a/src/decoder/lattice-biglm-faster-decoder.h b/src/decoder/lattice-biglm-faster-decoder.h
index 6276c25a83d..9ea53a95836 100644
--- a/src/decoder/lattice-biglm-faster-decoder.h
+++ b/src/decoder/lattice-biglm-faster-decoder.h
@@ -312,14 +312,14 @@ class LatticeBiglmFasterDecoder {
   // for the current frame.  [note: it's inserted if necessary into hash toks_
   // and also into the singly linked list of tokens active on this frame
   // (whose head is at active_toks_[frame]).
-  inline Token *FindOrAddToken(PairId state_pair, int32 frame, BaseFloat tot_cost,
-                               bool emitting, bool *changed) {
+  inline Elem *FindOrAddToken(PairId state_pair, int32 frame,
+      BaseFloat tot_cost, bool emitting, bool *changed) {
     // Returns the Token pointer.  Sets "changed" (if non-NULL) to true
     // if the token was newly created or the cost changed.
     KALDI_ASSERT(frame < active_toks_.size());
     Token *&toks = active_toks_[frame].toks;
-    Elem *e_found = toks_.Find(state_pair);
-    if (e_found == NULL) { // no such token presently.
+    Elem *e_found = toks_.Insert(state_pair, NULL);
+    if (e_found->val == NULL) { // no such token presently.
       const BaseFloat extra_cost = 0.0;
       // tokens on the currently final frame have zero extra_cost
       // as any of them could end up
@@ -328,9 +328,9 @@ class LatticeBiglmFasterDecoder {
       // NULL: no forward links yet
       toks = new_tok;
       num_toks_++;
-      toks_.Insert(state_pair, new_tok);
+      e_found->val = new_tok;
       if (changed) *changed = true;
-      return new_tok;
+      return e_found;
     } else {
       Token *tok = e_found->val; // There is an existing Token for this state.
       if (tok->tot_cost > tot_cost) { // replace old token
@@ -346,7 +346,7 @@ class LatticeBiglmFasterDecoder {
       } else {
         if (changed) *changed = false;
       }
-      return tok;
+      return e_found;
     }
   }
   
@@ -744,11 +744,11 @@ class LatticeBiglmFasterDecoder {
             else if (tot_cost + config_.beam < next_cutoff)
               next_cutoff = tot_cost + config_.beam; // prune by best current token
             PairId next_pair = ConstructPair(arc.nextstate, next_lm_state);
-            Token *next_tok = FindOrAddToken(next_pair, frame, tot_cost, true, NULL);
+            Elem *e_next = FindOrAddToken(next_pair, frame, tot_cost, true, NULL);
             // true: emitting, NULL: no change indicator needed
           
             // Add ForwardLink from tok to next_tok (put on head of list tok->links)
-            tok->links = new ForwardLink(next_tok, arc.ilabel, arc.olabel, 
+            tok->links = new ForwardLink(e_next->val, arc.ilabel, arc.olabel, 
                                          graph_cost, ac_cost, tok->links);
           }
         } // for all arcs
@@ -770,7 +770,7 @@ class LatticeBiglmFasterDecoder {
     KALDI_ASSERT(queue_.empty());
     BaseFloat best_cost = std::numeric_limits<BaseFloat>::infinity();
     for (const Elem *e = toks_.GetList(); e != NULL;  e = e->tail) {
-      queue_.push_back(e->key);
+      queue_.push_back(e);
       // for pruning with current best token
       best_cost = std::min(best_cost, static_cast<BaseFloat>(e->val->tot_cost));
     }
@@ -784,11 +784,12 @@ class LatticeBiglmFasterDecoder {
     BaseFloat cutoff = best_cost + config_.beam;
     
     while (!queue_.empty()) {
-      PairId state_pair = queue_.back();
+      const Elem *e = queue_.back();
       queue_.pop_back();
 
-      Token *tok = toks_.Find(state_pair)->val;  // would segfault if state not in
-                                                 // toks_ but this can't happen.
+      PairId state_pair = e->key;
+      Token *tok = e->val;  // would segfault if state not in
+                            // toks_ but this can't happen.
       BaseFloat cur_cost = tok->tot_cost;
       if (cur_cost > cutoff) // Don't bother processing successors.
         continue;
@@ -812,15 +813,15 @@ class LatticeBiglmFasterDecoder {
           if (tot_cost < cutoff) {
             bool changed;
             PairId next_pair = ConstructPair(arc.nextstate, next_lm_state);
-            Token *new_tok = FindOrAddToken(next_pair, frame, tot_cost,
-                                            false, &changed); // false: non-emit
+            Elem *e_new = FindOrAddToken(next_pair, frame, tot_cost,
+                                         false, &changed); // false: non-emit
             
-            tok->links = new ForwardLink(new_tok, 0, arc.olabel,
+            tok->links = new ForwardLink(e_new->val, 0, arc.olabel,
                                          graph_cost, 0, tok->links);
             
             // "changed" tells us whether the new token has a different
             // cost from before, or is new [if so, add into queue].
-            if (changed) queue_.push_back(next_pair);
+            if (changed) queue_.push_back(e_new);
           }
         }
       } // for all arcs
@@ -835,7 +836,7 @@ class LatticeBiglmFasterDecoder {
   std::vector<TokenList> active_toks_; // Lists of tokens, indexed by
   // frame (members of TokenList are toks, must_prune_forward_links,
   // must_prune_tokens).
-  std::vector<PairId> queue_;  // temp variable used in ProcessNonemitting,
+  std::vector<const Elem* > queue_;  // temp variable used in ProcessNonemitting,
   std::vector<BaseFloat> tmp_array_;  // used in GetCutoff.
   // make it class member to avoid internal new/delete.
   const fst::Fst<fst::StdArc> &fst_;
diff --git a/src/decoder/lattice-faster-decoder.cc b/src/decoder/lattice-faster-decoder.cc
index 2bc8c7cdef4..f9dc933e6f4 100644
--- a/src/decoder/lattice-faster-decoder.cc
+++ b/src/decoder/lattice-faster-decoder.cc
@@ -263,15 +263,16 @@ void LatticeFasterDecoderTpl<FST, Token>::PossiblyResizeHash(size_t num_toks) {
 // and also into the singly linked list of tokens active on this frame
 // (whose head is at active_toks_[frame]).
 template <typename FST, typename Token>
-inline Token* LatticeFasterDecoderTpl<FST, Token>::FindOrAddToken(
+inline typename LatticeFasterDecoderTpl<FST, Token>::Elem*
+LatticeFasterDecoderTpl<FST, Token>::FindOrAddToken(
       StateId state, int32 frame_plus_one, BaseFloat tot_cost,
       Token *backpointer, bool *changed) {
   // Returns the Token pointer.  Sets "changed" (if non-NULL) to true
   // if the token was newly created or the cost changed.
   KALDI_ASSERT(frame_plus_one < active_toks_.size());
   Token *&toks = active_toks_[frame_plus_one].toks;
-  Elem *e_found = toks_.Find(state);
-  if (e_found == NULL) {  // no such token presently.
+  Elem *e_found = toks_.Insert(state, NULL);
+  if (e_found->val == NULL) {  // no such token presently.
     const BaseFloat extra_cost = 0.0;
     // tokens on the currently final frame have zero extra_cost
     // as any of them could end up
@@ -280,9 +281,9 @@ inline Token* LatticeFasterDecoderTpl<FST, Token>::FindOrAddToken(
     // NULL: no forward links yet
     toks = new_tok;
     num_toks_++;
-    toks_.Insert(state, new_tok);
+    e_found->val = new_tok;
     if (changed) *changed = true;
-    return new_tok;
+    return e_found;
   } else {
     Token *tok = e_found->val;  // There is an existing Token for this state.
     if (tok->tot_cost > tot_cost) {  // replace old token
@@ -301,7 +302,7 @@ inline Token* LatticeFasterDecoderTpl<FST, Token>::FindOrAddToken(
     } else {
       if (changed) *changed = false;
     }
-    return tok;
+    return e_found;
   }
 }
 
@@ -800,12 +801,12 @@ BaseFloat LatticeFasterDecoderTpl<FST, Token>::ProcessEmitting(
             next_cutoff = tot_cost + adaptive_beam; // prune by best current token
           // Note: the frame indexes into active_toks_ are one-based,
           // hence the + 1.
-          Token *next_tok = FindOrAddToken(arc.nextstate,
-                                           frame + 1, tot_cost, tok, NULL);
+          Elem *e_next = FindOrAddToken(arc.nextstate,
+                                        frame + 1, tot_cost, tok, NULL);
           // NULL: no change indicator needed
 
           // Add ForwardLink from tok to next_tok (put on head of list tok->links)
-          tok->links = new ForwardLinkT(next_tok, arc.ilabel, arc.olabel,
+          tok->links = new ForwardLinkT(e_next->val, arc.ilabel, arc.olabel,
                                         graph_cost, ac_cost, tok->links);
         }
       } // for all arcs
@@ -838,7 +839,7 @@ void LatticeFasterDecoderTpl<FST, Token>::ProcessNonemitting(BaseFloat cutoff) {
   // first frame (called from InitDecoding()).
 
   // Processes nonemitting arcs for one frame.  Propagates within toks_.
-  // Note-- this queue structure is is not very optimal as
+  // Note-- this queue structure is not very optimal as
   // it may cause us to process states unnecessarily (e.g. more than once),
   // but in the baseline code, turning this vector into a set to fix this
   // problem did not improve overall speed.
@@ -855,14 +856,15 @@ void LatticeFasterDecoderTpl<FST, Token>::ProcessNonemitting(BaseFloat cutoff) {
   for (const Elem *e = toks_.GetList(); e != NULL;  e = e->tail) {
     StateId state = e->key;
     if (fst_->NumInputEpsilons(state) != 0)
-      queue_.push_back(state);
+      queue_.push_back(e);
   }
 
   while (!queue_.empty()) {
-    StateId state = queue_.back();
+    const Elem *e = queue_.back();
     queue_.pop_back();
 
-    Token *tok = toks_.Find(state)->val;  // would segfault if state not in toks_ but this can't happen.
+    StateId state = e->key;
+    Token *tok = e->val;  // would segfault if e is a NULL pointer but this can't happen.
     BaseFloat cur_cost = tok->tot_cost;
     if (cur_cost > cutoff) // Don't bother processing successors.
       continue;
@@ -882,16 +884,16 @@ void LatticeFasterDecoderTpl<FST, Token>::ProcessNonemitting(BaseFloat cutoff) {
         if (tot_cost < cutoff) {
           bool changed;
 
-          Token *new_tok = FindOrAddToken(arc.nextstate, frame + 1, tot_cost,
+          Elem *e_new = FindOrAddToken(arc.nextstate, frame + 1, tot_cost,
                                           tok, &changed);
 
-          tok->links = new ForwardLinkT(new_tok, 0, arc.olabel,
+          tok->links = new ForwardLinkT(e_new->val, 0, arc.olabel,
                                         graph_cost, 0, tok->links);
 
           // "changed" tells us whether the new token has a different
           // cost from before, or is new [if so, add into queue].
           if (changed && fst_->NumInputEpsilons(arc.nextstate) != 0)
-            queue_.push_back(arc.nextstate);
+            queue_.push_back(e_new);
         }
       }
     } // for all arcs
diff --git a/src/decoder/lattice-faster-decoder.h b/src/decoder/lattice-faster-decoder.h
index c611ec9dc05..e0cf7dea8d6 100644
--- a/src/decoder/lattice-faster-decoder.h
+++ b/src/decoder/lattice-faster-decoder.h
@@ -131,12 +131,12 @@ struct StdToken {
   // to keep it in a good numerical range).
   BaseFloat tot_cost;
 
-  // exta_cost is >= 0.  After calling PruneForwardLinks, this equals
-  // the minimum difference between the cost of the best path, and the cost of
-  // this is on, and the cost of the absolute best path, under the assumption
-  // that any of the currently active states at the decoding front may
-  // eventually succeed (e.g. if you were to take the currently active states
-  // one by one and compute this difference, and then take the minimum).
+  // exta_cost is >= 0.  After calling PruneForwardLinks, this equals the
+  // minimum difference between the cost of the best path that this link is a
+  // part of, and the cost of the absolute best path, under the assumption that
+  // any of the currently active states at the decoding front may eventually
+  // succeed (e.g. if you were to take the currently active states one by one
+  // and compute this difference, and then take the minimum).
   BaseFloat extra_cost;
 
   // 'links' is the head of singly-linked list of ForwardLinks, which is what we
@@ -216,7 +216,7 @@ struct BackpointerToken {
    will normally be StdToken, but also may be BackpointerToken which is to support
    quick lookup of the current best path (see lattice-faster-online-decoder.h)
 
-   The FST you invoke this decoder with is expected to equal
+   The FST you invoke this decoder which is expected to equal
    Fst::Fst<fst::StdArc>, a.k.a. StdFst, or GrammarFst.  If you invoke it with
    FST == StdFst and it notices that the actual FST type is
    fst::VectorFst<fst::StdArc> or fst::ConstFst<fst::StdArc>, the decoder object
@@ -380,9 +380,9 @@ class LatticeFasterDecoderTpl {
   // token was newly created or the cost changed.
   // If Token == StdToken, the 'backpointer' argument has no purpose (and will
   // hopefully be optimized out).
-  inline Token *FindOrAddToken(StateId state, int32 frame_plus_one,
-                               BaseFloat tot_cost, Token *backpointer,
-                               bool *changed);
+  inline Elem *FindOrAddToken(StateId state, int32 frame_plus_one,
+                              BaseFloat tot_cost, Token *backpointer,
+                              bool *changed);
 
   // prunes outgoing links for all tokens in active_toks_[frame]
   // it's called by PruneActiveTokens
@@ -464,7 +464,7 @@ class LatticeFasterDecoderTpl {
   std::vector<TokenList> active_toks_; // Lists of tokens, indexed by
   // frame (members of TokenList are toks, must_prune_forward_links,
   // must_prune_tokens).
-  std::vector<StateId> queue_;  // temp variable used in ProcessNonemitting,
+  std::vector<const Elem* > queue_;  // temp variable used in ProcessNonemitting,
   std::vector<BaseFloat> tmp_array_;  // used in GetCutoff.
 
   // fst_ is a pointer to the FST we are decoding from.
@@ -495,7 +495,7 @@ class LatticeFasterDecoderTpl {
   BaseFloat final_relative_cost_;
   BaseFloat final_best_cost_;
 
-  // There are various cleanup tasks... the the toks_ structure contains
+  // There are various cleanup tasks... the toks_ structure contains
   // singly linked lists of Token pointers, where Elem is the list type.
   // It also indexes them in a hash, indexed by state (this hash is only
   // maintained for the most recent frame).  toks_.Clear()
diff --git a/src/decoder/simple-decoder.cc b/src/decoder/simple-decoder.cc
index 836f87556c8..b347c57570d 100644
--- a/src/decoder/simple-decoder.cc
+++ b/src/decoder/simple-decoder.cc
@@ -214,20 +214,20 @@ void SimpleDecoder::ProcessEmitting(DecodableInterface *decodable) {
 void SimpleDecoder::ProcessNonemitting() {
   // Processes nonemitting arcs for one frame.  Propagates within
   // cur_toks_.
-  std::vector<StateId> queue_;
+  std::vector<StateId> queue;
   double infinity = std::numeric_limits<double>::infinity();
   double best_cost = infinity;
   for (unordered_map<StateId, Token*>::iterator iter = cur_toks_.begin();
        iter != cur_toks_.end();
        ++iter) {
-    queue_.push_back(iter->first);
+    queue.push_back(iter->first);
     best_cost = std::min(best_cost, iter->second->cost_);
   }
   double cutoff = best_cost + beam_;
   
-  while (!queue_.empty()) {
-    StateId state = queue_.back();
-    queue_.pop_back();
+  while (!queue.empty()) {
+    StateId state = queue.back();
+    queue.pop_back();
     Token *tok = cur_toks_[state];
     KALDI_ASSERT(tok != NULL && state == tok->arc_.nextstate);
     for (fst::ArcIterator<fst::Fst<StdArc> > aiter(fst_, state);
@@ -244,12 +244,12 @@ void SimpleDecoder::ProcessNonemitting() {
               = cur_toks_.find(arc.nextstate);
           if (find_iter == cur_toks_.end()) {
             cur_toks_[arc.nextstate] = new_tok;
-            queue_.push_back(arc.nextstate);
+            queue.push_back(arc.nextstate);
           } else {
             if ( *(find_iter->second) < *new_tok ) {
               Token::TokenDelete(find_iter->second);
               find_iter->second = new_tok;
-              queue_.push_back(arc.nextstate);
+              queue.push_back(arc.nextstate);
             } else {
               Token::TokenDelete(new_tok);
             }
diff --git a/src/doc/build_setup.dox b/src/doc/build_setup.dox
index 47ff7e033a8..5ea2e212b20 100644
--- a/src/doc/build_setup.dox
+++ b/src/doc/build_setup.dox
@@ -32,12 +32,12 @@
 
  The build process for Windows is separate from the build process for
  UNIX-like systems, and is described in windows/INSTALL (tested some time ago with
- Windows 7 and Microsoft Visual Studio 10.0).  We use scripts to
+ Windows 7 and Microsoft Visual Studio 2013).  We use scripts to
  create the Visual Studio 10.0 solution file.  There are two options for
- the math library on Windows: either you can use Cygwin to compile ATLAS, or you
- can use the Intel MKL library.  Detailed instructions are provided.  However, note
+ the math library on Windows: either Intel MKL, or use Cygwin to compile ATLAS.
+ Detailed instructions are provided.  However, note
  that the Windows setup is becoming out of date and is not regularly tested,
- and not all the code currently compiles on it.
+ and not all the may compile.
 
  \section build_setup_configure How our configure script works (for UNIX variants)
 
@@ -143,6 +143,6 @@ preprocessor variables, setting compile options, linking with libraries, and so
 
 We have compiled Kaldi on Windows, Cygwin, various flavors of Linux (including
 Ubuntu, CentOS, Debian, Red Hat and SUSE), and Darwin. We recommend you use g++ version
-4.4 or above, although other compilers such as llvm and Intel's icc are also known to work.
+4.7 or above, although other compilers such as llvm and Intel's icc are also known to work.
 
 */
diff --git a/src/doc/data_prep.dox b/src/doc/data_prep.dox
index d8fe1746df1..e81032537cc 100644
--- a/src/doc/data_prep.dox
+++ b/src/doc/data_prep.dox
@@ -191,7 +191,7 @@ the speaker identities, you can just make the speaker-ids the same as the uttera
 so the format of the file would be just <DFN>\<utterance-id\> \<utterance-id\></DFN>.
 We have made the previous sentence bold because we have encountered people creating
 a "global" speaker-id.  This is a bad idea because it makes cepstral mean normalization
-ineffective in traning (since it's applied globally), and because it will create problems
+ineffective in training (since it's applied globally), and because it will create problems
 when you use utils/split_data_dir.sh to split your data into pieces.
 
 There is another file that exists in some setups; it is used only occasionally and
diff --git a/src/doc/dependencies.dox b/src/doc/dependencies.dox
index 63d2658b726..d8a5591955f 100644
--- a/src/doc/dependencies.dox
+++ b/src/doc/dependencies.dox
@@ -113,7 +113,7 @@
     - CLAPACK, the linear algebra library (we download the headers).
       This is useful only on systems where you don't have ATLAS and are
       instead compiling with CLAPACK.
-    - OpenBLAS: this is an alernative to ATLAS or CLAPACK.  The scripts don't
+    - OpenBLAS: this is an alternative to ATLAS or CLAPACK.  The scripts don't
       use it by default but we provide installation scripts so you can install
       it if you want to compare it against ATLAS (it's more actively
       maintained than ATLAS).
diff --git a/src/doc/dnn.dox b/src/doc/dnn.dox
index 5b3d2b98261..bab4658e552 100644
--- a/src/doc/dnn.dox
+++ b/src/doc/dnn.dox
@@ -37,7 +37,7 @@ namespace kaldi {
   We currently have three separate codebases for deep neural nets in Kaldi.  All
   are still active in the sense that the up-to-date recipes refer to all of
   them.  The first one ("nnet1"( is located in code subdirectories nnet/ and
-  nnetbin/, and is primiarly maintained by Karel Vesely.  The second is located
+  nnetbin/, and is primarily maintained by Karel Vesely.  The second is located
   in code subdirectories nnet2/ and nnet2bin/, and is primarily maintained by
   Daniel Povey (this code was originally based on an earlier version of Karel's
   code, but it has been extensively rewritten).  The third is located
diff --git a/src/doc/dnn1.dox b/src/doc/dnn1.dox
index 223b7665274..e8dcfd90d3f 100644
--- a/src/doc/dnn1.dox
+++ b/src/doc/dnn1.dox
@@ -35,13 +35,13 @@ show some \ref dnn1_advanced_features, and do a light introduction to the \ref d
 <hr><!-- #################################################################################################################### -->
 
 \section dnn1_toplevel_scripts Top-level script
-Let's have a look at the script <b><a href="http://sourceforge.net/p/kaldi/code/HEAD/tree/trunk/egs/wsj/s5/local/nnet/run_dnn.sh">egs/wsj/s5/local/nnet/run_dnn.sh</a></b>.
+Let's have a look at the script <b><a href="https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/local/nnet/run_dnn.sh">egs/wsj/s5/local/nnet/run_dnn.sh</a></b>.
 This script assumes to use a single CUDA GPU, and that kaldi was compiled with CUDA (check for 'CUDA = true' in src/kaldi.mk).
 Also we assume that 'cuda_cmd' is set properly in egs/wsj/s5/cmd.sh either to a GPU cluster node using 'queue.pl' or to a local machine using 'run.pl'.
 And finally the script assumes we already have a SAT GMM system exp/tri4b and corresponding fMLLR transforms, as generated by egs/wsj/s5/run.sh.
 Note that for other databases the run_dnn.sh is typically in the same location s5/local/nnet/run_dnn.sh.
 
-The script <a href="http://sourceforge.net/p/kaldi/code/HEAD/tree/trunk/egs/wsj/s5/local/nnet/run_dnn.sh">egs/wsj/s5/local/nnet/run_dnn.sh</a> is split into several stages:
+The script <a href="https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/local/nnet/run_dnn.sh">egs/wsj/s5/local/nnet/run_dnn.sh</a> is split into several stages:
 
 0. <b>storing 40-dimensional fMLLR features to disk, steps/nnet/make_fmllr_feats.sh,</b>
 this simplifies the training scripts, the 40-dimensional features are MFCC-LDA-MLLT-fMLLR with CMN
@@ -100,7 +100,7 @@ Besides the DNN recipe, there are also other example scripts which can be handy:
 <hr><!-- #################################################################################################################### -->
 
 \section dnn1_training_script_internals Training script internals
-The main neural network training script <a href="http://sourceforge.net/p/kaldi/code/HEAD/tree/trunk/egs/wsj/s5/steps/nnet/train.sh">steps/nnet/train.sh</a> is invoked as:
+The main neural network training script <a href="https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/nnet/train.sh">steps/nnet/train.sh</a> is invoked as:
 
 \verbatim
 steps/nnet/train.sh <data-train> <data-dev> <lang-dir> <ali-train> <ali-dev> <exp-dir>
@@ -111,11 +111,11 @@ The <lang-dir> is used only in the special case when using LDA feature-transform
 The output (i.e. the trained networks and logfiles) goes into <exp-dir>.
 
 Internally the script prepares the feature+target pipelines, generates a neural-network prototype and initialization, creates feature_transform and calls the scheduler script 
-<a href="http://sourceforge.net/p/kaldi/code/HEAD/tree/trunk/egs/wsj/s5/steps/nnet/train_scheduler.sh">steps/nnet/train_scheduler.sh</a>,
+<a href="https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/nnet/train_scheduler.sh">steps/nnet/train_scheduler.sh</a>,
 which runs the training epochs and controls the learning rate.
 
 
-<b>While looking inside <a href="http://sourceforge.net/p/kaldi/code/HEAD/tree/trunk/egs/wsj/s5/steps/nnet/train.sh">steps/nnet/train.sh</a> we see:</b>
+<b>While looking inside <a href="https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/nnet/train.sh">steps/nnet/train.sh</a> we see:</b>
 
 1. CUDA is required, the scripts exit if no GPU was detected or was CUDA not compiled in (one can still use '--skip-cuda-check true' to run on CPU, but it is 10-20x slower)
 
@@ -165,12 +165,12 @@ $ cat exp/dnn5b_pretrain-dbn_dnn/nnet.proto
 
 7. the network is initialized by : \ref nnet-initialize.cc , the DBN gets prepended in the next step using \ref nnet-concat.cc
 
-8. finally the training gets called by running scheduler script <a href="http://sourceforge.net/p/kaldi/code/HEAD/tree/trunk/egs/wsj/s5/steps/nnet/train_scheduler.sh">steps/nnet/train_scheduler.sh</a>
+8. finally the training gets called by running scheduler script <a href="https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/nnet/train_scheduler.sh">steps/nnet/train_scheduler.sh</a>
 
 Note : both neural networks and feature transforms can be viewed by \ref nnet-info.cc, or shown in ascii by \ref nnet-copy.cc
 
 
-<b>While looking inside <a href="http://sourceforge.net/p/kaldi/code/HEAD/tree/trunk/egs/wsj/s5/steps/nnet/train_scheduler.sh">steps/nnet/train_scheduler.sh</a> we see:</b>
+<b>While looking inside <a href="https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/nnet/train_scheduler.sh">steps/nnet/train_scheduler.sh</a> we see:</b>
 
 the initial cross-validation run and the main for-loop over $iter which runs the epochs and controls the learning rate. Typically, the train_scheduler.sh is called from train.sh.
 - the default learning-rate scheduling is based on the relative improvement of the objective function: 
@@ -310,7 +310,7 @@ AddMat	174.307s
 AddMatMat	1922.11s
 \endverbatim
 
-<b> Running <a href="http://sourceforge.net/p/kaldi/code/HEAD/tree/trunk/egs/wsj/s5/steps/nnet/train_scheduler.sh">steps/nnet/train_scheduler.sh</a> directly:</b>
+<b> Running <a href="https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/nnet/train_scheduler.sh">steps/nnet/train_scheduler.sh</a> directly:</b>
 - The script train_scheduler.sh can be called outside train.sh, it allows to override the default NN-input and NN-target streams, which can be handy.
 - However the script assumes everything is set-up correctly, and there are almost no sanity checks, which makes it suitable for more advanced users only.
 - It is highly recommended to have a look at how train_scheduler.sh is usually called before trying to call it directly.
diff --git a/src/doc/grammar.dox b/src/doc/grammar.dox
index d1c6f51f349..30396041d22 100644
--- a/src/doc/grammar.dox
+++ b/src/doc/grammar.dox
@@ -352,7 +352,7 @@ Z_S  243
   The special symbols in CLG.fst will be as follows.
 
   The following special symbols may appear in any CLG graph, top-level or not:
-   - When any graph invokes a sub-graph, there will be n arc with an ilabel
+   - When any graph invokes a sub-graph, there will be an arc with an ilabel
      (</code>\#nonterm:foo</code>, <em>left-context-phone</em>) representing the
      user-specified nonterminal and the actual left-context, which will be
      followed by arcs with ilabels of the form (</code>\#nonterm_reenter</code>,
diff --git a/src/doc/history.dox b/src/doc/history.dox
index 40d46c7e32f..0813f2331cc 100644
--- a/src/doc/history.dox
+++ b/src/doc/history.dox
@@ -54,7 +54,8 @@
  Sandeep Boda, Sandeep Reddy and Haihua Xu (who helped with coding, code cleanup
  and documentation); we were visited by Michael Riley (who helped us to understand
  OpenFst and gave some lectures on FSTs), and would like to acknowledge the help of
- Honza Cernocky (for allowing us to have the workshop and helping to organize it),
+ Honza Cernocky (for negotiating the venue and some support for the workshop from
+ the Faculty of Information Technology of BUT and helping to organize it),
  Renata Kohlova (administration), and Tomas Kasparek (system administration).
  It is possible that this list of contributors contains
  oversights; any important omissions are unlikely to be intentional.
@@ -62,13 +63,16 @@
  A lot of code was written during the summer of 2010 but we still did not have a
  complete working system.  Some of the participants of the 2010 workshop
  continued working to complete the toolkit and get a working set of training scripts.
- The code was released on May 14th, 2011.
+ The code was released on May 14th, 2011, and presented to public at ICASSP 2011 
+ in Prague,
+ <a href="https://www.superlectures.com/icassp2011/category.php?lang=en&id=131">
+ see the recordings</a>.
 
  Since the initial release, Kaldi has been maintained and developed to a large
  extent by Daniel Povey, working at Microsoft Research until early 2012 and
  since then at Johns Hopkins University; but also with major contributions by
  others: notably Karel Vesely, who developed the neural-net training framework,
- and Arnab Ghoshal, who co-ordinated the acoustic modeling work early on; but
+ and Arnab Ghoshal, who coordinated the acoustic modeling work early on; but
  also other major contributors whom we do not name here because it is too hard
  to determine where to cut off the list; and a long tail of minor contributors;
  the total number of people who have contributed code or scripts or patches is
diff --git a/src/doc/io.dox b/src/doc/io.dox
index dc958f57a6f..8f3a3cc05b6 100644
--- a/src/doc/io.dox
+++ b/src/doc/io.dox
@@ -383,7 +383,7 @@ namespace kaldi {
   std::string rspecifier2 = "ark:-"; // archive read from stdin.
   // write to a gzipped text archive.
   std::string wspecifier1 = "ark,t:| gzip -c > /some/dir/foo.ark.gz";
-  std::string wspecifier2 = "ark,scp:data/my.ark,data/my.ark";
+  std::string wspecifier2 = "ark,scp:data/my.ark,data/my.scp";
  \endcode
 
  Usually, an rspecifier or wspecifier consists of a comma-separated, unordered
@@ -401,7 +401,7 @@ namespace kaldi {
  \endverbatim
  This will write an archive, and a
  script file with lines like "utt_id /somedir/foo.ark:1234" that specify offsets into the
- archive for more efficient random access.  You can then do what you like which
+ archive for more efficient random access.  You can then do whatever you like with
  the script file, including breaking it up into segments, and it will behave like
  any other script file.  Note that although the order of options before the colon
  doesn't generally matter, in this particular case the "ark" must come before
diff --git a/src/doc/kaldi_for_dummies.dox b/src/doc/kaldi_for_dummies.dox
index c04e0d0c3e9..b48d6dd8dac 100644
--- a/src/doc/kaldi_for_dummies.dox
+++ b/src/doc/kaldi_for_dummies.dox
@@ -71,7 +71,7 @@ and installation,
  - \c awk – programming language, used for searching and processing patterns
 in files and data streams,
  - \c bash – Unix shell and script programming language,
- - \c grep – command-line utility for searching plain-text data sets for lines
+ - \c grep – command-line utility for searching plain-text datasets for lines
 matching a regular expression,
  - \c make – automatically builds executable programs and libraries from
 source code,
@@ -87,16 +87,16 @@ If you do not have much idea about how to use GIT, please read about it:
 \ref tutorial_git.
 
 I installed Kaldi in this directory (called 'Kaldi root path'):
-\c /home/{user}/kaldi-trunk
+\c /home/{user}/kaldi
 
 \section kaldi_for_dummies_directories Kaldi directories structure
 
 Try to acknowledge where particular Kaldi components are placed. Also it would
 be nice if you read any \c README files you find.
 
-\c kaldi-trunk - main Kaldi directory which contains:
+\c kaldi - main Kaldi directory which contains:
  - \c egs – example scripts allowing you to quickly build ASR
-systems for over 30 popular speech corporas (documentation is attached for each
+systems for over 30 popular speech corpora (documentation is attached for each
 project),
  - \c misc – additional tools and supplies, not needed for proper
 Kaldi functionality,
@@ -127,7 +127,7 @@ train it, test it and get some decoding results.
 
 <h2>Your first task</h2>
 Something to begin with - create a folder \c digits in
-\c kaldi-trunk/egs/ directory. This is a place where you will put all
+\c kaldi/egs/ directory. This is a place where you will put all
 the stuff related to your project.
 
 \section kaldi_for_dummies_data Data preparation
@@ -136,34 +136,34 @@ the stuff related to your project.
 
 I assume that you want to set up an ASR system, basing on your own audio data.
 For example - let it be a set of 100 files. File format is WAV. Each file
-contains 3 spoken digits recorded in english language, one by one. Each of
+contains 3 spoken digits recorded in English language, one by one. Each of
 these audio files is named in a recognizable way (e.g. \c 1_5_6.wav,
 which in my pattern means that the spoken sentence is 'one, five, six') and
 placed in the recognizable folder representing particular speaker during a
 particular recording session (there may be a situation that you have recordings
 of the same person but in two different quality/noise environments - put these
-in separate folders). So to sum up, my exemplary data set looks like this:
+in separate folders). So to sum up, my exemplary dataset looks like this:
  - 10 different speakers (ASR systems must be trained and tested on different
 speakers, the more speakers you have the better),
  - each speaker says 10 sentences,
- - 100 senteces/utterances (in 100 *.wav files placed in 10 folders related to
+ - 100 sentences/utterances (in 100 *.wav files placed in 10 folders related to
 particular speakers - 10 *.wav files in each folder),
  - 300 words (digits from zero to nine),
  - each sentence/utterance consist of 3 words.
 
-Whatever your first data set is, adjust my example to your particular case. Be
-careful with big data sets and complex grammars - start with something simple.
+Whatever your first dataset is, adjust my example to your particular case. Be
+careful with big datasets and complex grammars - start with something simple.
 Sentences that contain only digits are perfect in this case.
 
 <h2>Task</h2>
-Go to \c kaldi-trunk/egs/digits directory and create
-\c digits_audio folder. In \c kaldi-trunk/egs/digits/digits_audio
+Go to \c kaldi/egs/digits directory and create
+\c digits_audio folder. In \c kaldi/egs/digits/digits_audio
 create two folders: \c train and \c test. Select one speaker
-of your choice to represent testing data set. Use this speaker's 'speakerID' as
-a name for an another new folder in \c kaldi-trunk/egs/digits/digits_audio/test
+of your choice to represent testing dataset. Use this speaker's 'speakerID' as
+a name for an another new folder in \c kaldi/egs/digits/digits_audio/test
 directory. Then put there all the audio files related to that person. Put the
 rest (9 speakers) into \c train folder - this will be your training
-data set. Also create subfolders for each speaker.
+dataset. Also create subfolders for each speaker.
 
 \subsection kaldi_for_dummies_acoustic Acoustic data
 
@@ -174,14 +174,14 @@ section as well) can be considered as a text file with some number of strings
 (each string in a new line). These strings need to be sorted. If you will
 encounter any sorting issues you can use Kaldi scripts for checking
 (\c utils/validate_data_dir.sh) and fixing (\c utils/fix_data_dir.sh) data order.
-And for you information - \c utils directory will be attached to your project in
+And for your information - \c utils directory will be attached to your project in
 \ref kaldi_for_dummies_tools "Tools attachment" section.
 
 <h2>Task</h2>
-In \c kaldi-trunk/egs/digits directory, create a folder \c data. Then create
+In \c kaldi/egs/digits directory, create a folder \c data. Then create
 \c test and \c train subfolders inside. Create in each subfolder following files
 (so you have files named in <b>the same way in \c test and \c train subfolders
-but they relate to two different data sets</b> that you created before):
+but they relate to two different datasets</b> that you created before):
 
 a.) \c spk2gender <br>
 This file informs about speakers gender. As we assumed, 'speakerID' is a unique
@@ -207,9 +207,9 @@ for examples below).
 
 <b>Pattern:</b> <uterranceID> <full_path_to_audio_file>
 \verbatim
-dad_4_4_2 /home/{user}/kaldi-trunk/egs/digits/digits_audio/train/dad/4_4_2.wav
-july_1_2_5 /home/{user}/kaldi-trunk/egs/digits/digits_audio/train/july/1_2_5.wav
-july_6_8_3 /home/{user}/kaldi-trunk/egs/digits/digits_audio/train/july/6_8_3.wav
+dad_4_4_2 /home/{user}/kaldi/egs/digits/digits_audio/train/dad/4_4_2.wav
+july_1_2_5 /home/{user}/kaldi/egs/digits/digits_audio/train/july/1_2_5.wav
+july_6_8_3 /home/{user}/kaldi/egs/digits/digits_audio/train/july/6_8_3.wav
 # and so on...
 \endverbatim
 
@@ -236,8 +236,8 @@ july_6_8_3 july
 \endverbatim
 
 e.) \c corpus.txt <br>
-This file has a slightly different directory. In \c kaldi-trunk/egs/digits/data
-create another folder \c local. In \c kaldi-trunk/egs/digits/data/local create a
+This file has a slightly different directory. In \c kaldi/egs/digits/data
+create another folder \c local. In \c kaldi/egs/digits/data/local create a
 file \c corpus.txt which should contain every single utterance transcription
 that can occur in your ASR system (in our case it will be 100 lines from 100
 audio files).
@@ -252,14 +252,14 @@ four four two
 
 \subsection kaldi_for_dummies_language Language data
 
-This section relates to language modelling files that also need to be considered
+This section relates to language modeling files that also need to be considered
 as 'must be done'. Look for the syntax details here: \ref data_prep (each file
 is precisely described). Also feel free to read some examples in other \c egs
 scripts. Now is the perfect time.
 
 <h2>Task</h2>
-In \c kaldi-trunk/egs/digits/data/local directory, create a folder \c dict. In
-\c kaldi-trunk/egs/digits/data/local/dict create following files:
+In \c kaldi/egs/digits/data/local directory, create a folder \c dict. In
+\c kaldi/egs/digits/data/local/dict create following files:
 
 a.) \c lexicon.txt <br>
 This file contains every word from your dictionary with its 'phone
@@ -337,19 +337,19 @@ complete.
 You need to add necessary Kaldi tools that are widely used in exemplary scripts.
 
 <h2>Task</h2>
-From \c kaldi-trunk/egs/wsj/s5 copy two folders (with the whole content) -
+From \c kaldi/egs/wsj/s5 copy two folders (with the whole content) -
 \c utils and \c steps - and put them in your
-\c kaldi-trunk/egs/digits directory. You can also create links to these
+\c kaldi/egs/digits directory. You can also create links to these
 directories. You may find such links in, for example,
-\c kaldi-trunk/egs/voxforge/s5.
+\c kaldi/egs/voxforge/s5.
 
 \subsection kaldi_for_dummies_scoring Scoring script
 
 This script will help you to get decoding results.
 
 <h2>Task</h2>
-From \c kaldi-trunk/egs/voxforge/s5/local copy the script \c score.sh into
-similar location in your project (\c kaldi-trunk/egs/digits/local).
+From \c kaldi/egs/voxforge/s5/local copy the script \c score.sh into
+similar location in your project (\c kaldi/egs/digits/local).
 
 \subsection kaldi_for_dummies_srilm SRILM installation
 
@@ -358,7 +358,7 @@ example - SRI Language Modeling Toolkit (SRILM).
 
 <h2>Task</h2>
 For detailed installation instructions go to
-\c kaldi-trunk/tools/install_srilm.sh (read all comments inside).
+\c kaldi/tools/install_srilm.sh (read all comments inside).
 
 \subsection kaldi_for_dummies_configuration Configuration files
 
@@ -366,8 +366,8 @@ It is not necessary to create configuration files but it can be a good habit
 for future.
 
 <h2>Task</h2>
-In \c kaldi-trunk/egs/digits create a folder \c conf. Inside
-\c kaldi-trunk/egs/digits/conf create two files (for some configuration
+In \c kaldi/egs/digits create a folder \c conf. Inside
+\c kaldi/egs/digits/conf create two files (for some configuration
 modifications in decoding and mfcc feature extraction processes - taken from
 \c /egs/voxforge):
 
@@ -395,10 +395,10 @@ decided to use two different training methods:
 - TRI1 - simple triphone training (first triphone pass).
 
 These two methods are enough to show noticable differences in decoding results
-using only digits lexicon and small training data set.
+using only digits lexicon and small training dataset.
 
 <h2>Task</h2>
-In \c kaldi-trunk/egs/digits directory create 3 scripts:
+In \c kaldi/egs/digits directory create 3 scripts:
 
 a.) \c cmd.sh <br>
 \code{.sh}
@@ -416,7 +416,7 @@ export KALDI_ROOT=`pwd`/../..
 export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$PWD:$PATH
 
 # Defining audio data directory (modify it for your installation directory!)
-export DATA_ROOT="/home/{user}/kaldi-trunk/egs/digits/digits_audio"
+export DATA_ROOT="/home/{user}/kaldi/egs/digits/digits_audio"
 
 # Enable SRILM
 . $KALDI_ROOT/tools/env.sh
@@ -432,7 +432,7 @@ c.) \c run.sh
 . ./path.sh || exit 1
 . ./cmd.sh || exit 1
 
-nj=1       # number of parallel jobs - 1 is perfect for such a small data set
+nj=1       # number of parallel jobs - 1 is perfect for such a small dataset
 lm_order=1 # language model order (n-gram quantity) - 1 is enough for digits grammar
 
 # Safety mechanism (possible running this script with modified arguments)
@@ -564,7 +564,7 @@ Now all you have to do is to run \c run.sh script. If I have made any mistakes
 in this tutorial, logs from the terminal should guide you how to deal with it.
 
 Besides the fact that you will notice some decoding results in the terminal
-window, go to newly made \c kaldi-trunk/egs/digits/exp. You may notice there
+window, go to newly made \c kaldi/egs/digits/exp. You may notice there
 folders with \c mono and \c tri1 results as well - directories structure are the
 same. Go to \c mono/decode directory. Here you may find result files (named in
 a <c>wer_{number}</c> way). Logs for decoding process may be found in \c log
@@ -575,7 +575,7 @@ folder (same directory).
 This is just an example. The point of this short tutorial is to show you how to
 create 'anything' in Kaldi and to get a better understanding of how to think
 while using this toolkit. Personally I started with looking for tutorials made
-by the Kaldi authors/developers. After succesful Kaldi installation I launched
+by the Kaldi authors/developers. After successful Kaldi installation I launched
 some example scripts (Yesno, Voxforge, LibriSpeech - they are relatively easy
 and have free acoustic/language data to download - I used these three as a base
 for my own scripts).
@@ -586,7 +586,7 @@ There are two very useful sections for beginners inside: <br>
 a.) \ref tutorial - almost 'step by step' tutorial on how to set up an ASR
 system; up to some point this can be done without RM dataset. It is good to
 read it, <br>
-b.) \ref data_prep - very detailed explaination of how to use your own data
+b.) \ref data_prep - very detailed explanation of how to use your own data
 in Kaldi.
 
 More useful links about Kaldi I found: <br>
diff --git a/src/doc/matrixwrap.dox b/src/doc/matrixwrap.dox
index fb595d581fe..9cf5e92ca48 100644
--- a/src/doc/matrixwrap.dox
+++ b/src/doc/matrixwrap.dox
@@ -22,93 +22,155 @@ namespace kaldi {
 
 /** \page matrixwrap External matrix libraries
 
-  Here we describe how our \ref matrix "matrix library" makes use of 
+  Here we describe how our \ref matrix "matrix library" makes use of
   external libraries.
 
   \section matrixwrap_summary Overview
- 
-  The matrix code in Kaldi is mostly a wrapper on top of the
-  linear-algebra libraries BLAS and LAPACK.  The code has been designed to be as flexible
-  as possible in terms of what libraries it can use.  Currently it supports four options:
+
+  The matrix code in Kaldi is mostly a wrapper on top of the linear-algebra
+  libraries BLAS and LAPACK.  The code has been designed to be as flexible as
+  possible in terms of what libraries it can use.  Currently it supports four
+  options:
+    -  Intel MKL, which provides both BLAS and LAPACK (the default)
+    -  OpenBLAS, which provides BLAS and LAPACK
     -  ATLAS, which is an implementation of BLAS plus a subset of LAPACK (with a different interface)
     -  Some implementation of BLAS plus CLAPACK (note: this has not been tested recently).
-    -  Intel's MKL, which provides both BLAS and LAPACK
-    -  OpenBLAS, which provides BLAS and LAPACK
 
-  The code has to "know" which of these four options is being used, because although in principle
-  BLAS and LAPACK are standardized, there are some differences in the interfaces.
-  The Kaldi code requires exactly one
-  of the three strings HAVE_ATLAS, HAVE_CLAPACK, HAVE_OPENBLAS or HAVE_MKL to be defined 
-  (e.g. using -DHAVE_ATLAS as an option to the compiler).  It must then be 
-  linked with the appropriate libraries.  The code that deals most directly
-  with including the external libraries and setting up the appropriate
-  typedef's and defines, is in \ref kaldi-blas.h.   However, the rest of
-  the matrix code is not completely insulated from these issues because the ATLAS
-  and CLAPACK versions of higher-level routines are called differently (so
-  we have a lot of "#ifdef HAVE_ATLAS" directives and the like).  Additionally, some routines
-  are not even available in ATLAS so we have had to implement them ourselves.
-
-  The "configure" script in the "src" directory is responsible for setting up Kaldi to use the libraries.
-  It does this by creating the file "kaldi.mk" in the "src" directory, which gives appropriate flags
-  to the compiler.   If called with no arguments it will use any ATLAS installation it can find in "normal" places
-  in your system, but it is quite configurable.  See the script itself for usage.
-
- \section matrixwrap_blas Basic Linear Algebra Subroutines (BLAS)
-
-   Because we refer a lot to BLAS in this section, we briefly explain what it is. 
-   BLAS is a set of subroutine declarations that correspond to low-level
-   matrix-vector operations.  There is Level 1 Blas (vector-vector), Level 2
-   (vector-matrix) and Level 3 (matrix-matrix).  They have names like daxpy (for
-   double-precision a*x plus y), and dgemm (for double general matrix-matrix
-   multiply).  BLAS has various actual implementations.  The "reference BLAS",
-   supplied I believe by Netlib (the folks who also brought us the most common version
-   of LAPACK), is one.  ATLAS is another one (but it also implements some functions
-   from LAPACK).
-
- \section matrixwrap_lapack Linear Algebra PACKage (LAPACK)
-
-   Lapack is a set of linear-algebra routines, originally written in Fortran.  It includes
-   higher-level routines than BLAS, such as matrix inversion, SVD, etc.  
-   Netlib has implemented this (this is the "normal" LAPACK).  LAPACK requires
-   BLAS.  It is possible to mix-and-match LAPACK and BLAS implementations 
-   (e.g. Netlib's LAPACK with ATLAS's BLAS).
- 
-  CLAPACK is a version of LAPACK that has been converted from Fortan to C automatically
-  using the f2c utility.  When we talk about using LAPACK, we are actually
-  talking about using CLAPACK.  Because CLAPACK has been converted to C using the
-  f2c utility, when we link against it we need to include the f2c library (e.g. -lf2c,
-  or -lg2c if using recent versions of gcc), otherwise we will get linking errors.
-
-
-  \section matrixwrap_atlas Automatically Tuned Linear Algebra Software (ATLAS) 
+  The code has to "know" which of these four options is being used, because
+  although in principle BLAS and LAPACK are standardized, there are some
+  differences in the interfaces.  The Kaldi code requires exactly one of the
+  three macros \c HAVE_ATLAS, \c HAVE_CLAPACK, \c HAVE_OPENBLAS or \c HAVE_MKL
+  to be defined (normally using \c -DHAVE_ATLAS as an option to the compiler).
+  It must then be linked with the appropriate libraries.  The code that deals
+  most directly with including the external libraries and setting up the
+  appropriate typedef's and defines, is in \ref kaldi-blas.h.  However, the rest
+  of the matrix code is not completely insulated from these issues because the
+  ATLAS and CLAPACK versions of higher-level routines are called differently (so
+  we have a lot of "#ifdef HAVE_ATLAS" directives and the like).  Additionally,
+  some routines are not even available in ATLAS so we have had to implement them
+  ourselves.
+
+  The "configure" script in the "src" directory is responsible for setting up
+  Kaldi to use the libraries.  It does this by creating the file "kaldi.mk" in
+  the "src" directory, which gives appropriate flags to the compiler. If called
+  with no arguments it will use any Intel MKL installation it can find in
+  "normal" places in your system, but it is configurable. Run the script with
+  the \c \--help option for the complete option list.
+
+ \section matrixwrap_matalgebra Understanding BLAS and LAPACK
+
+  Because we refer a lot to BLAS (and more often CBLAS) and LAPACK (or, rarely,
+  CLAPACK) in this section, we briefly explain what it is.
+
+ \subsection matrixwrap_blas Basic Linear Algebra Subroutines (BLAS)
+
+  BLAS is a set of subroutine declarations that correspond to low-level
+  matrix-vector operations.  There is BLAS Level 1 (vector-vector), Level 2
+  (vector-matrix) and Level 3 (matrix-matrix). They have names like \c daxpy
+  (for \"<b>d</b>ouble-precision \b a \b x <b>p</b>lus \b y\"), and \c dgemm
+  (for "double-precision general matrix-matrix multiply"). BLAS has various
+  actual implementations. The <a href="http://www.netlib.org/blas/">reference
+  implementation of BLAS</a> originated back in 1979, and has been maintained
+  since by Netlib. The reference implementation lacks any optimization
+  whatsoever, and exists solely as a touchstone to validate the correctness of
+  other implementations. MKL, ATLAS and OpenBLAS provide optimized
+  implementations of BLAS.
+
+  CBLAS is just the C language interface to BLAS.
+
+ \subsection matrixwrap_lapack Linear Algebra PACKage (LAPACK)
+
+  LAPACK is a set of linear-algebra routines, originally written in Fortran.  It
+  includes higher-level routines than BLAS, such as matrix inversion, SVD, etc.
+  The <a href="https://github.com/Reference-LAPACK">reference implementation of
+  LAPACK</a> was implemented and has been maintained by Netlib.  LAPACK
+  internally uses BLAS. It is possible to mix-and-match LAPACK and BLAS
+  implementations (e.g. Netlib's LAPACK with ATLAS's BLAS).
+
+  CLAPACK is a version of LAPACK that has been converted from Fortan to C
+  automatically using the f2c utility. Because of this, the f2c library is
+  required during linking with the "original" CLAPACK (usually \c -lg2c or
+  \c -lf2c).
+
+  MKL provides complete C-callable interfaces for its own BLAS and LAPACK
+  implementations; no additional libraries are required.
+
+ \section matrixwrap_mkl Intel Math Kernel Library (MKL)
+
+  Intel MKL provides C-language interface to a high-performance implementation
+  of the BLAS and LAPACK routines, and is currently the preferred CBLAS/CLAPACK
+  provider for Kaldi. To use MKL with Kaldi use the \c -DHAVE_MKL compiler flag.
+
+  Previously MKL used to be a paid product. Starting 2017, Intel made MKL freely
+  available and allows royalty-freely runtime redistribution even for commercial
+  application (although, just like, for example, CUDA, it is still a
+  closed-source commercial product).
+
+  MKL provides a very highly optimized implementation of linear algebra
+  routines, and especially on Intel CPUs. In fact, the library contains multiple
+  code paths, which are selected at runtime depending on individual features of
+  the CPU it is being loaded on. Thus with MKL you will automatically benefit
+  from all features and instruction sets (such as AVX2 and AVX512) if they are
+  available on your CPU, without any additional configuration. These
+  instructions accelerate linear algebra operations on CPU significantly.  It is
+  usually a good idea to use a recent MKL version if your CPU is of a newer
+  architecture.
+
+  To simplify MKL setup on Linux, we provide a script
+  \c tools/extras/install_mkl.sh. We install only 64-bit binaries for MKL, but
+  once the \c install_mkl.sh script completes successfully once, the Intel
+  repositories are registered on your system, and you can both obtain new
+  versions and 32-bit libraries using your system's package manager.
+
+  For Mac and Windows, <a href="https://software.intel.com/mkl/choose-download">
+  download the installer from Intel's Web site</a> (registration may be
+  required).  Refer to the same page in case the above Linux script does not
+  support your Linux distribution. The Intel installers (Mac, Windows) let you
+  select the 32-bit and 64-bit packages separately. To run Kaldi training
+  recipes only the 64-bit version is required.
+
+  We have tested Kaldi extensively with 64-bit libraries under Linux and
+  Windows.
+
+  The <a href="http://software.intel.com/articles/intel-mkl-link-line-advisor/">
+  MKL Link Line Advisor</a> is an interactive Web tool that allows configuring
+  the compiler flags for various systems and compilers, in case our "configure"
+  script does not cover it.
+  \n \b NOTE: Do not use the the multithreaded mode for
+  Kaldi training (select "sequential" as the threading option). Our script and
+  binary setups are designed to run multiple processes on a single machine,
+  presumably maxing out its CPU, and an attempt to multi-thread linear algebra
+  computations will only adversely impact the performance.
+
+  \section matrixwrap_atlas Automatically Tuned Linear Algebra Software (ATLAS)
 
   ATLAS is a well known implementation of BLAS plus a subset of LAPACK.  The
   general idea of ATLAS is to tune to the particular processor setup, so the
   compilation process is quite complex and can take a while.  For this reason,
-  it can be quite tricky to compile ATLAS.  On UNIX-based systems, you can't even do it unless you 
+  it can be quite tricky to compile ATLAS.  On UNIX-based systems, you can't even do it unless you
   are root or are friendly with your system administrator, because to compile
   it you need to turn off CPU throttling; and on Windows, ATLAS does not compile
   "natively", only in Cygwin.  Sometimes it can be a better bet to find libraries that
   have been compiled by someone else for your particular platform, but we can't offer
-  much advice on how to do this.  ATLAS generally performs better 
+  much advice on how to do this.  ATLAS generally performs better
   than the "reference BLAS" available from Netlib.   ATLAS only includes
   a few LAPACK routines.  These include matrix inversion and Cholesky factorization,
   but not SVD.  For this reason we have implemented a couple more of the LAPACK
-  routines (SVD and eigenvalue decomposition); see 
+  routines (SVD and eigenvalue decomposition); see
   the next section.
-  
+
   ATLAS conforms to the BLAS interface, but its interface for the subset of
-  LAPACK routines that it provides is not the same as Netlib's (it's more
-  C-like and less FORTRAN-ish).  For this reason, there are quite a number of #ifdef's in our code
-  to switch between the calling styles, depending whether we are
+  LAPACK routines that it provides is not the same as Netlib's (it's more C-like
+  and less FORTRAN-ish).  For this reason, there are quite a number of \#ifdef's
+  in our code to switch between the calling styles, depending whether we are
   linking with ATLAS or CLAPACK.
-  
+
   \subsection matrixwrap_atlas_install_windows Installing ATLAS (on Windows)
 
   For instructions on how to install ATLAS on Windows (and note that these
   instructions require Cygwin), see the file windows/INSTALL.atlas
   in our source distribution.  Note that our Windows setup is not being
-  actvely maintained at the moment and we don't anticipate that it will work
+  actively maintained at the moment and we don't anticipate that it will work
   very cleanly.
 
   \subsection matrixwrap_atlas_install_linux Installing ATLAS (on Linux)
@@ -118,39 +180,31 @@ namespace kaldi {
   pre-built binaries available, they may not be the best binaries possible for your
   architecture so it is probably a better idea to compile from source.
   The easiest way to do this
-  is to cd from "src" to "../tools" and to run ./install_atlas.sh.  
+  is to cd from "src" to "../tools" and to run ./install_atlas.sh.
   If this does not work, the detailed installation
-  instructions can be found at: http://math-atlas.sourceforge.net/atlas_install/. 
-	
+  instructions can be found at: http://math-atlas.sourceforge.net/atlas_install/.
+
   One useful note is that before installing ATLAS you should turn off CPU
- throttling using "cpufreq-selector -g performance" (cpufreq-selector may be in
- sbin), if it is enabled (see the ATLAS install page).  You can first try running the 
- "install_atlas.sh" script before doing this, to see whether it works-- if CPU
+  throttling using "cpufreq-selector -g performance" (cpufreq-selector may be in
+  sbin), if it is enabled (see the ATLAS install page).  You can first try running the
+  "install_atlas.sh" script before doing this, to see whether it works-- if CPU
   throttling is enabled, the ATLAS installation scripts will die with an error.
-	
-	\section matrixwrap_mkl Intel Math Kernel Library (MKL)
-	Intel MKL also provides C-language interface to the BLAS and LAPACK routines,
-	and can be used with Kaldi by using the -DHAVE_MKL compiler flag. The linker
-	flags for MKL tend to be quite different depending on the OS, architecture, 
-	compiler, etc. used. We have tested Kaldi on 32-bit Windows and x86_64 (or EMT64) Linux.
-	Flags for other platforms can be obtained from:
-  http://software.intel.com/en-us/articles/intel-mkl-link-line-advisor/
 
   \section matrixwrap_openblas OpenBLAS
 
-    Kaldi now supports linking against the OpenBLAS library, which  is an implementation
+  Kaldi now supports linking against the OpenBLAS library, which  is an implementation
   of BLAS and parts of LAPACK.  OpenBLAS also automatically compiles Netlib's implementation of LAPACK,
-  so that it can explort LAPACK in its entirety.
+  so that it can export LAPACK in its entirety.
   OpenBLAS is a fork of the GotoBLAS project (an assembler-heavy implementation of BLAS) which is no longer being
   maintained.  In order to use GotoBLAS you can cd from "src" to "../tools", type
   "make openblas", then cd to "../src" and give the correct option to the "configure" script
   to use OpenBLAS (look at the comments at the top of the configure script to find this option).
   Thanks to Sola Aina for suggesting this and helping us to get this to work.
-  
+
   \section matrixwrap_jama Java Matrix Package (JAMA)
 
   JAMA is an implementation of linear-algebra routines for Java, written
-  in collaboration between NIST and MathWorks and put into the public domain 
+  in collaboration between NIST and MathWorks and put into the public domain
   (see math.nist.gov/javanumerics/jama).  We used some of this code to fill
   in a couple of holes in ATLAS-- specifically, if we're compiling with
  -DHAVE_ATLAS, we don't have the CLAPACK routines for SVD and eigenvalue
@@ -165,7 +219,7 @@ namespace kaldi {
   directory and see if it succeeds.  A lot of compilation issues will manifest themselves
   as linking errors.  In this section we give a summary of some of the more common
   linking errors (at least, those that relate specifically to the matrix library).
- 
+
    Depending on the compilation option (-DHAVE_CLAPACK, -DHAVE_LAPACK or -DHAVE_MKL),
   the code will be expecting to link with different things.  When debugging linking
   errors, bear in mind that the problem could be a mismatch between the compilation
@@ -182,7 +236,7 @@ namespace kaldi {
    s_cat, pow_dd, r_sign, pow_ri, pow_di, s_copy, s_cmp, d_sign
 
   \subsection matrix_err_clapack CLAPACK linking errors
-    
+
    You will get these errors if you compiled with -DHAVE_CLAPACK but did
    not provide the CLAPACK library.  The symbols you will be missing are:
 
@@ -195,15 +249,15 @@ namespace kaldi {
   but it supplies different symbols.   The native CLAPACK version of liblapack
   has symbols like those above (e.g. sgesvd_, sgetrf_), but the ATLAS version
   has symbols like clapack_sgetrf and also ones like ATL_sgetrf.
-  
+
   \subsection matrix_err_blas BLAS linking errors
-  
+
    You will get these errors if you failed to link against an implementation
    of BLAS.  These errors can also occur if libraries are linked in the wrong
    order.  CLAPACK requires BLAS, so you have to link BLAS after CLAPACK.
-   
+
    The symbols you will see if you failed to link with BLAS include:
-  
+
    cblas_sger, cblas_saxpy, cblas_dapy, cblas_ddot, cblas_sdot, cblas_sgemm, cblas_dgemm
 
    To fix these, link with a static library like libcblas.a, or do -lcblas (assuming
@@ -220,7 +274,7 @@ namespace kaldi {
   CLAPACK.  The cblaswrap library should be invoked before the cblas one.  If you
   are missing cblaswrap, you will see errors about symbols like:
 
-  f2c_sgemm, f2c_strsm, f2c_sswap, f2c_scopy, f2c_sspmv, f2c_sdot, f2c_sgemv 
+  f2c_sgemm, f2c_strsm, f2c_sswap, f2c_scopy, f2c_sspmv, f2c_sdot, f2c_sgemv
 
   and so on (there are a lot of these symbols).
 
@@ -235,15 +289,15 @@ namespace kaldi {
 
   \subsection matrix_err_atl_clapack Missing the ATLAS implementation of (parts of) CLAPACK
 
-  These errors can only occur if you compiled wiht the -DHAVE_ATLAS option.
+  These errors can only occur if you compiled with the -DHAVE_ATLAS option.
   Atlas's name for the CLAPACK routines are different from clapack's own (they
   have clapack_ prepended to indicate the origin, which can be quite confusing).
 
   If you have undefined references to the following symbols:
-  
+
    clapack_sgetrf, clapack_sgetri, clapack_dgetrf, clapack_dgetri
 
-  then it means you failed to link with an ATLAS library containing these symbols.  
+  then it means you failed to link with an ATLAS library containing these symbols.
   This may be variously called liblapack.a, libclapack.a or liblapack_atlas.a,
   but you can tell that it is the right one if it defines a symbol called ATL_cgetrf
   (type "nm <library-name> | grep ATL_cgetrf" to see).  You may be able to link
@@ -254,7 +308,6 @@ namespace kaldi {
   out is to look inside it using "nm" or "strings".
 
 
-
 */
 
 }
diff --git a/src/doc/online_decoding.dox b/src/doc/online_decoding.dox
index 799bfb5895f..9bcc2575be1 100644
--- a/src/doc/online_decoding.dox
+++ b/src/doc/online_decoding.dox
@@ -438,6 +438,89 @@ and downloadable models that can be used with online nnet3 decoding, please
 see http://kaldi-asr.org/models.html (the first model there, the ASPIRE model,
 includes instructions in a README file).
 
+\subsection online_decoding_nnet3_tcp TCP server for nnet3 online decoding
+
+The program to run the TCP sever is online2-tcp-nnet3-decode-faster located in the
+~/src/online2bin folder. The usage is as follows:
+
+\verbatim
+online2-tcp-nnet3-decode-faster <nnet3-in> <fst-in> <word-symbol-table>
+\endverbatim
+
+For example:
+
+\verbatim
+online2-tcp-nnet3-decode-faster model/final.mdl graph/HCLG.fst graph/words.txt
+\endverbatim
+
+The word symbol table is mandatory (unlike other nnet3 online decoding programs) because
+the server outputs word strings. Endpointing is mandatory to make the operation of the
+program reasonable. Other, non-standard options include:
+    - port-num - the port the server listens on (by default 5050)
+    - samp-freq - sampling frequency of audio (usually 8000 for telephony and 16000 for other uses)
+    - chunk-length - length of signal being processed by decoder at each step
+    - output-period - how often we check for changes in the decoding (ie. output refresh rate, default 1s)
+    - num-threads-startup - number of threads used when initializing iVector extractor
+    - read-timeout - it the program doesn't receive data during this timeout, the server terminates the connection.
+		Use -1 to disable this feature.
+
+The TCP protocol simply takes RAW signal on input (16-bit signed integer
+encoding at chosen sampling frequency) and outputs simple text using the following
+logic:
+    - each refresh period (output-freq argument) the current state of decoding is output
+    - each line is terminated by '\r'
+    - once an utterance boundary is detected due to endpointing a '\n' char is output
+
+Each output string (delimited by '\r') should be treated as uncertain and can change
+entirely until the utterance delimiter ('\n') is sent. The delimiter chars are chosen
+specifically in order to make the output look neat in the terminal. It is possible to
+use it with other interfaces and a web demo (HTML/JS AudioAPI+WebSockets) exists.
+
+To run the program from the terminal you can use one of the following commands. First,
+make sure the server is running and accepting connections. Using the Aspire models, the
+command should look like this:
+\verbatim
+online2-tcp-nnet3-decode-faster --samp-freq=8000 --frames-per-chunk=20 --extra-left-context-initial=0
+    --frame-subsampling-factor=3 --config=model/conf/online.conf --min-active=200 --max-active=7000
+    --beam=15.0 --lattice-beam=6.0 --acoustic-scale=1.0 --port-num=5050 model/final.mdl graph/HCLG.fst graph/words.txt
+\endverbatim
+
+Note in order to make the communication as simple as possible, the server has to accept
+any data on input and cannot figure out when the stream is over. It will therefore not
+be able to terminate the connection and it is the client's resposibility to disconnect
+when it is ready to do so. As a fallback for certain situations, the read-timeout option
+was added, which will automatically disconnect if a chosen amount of seconds has passed.
+Keep in mind, that this is not an ideal solution and it's a better idea to design your
+client to properly disconnect the connection when neccessary.
+
+For testing purposes, we will use the netcat program. We will also use sox to reeoncode the
+files properly from any source. Netcat has an issue that, similarly to what was stated above 
+about the server, it cannot always interpret the data and usually it won't automatically
+disconnect the TCP connection. To get around this, we will use the '-N' switch, which kills
+the connection once streaming of the file is complete, but this can have a small sideffect of
+not reading the whole output from the Kaldi server if the discconect comes too fast. Just
+keep this in mind if you intend to implement any of these programs into a production environment.
+
+To send a WAV file into the server, it first needs to be decoded into raw audio, then it can be
+sent to the socket:
+\verbatim
+sox audio.wav -t raw -c 1 -b 16 -r 8k -e signed-integer - | nc -N localhost 5050
+\endverbatim
+
+It is possible to play audio (almost) simultaneously as decoding. It may require installing the
+'pv' program (used to throttle the signal into Kaldi at the same speed as the playback):
+
+\verbatim
+sox audio.wav -t raw -c 1 -b 16 -r 8k -e signed-integer - | \
+    tee >(play -t raw -r 8k -e signed-integer -b 16 -c 1 -q -) | \
+    pv -L 16000 -q | nc -N localhost 5050
+\endverbatim
+
+Finally, it is possible to send audio from the microphone directly into the server:
+
+\verbatim
+rec -r 8k -e signed-integer -c 1 -b 16 -t raw -q - | nc -N localhost 5050
+\endverbatim
 
 
 */
diff --git a/src/doc/tree_externals.dox b/src/doc/tree_externals.dox
index 01f8dc433bf..cc753ef390f 100644
--- a/src/doc/tree_externals.dox
+++ b/src/doc/tree_externals.dox
@@ -344,7 +344,7 @@ has value 0 (i.e. the leftmost HMM-state).  Assuming the answer is "yes", the ne
 is "SE 2 [ 220 221 222 223 ]", which is asking whether the phone to the right is one of various
 forms of the phone "M" (a rather unintuitive question to ask, since we're
 in the leftmost HMM-state); if yes, we ask "SE 0 [ 104 105 106 107... 286 287 ]" which is
-a question about the phone to the right; if yes, then the pdf-id is 5 ("CE 5") and if
+a question about the phone to the left; if yes, then the pdf-id is 5 ("CE 5") and if
 no, 696 ("CE 696").
 \verbatim
 s3# copy-tree --binary=false exp/tri1/tree - 2>/dev/null | head -100
diff --git a/src/doc/tutorial_looking.dox b/src/doc/tutorial_looking.dox
index 420abfc9bce..831d721c7eb 100644
--- a/src/doc/tutorial_looking.dox
+++ b/src/doc/tutorial_looking.dox
@@ -171,7 +171,7 @@ making sure have their normal values, begin with KALDI_.  This is a precaution
 to avoid future conflicts with other codebases (since \#defines don't limit themselves
 to the kaldi namespace).  Notice the style of the function names: LikeThis().
 Our style is generally based on
-<a href=http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml> this one </a>,
+<a href=https://google.github.io/styleguide/cppguide.html> this one </a>,
 to conform with OpenFst, but there are some differences.
 
 To see other elements of the style, which will help you to understand Kaldi
@@ -190,7 +190,7 @@ It prints out the usage, which should give you a generic idea of how Kaldi progr
 are called.  Note that while there is a --config option that can be used to
 pass a configuration file, in general Kaldi is not as config-driven as HTK and these
 files are not widely used.  You will see a --binary option.  In general, Kaldi file
-formats come in both binary and test forms, and the --binary option controls how
+formats come in both binary and text forms, and the --binary option controls how
 they are written.  However, this only controls how single objects (e.g. acoustic models)
 are written.  For whole collections of objects (e.g. collections of feature files),
 there is a different mechanism that we will come to later.
diff --git a/src/doc/tutorial_prereqs.dox b/src/doc/tutorial_prereqs.dox
index 82079a281b9..72b1fcf8ad8 100644
--- a/src/doc/tutorial_prereqs.dox
+++ b/src/doc/tutorial_prereqs.dox
@@ -51,7 +51,7 @@
   The most difficult part of the installation process relates to the math library
   ATLAS; if this is not already installed as a library on your system you will
   have to compile it, and this requires that CPU throttling be turned off, which
-  may require root priveleges.  We provide scripts and detailed instructions for
+  may require root privileges.  We provide scripts and detailed instructions for
   all installation steps.  When scripts fail, read the output carefully because
   it tries to provide guidance as to how to fix problems.  Please inform us if there
   are problems at any point, however minor; see \ref other.
diff --git a/src/doc/tutorial_running.dox b/src/doc/tutorial_running.dox
index f977348a3cb..d639cd4e664 100644
--- a/src/doc/tutorial_running.dox
+++ b/src/doc/tutorial_running.dox
@@ -115,14 +115,14 @@ Now go back to the data directory and change directory to /train. Then execute t
 
 \verbatim
 head text
-head spk2gender.map
+head spk2gender
 head spk2utt
 head utt2spk
 head wav.scp
 \endverbatim
 
 - text - This file contains mappings between utterances and utterance ids which will be used by Kaldi. This file will be turned into an integer format-- still a text file, but with the words replaced with integers.
-- spk2gender.map - This file contains mappings between speakers and their gender. This also acts as a list of unique users involved in training. 
+- spk2gender - This file contains mappings between speakers and their gender. This also acts as a list of unique users involved in training. 
 - spk2utt - This is a mapping between the speaker identifiers and all the utterance identifiers associated with the speaker. 
 - utt2spk - This is a one-to-one mapping between utterance ids and the corresponding speaker identifiers. 
 - wav.scp - This file is actually read directly by Kaldi programs when doing feature extraction. Look at the file again. It is parsed as a set of key-value pairs, where the key is the first string on each line. The value is a kind of "extended filename", and you can guess how it works. Since it is for reading we will refer to this type of string as an "rxfilename" (for writing we use the term wxfilename). See \ref io_sec_xfilename if you are curious. Note that although we use the extension .scp, this is not a script file in the HTK sense (i.e. it is not viewed as an extension to the command-line arguments).
@@ -383,7 +383,7 @@ do
 copy-tree --binary=false exp/mono/tree - | less
 \endverbatim
 Note that this is a monophone "tree" so it is very trivial-- it
-does not have any "splits".  Although this tree format was not indended to be
+does not have any "splits".  Although this tree format was not intended to be
 very human-readable, we have received a number of queries about the tree format so we
 will explain it.  The rest of this paragraph can be skipped over by the casual reader.
 After "ToPdf", the tree file contains an object of the
@@ -442,7 +442,7 @@ Type
 \verbatim
 grep Overall exp/mono/log/acc.{?,??}.{?,??}.log
 \endverbatim
-You can see the acoustic likelihods on each iteration.  Next look at one of the files
+You can see the acoustic likelihoods on each iteration.  Next look at one of the files
 exp/mono/log/update.*.log to see what kind of information is in the update log.
 
 When the monophone training is finished, we can test the monophone decoding. Before decoding, we have to create the decode graph. Type:
@@ -505,7 +505,7 @@ gmm-decode-faster
 \endverbatim
 to see the usage message, and match up the arguments with what you see in the log file.
 Recall that "rspecifier" is one of those strings that specifies how to read a table,
-and "wspecifier" specifies how to write one.  Look carefuly at these arguments and try
+and "wspecifier" specifies how to write one.  Look carefully at these arguments and try
 to figure out what they mean.  Look at the rspecifier that corresponds to the features, and
 try to understand it (this one has spaces inside, so Kaldi prints it out with single quotes
 around it so that you could paste it into the shell and the program would run as intended).
diff --git a/src/doc/tutorial_setup.dox b/src/doc/tutorial_setup.dox
index 11d97a945f9..13f5e3e9c74 100644
--- a/src/doc/tutorial_setup.dox
+++ b/src/doc/tutorial_setup.dox
@@ -34,16 +34,16 @@
 
   Assuming Git is installed, to get the latest code you can type
   \verbatim
-    git clone https://github.com/kaldi-asr/kaldi.git kaldi-trunk --origin golden
+    git clone https://github.com/kaldi-asr/kaldi.git
   \endverbatim
-  Then cd to kaldi-trunk.  Look at the INSTALL file and follow the instructions
+  Then cd to kaldi.  Look at the INSTALL file and follow the instructions
   (it points you to two subdirectories).  Look carefully at the output of the
   installation scripts, as they try to guide you what to do.  Some installation
   errors are non-fatal, and the installation scripts will tell you so (i.e. there
   are some things it installs which are nice to have but are not really needed).
   The "best-case" scenario is that you do:
  \verbatim
-   cd kaldi-trunk/tools/; make; cd ../src; ./configure; make
+   cd kaldi/tools/; make; cd ../src; ./configure; make
  \endverbatim
  and everything will just work; however, if this does not happen there are
  fallback plans (e.g. you may have to install some package on your machine, or run
diff --git a/src/doc/versions.dox b/src/doc/versions.dox
index b26978b6e4d..08e2c2bbda7 100644
--- a/src/doc/versions.dox
+++ b/src/doc/versions.dox
@@ -28,7 +28,7 @@
 
    \section versions_scheme Versioning scheme
 
-     During its lifetime, Kaldi has has three different versioning methods.
+     During its lifetime, Kaldi has three different versioning methods.
      Originally Kaldi was a subversion (svn)-based project, and was hosted
      on Sourceforge.  Then Kaldi was moved to github, and for some time the
      only version-number available was the git hash of the commit.
@@ -121,7 +121,7 @@
       - Create a nnet3-based setup for RNN language models (i.e. recurrent and neural net based
         language models)
       - Some extentions to the core of the nnet3 framework to support constant values and
-        scalar multiplication without dedicated compoennts.
+        scalar multiplication without dedicated components.
 
    Below are commits corresponding to minor version numbers 5.3.x.
 
diff --git a/src/feat/feature-common-inl.h b/src/feat/feature-common-inl.h
index b9c5794a629..26127a4dc4d 100644
--- a/src/feat/feature-common-inl.h
+++ b/src/feat/feature-common-inl.h
@@ -33,26 +33,26 @@ void OfflineFeatureTpl<F>::ComputeFeatures(
     Matrix<BaseFloat> *output) {
   KALDI_ASSERT(output != NULL);
   BaseFloat new_sample_freq = computer_.GetFrameOptions().samp_freq;
-  if (sample_freq == new_sample_freq)
+  if (sample_freq == new_sample_freq) {
     Compute(wave, vtln_warp, output);
-  else {
-    if (new_sample_freq < sample_freq) {
-      if (! computer_.GetFrameOptions().allow_downsample)
+  } else {
+    if (new_sample_freq < sample_freq &&
+        ! computer_.GetFrameOptions().allow_downsample)
         KALDI_ERR << "Waveform and config sample Frequency mismatch: "
                   << sample_freq << " .vs " << new_sample_freq
-                  << " ( use --allow_downsample=true option to allow "
+                  << " (use --allow-downsample=true to allow "
                   << " downsampling the waveform).";
-
-      // Downsample the waveform.
-      Vector<BaseFloat> downsampled_wave(wave);
-      DownsampleWaveForm(sample_freq, wave,
-                         new_sample_freq, &downsampled_wave);
-      Compute(downsampled_wave, vtln_warp, output);
-    } else
-      KALDI_ERR << "New sample Frequency " << new_sample_freq
-                << " is larger than waveform original sampling frequency "
-                << sample_freq;
-
+    else if (new_sample_freq > sample_freq &&
+             ! computer_.GetFrameOptions().allow_upsample)
+      KALDI_ERR << "Waveform and config sample Frequency mismatch: "
+                  << sample_freq << " .vs " << new_sample_freq
+                << " (use --allow-upsample=true option to allow "
+                << " upsampling the waveform).";
+    // Resample the waveform.
+    Vector<BaseFloat> resampled_wave(wave);
+    ResampleWaveform(sample_freq, wave,
+                     new_sample_freq, &resampled_wave);
+    Compute(resampled_wave, vtln_warp, output);
   }
 }
 
diff --git a/src/feat/feature-common.h b/src/feat/feature-common.h
index 45911cef585..3c2fbd37381 100644
--- a/src/feat/feature-common.h
+++ b/src/feat/feature-common.h
@@ -53,12 +53,12 @@ class ExampleFeatureComputer {
   }
 
   /// Returns the feature dimension
-  int32 Dim();
+  int32 Dim() const;
 
   /// Returns true if this function may inspect the raw log-energy of the signal
   /// (before windowing and pre-emphasis); it's safe to always return true, but
   /// setting it to false enables an optimization.
-  bool NeedRawLogEnergy() { return true; }
+  bool NeedRawLogEnergy() const { return true; }
 
   /// constructor from options class; it should not store a reference or pointer
   /// to the options class but should copy it.
@@ -89,7 +89,7 @@ class ExampleFeatureComputer {
      @param [out] feature  Pointer to a vector of size this->Dim(), to which
          the computed feature will be written.
   */
-  void Compute(BaseFloat signal_log_energy,
+  void Compute(BaseFloat signal_raw_log_energy,
                BaseFloat vtln_warp,
                VectorBase<BaseFloat> *signal_frame,
                VectorBase<BaseFloat> *feature);
diff --git a/src/feat/feature-fbank.cc b/src/feat/feature-fbank.cc
index 10f7e67d607..8c32cbb1484 100644
--- a/src/feat/feature-fbank.cc
+++ b/src/feat/feature-fbank.cc
@@ -69,7 +69,7 @@ const MelBanks* FbankComputer::GetMelBanks(BaseFloat vtln_warp) {
   return this_mel_banks;
 }
 
-void FbankComputer::Compute(BaseFloat signal_log_energy,
+void FbankComputer::Compute(BaseFloat signal_raw_log_energy,
                             BaseFloat vtln_warp,
                             VectorBase<BaseFloat> *signal_frame,
                             VectorBase<BaseFloat> *feature) {
@@ -82,7 +82,7 @@ void FbankComputer::Compute(BaseFloat signal_log_energy,
 
   // Compute energy after window function (not the raw one).
   if (opts_.use_energy && !opts_.raw_energy)
-    signal_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
+    signal_raw_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
                                      std::numeric_limits<float>::min()));
 
   if (srfft_ != NULL)  // Compute FFT using split-radix algorithm.
@@ -114,11 +114,11 @@ void FbankComputer::Compute(BaseFloat signal_log_energy,
 
   // Copy energy as first value (or the last, if htk_compat == true).
   if (opts_.use_energy) {
-    if (opts_.energy_floor > 0.0 && signal_log_energy < log_energy_floor_) {
-      signal_log_energy = log_energy_floor_;
+    if (opts_.energy_floor > 0.0 && signal_raw_log_energy < log_energy_floor_) {
+      signal_raw_log_energy = log_energy_floor_;
     }
     int32 energy_index = opts_.htk_compat ? opts_.mel_opts.num_bins : 0;
-    (*feature)(energy_index) = signal_log_energy;
+    (*feature)(energy_index) = signal_raw_log_energy;
   }
 }
 
diff --git a/src/feat/feature-fbank.h b/src/feat/feature-fbank.h
index 724d7d148dc..f57d185a41c 100644
--- a/src/feat/feature-fbank.h
+++ b/src/feat/feature-fbank.h
@@ -94,7 +94,7 @@ class FbankComputer {
     return opts_.mel_opts.num_bins + (opts_.use_energy ? 1 : 0);
   }
 
-  bool NeedRawLogEnergy() { return opts_.use_energy && opts_.raw_energy; }
+  bool NeedRawLogEnergy() const { return opts_.use_energy && opts_.raw_energy; }
 
   const FrameExtractionOptions &GetFrameOptions() const {
     return opts_.frame_opts;
@@ -121,7 +121,7 @@ class FbankComputer {
      @param [out] feature  Pointer to a vector of size this->Dim(), to which
          the computed feature will be written.
   */
-  void Compute(BaseFloat signal_log_energy,
+  void Compute(BaseFloat signal_raw_log_energy,
                BaseFloat vtln_warp,
                VectorBase<BaseFloat> *signal_frame,
                VectorBase<BaseFloat> *feature);
diff --git a/src/feat/feature-mfcc.cc b/src/feat/feature-mfcc.cc
index 899988c2822..73ab4b312c4 100644
--- a/src/feat/feature-mfcc.cc
+++ b/src/feat/feature-mfcc.cc
@@ -25,7 +25,7 @@
 namespace kaldi {
 
 
-void MfccComputer::Compute(BaseFloat signal_log_energy,
+void MfccComputer::Compute(BaseFloat signal_raw_log_energy,
                            BaseFloat vtln_warp,
                            VectorBase<BaseFloat> *signal_frame,
                            VectorBase<BaseFloat> *feature) {
@@ -35,8 +35,8 @@ void MfccComputer::Compute(BaseFloat signal_log_energy,
   const MelBanks &mel_banks = *(GetMelBanks(vtln_warp));
 
   if (opts_.use_energy && !opts_.raw_energy)
-    signal_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
-                                     std::numeric_limits<float>::min()));
+    signal_raw_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
+                                     std::numeric_limits<float>::epsilon()));
 
   if (srfft_ != NULL)  // Compute FFT using the split-radix algorithm.
     srfft_->Compute(signal_frame->Data(), true);
@@ -62,9 +62,9 @@ void MfccComputer::Compute(BaseFloat signal_log_energy,
     feature->MulElements(lifter_coeffs_);
 
   if (opts_.use_energy) {
-    if (opts_.energy_floor > 0.0 && signal_log_energy < log_energy_floor_)
-      signal_log_energy = log_energy_floor_;
-    (*feature)(0) = signal_log_energy;
+    if (opts_.energy_floor > 0.0 && signal_raw_log_energy < log_energy_floor_)
+      signal_raw_log_energy = log_energy_floor_;
+    (*feature)(0) = signal_raw_log_energy;
   }
 
   if (opts_.htk_compat) {
diff --git a/src/feat/feature-mfcc.h b/src/feat/feature-mfcc.h
index 66c52e89821..dbfb9d60364 100644
--- a/src/feat/feature-mfcc.h
+++ b/src/feat/feature-mfcc.h
@@ -96,7 +96,7 @@ class MfccComputer {
 
   int32 Dim() const { return opts_.num_ceps; }
 
-  bool NeedRawLogEnergy() { return opts_.use_energy && opts_.raw_energy; }
+  bool NeedRawLogEnergy() const { return opts_.use_energy && opts_.raw_energy; }
 
   /**
      Function that computes one frame of features from
@@ -119,7 +119,7 @@ class MfccComputer {
      @param [out] feature  Pointer to a vector of size this->Dim(), to which
          the computed feature will be written.
   */
-  void Compute(BaseFloat signal_log_energy,
+  void Compute(BaseFloat signal_raw_log_energy,
                BaseFloat vtln_warp,
                VectorBase<BaseFloat> *signal_frame,
                VectorBase<BaseFloat> *feature);
@@ -129,6 +129,7 @@ class MfccComputer {
   // disallow assignment.
   MfccComputer &operator = (const MfccComputer &in);
 
+ protected:
   const MelBanks *GetMelBanks(BaseFloat vtln_warp);
 
   MfccOptions opts_;
diff --git a/src/feat/feature-plp.cc b/src/feat/feature-plp.cc
index 8f4a7d66161..e0c270c7061 100644
--- a/src/feat/feature-plp.cc
+++ b/src/feat/feature-plp.cc
@@ -109,7 +109,7 @@ const Vector<BaseFloat> *PlpComputer::GetEqualLoudness(BaseFloat vtln_warp) {
   return ans;
 }
 
-void PlpComputer::Compute(BaseFloat signal_log_energy,
+void PlpComputer::Compute(BaseFloat signal_raw_log_energy,
                           BaseFloat vtln_warp,
                           VectorBase<BaseFloat> *signal_frame,
                           VectorBase<BaseFloat> *feature) {
@@ -124,7 +124,7 @@ void PlpComputer::Compute(BaseFloat signal_log_energy,
 
 
   if (opts_.use_energy && !opts_.raw_energy)
-    signal_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
+    signal_raw_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
                                      std::numeric_limits<float>::min()));
 
   if (srfft_ != NULL)  // Compute FFT using split-radix algorithm.
@@ -174,9 +174,9 @@ void PlpComputer::Compute(BaseFloat signal_log_energy,
     feature->Scale(opts_.cepstral_scale);
 
   if (opts_.use_energy) {
-    if (opts_.energy_floor > 0.0 && signal_log_energy < log_energy_floor_)
-      signal_log_energy = log_energy_floor_;
-    (*feature)(0) = signal_log_energy;
+    if (opts_.energy_floor > 0.0 && signal_raw_log_energy < log_energy_floor_)
+      signal_raw_log_energy = log_energy_floor_;
+    (*feature)(0) = signal_raw_log_energy;
   }
 
   if (opts_.htk_compat) {  // reorder the features.
diff --git a/src/feat/feature-plp.h b/src/feat/feature-plp.h
index 958c5706e89..4f156ca1e88 100644
--- a/src/feat/feature-plp.h
+++ b/src/feat/feature-plp.h
@@ -110,7 +110,7 @@ class PlpComputer {
 
   int32 Dim() const { return opts_.num_ceps; }
 
-  bool NeedRawLogEnergy() { return opts_.use_energy && opts_.raw_energy; }
+  bool NeedRawLogEnergy() const { return opts_.use_energy && opts_.raw_energy; }
 
   /**
      Function that computes one frame of features from
@@ -133,7 +133,7 @@ class PlpComputer {
      @param [out] feature  Pointer to a vector of size this->Dim(), to which
          the computed feature will be written.
   */
-  void Compute(BaseFloat signal_log_energy,
+  void Compute(BaseFloat signal_raw_log_energy,
                BaseFloat vtln_warp,
                VectorBase<BaseFloat> *signal_frame,
                VectorBase<BaseFloat> *feature);
diff --git a/src/feat/feature-spectrogram.cc b/src/feat/feature-spectrogram.cc
index d2daa7aa829..7eee2643cf5 100644
--- a/src/feat/feature-spectrogram.cc
+++ b/src/feat/feature-spectrogram.cc
@@ -44,7 +44,7 @@ SpectrogramComputer::~SpectrogramComputer() {
   delete srfft_;
 }
 
-void SpectrogramComputer::Compute(BaseFloat signal_log_energy,
+void SpectrogramComputer::Compute(BaseFloat signal_raw_log_energy,
                                   BaseFloat vtln_warp,
                                   VectorBase<BaseFloat> *signal_frame,
                                   VectorBase<BaseFloat> *feature) {
@@ -54,7 +54,7 @@ void SpectrogramComputer::Compute(BaseFloat signal_log_energy,
 
   // Compute energy after window function (not the raw one)
   if (!opts_.raw_energy)
-    signal_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
+    signal_raw_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
                                      std::numeric_limits<float>::epsilon()));
 
   if (srfft_ != NULL)  // Compute FFT using split-radix algorithm.
@@ -72,11 +72,11 @@ void SpectrogramComputer::Compute(BaseFloat signal_log_energy,
 
   feature->CopyFromVec(power_spectrum);
 
-  if (opts_.energy_floor > 0.0 && signal_log_energy < log_energy_floor_)
-    signal_log_energy = log_energy_floor_;
+  if (opts_.energy_floor > 0.0 && signal_raw_log_energy < log_energy_floor_)
+    signal_raw_log_energy = log_energy_floor_;
   // The zeroth spectrogram component is always set to the signal energy,
   // instead of the square of the constant component of the signal.
-  (*feature)(0) = signal_log_energy;
+  (*feature)(0) = signal_raw_log_energy;
 }
 
 }  // namespace kaldi
diff --git a/src/feat/feature-spectrogram.h b/src/feat/feature-spectrogram.h
index 9aeb68c8df8..132a6875e00 100644
--- a/src/feat/feature-spectrogram.h
+++ b/src/feat/feature-spectrogram.h
@@ -70,7 +70,7 @@ class SpectrogramComputer {
 
   int32 Dim() const { return opts_.frame_opts.PaddedWindowSize() / 2 + 1; }
 
-  bool NeedRawLogEnergy() { return opts_.raw_energy; }
+  bool NeedRawLogEnergy() const { return opts_.raw_energy; }
 
 
   /**
@@ -91,7 +91,7 @@ class SpectrogramComputer {
      @param [out] feature  Pointer to a vector of size this->Dim(), to which
          the computed feature will be written.
   */
-  void Compute(BaseFloat signal_log_energy,
+  void Compute(BaseFloat signal_raw_log_energy,
                BaseFloat vtln_warp,
                VectorBase<BaseFloat> *signal_frame,
                VectorBase<BaseFloat> *feature);
diff --git a/src/feat/feature-window.h b/src/feat/feature-window.h
index c249414259c..a7abba50eca 100644
--- a/src/feat/feature-window.h
+++ b/src/feat/feature-window.h
@@ -40,14 +40,16 @@ struct FrameExtractionOptions {
   BaseFloat preemph_coeff;  // Preemphasis coefficient.
   bool remove_dc_offset;  // Subtract mean of wave before FFT.
   std::string window_type;  // e.g. Hamming window
-  bool round_to_power_of_two;
-  BaseFloat blackman_coeff;
-  bool snip_edges;
-  bool allow_downsample;
   // May be "hamming", "rectangular", "povey", "hanning", "blackman"
   // "povey" is a window I made to be similar to Hamming but to go to zero at the
   // edges, it's pow((0.5 - 0.5*cos(n/N*2*pi)), 0.85)
   // I just don't think the Hamming window makes sense as a windowing function.
+  bool round_to_power_of_two;
+  BaseFloat blackman_coeff;
+  bool snip_edges;
+  bool allow_downsample;
+  bool allow_upsample;
+  int max_feature_vectors;
   FrameExtractionOptions():
       samp_freq(16000),
       frame_shift_ms(10.0),
@@ -59,7 +61,10 @@ struct FrameExtractionOptions {
       round_to_power_of_two(true),
       blackman_coeff(0.42),
       snip_edges(true),
-      allow_downsample(false) { }
+      allow_downsample(false),
+      allow_upsample(false),
+      max_feature_vectors(-1)
+      { }
 
   void Register(OptionsItf *opts) {
     opts->Register("sample-frequency", &samp_freq,
@@ -90,6 +95,13 @@ struct FrameExtractionOptions {
     opts->Register("allow-downsample", &allow_downsample,
                    "If true, allow the input waveform to have a higher frequency than "
                    "the specified --sample-frequency (and we'll downsample).");
+    opts->Register("max-feature-vectors", &max_feature_vectors,
+                   "Memory optimization. If larger than 0, periodically remove feature "
+                   "vectors so that only this number of the latest feature vectors is "
+                   "retained.");
+    opts->Register("allow-upsample", &allow_upsample,
+                   "If true, allow the input waveform to have a lower frequency than "
+                   "the specified --sample-frequency (and we'll upsample).");
   }
   int32 WindowShift() const {
     return static_cast<int32>(samp_freq * 0.001 * frame_shift_ms);
@@ -150,7 +162,7 @@ void Preemphasize(VectorBase<BaseFloat> *waveform, BaseFloat preemph_coeff);
 
 /**
   This function does all the windowing steps after actually
-  extracting the windowed signal: depeding on the
+  extracting the windowed signal: depending on the
   configuration, it does dithering, dc offset removal,
   preemphasis, and multiplication by the windowing function.
    @param [in] opts  The options class to be used
diff --git a/src/feat/mel-computations.h b/src/feat/mel-computations.h
index 5df36c8cb90..7ac2aea528f 100644
--- a/src/feat/mel-computations.h
+++ b/src/feat/mel-computations.h
@@ -50,7 +50,7 @@ struct MelBanksOptions {
                         // to the Nyquist frequency to get the cutoff.
   bool debug_mel;
   // htk_mode is a "hidden" config, it does not show up on command line.
-  // Enables more exact compatibibility with HTK, for testing purposes.  Affects
+  // Enables more exact compatibility with HTK, for testing purposes.  Affects
   // mel-energy flooring and reproduces a bug in HTK.
   bool htk_mode;
   explicit MelBanksOptions(int num_bins = 25)
@@ -63,7 +63,7 @@ struct MelBanksOptions {
     opts->Register("low-freq", &low_freq,
                    "Low cutoff frequency for mel bins");
     opts->Register("high-freq", &high_freq,
-                   "High cutoff frequency for mel bins (if < 0, offset from Nyquist)");
+                   "High cutoff frequency for mel bins (if <= 0, offset from Nyquist)");
     opts->Register("vtln-low", &vtln_low,
                    "Low inflection point in piecewise linear VTLN warping function");
     opts->Register("vtln-high", &vtln_high,
@@ -116,6 +116,10 @@ class MelBanks {
   // returns vector of central freq of each bin; needed by plp code.
   const Vector<BaseFloat> &GetCenterFreqs() const { return center_freqs_; }
 
+  const std::vector<std::pair<int32, Vector<BaseFloat> > >& GetBins() const {
+    return bins_;
+  }
+
   // Copy constructor
   MelBanks(const MelBanks &other);
  private:
@@ -145,7 +149,7 @@ void ComputeLifterCoeffs(BaseFloat Q, VectorBase<BaseFloat> *coeffs);
 // pTmp - temporal place [n]
 // pAC - autocorrelation coefficients [n + 1]
 // pLP - linear prediction coefficients [n] (predicted_sn = sum_1^P{a[i] * s[n-i]}})
-//       F(z) = 1 / (1 - A(z)), 1 is not stored in the demoninator
+//       F(z) = 1 / (1 - A(z)), 1 is not stored in the denominator
 // Returns log energy of residual (I think)
 BaseFloat Durbin(int n, const BaseFloat *pAC, BaseFloat *pLP, BaseFloat *pTmp);
 
diff --git a/src/feat/online-feature-test.cc b/src/feat/online-feature-test.cc
index e3a1d5f99f3..7ba6c7c32be 100644
--- a/src/feat/online-feature-test.cc
+++ b/src/feat/online-feature-test.cc
@@ -375,6 +375,45 @@ void TestOnlineAppendFeature() {
   }
 }
 
+void TestRecyclingVector() {
+  RecyclingVector full_vec;
+  RecyclingVector shrinking_vec(10);
+  for (int i = 0; i != 100; ++i) {
+    Vector <BaseFloat> data(1);
+    data.Set(i);
+    full_vec.PushBack(new Vector<BaseFloat>(data));
+    shrinking_vec.PushBack(new Vector<BaseFloat>(data));
+  }
+  KALDI_ASSERT(full_vec.Size() == 100);
+  KALDI_ASSERT(shrinking_vec.Size() == 100);
+
+  // full_vec should contain everything
+  for (int i = 0; i != 100; ++i) {
+    Vector <BaseFloat> *data = full_vec.At(i);
+    KALDI_ASSERT(data != nullptr);
+    KALDI_ASSERT((*data)(0) == static_cast<BaseFloat>(i));
+  }
+
+  // shrinking_vec may throw an exception for the first 90 elements
+  int caught_exceptions = 0;
+  for (int i = 0; i != 90; ++i) {
+    try {
+      shrinking_vec.At(i);
+    } catch (const std::runtime_error &) {
+      ++caught_exceptions;
+    }
+  }
+  // it may actually store a bit more elements for performance efficiency considerations
+  KALDI_ASSERT(caught_exceptions >= 80);
+
+  // shrinking_vec should contain the last 10 elements
+  for (int i = 90; i != 100; ++i) {
+    Vector <BaseFloat> *data = shrinking_vec.At(i);
+    KALDI_ASSERT(data != nullptr);
+    KALDI_ASSERT((*data)(0) == static_cast<BaseFloat>(i));
+  }
+}
+
 }  // end namespace kaldi
 
 int main() {
@@ -387,6 +426,7 @@ int main() {
     TestOnlinePlp();
     TestOnlineTransform();
     TestOnlineAppendFeature();
+    TestRecyclingVector();
   }
   std::cout << "Test OK.\n";
 }
diff --git a/src/feat/online-feature.cc b/src/feat/online-feature.cc
index 88d21473b9f..b7f5b3ebc60 100644
--- a/src/feat/online-feature.cc
+++ b/src/feat/online-feature.cc
@@ -24,50 +24,142 @@
 
 namespace kaldi {
 
-template<class C>
+RecyclingVector::RecyclingVector(int items_to_hold):
+  items_to_hold_(items_to_hold == 0 ? -1 : items_to_hold),
+  first_available_index_(0) {
+}
+
+RecyclingVector::~RecyclingVector() {
+  for (auto *item : items_) {
+    delete item;
+  }
+}
+
+Vector<BaseFloat> *RecyclingVector::At(int index) const {
+  if (index < first_available_index_) {
+    KALDI_ERR << "Attempted to retrieve feature vector that was "
+                 "already removed by the RecyclingVector (index = "
+              << index << "; "
+              << "first_available_index = " << first_available_index_ << "; "
+              << "size = " << Size() << ")";
+  }
+  // 'at' does size checking.
+  return items_.at(index - first_available_index_);
+}
+
+void RecyclingVector::PushBack(Vector<BaseFloat> *item) {
+  if (items_.size() == items_to_hold_) {
+    delete items_.front();
+    items_.pop_front();
+    ++first_available_index_;
+  }
+  items_.push_back(item);
+}
+
+int RecyclingVector::Size() const {
+  return first_available_index_ + items_.size();
+}
+
+template <class C>
 void OnlineGenericBaseFeature<C>::GetFrame(int32 frame,
                                            VectorBase<BaseFloat> *feat) {
-  // 'at' does size checking.
-  feat->CopyFromVec(*(features_.at(frame)));
+  feat->CopyFromVec(*(features_.At(frame)));
 };
 
-template<class C>
+template <class C>
 OnlineGenericBaseFeature<C>::OnlineGenericBaseFeature(
     const typename C::Options &opts):
     computer_(opts), window_function_(computer_.GetFrameOptions()),
+    features_(opts.frame_opts.max_feature_vectors),
     input_finished_(false), waveform_offset_(0) { }
 
-template<class C>
-void OnlineGenericBaseFeature<C>::AcceptWaveform(BaseFloat sampling_rate,
-                                                 const VectorBase<BaseFloat> &waveform) {
+
+template <class C>
+void OnlineGenericBaseFeature<C>::MaybeCreateResampler(
+    BaseFloat sampling_rate) {
   BaseFloat expected_sampling_rate = computer_.GetFrameOptions().samp_freq;
-  if (sampling_rate != expected_sampling_rate)
+
+  if (resampler_ != nullptr) {
+    KALDI_ASSERT(resampler_->GetInputSamplingRate() == sampling_rate);
+    KALDI_ASSERT(resampler_->GetOutputSamplingRate() == expected_sampling_rate);
+  } else if (((sampling_rate > expected_sampling_rate) &&
+              !computer_.GetFrameOptions().allow_downsample) ||
+             ((sampling_rate > expected_sampling_rate) &&
+              !computer_.GetFrameOptions().allow_upsample)) {
+    resampler_.reset(new LinearResample(
+        sampling_rate, expected_sampling_rate,
+        std::min(sampling_rate / 2, expected_sampling_rate / 2), 6));
+  } else if (sampling_rate != expected_sampling_rate) {
     KALDI_ERR << "Sampling frequency mismatch, expected "
-              << expected_sampling_rate << ", got " << sampling_rate;
-  if (waveform.Dim() == 0)
+              << expected_sampling_rate << ", got " << sampling_rate
+              << "\nPerhaps you want to use the options "
+                 "--allow_{upsample,downsample}";
+  }
+}
+
+template <class C>
+void OnlineGenericBaseFeature<C>::InputFinished() {
+  if (resampler_ != nullptr) {
+    // There may be a few samples left once we flush the resampler_ object, telling it
+    // that the file has finished.  This should rarely make any difference.
+    Vector<BaseFloat> appended_wave;
+    Vector<BaseFloat> resampled_wave;
+    resampler_->Resample(appended_wave, true, &resampled_wave);
+
+    if (resampled_wave.Dim() != 0) {
+      appended_wave.Resize(waveform_remainder_.Dim() +
+                           resampled_wave.Dim());
+      if (waveform_remainder_.Dim() != 0)
+        appended_wave.Range(0, waveform_remainder_.Dim())
+            .CopyFromVec(waveform_remainder_);
+      appended_wave.Range(waveform_remainder_.Dim(), resampled_wave.Dim())
+          .CopyFromVec(resampled_wave);
+      waveform_remainder_.Swap(&appended_wave);
+    }
+  }
+  input_finished_ = true;
+  ComputeFeatures();
+}
+
+template <class C>
+void OnlineGenericBaseFeature<C>::AcceptWaveform(
+    BaseFloat sampling_rate, const VectorBase<BaseFloat> &original_waveform) {
+  if (original_waveform.Dim() == 0)
     return;  // Nothing to do.
   if (input_finished_)
     KALDI_ERR << "AcceptWaveform called after InputFinished() was called.";
-  // append 'waveform' to 'waveform_remainder_.'
-  Vector<BaseFloat> appended_wave(waveform_remainder_.Dim() + waveform.Dim());
+
+  Vector<BaseFloat> appended_wave;
+  Vector<BaseFloat> resampled_wave;
+
+  const VectorBase<BaseFloat> *waveform;
+
+  MaybeCreateResampler(sampling_rate);
+  if (resampler_ == nullptr) {
+    waveform = &original_waveform;
+  } else {
+    resampler_->Resample(original_waveform, false, &resampled_wave);
+    waveform = &resampled_wave;
+  }
+
+  appended_wave.Resize(waveform_remainder_.Dim() + waveform->Dim());
   if (waveform_remainder_.Dim() != 0)
-    appended_wave.Range(0, waveform_remainder_.Dim()).CopyFromVec(
-        waveform_remainder_);
-  appended_wave.Range(waveform_remainder_.Dim(), waveform.Dim()).CopyFromVec(
-      waveform);
+    appended_wave.Range(0, waveform_remainder_.Dim())
+        .CopyFromVec(waveform_remainder_);
+  appended_wave.Range(waveform_remainder_.Dim(), waveform->Dim())
+      .CopyFromVec(*waveform);
   waveform_remainder_.Swap(&appended_wave);
   ComputeFeatures();
 }
 
-template<class C>
+template <class C>
 void OnlineGenericBaseFeature<C>::ComputeFeatures() {
   const FrameExtractionOptions &frame_opts = computer_.GetFrameOptions();
   int64 num_samples_total = waveform_offset_ + waveform_remainder_.Dim();
-  int32 num_frames_old = features_.size(),
+  int32 num_frames_old = features_.Size(),
       num_frames_new = NumFrames(num_samples_total, frame_opts,
                                  input_finished_);
   KALDI_ASSERT(num_frames_new >= num_frames_old);
-  features_.resize(num_frames_new, NULL);
 
   Vector<BaseFloat> window;
   bool need_raw_log_energy = computer_.NeedRawLogEnergy();
@@ -81,7 +173,7 @@ void OnlineGenericBaseFeature<C>::ComputeFeatures() {
     // note: this online feature-extraction code does not support VTLN.
     BaseFloat vtln_warp = 1.0;
     computer_.Compute(raw_log_energy, vtln_warp, &window, this_feature);
-    features_[frame] = this_feature;
+    features_.PushBack(this_feature);
   }
   // OK, we will now discard any portion of the signal that will not be
   // necessary to compute frames in the future.
@@ -110,7 +202,6 @@ template class OnlineGenericBaseFeature<MfccComputer>;
 template class OnlineGenericBaseFeature<PlpComputer>;
 template class OnlineGenericBaseFeature<FbankComputer>;
 
-
 OnlineCmvnState::OnlineCmvnState(const OnlineCmvnState &other):
     speaker_cmvn_stats(other.speaker_cmvn_stats),
     global_cmvn_stats(other.global_cmvn_stats),
@@ -138,8 +229,6 @@ void OnlineCmvnState::Read(std::istream &is, bool binary) {
   ExpectToken(is, binary, "</OnlineCmvnState>");
 }
 
-
-
 OnlineCmvn::OnlineCmvn(const OnlineCmvnOptions &opts,
                        const OnlineCmvnState &cmvn_state,
                        OnlineFeatureInterface *src):
@@ -293,7 +382,8 @@ void OnlineCmvn::SmoothOnlineCmvnStats(const MatrixBase<double> &speaker_stats,
   // If count exceeded cmn_window it would be an error in how "window_stats"
   // was accumulated.
   KALDI_ASSERT(cur_count <= 1.001 * opts.cmn_window);
-  if (cur_count >= opts.cmn_window) return;
+  if (cur_count >= opts.cmn_window)
+    return;
   if (speaker_stats.NumRows() != 0) {  // if we have speaker stats..
     double count_from_speaker = opts.cmn_window - cur_count,
         speaker_count = speaker_stats(0, dim);
@@ -306,7 +396,8 @@ void OnlineCmvn::SmoothOnlineCmvnStats(const MatrixBase<double> &speaker_stats,
                              speaker_stats);
     cur_count = (*stats)(0, dim);
   }
-  if (cur_count >= opts.cmn_window) return;
+  if (cur_count >= opts.cmn_window)
+    return;
   if (global_stats.NumRows() != 0) {
     double count_from_global = opts.cmn_window - cur_count,
         global_count = global_stats(0, dim);
@@ -398,7 +489,7 @@ void OnlineCmvn::SetState(const OnlineCmvnState &cmvn_state) {
 
 int32 OnlineSpliceFrames::NumFramesReady() const {
   int32 num_frames = src_->NumFramesReady();
-  if (num_frames > 0 && src_->IsLastFrame(num_frames-1))
+  if (num_frames > 0 && src_->IsLastFrame(num_frames - 1))
     return num_frames;
   else
     return std::max<int32>(0, num_frames - right_context_);
diff --git a/src/feat/online-feature.h b/src/feat/online-feature.h
index d41bb6747c7..4f66ffef2ff 100644
--- a/src/feat/online-feature.h
+++ b/src/feat/online-feature.h
@@ -41,6 +41,36 @@ namespace kaldi {
 /// @{
 
 
+/// This class serves as a storage for feature vectors with an option to limit
+/// the memory usage by removing old elements. The deleted frames indices are
+/// "remembered" so that regardless of the MAX_ITEMS setting, the user always
+/// provides the indices as if no deletion was being performed.
+/// This is useful when processing very long recordings which would otherwise
+/// cause the memory to eventually blow up when the features are not being removed.
+class RecyclingVector {
+public:
+  /// By default it does not remove any elements.
+  RecyclingVector(int items_to_hold = -1);
+
+  /// The ownership is being retained by this collection - do not delete the item.
+  Vector<BaseFloat> *At(int index) const;
+
+  /// The ownership of the item is passed to this collection - do not delete the item.
+  void PushBack(Vector<BaseFloat> *item);
+
+  /// This method returns the size as if no "recycling" had happened,
+  /// i.e. equivalent to the number of times the PushBack method has been called.
+  int Size() const;
+
+  ~RecyclingVector();
+
+private:
+  std::deque<Vector<BaseFloat>*> items_;
+  int items_to_hold_;
+  int first_available_index_;
+};
+
+
 /// This is a templated class for online feature extraction;
 /// it's templated on a class like MfccComputer or PlpComputer
 /// that does the basic feature extraction.
@@ -61,7 +91,7 @@ class OnlineGenericBaseFeature: public OnlineBaseFeature {
     return computer_.GetFrameOptions().frame_shift_ms / 1000.0f;
   }
 
-  virtual int32 NumFramesReady() const { return features_.size(); }
+  virtual int32 NumFramesReady() const { return features_.Size(); }
 
   virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
 
@@ -83,14 +113,7 @@ class OnlineGenericBaseFeature: public OnlineBaseFeature {
   // more waveform.  This will help flush out the last frame or two
   // of features, in the case where snip-edges == false; it also
   // affects the return value of IsLastFrame().
-  virtual void InputFinished() {
-    input_finished_ = true;
-    ComputeFeatures();
-  }
-
-  ~OnlineGenericBaseFeature() {
-    DeletePointers(&features_);
-  }
+  virtual void InputFinished();
 
  private:
   // This function computes any additional feature frames that it is possible to
@@ -101,13 +124,19 @@ class OnlineGenericBaseFeature: public OnlineBaseFeature {
   // waveform_remainder_ while incrementing waveform_offset_ by the same amount.
   void ComputeFeatures();
 
+  void MaybeCreateResampler(BaseFloat sampling_rate);
+
   C computer_;  // class that does the MFCC or PLP or filterbank computation
 
+  // resampler in cases when the input sampling frequency is not equal to
+  // the expected sampling rate
+  std::unique_ptr<LinearResample> resampler_;
+
   FeatureWindowFunction window_function_;
 
   // features_ is the Mfcc or Plp or Fbank features that we have already computed.
 
-  std::vector<Vector<BaseFloat>*> features_;
+  RecyclingVector features_;
 
   // True if the user has called "InputFinished()"
   bool input_finished_;
diff --git a/src/feat/pitch-functions-test.cc b/src/feat/pitch-functions-test.cc
index 098e590a8e9..0e481c18674 100644
--- a/src/feat/pitch-functions-test.cc
+++ b/src/feat/pitch-functions-test.cc
@@ -449,7 +449,7 @@ static void UnitTestKeeleNccfBallast() {
       // use pitch code with default configuration..
       PitchExtractionOptions op;
       op.nccf_ballast = 0.05 * k;
-      KALDI_LOG << " nccf_ballast " << op.nccf_ballast << std::endl;
+      KALDI_LOG << " nccf_ballast " << op.nccf_ballast;
       // compute pitch.
       Matrix<BaseFloat> m;
       ComputeKaldiPitch(op, waveform, &m);
@@ -493,7 +493,7 @@ static void UnitTestPitchExtractionSpeed() {
     double tot_time = timer.Elapsed(),
         speech_time = test_num * waveform.Dim() / wave.SampFreq();
     KALDI_LOG << " Pitch extraction time per second of speech is "
-              << (tot_time / speech_time) << " seconds " << std::endl;
+              << (tot_time / speech_time) << " seconds.";
   }
 }
 static void UnitTestPitchExtractorCompareKeele() {
diff --git a/src/feat/resample.cc b/src/feat/resample.cc
index 518685d85c8..11f4c62bf1c 100644
--- a/src/feat/resample.cc
+++ b/src/feat/resample.cc
@@ -302,7 +302,7 @@ void ArbitraryResample::Resample(const VectorBase<BaseFloat> &input,
                                  VectorBase<BaseFloat> *output) const {
   KALDI_ASSERT(input.Dim() == num_samples_in_ &&
                output->Dim() == weights_.size());
-  
+
   int32 output_dim = output->Dim();
   for (int32 i = 0; i < output_dim; i++) {
     SubVector<BaseFloat> input_part(input, first_index_[i], weights_[i].Dim());
@@ -365,13 +365,13 @@ BaseFloat ArbitraryResample::FilterFunc(BaseFloat t) const {
   return filter * window;
 }
 
-void DownsampleWaveForm(BaseFloat orig_freq, const VectorBase<BaseFloat> &wave,
-                        BaseFloat new_freq, Vector<BaseFloat> *new_wave) {
-  KALDI_ASSERT(new_freq < orig_freq);
-  BaseFloat lowpass_cutoff = 0.99 * 0.5 * new_freq;
+void ResampleWaveform(BaseFloat orig_freq, const VectorBase<BaseFloat> &wave,
+                      BaseFloat new_freq, Vector<BaseFloat> *new_wave) {
+  BaseFloat min_freq = std::min(orig_freq, new_freq);
+  BaseFloat lowpass_cutoff = 0.99 * 0.5 * min_freq;
   int32 lowpass_filter_width = 6;
-  LinearResample signal_downsampler(orig_freq, new_freq,
-                                    lowpass_cutoff, lowpass_filter_width);
-  signal_downsampler.Resample(wave, true, new_wave);
+  LinearResample resampler(orig_freq, new_freq,
+                           lowpass_cutoff, lowpass_filter_width);
+  resampler.Resample(wave, true, new_wave);
 }
 }  // namespace kaldi
diff --git a/src/feat/resample.h b/src/feat/resample.h
index cc3e5064863..e0b4688c99b 100644
--- a/src/feat/resample.h
+++ b/src/feat/resample.h
@@ -40,7 +40,7 @@ namespace kaldi {
 
 /**
    \file[resample.h]
-   
+
    This header contains declarations of classes for resampling signals.  The
    normal cases of resampling a signal are upsampling and downsampling
    (increasing and decreasing the sample rate of a signal, respectively),
@@ -51,7 +51,7 @@ namespace kaldi {
    The input signal is always evenly spaced, say sampled with frequency S, and
    we assume the original signal was band-limited to S/2 or lower.  The n'th
    input sample x_n (with n = 0, 1, ...) is interpreted as the original
-   signal's value at time n/S.  
+   signal's value at time n/S.
 
    For resampling, it is convenient to view the input signal as a
    continuous function x(t) of t, where each sample x_n becomes a delta function
@@ -73,14 +73,14 @@ namespace kaldi {
    means we window the sinc function out to its first zero on the left and right,
    w = 2 means the second zero, and so on; we normally choose w to be at least two.
    We call this num_zeros, not w, in the code.
-   
+
    Convolving the signal x(t) with this windowed filter h(t) = f(t)g(t) and evaluating the resulting
    signal s(t) at an arbitrary time t is easy: we have
     \f[          s(t) = 1/S \sum_n x_n h(t - n/S)        \f].
    (note: the sign of t - n/S might be wrong, but it doesn't matter as the filter
    and window are symmetric).
    This is true for arbitrary values of t.  What the class ArbitraryResample does
-   is to allow you to evaluate the signal for specified values of t.  
+   is to allow you to evaluate the signal for specified values of t.
 */
 
 
@@ -90,7 +90,7 @@ namespace kaldi {
    don't have to be linearly spaced.  The low-pass filter cutoff
    "filter_cutoff_hz" should be less than half the sample rate;
    "num_zeros" should probably be at least two preferably more; higher numbers give
-   sharper filters but will be less efficient. 
+   sharper filters but will be less efficient.
 */
 class ArbitraryResample {
  public:
@@ -115,7 +115,7 @@ class ArbitraryResample {
   /// This version of the Resample function processes just
   /// one vector.
   void Resample(const VectorBase<BaseFloat> &input,
-                VectorBase<BaseFloat> *output) const;  
+                VectorBase<BaseFloat> *output) const;
  private:
   void SetIndexes(const Vector<BaseFloat> &sample_points);
 
@@ -185,6 +185,10 @@ class LinearResample {
   /// Resample(x, y, true) for the last piece.  Call it unnecessarily between
   /// signals will not do any harm.
   void Reset();
+
+  //// Return the input and output sampling rates (for checks, for example)
+  inline int32 GetInputSamplingRate() { return samp_rate_in_; }
+  inline int32 GetOutputSamplingRate() { return samp_rate_out_; }
  private:
   /// This function outputs the number of output samples we will output
   /// for a signal with "input_num_samp" input samples.  If flush == true,
@@ -248,20 +252,35 @@ class LinearResample {
                                        ///< previously seen input signal.
 };
 
-/// Downsample a waveform. This is a convenience wrapper for the
-/// class 'LinearResample'.
-/// The low-pass filter cutoff used in 'LinearResample' is 0.99 of half of the
-/// new_freq and num_zeros is 6.
-/// The downsampling results is also checked wit sox resampling toolkit.
-/// Sox design is inspired by Laurent De Soras' paper,
-/// https://ccrma.stanford.edu/~jos/resample/Implementation.html
-/// It designs low pass filter using pass-band, stop-band, Nyquist freq
-/// and stop-band attenuation.
-/// e.g. The mainlob for Hanning window is 4pi/M, where the main-lobe width is
-/// equal to (pass-band-freq - stop-band-freq).
-/// Also the cutoff frequency is equal to (pass-band-freq - stop-band-freq).
-void DownsampleWaveForm(BaseFloat orig_freq, const VectorBase<BaseFloat> &wave,
-                        BaseFloat new_freq, Vector<BaseFloat> *new_wave);
+/**
+   Downsample or upsample a waveform. This is a convenience wrapper for the
+   class 'LinearResample'.
+   The low-pass filter cutoff used in 'LinearResample' is 0.99 of the Nyquist,
+   where the Nyquist is half of the minimum of (orig_freq, new_freq).  The
+   resampling is done with a symmetric FIR filter with N_z (number of zeros)
+   as 6.
+
+   We compared the downsampling results with those from the sox resampling
+   toolkit.
+   Sox's design is inspired by Laurent De Soras' paper,
+   https://ccrma.stanford.edu/~jos/resample/Implementation.html
+
+   Note: we expect that while orig_freq and new_freq are of type BaseFloat, they
+   are actually required to have exact integer values (like 16000 or 8000) with
+   a ratio between them that can be expressed as a rational number with
+   reasonably small integer factors.
+*/
+void ResampleWaveform(BaseFloat orig_freq, const VectorBase<BaseFloat> &wave,
+                      BaseFloat new_freq, Vector<BaseFloat> *new_wave);
+
+
+/// This function is deprecated.  It is provided for backward compatibility, to avoid
+/// breaking older code.
+inline void DownsampleWaveForm(BaseFloat orig_freq, const VectorBase<BaseFloat> &wave,
+                               BaseFloat new_freq, Vector<BaseFloat> *new_wave) {
+  ResampleWaveform(orig_freq, wave, new_freq, new_wave);
+}
+
 
 /// @} End of "addtogroup feat"
 }  // namespace kaldi
diff --git a/src/feat/wave-reader.h b/src/feat/wave-reader.h
index 7ba981c2c24..dae74139232 100644
--- a/src/feat/wave-reader.h
+++ b/src/feat/wave-reader.h
@@ -119,7 +119,7 @@ class WaveData {
   void Write(std::ostream &os) const;
 
   // This function returns the wave data-- it's in a matrix
-  // becase there may be multiple channels.  In the normal case
+  // because there may be multiple channels.  In the normal case
   // there's just one channel so Data() will have one row.
   const Matrix<BaseFloat> &Data() const { return data_; }
 
diff --git a/src/featbin/compute-fbank-feats.cc b/src/featbin/compute-fbank-feats.cc
index 41df621d62d..e52b30bafb6 100644
--- a/src/featbin/compute-fbank-feats.cc
+++ b/src/featbin/compute-fbank-feats.cc
@@ -19,9 +19,9 @@
 // limitations under the License.
 
 #include "base/kaldi-common.h"
-#include "util/common-utils.h"
 #include "feat/feature-fbank.h"
 #include "feat/wave-reader.h"
+#include "util/common-utils.h"
 
 
 int main(int argc, char *argv[]) {
@@ -29,35 +29,42 @@ int main(int argc, char *argv[]) {
     using namespace kaldi;
     const char *usage =
         "Create Mel-filter bank (FBANK) feature files.\n"
-        "Usage:  compute-fbank-feats [options...] <wav-rspecifier> <feats-wspecifier>\n";
+        "Usage:  compute-fbank-feats [options...] <wav-rspecifier> "
+        "<feats-wspecifier>\n";
 
-    // construct all the global objects
+    // Construct all the global objects.
     ParseOptions po(usage);
     FbankOptions fbank_opts;
+    // Define defaults for global options.
     bool subtract_mean = false;
     BaseFloat vtln_warp = 1.0;
     std::string vtln_map_rspecifier;
     std::string utt2spk_rspecifier;
     int32 channel = -1;
     BaseFloat min_duration = 0.0;
-    // Define defaults for gobal options
     std::string output_format = "kaldi";
+    std::string utt2dur_wspecifier;
 
-    // Register the option struct
+    // Register the option struct.
     fbank_opts.Register(&po);
-    // Register the options
-    po.Register("output-format", &output_format, "Format of the output files [kaldi, htk]");
-    po.Register("subtract-mean", &subtract_mean, "Subtract mean of each feature file [CMS]; not recommended to do it this way. ");
-    po.Register("vtln-warp", &vtln_warp, "Vtln warp factor (only applicable if vtln-map not specified)");
-    po.Register("vtln-map", &vtln_map_rspecifier, "Map from utterance or speaker-id to vtln warp factor (rspecifier)");
-    po.Register("utt2spk", &utt2spk_rspecifier, "Utterance to speaker-id map (if doing VTLN and you have warps per speaker)");
-    po.Register("channel", &channel, "Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right)");
-    po.Register("min-duration", &min_duration, "Minimum duration of segments to process (in seconds).");
-
-    // OPTION PARSING ..........................................................
-    //
-
-    // parse options (+filling the registered variables)
+    // Register the options.
+    po.Register("output-format", &output_format,
+                "Format of the output files [kaldi, htk]");
+    po.Register("subtract-mean", &subtract_mean, "Subtract mean of each "
+                "feature file [CMS]; not recommended to do it this way. ");
+    po.Register("vtln-warp", &vtln_warp,
+                "Vtln warp factor (only applicable if vtln-map not specified)");
+    po.Register("vtln-map", &vtln_map_rspecifier,"Map from utterance or "
+                "speaker-id to vtln warp factor (rspecifier)");
+    po.Register("utt2spk", &utt2spk_rspecifier, "Utterance to speaker-id map "
+                "(if doing VTLN and you have warps per speaker)");
+    po.Register("channel", &channel, "Channel to extract (-1 -> expect mono, "
+                "0 -> left, 1 -> right)");
+    po.Register("min-duration", &min_duration, "Minimum duration of segments "
+                "to process (in seconds).");
+    po.Register("write-utt2dur", &utt2dur_wspecifier, "Wspecifier to write "
+                "duration of each utterance in seconds, e.g. 'ark,t:utt2dur'.");
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 2) {
@@ -71,16 +78,16 @@ int main(int argc, char *argv[]) {
 
     Fbank fbank(fbank_opts);
 
+    if (utt2spk_rspecifier != "" && vtln_map_rspecifier != "")
+      KALDI_ERR << ("The --utt2spk option is only needed if "
+                    "the --vtln-map option is used.");
+    RandomAccessBaseFloatReaderMapped vtln_map_reader(vtln_map_rspecifier,
+                                                      utt2spk_rspecifier);
+
     SequentialTableReader<WaveHolder> reader(wav_rspecifier);
     BaseFloatMatrixWriter kaldi_writer;  // typedef to TableWriter<something>.
     TableWriter<HtkMatrixHolder> htk_writer;
 
-    if (utt2spk_rspecifier != "")
-      KALDI_ASSERT(vtln_map_rspecifier != "" && "the utt2spk option is only "
-                   "needed if the vtln-map option is used.");
-    RandomAccessBaseFloatReaderMapped vtln_map_reader(vtln_map_rspecifier,
-                                                      utt2spk_rspecifier);
-
     if (output_format == "kaldi") {
       if (!kaldi_writer.Open(output_wspecifier))
         KALDI_ERR << "Could not initialize output with wspecifier "
@@ -93,6 +100,8 @@ int main(int argc, char *argv[]) {
       KALDI_ERR << "Invalid output_format string " << output_format;
     }
 
+    DoubleWriter utt2dur_writer(utt2dur_wspecifier);
+
     int32 num_utts = 0, num_success = 0;
     for (; !reader.Done(); reader.Next()) {
       num_utts++;
@@ -105,7 +114,7 @@ int main(int argc, char *argv[]) {
       }
       int32 num_chan = wave_data.Data().NumRows(), this_chan = channel;
       {  // This block works out the channel (0=left, 1=right...)
-        KALDI_ASSERT(num_chan > 0);  // should have been caught in
+        KALDI_ASSERT(num_chan > 0);  // This should have been caught in
         // reading code if no channels.
         if (channel == -1) {
           this_chan = 0;
@@ -136,10 +145,10 @@ int main(int argc, char *argv[]) {
       SubVector<BaseFloat> waveform(wave_data.Data(), this_chan);
       Matrix<BaseFloat> features;
       try {
-        fbank.ComputeFeatures(waveform, wave_data.SampFreq(), vtln_warp_local, &features);
+        fbank.ComputeFeatures(waveform, wave_data.SampFreq(),
+                              vtln_warp_local, &features);
       } catch (...) {
-        KALDI_WARN << "Failed to compute features for utterance "
-                   << utt;
+        KALDI_WARN << "Failed to compute features for utterance " << utt;
         continue;
       }
       if (subtract_mean) {
@@ -165,6 +174,9 @@ int main(int argc, char *argv[]) {
         p.second = header;
         htk_writer.Write(utt, p);
       }
+      if (utt2dur_writer.IsOpen()) {
+        utt2dur_writer.Write(utt, wave_data.Duration());
+      }
       if (num_utts % 10 == 0)
         KALDI_LOG << "Processed " << num_utts << " utterances";
       KALDI_VLOG(2) << "Processed features for key " << utt;
@@ -177,6 +189,4 @@ int main(int argc, char *argv[]) {
     std::cerr << e.what();
     return -1;
   }
-  return 0;
 }
-
diff --git a/src/featbin/compute-mfcc-feats.cc b/src/featbin/compute-mfcc-feats.cc
index 09efcd38dd0..0827d0a9360 100644
--- a/src/featbin/compute-mfcc-feats.cc
+++ b/src/featbin/compute-mfcc-feats.cc
@@ -19,33 +19,35 @@
 // limitations under the License.
 
 #include "base/kaldi-common.h"
-#include "util/common-utils.h"
 #include "feat/feature-mfcc.h"
 #include "feat/wave-reader.h"
+#include "util/common-utils.h"
 
 int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
     const char *usage =
         "Create MFCC feature files.\n"
-        "Usage:  compute-mfcc-feats [options...] <wav-rspecifier> <feats-wspecifier>\n";
+        "Usage:  compute-mfcc-feats [options...] <wav-rspecifier> "
+        "<feats-wspecifier>\n";
 
-    // construct all the global objects
+    // Construct all the global objects.
     ParseOptions po(usage);
     MfccOptions mfcc_opts;
+    // Define defaults for global options.
     bool subtract_mean = false;
     BaseFloat vtln_warp = 1.0;
     std::string vtln_map_rspecifier;
     std::string utt2spk_rspecifier;
     int32 channel = -1;
     BaseFloat min_duration = 0.0;
-    // Define defaults for gobal options
     std::string output_format = "kaldi";
+    std::string utt2dur_wspecifier;
 
-    // Register the MFCC option struct
+    // Register the MFCC option struct.
     mfcc_opts.Register(&po);
 
-    // Register the options
+    // Register the options.
     po.Register("output-format", &output_format, "Format of the output "
                 "files [kaldi, htk]");
     po.Register("subtract-mean", &subtract_mean, "Subtract mean of each "
@@ -60,6 +62,8 @@ int main(int argc, char *argv[]) {
                 "0 -> left, 1 -> right)");
     po.Register("min-duration", &min_duration, "Minimum duration of segments "
                 "to process (in seconds).");
+    po.Register("write-utt2dur", &utt2dur_wspecifier, "Wspecifier to write "
+                "duration of each utterance in seconds, e.g. 'ark,t:utt2dur'.");
 
     po.Read(argc, argv);
 
@@ -74,16 +78,16 @@ int main(int argc, char *argv[]) {
 
     Mfcc mfcc(mfcc_opts);
 
+    if (utt2spk_rspecifier != "" && vtln_map_rspecifier != "")
+      KALDI_ERR << ("The --utt2spk option is only needed if "
+                    "the --vtln-map option is used.");
+    RandomAccessBaseFloatReaderMapped vtln_map_reader(vtln_map_rspecifier,
+                                                      utt2spk_rspecifier);
+
     SequentialTableReader<WaveHolder> reader(wav_rspecifier);
     BaseFloatMatrixWriter kaldi_writer;  // typedef to TableWriter<something>.
     TableWriter<HtkMatrixHolder> htk_writer;
 
-    if (utt2spk_rspecifier != "")
-      KALDI_ASSERT(vtln_map_rspecifier != "" && "the utt2spk option is only "
-                   "needed if the vtln-map option is used.");
-    RandomAccessBaseFloatReaderMapped vtln_map_reader(vtln_map_rspecifier,
-                                                      utt2spk_rspecifier);
-    
     if (output_format == "kaldi") {
       if (!kaldi_writer.Open(output_wspecifier))
         KALDI_ERR << "Could not initialize output with wspecifier "
@@ -96,6 +100,8 @@ int main(int argc, char *argv[]) {
       KALDI_ERR << "Invalid output_format string " << output_format;
     }
 
+    DoubleWriter utt2dur_writer(utt2dur_wspecifier);
+
     int32 num_utts = 0, num_success = 0;
     for (; !reader.Done(); reader.Next()) {
       num_utts++;
@@ -139,10 +145,10 @@ int main(int argc, char *argv[]) {
       SubVector<BaseFloat> waveform(wave_data.Data(), this_chan);
       Matrix<BaseFloat> features;
       try {
-        mfcc.ComputeFeatures(waveform, wave_data.SampFreq(), vtln_warp_local, &features);
+        mfcc.ComputeFeatures(waveform, wave_data.SampFreq(),
+                             vtln_warp_local, &features);
       } catch (...) {
-        KALDI_WARN << "Failed to compute features for utterance "
-                   << utt;
+        KALDI_WARN << "Failed to compute features for utterance " << utt;
         continue;
       }
       if (subtract_mean) {
@@ -168,6 +174,9 @@ int main(int argc, char *argv[]) {
         p.second = header;
         htk_writer.Write(utt, p);
       }
+      if (utt2dur_writer.IsOpen()) {
+        utt2dur_writer.Write(utt, wave_data.Duration());
+      }
       if (num_utts % 10 == 0)
         KALDI_LOG << "Processed " << num_utts << " utterances";
       KALDI_VLOG(2) << "Processed features for key " << utt;
@@ -181,4 +190,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
diff --git a/src/featbin/compute-plp-feats.cc b/src/featbin/compute-plp-feats.cc
index 3e9fe9d7423..5c3b9843b4d 100644
--- a/src/featbin/compute-plp-feats.cc
+++ b/src/featbin/compute-plp-feats.cc
@@ -19,9 +19,9 @@
 // limitations under the License.
 
 #include "base/kaldi-common.h"
-#include "util/common-utils.h"
 #include "feat/feature-plp.h"
 #include "feat/wave-reader.h"
+#include "util/common-utils.h"
 
 
 int main(int argc, char *argv[]) {
@@ -29,21 +29,23 @@ int main(int argc, char *argv[]) {
     using namespace kaldi;
     const char *usage =
         "Create PLP feature files.\n"
-        "Usage:  compute-plp-feats [options...] <wav-rspecifier> <feats-wspecifier>\n";
+        "Usage:  compute-plp-feats [options...] <wav-rspecifier> "
+        "<feats-wspecifier>\n";
 
-    // construct all the global objects
+    // Construct all the global objects.
     ParseOptions po(usage);
     PlpOptions plp_opts;
+    // Define defaults for global options.
     bool subtract_mean = false;
     BaseFloat vtln_warp = 1.0;
     std::string vtln_map_rspecifier;
     std::string utt2spk_rspecifier;
     int32 channel = -1;
     BaseFloat min_duration = 0.0;
-    // Define defaults for gobal options
     std::string output_format = "kaldi";
+    std::string utt2dur_wspecifier;
 
-    // Register the options
+    // Register the options.
     po.Register("output-format", &output_format, "Format of the output "
                 "files [kaldi, htk]");
     po.Register("subtract-mean", &subtract_mean, "Subtract mean of each "
@@ -58,11 +60,13 @@ int main(int argc, char *argv[]) {
                 "0 -> left, 1 -> right)");
     po.Register("min-duration", &min_duration, "Minimum duration of segments "
                 "to process (in seconds).");
+    po.Register("write-utt2dur", &utt2dur_wspecifier, "Wspecifier to write "
+                "duration of each utterance in seconds, e.g. 'ark,t:utt2dur'.");
 
     plp_opts.Register(&po);
 
     po.Read(argc, argv);
-    
+
     if (po.NumArgs() != 2) {
       po.PrintUsage();
       exit(1);
@@ -74,16 +78,16 @@ int main(int argc, char *argv[]) {
 
     Plp plp(plp_opts);
 
+    if (utt2spk_rspecifier != "" && vtln_map_rspecifier != "")
+      KALDI_ERR << ("The --utt2spk option is only needed if "
+                    "the --vtln-map option is used.");
+    RandomAccessBaseFloatReaderMapped vtln_map_reader(vtln_map_rspecifier,
+                                                      utt2spk_rspecifier);
+
     SequentialTableReader<WaveHolder> reader(wav_rspecifier);
     BaseFloatMatrixWriter kaldi_writer;  // typedef to TableWriter<something>.
     TableWriter<HtkMatrixHolder> htk_writer;
 
-    if (utt2spk_rspecifier != "")
-      KALDI_ASSERT(vtln_map_rspecifier != "" && "the utt2spk option is only "
-                   "needed if the vtln-map option is used.");
-    RandomAccessBaseFloatReaderMapped vtln_map_reader(vtln_map_rspecifier,
-                                                      utt2spk_rspecifier);
-    
     if (output_format == "kaldi") {
       if (!kaldi_writer.Open(output_wspecifier))
         KALDI_ERR << "Could not initialize output with wspecifier "
@@ -96,6 +100,8 @@ int main(int argc, char *argv[]) {
       KALDI_ERR << "Invalid output_format string " << output_format;
     }
 
+    DoubleWriter utt2dur_writer(utt2dur_wspecifier);
+
     int32 num_utts = 0, num_success = 0;
     for (; !reader.Done(); reader.Next()) {
       num_utts++;
@@ -107,8 +113,8 @@ int main(int argc, char *argv[]) {
         continue;
       }
       int32 num_chan = wave_data.Data().NumRows(), this_chan = channel;
-      {  // This block works out the channel (0=left, 1=right...)
-        KALDI_ASSERT(num_chan > 0);  // should have been caught in
+      {  // This block works out the channel (0=left, 1=right...).
+        KALDI_ASSERT(num_chan > 0);  // This should have been caught in
         // reading code if no channels.
         if (channel == -1) {
           this_chan = 0;
@@ -139,10 +145,10 @@ int main(int argc, char *argv[]) {
       SubVector<BaseFloat> waveform(wave_data.Data(), this_chan);
       Matrix<BaseFloat> features;
       try {
-        plp.ComputeFeatures(waveform, wave_data.SampFreq(), vtln_warp_local, &features);
+        plp.ComputeFeatures(waveform, wave_data.SampFreq(),
+                            vtln_warp_local, &features);
       } catch (...) {
-        KALDI_WARN << "Failed to compute features for utterance "
-                   << utt;
+        KALDI_WARN << "Failed to compute features for utterance " << utt;
         continue;
       }
       if (subtract_mean) {
@@ -168,6 +174,9 @@ int main(int argc, char *argv[]) {
         p.second = header;
         htk_writer.Write(utt, p);
       }
+      if (utt2dur_writer.IsOpen()) {
+        utt2dur_writer.Write(utt, wave_data.Duration());
+      }
       if (num_utts % 10 == 0)
         KALDI_LOG << "Processed " << num_utts << " utterances";
       KALDI_VLOG(2) << "Processed features for key " << utt;
@@ -181,4 +190,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
diff --git a/src/featbin/compute-spectrogram-feats.cc b/src/featbin/compute-spectrogram-feats.cc
index 3b40a6fa5c7..67932915278 100644
--- a/src/featbin/compute-spectrogram-feats.cc
+++ b/src/featbin/compute-spectrogram-feats.cc
@@ -18,9 +18,9 @@
 // limitations under the License.
 
 #include "base/kaldi-common.h"
-#include "util/common-utils.h"
 #include "feat/feature-spectrogram.h"
 #include "feat/wave-reader.h"
+#include "util/common-utils.h"
 
 
 int main(int argc, char *argv[]) {
@@ -28,29 +28,33 @@ int main(int argc, char *argv[]) {
     using namespace kaldi;
     const char *usage =
         "Create spectrogram feature files.\n"
-        "Usage:  compute-spectrogram-feats [options...] <wav-rspecifier> <feats-wspecifier>\n";
+        "Usage:  compute-spectrogram-feats [options...] <wav-rspecifier> "
+        "<feats-wspecifier>\n";
 
-    // construct all the global objects
+    // Construct all the global objects.
     ParseOptions po(usage);
     SpectrogramOptions spec_opts;
+    // Define defaults for global options.
     bool subtract_mean = false;
     int32 channel = -1;
     BaseFloat min_duration = 0.0;
-    // Define defaults for gobal options
     std::string output_format = "kaldi";
+    std::string utt2dur_wspecifier;
 
     // Register the option struct
     spec_opts.Register(&po);
     // Register the options
-    po.Register("output-format", &output_format, "Format of the output files [kaldi, htk]");
-    po.Register("subtract-mean", &subtract_mean, "Subtract mean of each feature file [CMS]; not recommended to do it this way. ");
-    po.Register("channel", &channel, "Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right)");
-    po.Register("min-duration", &min_duration, "Minimum duration of segments to process (in seconds).");
-
-    // OPTION PARSING ..........................................................
-    //
+    po.Register("output-format", &output_format,
+                "Format of the output files [kaldi, htk]");
+    po.Register("subtract-mean", &subtract_mean, "Subtract mean of each "
+                "feature file [CMS]; not recommended to do it this way. ");
+    po.Register("channel", &channel, "Channel to extract (-1 -> expect mono, "
+                "0 -> left, 1 -> right)");
+    po.Register("min-duration", &min_duration, "Minimum duration of segments "
+                "to process (in seconds).");
+    po.Register("write-utt2dur", &utt2dur_wspecifier, "Wspecifier to write "
+                "duration of each utterance in seconds, e.g. 'ark,t:utt2dur'.");
 
-    // parse options (+filling the registered variables)
     po.Read(argc, argv);
 
     if (po.NumArgs() != 2) {
@@ -80,6 +84,8 @@ int main(int argc, char *argv[]) {
       KALDI_ERR << "Invalid output_format string " << output_format;
     }
 
+    DoubleWriter utt2dur_writer(utt2dur_wspecifier);
+
     int32 num_utts = 0, num_success = 0;
     for (; !reader.Done(); reader.Next()) {
       num_utts++;
@@ -114,8 +120,7 @@ int main(int argc, char *argv[]) {
       try {
         spec.ComputeFeatures(waveform, wave_data.SampFreq(), 1.0, &features);
       } catch (...) {
-        KALDI_WARN << "Failed to compute features for utterance "
-                   << utt;
+        KALDI_WARN << "Failed to compute features for utterance " << utt;
         continue;
       }
       if (subtract_mean) {
@@ -141,6 +146,9 @@ int main(int argc, char *argv[]) {
         p.second = header;
         htk_writer.Write(utt, p);
       }
+      if (utt2dur_writer.IsOpen()) {
+        utt2dur_writer.Write(utt, wave_data.Duration());
+      }
       if(num_utts % 10 == 0)
         KALDI_LOG << "Processed " << num_utts << " utterances";
       KALDI_VLOG(2) << "Processed features for key " << utt;
@@ -153,6 +161,4 @@ int main(int argc, char *argv[]) {
     std::cerr << e.what();
     return -1;
   }
-  return 0;
 }
-
diff --git a/src/featbin/extract-segments.cc b/src/featbin/extract-segments.cc
index dd4f5fbb32c..bda79879483 100644
--- a/src/featbin/extract-segments.cc
+++ b/src/featbin/extract-segments.cc
@@ -70,18 +70,18 @@ int main(int argc, char *argv[]) {
 
     RandomAccessTableReader<WaveHolder> reader(wav_rspecifier);
     TableWriter<WaveHolder> writer(wav_wspecifier);
-    Input ki(segments_rxfilename);  // no binary argment: never binary.
+    Input ki(segments_rxfilename);  // No binary argment: never binary.
 
     int32 num_lines = 0, num_success = 0;
 
     std::string line;
-    /* read each line from segments file */
+    // Read each line from the segments file.
     while (std::getline(ki.Stream(), line)) {
       num_lines++;
       std::vector<std::string> split_line;
-      // Split the line by space or tab and check the number of fields in each
-      // line. There must be 4 fields--segment name , reacording wav file name,
-      // start time, end time; 5th field (channel info) is optional.
+      // Split the line into whitespace-separated fields and verify their
+      // number. There must be 4 or 5 fields: segment name, reacording ID, start
+      // time, end time, and the optional channel number.
       SplitStringToVector(line, " \t\r", true, &split_line);
       if (split_line.size() != 4 && split_line.size() != 5) {
         KALDI_WARN << "Invalid line in segments file: " << line;
@@ -92,8 +92,8 @@ int main(int argc, char *argv[]) {
           start_str = split_line[2],
           end_str = split_line[3];
 
-      // Convert the start time and endtime to real from string. Segment is
-      // ignored if start or end time cannot be converted to real.
+      // Parse the start and end times as float values. Segment is ignored if
+      // any of end times is malformed.
       double start, end;
       if (!ConvertStringToReal(start_str, &start)) {
         KALDI_WARN << "Invalid line in segments file [bad start]: " << line;
@@ -103,24 +103,24 @@ int main(int argc, char *argv[]) {
         KALDI_WARN << "Invalid line in segments file [bad end]: " << line;
         continue;
       }
-      // start time must not be negative; start time must not be greater than
-      // end time, except if end time is -1
-      if (start < 0 || (end != -1.0 && end <= 0) || ((start >= end) && (end > 0))) {
-        KALDI_WARN << "Invalid line in segments file [empty or invalid segment]: "
-                   << line;
+      // Start time must be non-negative and not greater than the end time,
+      // except if the end time is -1.
+      if (start < 0 || (end != -1.0 && end <= 0) ||
+          ((start >= end) && (end > 0))) {
+        KALDI_WARN << ("Invalid line in segments file "
+                       "[empty or invalid segment]: ") << line;
         continue;
       }
-      int32 channel = -1;  // means channel info is unspecified.
-      // if each line has 5 elements then 5th element must be channel identifier
+      int32 channel = -1;  // -1 means channel is unspecified.
+      // If the line has 5 elements, then the 5th element is the channel number.
       if (split_line.size() == 5) {
         if (!ConvertStringToInteger(split_line[4], &channel) || channel < 0) {
           KALDI_WARN << "Invalid line in segments file [bad channel]: " << line;
           continue;
         }
       }
-      /* check whether a segment start time and end time exists in recording
-       * if fails , skips the segment.
-       */
+
+      // Check whether the recording ID is in wav.scp; if not, skip the segment.
       if (!reader.HasKey(recording)) {
         KALDI_WARN << "Could not find recording " << recording
                    << ", skipping segment " << segment;
@@ -129,74 +129,73 @@ int main(int argc, char *argv[]) {
 
       const WaveData &wave = reader.Value(recording);
       const Matrix<BaseFloat> &wave_data = wave.Data();
-      BaseFloat samp_freq = wave.SampFreq();  // read sampling fequency
-      int32 num_samp = wave_data.NumCols(),  // number of samples in recording
-        num_chan = wave_data.NumRows();  // number of channels in recording
-
-      // Convert starting time of the segment to corresponding sample number.
-      // If end time is -1 then use the whole file starting from start time.
-      int32 start_samp = start * samp_freq,
-          end_samp = (end != -1)? (end * samp_freq) : num_samp;
-      KALDI_ASSERT(start_samp >= 0 && end_samp > 0 && "Invalid start or end.");
-
-      // start sample must be less than total number of samples,
-      // otherwise skip the segment
-      if (start_samp < 0 || start_samp >= num_samp) {
-        KALDI_WARN << "Start sample out of range " << start_samp << " [length:] "
-                   << num_samp << ", skipping segment " << segment;
+      BaseFloat samp_freq = wave.SampFreq();  // Sampling fequency.
+      int32 num_samp = wave_data.NumCols(),  // Number of samples in recording.
+        num_chan = wave_data.NumRows();  // Number of channels in recording.
+      BaseFloat file_length = num_samp / samp_freq;  // In seconds.
+
+      // Start must be within the wave data, otherwise skip the segment.
+      if (start < 0 || start > file_length) {
+        KALDI_WARN << "Segment start is out of file data range [0, "
+                   << file_length << "s]; skipping segment '" << line << "'";
         continue;
       }
-      /* end sample must be less than total number samples
-       * otherwise skip the segment
-       */
-      if (end_samp > num_samp) {
-        if ((end_samp >=
-             num_samp + static_cast<int32>(max_overshoot * samp_freq))) {
-          KALDI_WARN << "End sample too far out of range " << end_samp
-                     << " [length:] " << num_samp << ", skipping segment "
-                     << segment;
-          continue;
-        }
-        end_samp = num_samp;  // for small differences, just truncate.
+
+      // End must be less than the file length adjusted for possible overshoot;
+      // otherwise skip the segment. end == -1 passes the check.
+      if (end > file_length + max_overshoot) {
+        KALDI_WARN << "Segment end is too far out of file data range [0,"
+                   << file_length << "s]; skipping segment '" << line << "'";
+        continue;
       }
-      // Skip if segment size is less than minimum segment length (default 0.1s)
-      if (end_samp <=
-          start_samp + static_cast<int32>(min_segment_length * samp_freq)) {
+
+      // Otherwise ensure the end is not beyond the end of data, and default
+      // end == -1 to the end of file data.
+      if (end < 0 || end > file_length) end = file_length;
+
+      // Skip if segment size is less than the minimum allowed.
+      if (end - start < min_segment_length) {
         KALDI_WARN << "Segment " << segment << " too short, skipping it.";
         continue;
       }
-      /* check whether the wav file has more than one channel
-       * if yes, specify the channel info in segments file
-       * otherwise skips the segment
-       */
+
+      // Check that the channel is specified in the segments file for a multi-
+      // channel file, and that the channel actually exists in the wave data.
       if (channel == -1) {
         if (num_chan == 1) channel = 0;
         else {
-          KALDI_ERR << "If your data has multiple channels, you must specify the"
-              " channel in the segments file.  Processing segment " << segment;
+          KALDI_ERR << ("Your data has multiple channels. You must "
+                        "specify the channel in the segments file. "
+                        "Skipping segment ") << segment;
         }
       } else {
         if (channel >= num_chan) {
           KALDI_WARN << "Invalid channel " << channel << " >= " << num_chan
-                     << ", processing segment " << segment;
+                     << ". Skipping segment " << segment;
           continue;
         }
       }
-      /*
-       * This function  return a portion of a wav data from the orignial wav data matrix
-       */
-      SubMatrix<BaseFloat> segment_matrix(wave_data, channel, 1, start_samp, end_samp-start_samp);
+
+      // Convert endpoints of the segment to sample numbers. Note that the
+      // conversion requires a proper rounding.
+      int32 start_samp = static_cast<int32>(start * samp_freq + 0.5f),
+          end_samp = static_cast<int32>(end * samp_freq + 0.5f);
+    
+      if (end_samp > num_samp) 
+        end_samp = num_samp;
+     
+      // Get the range of data from the orignial wave_data matrix.
+      SubMatrix<BaseFloat> segment_matrix(wave_data, channel, 1,
+                                          start_samp, end_samp - start_samp);
       WaveData segment_wave(samp_freq, segment_matrix);
-      writer.Write(segment, segment_wave); // write segment in wave format.
+      writer.Write(segment, segment_wave);  // Write the range in wave format.
       num_success++;
     }
     KALDI_LOG << "Successfully processed " << num_success << " lines out of "
               << num_lines << " in the segments file. ";
-    /* prints number of segments processed */
     return 0;
   } catch(const std::exception &e) {
     std::cerr << e.what();
     return -1;
   }
 }
-
diff --git a/src/featbin/paste-feats.cc b/src/featbin/paste-feats.cc
index 7d2afeb9f37..beadf5e1725 100644
--- a/src/featbin/paste-feats.cc
+++ b/src/featbin/paste-feats.cc
@@ -28,7 +28,7 @@ namespace kaldi {
 
 // returns true if successfully appended.
 bool AppendFeats(const std::vector<Matrix<BaseFloat> > &in,
-                 std::string utt,
+                 const std::string &utt,
                  int32 tolerance,
                  Matrix<BaseFloat> *out) {
   // Check the lengths
diff --git a/src/fstbin/make-grammar-fst.cc b/src/fstbin/make-grammar-fst.cc
index f7fd46a4a55..fc9a17908f9 100644
--- a/src/fstbin/make-grammar-fst.cc
+++ b/src/fstbin/make-grammar-fst.cc
@@ -114,8 +114,9 @@ int main(int argc, char *argv[]) {
     std::string top_fst_str = po.GetArg(1),
         fst_out_str = po.GetArg(po.NumArgs());
 
-    ConstFst<StdArc> *top_fst = ReadAsConstFst(top_fst_str);
-    std::vector<std::pair<int32, const ConstFst<StdArc>* > > pairs;
+    std::shared_ptr<const ConstFst<StdArc> > top_fst(
+        ReadAsConstFst(top_fst_str));
+    std::vector<std::pair<int32, std::shared_ptr<const ConstFst<StdArc> > > > pairs;
 
     int32 num_pairs = (po.NumArgs() - 2) / 2;
     for (int32 i = 1; i <= num_pairs; i++) {
@@ -126,12 +127,13 @@ int main(int argc, char *argv[]) {
         KALDI_ERR << "Expected positive integer as nonterminal, got: "
                   << nonterm_str;
       std::string fst_str = po.GetArg(2*i + 1);
-      ConstFst<StdArc> *fst = ReadAsConstFst(fst_str);
-      pairs.push_back(std::pair<int32, const ConstFst<StdArc>* >(nonterminal, fst));
+      std::shared_ptr<const ConstFst<StdArc> > this_fst(ReadAsConstFst(fst_str));
+      pairs.push_back(std::pair<int32, std::shared_ptr<const ConstFst<StdArc> > >(
+          nonterminal, this_fst));
     }
 
     GrammarFst *grammar_fst = new GrammarFst(nonterm_phones_offset,
-                                             *top_fst,
+                                             top_fst,
                                              pairs);
 
     if (write_as_grammar) {
@@ -151,10 +153,6 @@ int main(int argc, char *argv[]) {
       cfst.Write(ko.Stream(), wopts);
     }
 
-    delete top_fst;
-    for (size_t i = 0; i < pairs.size(); i++)
-      delete pairs[i].second;
-
     KALDI_LOG << "Created grammar FST and wrote it to "
               << fst_out_str;
   } catch(const std::exception &e) {
diff --git a/src/fstext/context-fst.cc b/src/fstext/context-fst.cc
index 9936a398e37..1e41adc021f 100644
--- a/src/fstext/context-fst.cc
+++ b/src/fstext/context-fst.cc
@@ -345,7 +345,7 @@ SymbolTable *CreateILabelInfoSymbolTable(const vector<vector<int32> > &info,
                                          const SymbolTable &phones_symtab,
                                          std::string separator,
                                          std::string initial_disambig) {  // e.g. separator = "/", initial-disambig="#-1"
-  KALDI_ASSERT(!info.empty() && !info[0].empty());
+  KALDI_ASSERT(!info.empty() && info[0].empty());
   SymbolTable *ans = new SymbolTable("ilabel-info-symtab");
   int64 s = ans->AddSymbol(phones_symtab.Find(static_cast<int64>(0)));
   assert(s == 0);
diff --git a/src/fstext/determinize-lattice-inl.h b/src/fstext/determinize-lattice-inl.h
index 43ad809f70e..775228bfd21 100644
--- a/src/fstext/determinize-lattice-inl.h
+++ b/src/fstext/determinize-lattice-inl.h
@@ -510,7 +510,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
         if (!CheckMemoryUsage()) return false;
       }
       return (determinized_ = true);
-    } catch (std::bad_alloc) {
+    } catch (const std::bad_alloc &) {
       int32 repo_size = repository_.MemSize(),
           arcs_size = num_arcs_ * sizeof(TempArc),
           elems_size = num_elems_ * sizeof(Element),
@@ -520,7 +520,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
           << " (repo,arcs,elems) = ("
           << repo_size << "," << arcs_size << "," << elems_size << ")";
       return (determinized_ = false);
-    } catch (std::runtime_error) {
+    } catch (const std::runtime_error &) {
       KALDI_WARN << "Caught exception doing lattice determinization";
       return (determinized_ = false);
     }
diff --git a/src/fstext/lattice-weight.h b/src/fstext/lattice-weight.h
index af4826f7bed..86bec97d4e8 100644
--- a/src/fstext/lattice-weight.h
+++ b/src/fstext/lattice-weight.h
@@ -179,8 +179,7 @@ class LatticeWeightTpl {
     } else if (s == "-Infinity") {
       f = -numeric_limits<T>::infinity();
     } else if (s == "BadNumber") {
-      f = numeric_limits<T>::infinity();
-      f -= f; // get NaN
+      f = numeric_limits<T>::quiet_NaN();
     } else {
       char *p;
       f = strtod(s.c_str(), &p);
diff --git a/src/fstext/remove-eps-local.h b/src/fstext/remove-eps-local.h
index ce6ff067d7b..c45226c11cd 100644
--- a/src/fstext/remove-eps-local.h
+++ b/src/fstext/remove-eps-local.h
@@ -36,7 +36,7 @@ namespace fst {
 /// into one.
 /// The algorithm preserves equivalence and stochasticity in the given semiring.
 /// If you want to preserve stochasticity in a different semiring (e.g. log),
-/// then use RemoveEpsLocalSpecial, which only words for StdArc but which
+/// then use RemoveEpsLocalSpecial, which only works for StdArc but which
 /// preserves stochasticity, where possible (*) in the LogArc sense.  The reason that we can't
 /// just cast to a different semiring is that in that case we would no longer
 /// be able to guarantee equivalence in the original semiring (this arises from
diff --git a/src/gmm/diag-gmm-test.cc b/src/gmm/diag-gmm-test.cc
index 8d7ff3d73bd..3308acc2837 100644
--- a/src/gmm/diag-gmm-test.cc
+++ b/src/gmm/diag-gmm-test.cc
@@ -43,7 +43,7 @@ void InitRandomGmm(DiagGmm *gmm_in) {
   gmm.SetWeights(weights);
   gmm.SetInvVarsAndMeans(inv_vars, means);
   gmm.Perturb(0.5 * RandUniform());
-  gmm.ComputeGconsts();  // this is unnecassary; computed in Perturb
+  gmm.ComputeGconsts();  // this is unnecessary; computed in Perturb
 }
 
 
diff --git a/src/gmm/mle-diag-gmm.h b/src/gmm/mle-diag-gmm.h
index 24194ef886a..d41d36489bf 100644
--- a/src/gmm/mle-diag-gmm.h
+++ b/src/gmm/mle-diag-gmm.h
@@ -85,7 +85,7 @@ struct MapDiagGmmOptions {
   /// Tau value for the weights-- this tau value is applied
   /// per state, not per Gaussian.
   BaseFloat weight_tau;
-  
+
   MapDiagGmmOptions(): mean_tau(10.0),
                              variance_tau(50.0),
                              weight_tau(10.0) { }
@@ -150,8 +150,8 @@ class AccumDiagGmm {
       const MatrixBase<BaseFloat> &data,
       const VectorBase<BaseFloat> &frame_weights,
       int32 num_threads);
-  
-  
+
+
   /// Increment the stats for this component by the specified amount
   /// (not all parts may be taken, depending on flags).
   /// Note: x_stats and x2_stats are assumed to already be multiplied by "occ"
@@ -162,7 +162,7 @@ class AccumDiagGmm {
 
   /// Increment with stats from this other accumulator (times scale)
   void Add(double scale, const AccumDiagGmm &acc);
-  
+
   /// Smooths the accumulated counts by adding 'tau' extra frames. An example
   /// use for this is I-smoothing for MMIE.   Calls SmoothWithAccum.
   void SmoothStats(BaseFloat tau);
@@ -179,13 +179,13 @@ class AccumDiagGmm {
   void SmoothWithModel(BaseFloat tau, const DiagGmm &src_gmm);
 
   // Const accessors
-  const GmmFlagsType Flags() const { return flags_; }
+  GmmFlagsType Flags() const { return flags_; }
   const VectorBase<double> &occupancy() const { return occupancy_; }
   const MatrixBase<double> &mean_accumulator() const { return mean_accumulator_; }
   const MatrixBase<double> &variance_accumulator() const { return variance_accumulator_; }
 
   // used in testing.
-  void AssertEqual(const AccumDiagGmm &other); 
+  void AssertEqual(const AccumDiagGmm &other);
  private:
   int32 dim_;
   int32 num_comp_;
diff --git a/src/gmm/mle-full-gmm.h b/src/gmm/mle-full-gmm.h
index 6e770764e1e..618714b0e9b 100644
--- a/src/gmm/mle-full-gmm.h
+++ b/src/gmm/mle-full-gmm.h
@@ -1,7 +1,7 @@
 // gmm/mle-full-gmm.h
 
 // Copyright 2009-2011  Jan Silovsky;  Saarland University;
-//                      Microsoft Corporation; 
+//                      Microsoft Corporation;
 //                      Univ. Erlangen Nuremberg, Korbinian Riedhammer
 
 // See ../../COPYING for clarification regarding multiple authors
@@ -91,7 +91,7 @@ class AccumFullGmm {
   void Resize(int32 num_components, int32 dim, GmmFlagsType flags);
   /// Calls Resize with arguments based on gmm_ptr_
   void Resize(const FullGmm &gmm, GmmFlagsType flags);
-  
+
   void ResizeVarAccumulator(int32 num_comp, int32 dim);
   /// Returns the number of mixture components
   int32 NumGauss() const { return num_comp_; }
@@ -122,8 +122,8 @@ class AccumFullGmm {
                                const VectorBase<BaseFloat> &data,
                                BaseFloat frame_posterior);
 
-  /// Accessors  
-  const GmmFlagsType Flags() const { return flags_; }
+  /// Accessors
+  GmmFlagsType Flags() const { return flags_; }
   const Vector<double> &occupancy() const { return occupancy_; }
   const Matrix<double> &mean_accumulator() const { return mean_accumulator_; }
   const std::vector<SpMatrix<double> > &covariance_accumulator() const { return covariance_accumulator_; }
diff --git a/src/gmmbin/gmm-init-biphone.cc b/src/gmmbin/gmm-init-biphone.cc
index 42a9d1a91a0..0775a5c7b23 100644
--- a/src/gmmbin/gmm-init-biphone.cc
+++ b/src/gmmbin/gmm-init-biphone.cc
@@ -52,12 +52,14 @@ void ReadSharedPhonesList(std::string rxfilename, std::vector<std::vector<int32>
 EventMap
 *GetFullBiphoneStubMap(const std::vector<std::vector<int32> > &phone_sets,
                        const std::vector<int32> &phone2num_pdf_classes,
-                       const std::vector<bool> &share_roots,
-                       const std::vector<int32> &ci_phones_list) {
+                       const std::vector<int32> &ci_phones_list,
+                       const std::vector<std::vector<int32> > &bi_counts,
+                       int32 biphone_min_count,
+                       const std::vector<int32> &mono_counts,
+                       int32 mono_min_count) {
 
   {  // Check the inputs
-    KALDI_ASSERT(!phone_sets.empty() &&
-                 share_roots.size() == phone_sets.size());
+    KALDI_ASSERT(!phone_sets.empty());
     std::set<int32> all_phones;
     for (size_t i = 0; i < phone_sets.size(); i++) {
       KALDI_ASSERT(IsSortedAndUniq(phone_sets[i]));
@@ -81,6 +83,14 @@ EventMap
     level1_map[ci_phones_list[i]] = new TableEventMap(kPdfClass, level2_map);
   }
 
+  // If there is not enough data for a biphone, we will revert to monophone
+  // and if there is not enough data for the monophone either, we will revert
+  // to zerophone (which is like a global garbage pdf) after initializing it.
+  int32 zerophone_pdf = -1;
+  // If a monophone state is created for a phone-set, the corresponding pdf will
+  // be stored in this vector.
+  std::vector<int32> monophone_pdf(phone_sets.size(), -1);
+
   for (size_t i = 0; i < phone_sets.size(); i++) {
 
     if (numpdfs_per_phone == 1) {
@@ -100,38 +110,68 @@ EventMap
         level1_map[pset[k]] = new TableEventMap(0, level2_map);
     } else {
       KALDI_ASSERT(numpdfs_per_phone == 2);
-      int32 base_pdfid = current_pdfid;
-      std::vector<int32> pset = phone_sets[i];  // All these will have a shared
+      std::vector<int32> right_phoneset = phone_sets[i];  // All these will have a shared
                                                 // event-map child
-      for (size_t k = 0; k < pset.size(); k++) {
-        // Create an event map for level2:
-        std::map<EventValueType, EventMap*> level2_map;  // key is 0
-        {
-          std::map<EventValueType, EventAnswerType> level3_map;  // key is kPdfClass
+      // Create an event map for level2:
+      std::map<EventValueType, EventMap*> level2_map;  // key is 0
+      {  // Handle CI phones
+        std::map<EventValueType, EventAnswerType> level3_map;  // key is kPdfClass
+        level3_map[0] = current_pdfid++;
+        level3_map[1] = current_pdfid++;
+        level2_map[0] = new TableEventMap(kPdfClass, level3_map);  // no-left-context case
+        for (size_t i = 0; i < ci_phones_list.size(); i++)  // ci-phone left-context cases
+          level2_map[ci_phones_list[i]] = new TableEventMap(kPdfClass, level3_map);
+      }
+      for (size_t j = 0; j < phone_sets.size(); j++) {
+        std::vector<int32> left_phoneset = phone_sets[j];  // All these will have a
+        // shared subtree with 2 pdfids
+        std::map<EventValueType, EventAnswerType> level3_map;  // key is kPdfClass
+        if (bi_counts.empty() ||
+            bi_counts[left_phoneset[0]][right_phoneset[0]] >= biphone_min_count) {
           level3_map[0] = current_pdfid++;
           level3_map[1] = current_pdfid++;
-          level2_map[0] = new TableEventMap(kPdfClass, level3_map);  // no-left-context case
-          for (size_t i = 0; i < ci_phones_list.size(); i++)  // ci-phone left-context cases
-            level2_map[ci_phones_list[i]] = new TableEventMap(kPdfClass, level3_map);
+        } else if (mono_counts.empty() ||
+                   mono_counts[right_phoneset[0]] > mono_min_count) {
+          //  Revert to mono.
+          KALDI_VLOG(2) << "Reverting to mono for biphone (" << left_phoneset[0]
+                        << "," << right_phoneset[0] << ")";
+          if (monophone_pdf[i] == -1) {
+            KALDI_VLOG(1) << "Reserving mono PDFs for phone-set " << i;
+            monophone_pdf[i] = current_pdfid++;
+            current_pdfid++; // num-pdfs-per-phone is 2
+          }
+          level3_map[0] = monophone_pdf[i];
+          level3_map[1] = monophone_pdf[i] + 1;
+        } else {
+          KALDI_VLOG(2) << "Reverting to zerophone for biphone ("
+                        << left_phoneset[0]
+                        << "," << right_phoneset[0] << ")";
+          // Revert to zerophone
+          if (zerophone_pdf == -1) {
+            KALDI_VLOG(1) << "Reserving zero PDFs.";
+            zerophone_pdf = current_pdfid++;
+            current_pdfid++; // num-pdfs-per-phone is 2
+          }
+          level3_map[0] = zerophone_pdf;
+          level3_map[1] = zerophone_pdf + 1;
         }
-        for (size_t j = 0; j < phone_sets.size(); j++) {
-          std::map<EventValueType, EventAnswerType> level3_map;  // key is kPdfClass
-          level3_map[0] = current_pdfid++;
-          level3_map[1] = current_pdfid++;
 
-          std::vector<int32> ipset = phone_sets[j];  // All these will have a
-                                                     // shared subtree with 2 pdfids
-          for (size_t ik = 0; ik < ipset.size(); ik++) {
-            level2_map[ipset[ik]] = new TableEventMap(kPdfClass, level3_map);
-          }
+        for (size_t k = 0; k < left_phoneset.size(); k++) {
+          int32 left_phone = left_phoneset[k];
+          level2_map[left_phone] = new TableEventMap(kPdfClass, level3_map);
         }
-        level1_map[pset[k]] = new TableEventMap(0, level2_map);
-        if (k != pset.size() - 1)
-          current_pdfid = base_pdfid;
+      }
+      for (size_t k = 0; k < right_phoneset.size(); k++) {
+        std::map<EventValueType, EventMap*> level2_copy;
+        for (auto const& kv: level2_map)
+          level2_copy[kv.first] = kv.second->Copy(std::vector<EventMap*>());
+        int32 right_phone = right_phoneset[k];
+        level1_map[right_phone] = new TableEventMap(0, level2_copy);
       }
     }
 
   }
+  KALDI_LOG << "Num PDFs: " << current_pdfid;
   return new TableEventMap(1, level1_map);
 }
 
@@ -139,7 +179,11 @@ EventMap
 ContextDependency*
 BiphoneContextDependencyFull(std::vector<std::vector<int32> > phone_sets,
                              const std::vector<int32> phone2num_pdf_classes,
-                             const std::vector<int32> &ci_phones_list) {
+                             const std::vector<int32> &ci_phones_list,
+                             const std::vector<std::vector<int32> > &bi_counts,
+                             int32 biphone_min_count,
+                             const std::vector<int32> &mono_counts,
+                             int32 mono_min_count) {
   // Remove all the CI phones from the phone sets
   std::set<int32> ci_phones;
   for (size_t i = 0; i < ci_phones_list.size(); i++)
@@ -159,13 +203,54 @@ BiphoneContextDependencyFull(std::vector<std::vector<int32> > phone_sets,
   int32 P = 1, N = 2;
   EventMap *pdf_map = GetFullBiphoneStubMap(phone_sets,
                                             phone2num_pdf_classes,
-                                            share_roots, ci_phones_list);
+                                            ci_phones_list, bi_counts,
+                                            biphone_min_count, mono_counts,
+                                            mono_min_count);
   return new ContextDependency(N, P, pdf_map);
 }
 
 
 } // end namespace kaldi
 
+/* This function reads the counts of biphones and monophones from a text file
+   generated for chain flat-start training. On each line there is either a
+   biphone count or a monophone count:
+   <left-phone-id> <right-phone-id> <count>
+   <monophone-id> <count>
+   The phone-id's are according to phones.txt.
+
+   It's more efficient to load the biphone counts into a map because
+   most entries are zero, but since there are not many biphones, a 2-dim vector
+   is OK. */
+static void ReadPhoneCounts(std::string &filename, int32 num_phones,
+                            std::vector<int32> *mono_counts,
+                            std::vector<std::vector<int32> > *bi_counts) {
+  // The actual phones start from id = 1 (so the last phone has id = num_phones).
+  mono_counts->resize(num_phones + 1, 0);
+  bi_counts->resize(num_phones + 1, std::vector<int>(num_phones + 1, 0));
+  std::ifstream infile(filename);
+  std::string line;
+  while (std::getline(infile, line)) {
+    std::istringstream iss(line);
+    int a, b;
+    long c;
+    if ((std::istringstream(line) >> a >> b >> c)) {
+      // It's a biphone count.
+      KALDI_ASSERT(a >= 0 && a <= num_phones);  // 0 means no-left-context
+      KALDI_ASSERT(b > 0 && b <= num_phones);
+      KALDI_ASSERT(c >= 0);
+      (*bi_counts)[a][b] = c;
+    } else if ((std::istringstream(line) >> b >> c)) {
+      // It's a monophone count.
+      KALDI_ASSERT(b > 0 && b <= num_phones);
+      KALDI_ASSERT(c >= 0);
+      (*mono_counts)[b] = c;
+    } else {
+      KALDI_ERR << "Bad line in phone stats file: " << line;
+    }
+  }
+}
+
 int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
@@ -179,7 +264,8 @@ int main(int argc, char *argv[]) {
         " gmm-init-biphone topo 39 bi.mdl bi.tree\n";
 
     bool binary = true;
-    std::string shared_phones_rxfilename;
+    std::string shared_phones_rxfilename, phone_counts_rxfilename;
+    int32 min_biphone_count = 100, min_mono_count = 20;
     std::string ci_phones_str;
     std::vector<int32> ci_phones;  // Sorted, uniqe vector of
     // context-independent phones.
@@ -191,6 +277,15 @@ int main(int argc, char *argv[]) {
                 "whose pdfs should be shared.");
     po.Register("ci-phones", &ci_phones_str, "Colon-separated list of "
                 "integer indices of context-independent phones.");
+    po.Register("phone-counts", &phone_counts_rxfilename,
+                "rxfilename containing, on each line, a biphone/phone and "
+                "its count in the training data.");
+    po.Register("min-biphone-count", &min_biphone_count, "Minimum number of "
+                "occurences of a biphone in training data to reserve pdfs "
+                "for it.");
+    po.Register("min-monophone-count", &min_mono_count, "Minimum number of "
+                "occurences of a monophone in training data to reserve pdfs "
+                "for it.");
     po.Read(argc, argv);
 
     if (po.NumArgs() != 4) {
@@ -214,7 +309,6 @@ int main(int argc, char *argv[]) {
         KALDI_ERR << "Invalid --ci-phones option: " << ci_phones_str;
     }
 
-
     Vector<BaseFloat> glob_inv_var(dim);
     glob_inv_var.Set(1.0);
     Vector<BaseFloat> glob_mean(dim);
@@ -235,6 +329,15 @@ int main(int argc, char *argv[]) {
                    phone2num_pdf_classes[phones[i]] == 2);
     }
 
+    std::vector<int32> mono_counts;
+    std::vector<std::vector<int32> > bi_counts;
+    if (!phone_counts_rxfilename.empty()) {
+      ReadPhoneCounts(phone_counts_rxfilename, phones.size(),
+                      &mono_counts, &bi_counts);
+      KALDI_LOG << "Loaded mono/bi phone counts.";
+    }
+
+
     // Now the tree:
     ContextDependency *ctx_dep = NULL;
     std::vector<std::vector<int32> > shared_phones;
@@ -247,7 +350,9 @@ int main(int argc, char *argv[]) {
       // ReadSharedPhonesList crashes on error.
     }
     ctx_dep = BiphoneContextDependencyFull(shared_phones, phone2num_pdf_classes,
-                                           ci_phones);
+                                           ci_phones, bi_counts,
+                                           min_biphone_count,
+                                           mono_counts, min_mono_count);
 
     int32 num_pdfs = ctx_dep->NumPdfs();
 
diff --git a/src/gst-plugin/Makefile b/src/gst-plugin/Makefile
index 92af0483a6e..4d4764b6006 100644
--- a/src/gst-plugin/Makefile
+++ b/src/gst-plugin/Makefile
@@ -34,11 +34,6 @@ ifneq ($(wildcard ../../tools/portaudio/install/include/pa_linux_alsa.h),)
     EXTRA_LDLIBS += -lasound
 endif
 
-# MKL libs required when linked via shared library
-ifdef MKLROOT
-	EXTRA_LDLIBS+=-lmkl_p4n -lmkl_def
-endif
-
 # Library so name and rpath
 CXX_VERSION=$(shell $(CXX) --version 2>/dev/null)
 ifneq (,$(findstring clang, $(CXX_VERSION)))
diff --git a/src/hmm/hmm-topology.cc b/src/hmm/hmm-topology.cc
index cf134065dbf..29634ecda0b 100644
--- a/src/hmm/hmm-topology.cc
+++ b/src/hmm/hmm-topology.cc
@@ -69,7 +69,7 @@ void HmmTopology::Read(std::istream &is, bool binary) {
         ReadToken(is, binary, &token);
         while (token != "</TopologyEntry>") {
           if (token != "<State>")
-            KALDI_ERR << "Expected </TopologyEntry> or <State>, got instead "<<token;
+            KALDI_ERR << "Expected </TopologyEntry> or <State>, got instead " << token;
           int32 state;
           ReadBasicType(is, binary, &state);
           if (state != static_cast<int32>(this_entry.size()))
@@ -88,7 +88,8 @@ void HmmTopology::Read(std::istream &is, bool binary) {
             int32 self_loop_pdf_class = kNoPdf;
             ReadBasicType(is, binary, &forward_pdf_class);
             ReadToken(is, binary, &token);
-            KALDI_ASSERT(token == "<SelfLoopPdfClass>");
+            if (token != "<SelfLoopPdfClass>")
+              KALDI_ERR << "Expected <SelfLoopPdfClass>, got instead " << token;
             ReadBasicType(is, binary, &self_loop_pdf_class);
             this_entry.push_back(HmmState(forward_pdf_class, self_loop_pdf_class));
             ReadToken(is, binary, &token);
@@ -102,10 +103,10 @@ void HmmTopology::Read(std::istream &is, bool binary) {
             this_entry.back().transitions.push_back(std::make_pair(dst_state, trans_prob));
             ReadToken(is, binary, &token);
           }
-          if(token == "<Final>") // TODO: remove this clause after a while.
+          if (token == "<Final>")  // TODO: remove this clause after a while.
             KALDI_ERR << "You are trying to read old-format topology with new Kaldi.";
           if (token != "</State>")
-            KALDI_ERR << "Reading HmmTopology,  unexpected token "<<token;
+            KALDI_ERR << "Expected </State>, got instead " << token;
           ReadToken(is, binary, &token);
         }
         int32 my_index = entries_.size();
diff --git a/src/hmm/transition-model.cc b/src/hmm/transition-model.cc
index 5ecb7776f00..420a94585ea 100644
--- a/src/hmm/transition-model.cc
+++ b/src/hmm/transition-model.cc
@@ -39,7 +39,7 @@ void TransitionModel::ComputeTuplesIsHmm(const ContextDependencyInterface &ctx_d
   const std::vector<int32> &phones = topo_.GetPhones();
   KALDI_ASSERT(!phones.empty());
 
-  // this is the case for normal models. but not fot chain models
+  // this is the case for normal models. but not for chain models
   std::vector<std::vector<std::pair<int32, int32> > > pdf_info;
   std::vector<int32> num_pdf_classes( 1 + *std::max_element(phones.begin(), phones.end()), -1);
   for (size_t i = 0; i < phones.size(); i++)
@@ -85,7 +85,7 @@ void TransitionModel::ComputeTuplesNotHmm(const ContextDependencyInterface &ctx_
 
   // pdf_info is a set of lists indexed by phone. Each list is indexed by
   // (pdf-class, self-loop pdf-class) of each state of that phone, and the element
-  // is a list of possible (pdf, self-loop pdf) pairs that that (pdf-class, self-loop pdf-class)
+  // is a list of possible (pdf, self-loop pdf) pairs that (pdf-class, self-loop pdf-class)
   // pair generates.
   std::vector<std::vector<std::vector<std::pair<int32, int32> > > > pdf_info;
   // pdf_class_pairs is a set of lists indexed by phone. Each list stores
@@ -177,7 +177,7 @@ void TransitionModel::ComputeDerived() {
   }
 
   // The following statements put copies a large number in the region of memory
-  // past the end of the id2pdf_id_ array, while leaving the aray as it was
+  // past the end of the id2pdf_id_ array, while leaving the array as it was
   // before.  The goal of this is to speed up decoding by disabling a check
   // inside TransitionIdToPdf() that the transition-id was within the correct
   // range.
diff --git a/src/hmm/transition-model.h b/src/hmm/transition-model.h
index e453c24f9cb..c97980405c1 100644
--- a/src/hmm/transition-model.h
+++ b/src/hmm/transition-model.h
@@ -47,7 +47,7 @@ namespace kaldi {
 // this depends on the number of transitions/final-probs in the topology for
 // that (phone, HMM-state).  Each probability has an associated transition-index.
 // We associate with each (transition-state, transition-index) a unique transition-id.
-// Each individual probability estimated by the transition-model is asociated with a
+// Each individual probability estimated by the transition-model is associated with a
 // transition-id.
 //
 // List of the various types of quantity referred to here and what they mean:
@@ -335,7 +335,7 @@ inline int32 TransitionModel::TransitionIdToPdfFast(int32 trans_id) const {
   // Note: it's a little dangerous to assert this only in paranoid mode.
   // However, this function is called in the inner loop of decoders and
   // the assertion likely takes a significant amount of time.  We make
-  // sure that past the end of thd id2pdf_id_ array there are big
+  // sure that past the end of the id2pdf_id_ array there are big
   // numbers, which will make the calling code more likely to segfault
   // (rather than silently die) if this is called for out-of-range values.
   KALDI_PARANOID_ASSERT(
diff --git a/src/itf/decodable-itf.h b/src/itf/decodable-itf.h
index 9f1f2f62e2b..20934dde8c9 100644
--- a/src/itf/decodable-itf.h
+++ b/src/itf/decodable-itf.h
@@ -50,7 +50,7 @@ namespace kaldi {
       // Process this frame
     }
     \endcode
-   and the the call to IsLastFrame would block if the features had not arrived yet.
+   and the call to IsLastFrame would block if the features had not arrived yet.
    The decodable object would have to know when to terminate the decoding.  This
    online-decoding mode is still supported, it is what happens when you call, for
    example, LatticeFasterDecoder::Decode().
diff --git a/src/ivector/agglomerative-clustering.cc b/src/ivector/agglomerative-clustering.cc
index 30138e00637..ced912ed195 100644
--- a/src/ivector/agglomerative-clustering.cc
+++ b/src/ivector/agglomerative-clustering.cc
@@ -2,6 +2,7 @@
 
 // Copyright  2017-2018  Matthew Maciejewski
 //                 2018  David Snyder
+//                 2019  Dogan Can
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -24,65 +25,98 @@
 namespace kaldi {
 
 void AgglomerativeClusterer::Cluster() {
-  KALDI_VLOG(2) << "Initializing cluster assignments.";
-  Initialize();
-
-  KALDI_VLOG(2) << "Clustering...";
-  // This is the main algorithm loop. It moves through the queue merging
-  // clusters until a stopping criterion has been reached.
-  while (num_clusters_ > min_clust_ && !queue_.empty()) {
-    std::pair<BaseFloat, std::pair<uint16, uint16> > pr = queue_.top();
-    int32 i = (int32) pr.second.first, j = (int32) pr.second.second;
-    queue_.pop();
-    // check to make sure clusters have not already been merged
-    if ((active_clusters_.find(i) != active_clusters_.end()) &&
-        (active_clusters_.find(j) != active_clusters_.end()))
-      MergeClusters(i, j);
-  }
+  if (num_points_ > first_pass_max_points_)
+    ClusterTwoPass();
+  else
+    ClusterSinglePass();
+}
 
-  std::vector<int32> new_assignments(num_points_);
-  int32 label_id = 0;
-  std::set<int32>::iterator it;
-  // Iterate through the clusters and assign all utterances within the cluster
-  // an ID label unique to the cluster. This is the final output and frees up
-  // the cluster memory accordingly.
-  for (it = active_clusters_.begin(); it != active_clusters_.end(); ++it) {
-    ++label_id;
-    AhcCluster *cluster = clusters_map_[*it];
-    std::vector<int32>::iterator utt_it;
-    for (utt_it = cluster->utt_ids.begin();
-         utt_it != cluster->utt_ids.end(); ++utt_it)
-      new_assignments[*utt_it] = label_id;
-    delete cluster;
+void AgglomerativeClusterer::ClusterSinglePass() {
+  InitializeClusters(0, num_points_);
+  ComputeClusters(min_clusters_);
+  AssignClusters();
+}
+
+void AgglomerativeClusterer::ClusterTwoPass() {
+  // This is the first pass loop. We divide the input into equal size subsets
+  // making sure each subset has at most first_pass_max_points_ points. Then, we
+  // cluster the points in each subset separately until a stopping criterion is
+  // reached. We set the minimum number of clusters to 10 * min_clusters_ for
+  // each subset to avoid early merging of most clusters that would otherwise be
+  // kept separate in single pass clustering.
+  BaseFloat num_points = static_cast<BaseFloat>(num_points_);
+  int32 num_subsets = ceil(num_points / first_pass_max_points_);
+  int32 subset_size = ceil(num_points / num_subsets);
+  for (int32 n = 0; n < num_points_; n += subset_size) {
+    InitializeClusters(n, std::min(n + subset_size, num_points_));
+    ComputeClusters(min_clusters_ * 10);
+    AddClustersToSecondPass();
   }
-  assignments_->swap(new_assignments);
+
+  // We swap the contents of the first and second pass data structures so that
+  // we can use the same method to do second pass clustering.
+  clusters_map_.swap(second_pass_clusters_map_);
+  active_clusters_.swap(second_pass_active_clusters_);
+  cluster_cost_map_.swap(second_pass_cluster_cost_map_);
+  queue_.swap(second_pass_queue_);
+  count_ = second_pass_count_;
+
+  // This is the second pass. It moves through the queue merging clusters
+  // determined in the first pass until a stopping criterion is reached.
+  ComputeClusters(min_clusters_);
+
+  AssignClusters();
 }
 
-BaseFloat AgglomerativeClusterer::GetCost(int32 i, int32 j) {
+uint32 AgglomerativeClusterer::EncodePair(int32 i, int32 j) {
   if (i < j)
-    return cluster_cost_map_[std::make_pair(i, j)];
+    return (static_cast<uint32>(i) << 16) + static_cast<uint32>(j);
   else
-    return cluster_cost_map_[std::make_pair(j, i)];
+    return (static_cast<uint32>(j) << 16) + static_cast<uint32>(i);
+}
+
+std::pair<int32, int32> AgglomerativeClusterer::DecodePair(uint32 key) {
+  return std::make_pair(static_cast<int32>(key >> 16),
+                        static_cast<int32>(key & 0x0000FFFFu));
 }
 
-void AgglomerativeClusterer::Initialize() {
-  KALDI_ASSERT(num_clusters_ != 0);
-  for (int32 i = 0; i < num_points_; i++) {
+void AgglomerativeClusterer::InitializeClusters(int32 first, int32 last) {
+  KALDI_ASSERT(last > first);
+  clusters_map_.clear();
+  active_clusters_.clear();
+  cluster_cost_map_.clear();
+  queue_ = QueueType();  // priority_queue does not have a clear method
+
+  for (int32 i = first; i < last; i++) {
     // create an initial cluster of size 1 for each point
     std::vector<int32> ids;
     ids.push_back(i);
-    AhcCluster *c = new AhcCluster(++count_, -1, -1, ids);
-    clusters_map_[count_] = c;
-    active_clusters_.insert(count_);
+    AhcCluster *c = new AhcCluster(i + 1, -1, -1, ids);
+    clusters_map_[i + 1] = c;
+    active_clusters_.insert(i + 1);
 
     // propagate the queue with all pairs from the cost matrix
-    for (int32 j = i+1; j < num_clusters_; j++) {
-      BaseFloat cost = costs_(i,j);
-      cluster_cost_map_[std::make_pair(i+1, j+1)] = cost;
-      if (cost <= thresh_)
-        queue_.push(std::make_pair(cost,
-            std::make_pair(static_cast<uint16>(i+1),
-                           static_cast<uint16>(j+1))));
+    for (int32 j = i + 1; j < last; j++) {
+      BaseFloat cost = costs_(i, j);
+      uint32 key = EncodePair(i + 1, j + 1);
+      cluster_cost_map_[key] = cost;
+      if (cost <= threshold_)
+        queue_.push(std::make_pair(cost, key));
+    }
+  }
+}
+
+void AgglomerativeClusterer::ComputeClusters(int32 min_clusters) {
+  while (active_clusters_.size() > min_clusters && !queue_.empty()) {
+    std::pair<BaseFloat, uint32> pr = queue_.top();
+    int32 i, j;
+    std::tie(i, j) = DecodePair(pr.second);
+    queue_.pop();
+    // check to make sure clusters have not already been merged
+    if ((active_clusters_.find(i) != active_clusters_.end()) &&
+        (active_clusters_.find(j) != active_clusters_.end())) {
+      if (clusters_map_[i]->size + clusters_map_[j]->size <= max_cluster_size_)
+        MergeClusters(i, j);
     }
   }
 }
@@ -105,27 +139,99 @@ void AgglomerativeClusterer::MergeClusters(int32 i, int32 j) {
   std::set<int32>::iterator it;
   for (it = active_clusters_.begin(); it != active_clusters_.end(); ++it) {
     // The new cost is the sum of the costs of the new cluster's parents
-    BaseFloat new_cost = GetCost(*it, i) + GetCost(*it, j);
-    cluster_cost_map_[std::make_pair(*it, count_)] = new_cost;
+    BaseFloat new_cost = cluster_cost_map_[EncodePair(*it, i)] +
+                         cluster_cost_map_[EncodePair(*it, j)];
+    uint32 new_key = EncodePair(*it, count_);
+    cluster_cost_map_[new_key] = new_cost;
     BaseFloat norm = clust1->size * (clusters_map_[*it])->size;
-    if (new_cost / norm <= thresh_)
-      queue_.push(std::make_pair(new_cost / norm,
-          std::make_pair(static_cast<uint16>(*it),
-                         static_cast<uint16>(count_))));
+    if (new_cost / norm <= threshold_)
+      queue_.push(std::make_pair(new_cost / norm, new_key));
   }
   active_clusters_.insert(count_);
   clusters_map_[count_] = clust1;
   delete clust2;
-  num_clusters_--;
+}
+
+void AgglomerativeClusterer::AddClustersToSecondPass() {
+  // This method collects the results of first pass clustering for one subset,
+  // i.e. adds the set of active clusters to the set of second pass active
+  // clusters and computes the costs for the newly formed cluster pairs.
+  std::set<int32>::iterator it1, it2;
+  int32 count = second_pass_count_;
+  for (it1 = active_clusters_.begin(); it1 != active_clusters_.end(); ++it1) {
+    AhcCluster *clust1 = clusters_map_[*it1];
+    second_pass_clusters_map_[++count] = clust1;
+
+    // Compute new cluster pair costs
+    for (it2 = second_pass_active_clusters_.begin();
+         it2 != second_pass_active_clusters_.end(); ++it2) {
+      AhcCluster *clust2 = second_pass_clusters_map_[*it2];
+      uint32 new_key = EncodePair(count, *it2);
+
+      BaseFloat new_cost = 0.0;
+      std::vector<int32>::iterator utt_it1, utt_it2;
+      for (utt_it1 = clust1->utt_ids.begin();
+           utt_it1 != clust1->utt_ids.end(); ++utt_it1) {
+         for (utt_it2 = clust2->utt_ids.begin();
+              utt_it2 != clust2->utt_ids.end(); ++utt_it2) {
+           new_cost += costs_(*utt_it1, *utt_it2);
+         }
+      }
+
+      second_pass_cluster_cost_map_[new_key] = new_cost;
+      BaseFloat norm = clust1->size * clust2->size;
+      if (new_cost / norm <= threshold_)
+        second_pass_queue_.push(std::make_pair(new_cost / norm, new_key));
+    }
+
+    // Copy cluster pair costs that were already computed in the first pass
+    int32 count2 = second_pass_count_;
+    for (it2 = active_clusters_.begin(); it2 != it1; ++it2) {
+      uint32 key = EncodePair(*it1, *it2);
+      BaseFloat cost = cluster_cost_map_[key];
+      BaseFloat norm = clust1->size * (clusters_map_[*it2])->size;
+      uint32 new_key = EncodePair(count, ++count2);
+      second_pass_cluster_cost_map_[new_key] = cost;
+      if (cost / norm <= threshold_)
+        second_pass_queue_.push(std::make_pair(cost / norm, new_key));
+    }
+  }
+  // We update second_pass_count_ and second_pass_active_clusters_ here since
+  // above loop assumes they do not change while the loop is running.
+  while (second_pass_count_ < count)
+    second_pass_active_clusters_.insert(++second_pass_count_);
+}
+
+void AgglomerativeClusterer::AssignClusters() {
+  assignments_->resize(num_points_);
+  int32 label_id = 0;
+  std::set<int32>::iterator it;
+  // Iterate through the clusters and assign all utterances within the cluster
+  // an ID label unique to the cluster. This is the final output and frees up
+  // the cluster memory accordingly.
+  for (it = active_clusters_.begin(); it != active_clusters_.end(); ++it) {
+    ++label_id;
+    AhcCluster *cluster = clusters_map_[*it];
+    std::vector<int32>::iterator utt_it;
+    for (utt_it = cluster->utt_ids.begin();
+         utt_it != cluster->utt_ids.end(); ++utt_it)
+      (*assignments_)[*utt_it] = label_id;
+    delete cluster;
+  }
 }
 
 void AgglomerativeCluster(
     const Matrix<BaseFloat> &costs,
-    BaseFloat thresh,
-    int32 min_clust,
+    BaseFloat threshold,
+    int32 min_clusters,
+    int32 first_pass_max_points,
+    BaseFloat max_cluster_fraction,
     std::vector<int32> *assignments_out) {
-  KALDI_ASSERT(min_clust >= 0);
-  AgglomerativeClusterer ac(costs, thresh, min_clust, assignments_out);
+  KALDI_ASSERT(min_clusters >= 0);
+  KALDI_ASSERT(max_cluster_fraction >= 1.0 / min_clusters);
+  AgglomerativeClusterer ac(costs, threshold, min_clusters,
+                            first_pass_max_points, max_cluster_fraction,
+                            assignments_out);
   ac.Cluster();
 }
 
diff --git a/src/ivector/agglomerative-clustering.h b/src/ivector/agglomerative-clustering.h
index 310a336f8b5..ffd63a86e29 100644
--- a/src/ivector/agglomerative-clustering.h
+++ b/src/ivector/agglomerative-clustering.h
@@ -2,6 +2,7 @@
 
 // Copyright  2017-2018  Matthew Maciejewski
 //                 2018  David Snyder
+//                 2019  Dogan Can
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -55,65 +56,108 @@ class AgglomerativeClusterer {
  public:
   AgglomerativeClusterer(
       const Matrix<BaseFloat> &costs,
-      BaseFloat thresh,
-      int32 min_clust,
+      BaseFloat threshold,
+      int32 min_clusters,
+      int32 first_pass_max_points,
+      BaseFloat max_cluster_fraction,
       std::vector<int32> *assignments_out)
-      : count_(0), costs_(costs), thresh_(thresh), min_clust_(min_clust),
+      : costs_(costs), threshold_(threshold), min_clusters_(min_clusters),
+        first_pass_max_points_(first_pass_max_points),
         assignments_(assignments_out) {
-    num_clusters_ = costs.NumRows();
     num_points_ = costs.NumRows();
+
+    // The max_cluster_size_ is a hard limit on the number points in a cluster.
+    // This is useful for handling degenerate cases where some outlier points
+    // form their own clusters and force everything else to be clustered
+    // together, e.g. when min-clusters is provided instead of a threshold.
+    max_cluster_size_ = ceil(num_points_ * max_cluster_fraction);
+
+    // The count_, which is used for identifying clusters, is initialized to
+    // num_points_ because cluster IDs 1..num_points_ are reserved for input
+    // points, which are the initial set of clusters.
+    count_ = num_points_;
+
+    // The second_pass_count_, which is used for identifying the initial set of
+    // second pass clusters and initializing count_ before the second pass, is
+    // initialized to 0 and incremented whenever a new cluster is added to the
+    // initial set of second pass clusters.
+    second_pass_count_ = 0;
   }
 
-  // Performs the clustering
+  // Clusters points. Chooses single pass or two pass algorithm.
   void Cluster();
+
+  // Clusters points using single pass algorithm.
+  void ClusterSinglePass();
+
+  // Clusters points using two pass algorithm.
+  void ClusterTwoPass();
+
  private:
-  // Returns the cost between clusters with IDs i and j
-  BaseFloat GetCost(int32 i, int32 j);
+  // Encodes cluster pair into a 32bit unsigned integer.
+  uint32 EncodePair(int32 i, int32 j);
+  // Decodes cluster pair from a 32bit unsigned integer.
+  std::pair<int32, int32> DecodePair(uint32 key);
   // Initializes the clustering queue with singleton clusters
-  void Initialize();
+  void InitializeClusters(int32 first, int32 last);
+  // Does hierarchical agglomerative clustering
+  void ComputeClusters(int32 min_clusters);
+  // Adds clusters created in first pass to second pass clusters
+  void AddClustersToSecondPass();
+  // Assigns points to clusters
+  void AssignClusters();
   // Merges clusters with IDs i and j and updates cost map and queue
   void MergeClusters(int32 i, int32 j);
 
-
-  int32 count_;  // Count of clusters that have been created. Also used to give
-                 // clusters unique IDs.
   const Matrix<BaseFloat> &costs_;  // cost matrix
-  BaseFloat thresh_;  // stopping criterion threshold
-  int32 min_clust_;  // minimum number of clusters
+  BaseFloat threshold_;  // stopping criterion threshold
+  int32 min_clusters_;  // minimum number of clusters
+  int32 first_pass_max_points_;  // maximum number of points in each subset
   std::vector<int32> *assignments_;  // assignments out
 
+  int32 num_points_;  // total number of points to cluster
+  int32 max_cluster_size_;  // maximum number of points in a cluster
+  int32 count_;  // count of first pass clusters, used for identifying clusters
+  int32 second_pass_count_;  // count of second pass clusters
+
   // Priority queue using greater (lowest costs are highest priority).
   // Elements contain pairs of cluster IDs and their cost.
-  typedef std::pair<BaseFloat, std::pair<uint16,
-    uint16> > QueueElement;
+  typedef std::pair<BaseFloat, uint32> QueueElement;
   typedef std::priority_queue<QueueElement, std::vector<QueueElement>,
     std::greater<QueueElement>  > QueueType;
-  QueueType queue_;
+  QueueType queue_, second_pass_queue_;
 
   // Map from cluster IDs to cost between them
-  std::unordered_map<std::pair<int32, int32>, BaseFloat,
-                     PairHasher<int32, int32>> cluster_cost_map_;
+  std::unordered_map<uint32, BaseFloat> cluster_cost_map_;
   // Map from cluster ID to cluster object address
   std::unordered_map<int32, AhcCluster*> clusters_map_;
-  std::set<int32> active_clusters_;  // IDs of unmerged clusters
-  int32 num_clusters_;  // number of active clusters
-  int32 num_points_;  // total number of points to cluster
+  // Set of unmerged cluster IDs
+  std::set<int32> active_clusters_;
+
+  // Map from second pass cluster IDs to cost between them
+  std::unordered_map<uint32, BaseFloat> second_pass_cluster_cost_map_;
+  // Map from second pass cluster ID to cluster object address
+  std::unordered_map<int32, AhcCluster*> second_pass_clusters_map_;
+  // Set of unmerged second pass cluster IDs
+  std::set<int32> second_pass_active_clusters_;
 };
 
 /** This is the function that is called to perform the agglomerative
  *  clustering. It takes the following arguments:
  *   - A matrix of all pairwise costs, with each row/column corresponding
  *      to an utterance ID, and the elements of the matrix containing the
-        cost for pairing the utterances for its row and column
+ *      cost for pairing the utterances for its row and column
  *   - A threshold which is used as the stopping criterion for the clusters
  *   - A minimum number of clusters that will not be merged past
+ *   - A maximum fraction of points that can be in a cluster
  *   - A vector which will be filled with integer IDs corresponding to each
  *      of the rows/columns of the score matrix.
  *
  *  The basic algorithm is as follows:
  *  \code
- *      while (num-clusters > min_clust && smallest-merge-cost <= thresh)
- *          merge the two clusters with lowest cost.
+ *      while (num-clusters > min-clusters && smallest-merge-cost <= threshold)
+ *          if (size-of-new-cluster <= max-cluster-size)
+ *              merge the two clusters with lowest cost
  *  \endcode
  *
  *  The cost between two clusters is the average cost of all pairwise
@@ -126,11 +170,19 @@ class AgglomerativeClusterer {
  *  costs between clusters I and M and clusters I and N, where
  *  cluster J was formed by merging clusters M and N.
  *
+ *  If the number of points to cluster is larger than first-pass-max-points,
+ *  then clustering is done in two passes. In the first pass, input points are
+ *  divided into contiguous subsets of size at most first-pass-max-points and
+ *  each subset is clustered separately. In the second pass, the first pass
+ *  clusters are merged into the final set of clusters.
+ *
  */
 void AgglomerativeCluster(
     const Matrix<BaseFloat> &costs,
-    BaseFloat thresh,
-    int32 min_clust,
+    BaseFloat threshold,
+    int32 min_clusters,
+    int32 first_pass_max_points,
+    BaseFloat max_cluster_fraction,
     std::vector<int32> *assignments_out);
 
 }  // end namespace kaldi.
diff --git a/src/ivector/logistic-regression.cc b/src/ivector/logistic-regression.cc
index 5d02c013294..4eae2ebe3d7 100644
--- a/src/ivector/logistic-regression.cc
+++ b/src/ivector/logistic-regression.cc
@@ -55,8 +55,7 @@ void LogisticRegression::Train(const Matrix<BaseFloat> &xs,
 
   weights_.SetZero();
   TrainParameters(xs_with_prior, ys, conf, &xw);
-  KALDI_LOG <<
-    "Finished training parameters without mixture components." << std::endl;
+  KALDI_LOG << "Finished training parameters without mixture components.";
 
   // If we are using mixture components, we add those components
   // in MixUp and retrain with the extra weights.
@@ -64,8 +63,7 @@ void LogisticRegression::Train(const Matrix<BaseFloat> &xs,
     MixUp(ys, num_classes, conf);
     Matrix<BaseFloat> xw(xs_num_rows, weights_.NumRows());
     TrainParameters(xs_with_prior, ys, conf, &xw);
-    KALDI_LOG <<
-      "Finished training mixture components." << std::endl;
+    KALDI_LOG << "Finished training mixture components.";
   }
 }
 
@@ -87,8 +85,7 @@ void LogisticRegression::MixUp(const std::vector<int32> &ys,
                                   static_cast<int32>(0));
 
   KALDI_LOG << "Target number mixture components was " << conf.mix_up
-            << ". Training " << new_dim << " mixture components. "
-            << std::endl;
+            << ". Training " << new_dim << " mixture components.";
 
   int32 old_dim = weights_.NumRows(),
         num_components = old_dim,
diff --git a/src/ivectorbin/Makefile b/src/ivectorbin/Makefile
index 5a738352d9c..9f50bd232fa 100644
--- a/src/ivectorbin/Makefile
+++ b/src/ivectorbin/Makefile
@@ -16,7 +16,7 @@ BINFILES = ivector-extractor-init ivector-extractor-acc-stats \
            logistic-regression-train logistic-regression-eval \
            logistic-regression-copy ivector-extract-online \
            ivector-adapt-plda ivector-plda-scoring-dense \
-           agglomerative-cluster
+           agglomerative-cluster ivector-extractor-copy
 
 OBJFILES =
 
diff --git a/src/ivectorbin/agglomerative-cluster.cc b/src/ivectorbin/agglomerative-cluster.cc
index 9dca9bfeb83..4812dd291e1 100644
--- a/src/ivectorbin/agglomerative-cluster.cc
+++ b/src/ivectorbin/agglomerative-cluster.cc
@@ -2,6 +2,7 @@
 
 // Copyright 2016-2018  David Snyder
 //           2017-2018  Matthew Maciejewski
+//                2019  Dogan Can
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -47,8 +48,9 @@ int main(int argc, char *argv[]) {
 
     ParseOptions po(usage);
     std::string reco2num_spk_rspecifier;
-    BaseFloat threshold = 0.0;
+    BaseFloat threshold = 0.0, max_spk_fraction = 1.0;
     bool read_costs = false;
+    int32 first_pass_max_utterances = std::numeric_limits<int16>::max();
 
     po.Register("reco2num-spk-rspecifier", &reco2num_spk_rspecifier,
       "If supplied, clustering creates exactly this many clusters for each"
@@ -58,6 +60,16 @@ int main(int argc, char *argv[]) {
     po.Register("read-costs", &read_costs, "If true, the first"
       " argument is interpreted as a matrix of costs rather than a"
       " similarity matrix.");
+    po.Register("first-pass-max-utterances", &first_pass_max_utterances,
+      "If the number of utterances is larger than first-pass-max-utterances,"
+      " then clustering is done in two passes. In the first pass, input points"
+      " are divided into contiguous subsets of size first-pass-max-utterances"
+      " and each subset is clustered separately. In the second pass, the first"
+      " pass clusters are merged into the final set of clusters.");
+    po.Register("max-spk-fraction", &max_spk_fraction, "Merge clusters if the"
+      " total fraction of utterances in them is less than this threshold."
+      " This is active only when reco2num-spk-rspecifier is supplied and"
+      " 1.0 / num-spk <= max-spk-fraction <= 1.0.");
 
     po.Read(argc, argv);
 
@@ -90,10 +102,17 @@ int main(int argc, char *argv[]) {
       std::vector<int32> spk_ids;
       if (reco2num_spk_rspecifier.size()) {
         int32 num_speakers = reco2num_spk_reader.Value(reco);
-        AgglomerativeCluster(costs,
-          std::numeric_limits<BaseFloat>::max(), num_speakers, &spk_ids);
+        if (1.0 / num_speakers <= max_spk_fraction && max_spk_fraction <= 1.0)
+          AgglomerativeCluster(costs, std::numeric_limits<BaseFloat>::max(),
+                               num_speakers, first_pass_max_utterances,
+                               max_spk_fraction, &spk_ids);
+        else
+          AgglomerativeCluster(costs, std::numeric_limits<BaseFloat>::max(),
+                               num_speakers, first_pass_max_utterances,
+                               1.0, &spk_ids);
       } else {
-        AgglomerativeCluster(costs, threshold, 1, &spk_ids);
+        AgglomerativeCluster(costs, threshold, 1, first_pass_max_utterances,
+                             1.0, &spk_ids);
       }
       for (int32 i = 0; i < spk_ids.size(); i++)
         label_writer.Write(uttlist[i], spk_ids[i]);
diff --git a/src/ivectorbin/ivector-extractor-copy.cc b/src/ivectorbin/ivector-extractor-copy.cc
new file mode 100755
index 00000000000..4b443a1b7b5
--- /dev/null
+++ b/src/ivectorbin/ivector-extractor-copy.cc
@@ -0,0 +1,68 @@
+
+// Copyright 2019  Valluri Saikiran, Nagendra Goel, Govivace.inc
+
+
+// This file copies ivector extractor into text format
+// which is to be loaded by dump_model.py, to parse the model parameters,
+// before VB resegmentation.
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "ivector/ivector-extractor.h"
+
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using kaldi::int32;
+
+    const char *usage =
+        "Copy an ivector-extractor\n"
+        "Usage:  ivector-extractor-copy [options] <ivector-extractor-in> <ivector-extractor-out>\n"
+        "e.g.:\n"
+        " ivector-extractor-copy --binary=false 0.ie 0_txt.ie\n";
+
+    bool binary = true;
+    IvectorExtractorOptions ivector_opts;
+    ParseOptions po(usage);
+    po.Register("binary", &binary, "Write output in binary mode");
+    ivector_opts.Register(&po);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string ivector_extractor_rxfilename = po.GetArg(1),
+        ivector_extractor_wxfilename = po.GetArg(2);
+
+    IvectorExtractor extractor;
+    ReadKaldiObject(ivector_extractor_rxfilename, &extractor);
+
+    WriteKaldiObject(extractor, ivector_extractor_wxfilename, binary);
+  
+    return 0;
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
+
diff --git a/src/kwsbin/compute-atwv.cc b/src/kwsbin/compute-atwv.cc
index 6d9f6d2c2bb..0907baf268a 100644
--- a/src/kwsbin/compute-atwv.cc
+++ b/src/kwsbin/compute-atwv.cc
@@ -112,7 +112,7 @@ int main(int argc, char *argv[]) {
       if (vals.size() != 4) {
         KALDI_ERR << "Incorrect format of the reference file"
           << " -- 4 entries expected, " << vals.size() << " given!\n"
-          << "Key: " << kwid << std::endl;
+          << "Key: " << kwid;
       }
       KwsTerm inst(kwid, vals);
       aligner.AddRef(inst);
@@ -127,7 +127,7 @@ int main(int argc, char *argv[]) {
       if (vals.size() != 4) {
         KALDI_ERR << "Incorrect format of the hypotheses file"
           << " -- 4 entries expected, " << vals.size() << " given!\n"
-          << "Key: " << kwid << std::endl;
+          << "Key: " << kwid;
       }
       KwsTerm inst(kwid, vals);
       aligner.AddHyp(inst);
@@ -171,4 +171,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
diff --git a/src/lat/compose-lattice-pruned.cc b/src/lat/compose-lattice-pruned.cc
index c6e4dafc008..cc71db38eab 100644
--- a/src/lat/compose-lattice-pruned.cc
+++ b/src/lat/compose-lattice-pruned.cc
@@ -658,6 +658,7 @@ void PrunedCompactLatticeComposer::AddFirstState() {
   composed_state_queue_.push(
       std::pair<BaseFloat, int32>(expected_cost_offset,
                                   state_id));  // actually (0.0, 0).
+
 }
 
 
@@ -771,7 +772,14 @@ void PrunedCompactLatticeComposer::ProcessTransition(int32 src_composed_state,
   // Note: we expect that ilabel == olabel, since this is a CompactLattice, but this
   // may not be so if we extend this to work with Lattice.
   fst::StdArc lm_arc;
-  if (!det_fst_->GetArc(src_info->lm_state, olabel, &lm_arc)) {
+
+  // the input lattice might have epsilons
+  if (olabel == 0) {
+    lm_arc.ilabel = 0;
+    lm_arc.olabel = 0;
+    lm_arc.nextstate = src_info->lm_state;
+    lm_arc.weight = fst::StdArc::Weight(0.0);
+  } else if (!det_fst_->GetArc(src_info->lm_state, olabel, &lm_arc)) {
     // for normal language models we don't expect this to happen, but the
     // appropriate behavior is to do nothing; the composed arc does not exist,
     // so there is no arc to add and no new state to create.
diff --git a/src/lat/determinize-lattice-pruned.cc b/src/lat/determinize-lattice-pruned.cc
index 447c951d02c..22eae8199ff 100644
--- a/src/lat/determinize-lattice-pruned.cc
+++ b/src/lat/determinize-lattice-pruned.cc
@@ -665,8 +665,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
         continue;
       if (opts_.max_loop > 0 && counter++ > opts_.max_loop) {
         KALDI_ERR << "Lattice determinization aborted since looped more than "
-                  << opts_.max_loop << " times during epsilon closure.\n";
-        throw std::runtime_error("looped more than max-arcs times in lattice determinization");
+                  << opts_.max_loop << " times during epsilon closure.";
       }
       for (ArcIterator<ExpandedFst<Arc> > aiter(*ifst_, elem.state); !aiter.Done(); aiter.Next()) {
         const Arc &arc = aiter.Value();
diff --git a/src/lat/sausages.cc b/src/lat/sausages.cc
index e376272510c..b851bc3604c 100644
--- a/src/lat/sausages.cc
+++ b/src/lat/sausages.cc
@@ -53,7 +53,17 @@ void MinimumBayesRisk::MbrDecode() {
       }
       // build the outputs (time, confidences),
       if (R_[q] != 0 || opts_.print_silence) {
-        one_best_times_.push_back(times_[q][0]);
+        // see which 'item' from the sausage-bin should we select,
+        // (not necessarily the 1st one when MBR decoding disabled)
+        int32 s = 0;
+        for (int32 j=0; j<gamma_[q].size(); j++) {
+          if (gamma_[q][j].first == R_[q]) {
+            s = j;
+            break;
+          }
+        }
+        one_best_times_.push_back(times_[q][s]);
+        // post-process the times,
         size_t i = one_best_times_.size();
         if (i > 1 && one_best_times_[i-2].second > one_best_times_[i-1].first) {
           // It's quite possible for this to happen, but it seems like it would
@@ -76,8 +86,12 @@ void MinimumBayesRisk::MbrDecode() {
           one_best_times_[i-1].second = right;
         }
         BaseFloat confidence = 0.0;
-        for (int32 j = 0; j < gamma_[q].size(); j++)
-          if (gamma_[q][j].first == R_[q]) confidence = gamma_[q][j].second;
+        for (int32 j = 0; j < gamma_[q].size(); j++) {
+          if (gamma_[q][j].first == R_[q]) {
+            confidence = gamma_[q][j].second;
+            break;
+          }
+        }
         one_best_confidences_.push_back(confidence);
       }
     }
diff --git a/src/latbin/lattice-expand-ngram.cc b/src/latbin/lattice-expand-ngram.cc
index 1b8cfbee24b..1e7625d79e0 100644
--- a/src/latbin/lattice-expand-ngram.cc
+++ b/src/latbin/lattice-expand-ngram.cc
@@ -36,15 +36,15 @@ int main(int argc, char *argv[]) {
       "Usage: lattice-expand-ngram [options] lattice-rspecifier "
       "lattice-wspecifier\n"
       "e.g.: lattice-expand-ngram --n=3 ark:lat ark:expanded_lat\n";
-      
+
     ParseOptions po(usage);
     int32 n = 3;
 
     std::string word_syms_filename;
     po.Register("n", &n, "n-gram context to expand to.");
-    
+
     po.Read(argc, argv);
- 
+
     if (po.NumArgs() != 2) {
       po.PrintUsage();
       exit(1);
@@ -58,10 +58,10 @@ int main(int argc, char *argv[]) {
     fst::UnweightedNgramFst<CompactLatticeArc> expand_fst(n);
 
     SequentialCompactLatticeReader lat_reader(lats_rspecifier);
-    CompactLatticeWriter lat_writer(lats_wspecifier); 
+    CompactLatticeWriter lat_writer(lats_wspecifier);
 
     int32 n_done = 0, n_fail = 0;
-    
+
     for (; !lat_reader.Done(); lat_reader.Next()) {
       std::string key = lat_reader.Key();
       KALDI_LOG << "Processing lattice for key " << key;
@@ -69,14 +69,14 @@ int main(int argc, char *argv[]) {
       CompactLattice expanded_lat;
       ComposeDeterministicOnDemand(lat, &expand_fst, &expanded_lat);
       if (expanded_lat.Start() == fst::kNoStateId) {
-        KALDI_WARN << "Empty lattice for utterance " << key << std::endl;
+        KALDI_WARN << "Empty lattice for utterance " << key;
        n_fail++;
       } else {
         if (lat.NumStates() == expanded_lat.NumStates()) {
-          KALDI_LOG << "Lattice for key " << key 
+          KALDI_LOG << "Lattice for key " << key
             << " did not need to be expanded for order " << n << ".";
         } else {
-          KALDI_LOG << "Lattice expanded from " << lat.NumStates() << " to " 
+          KALDI_LOG << "Lattice expanded from " << lat.NumStates() << " to "
             << expanded_lat.NumStates() << " states for order " << n << ".";
         }
         lat_writer.Write(key, expanded_lat);
@@ -84,7 +84,7 @@ int main(int argc, char *argv[]) {
       }
       lat_reader.FreeCurrent();
     }
-    KALDI_LOG << "Processed " << n_done << " lattices with " << n_fail 
+    KALDI_LOG << "Processed " << n_done << " lattices with " << n_fail
       << " failures.";
     return 0;
   } catch(const std::exception &e) {
diff --git a/src/lm/arpa-file-parser.cc b/src/lm/arpa-file-parser.cc
index f3565eabf4e..53e4a1b61bf 100644
--- a/src/lm/arpa-file-parser.cc
+++ b/src/lm/arpa-file-parser.cc
@@ -74,7 +74,7 @@ void ArpaFileParser::Read(std::istream &is) {
   warning_count_ = 0;
   current_line_.clear();
 
-#define PARSE_ERR (KALDI_ERR << LineReference() << ": ")
+#define PARSE_ERR KALDI_ERR << LineReference() << ": "
 
   // Give derived class an opportunity to prepare its state.
   ReadStarted();
diff --git a/src/lm/arpa-lm-compiler-test.cc b/src/lm/arpa-lm-compiler-test.cc
index 697d70c416a..ccfd26af7e5 100644
--- a/src/lm/arpa-lm-compiler-test.cc
+++ b/src/lm/arpa-lm-compiler-test.cc
@@ -209,8 +209,7 @@ bool ThrowsExceptionTest(bool seps, const string &infile) {
     // Make memory cleanup easy in both cases of try-catch block.
     std::unique_ptr<ArpaLmCompiler> compiler(Compile(seps, infile));
     return false;
-  } catch (const std::runtime_error&) {
-    // Kaldi throws only std::runtime_error in kaldi-error.cc
+  } catch (const KaldiFatalError&) {
     return true;
   }
 }
diff --git a/src/makefiles/cuda_64bit.mk b/src/makefiles/cuda_64bit.mk
index d66ae03602f..459cfa652ef 100644
--- a/src/makefiles/cuda_64bit.mk
+++ b/src/makefiles/cuda_64bit.mk
@@ -5,7 +5,7 @@ ifndef CUDATKDIR
 $(error CUDATKDIR not defined.)
 endif
 
-CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include -fPIC -pthread -isystem $(OPENFSTINC) -rdynamic
+CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include -fPIC -pthread -isystem $(OPENFSTINC)
 
 CUDA_INCLUDE= -I$(CUDATKDIR)/include -I$(CUBROOT)
 CUDA_FLAGS = --machine 64 -DHAVE_CUDA \
@@ -14,4 +14,4 @@ CUDA_FLAGS = --machine 64 -DHAVE_CUDA \
              --verbose -Xcompiler "$(CXXFLAGS)"
 
 CUDA_LDFLAGS += -L$(CUDATKDIR)/lib64 -Wl,-rpath,$(CUDATKDIR)/lib64
-CUDA_LDLIBS += -lcublas -lcusparse -lcudart -lcurand -lnvToolsExt #LDLIBS : The libs are loaded later than static libs in implicit rule
+CUDA_LDLIBS += -lcublas -lcusparse -lcudart -lcurand -lcufft -lnvToolsExt #LDLIBS : The .so libs are loaded later than static libs in implicit rule
diff --git a/src/makefiles/default_rules.mk b/src/makefiles/default_rules.mk
index 25dafae2f3a..14989e8afaf 100644
--- a/src/makefiles/default_rules.mk
+++ b/src/makefiles/default_rules.mk
@@ -3,11 +3,19 @@ SHELL := /bin/bash
 
 ifeq ($(KALDI_FLAVOR), dynamic)
   ifeq ($(shell uname), Darwin)
-    ifdef LIBNAME
-      LIBFILE = lib$(LIBNAME).dylib
+    ifdef ANDROIDINC # cross-compiling enabled on host MacOS
+      ifdef LIBNAME
+        LIBFILE = lib$(LIBNAME).so
+      endif
+      LDFLAGS += -Wl,-rpath -Wl,$(KALDILIBDIR)
+      EXTRA_LDLIBS += $(foreach dep,$(ADDLIBS), $(dir $(dep))$(notdir $(basename $(dep))).a)
+    else
+      ifdef LIBNAME
+        LIBFILE = lib$(LIBNAME).dylib
+      endif
+      LDFLAGS += -Wl,-rpath -Wl,$(KALDILIBDIR)
+      EXTRA_LDLIBS += $(foreach dep,$(ADDLIBS), $(dir $(dep))lib$(notdir $(basename $(dep))).dylib)
     endif
-    LDFLAGS += -Wl,-rpath -Wl,$(KALDILIBDIR)
-    EXTRA_LDLIBS += $(foreach dep,$(ADDLIBS), $(dir $(dep))lib$(notdir $(basename $(dep))).dylib)
   else ifeq ($(shell uname), Linux)
     ifdef LIBNAME
       LIBFILE = lib$(LIBNAME).so
@@ -125,7 +133,7 @@ valgrind: .valgrind
 #buid up dependency commands
 CC_SRCS=$(wildcard *.cc)
 #check if files exist to run dependency commands on
-ifneq ($(CC_SRCS),)										
+ifneq ($(CC_SRCS),)
 CC_DEP_COMMAND=$(CXX) -M $(CXXFLAGS) $(CC_SRCS)
 endif
 
diff --git a/src/makefiles/linux_x86_64_mkl.mk b/src/makefiles/linux_x86_64_mkl.mk
index 7a70fa51a65..d1c399d9796 100644
--- a/src/makefiles/linux_x86_64_mkl.mk
+++ b/src/makefiles/linux_x86_64_mkl.mk
@@ -22,7 +22,7 @@ ifndef MKLROOT
 $(error MKLROOT not defined.)
 endif
 
-MKLLIB ?= $(MKLROOT)/lib/em64t
+MKLLIB ?= $(MKLROOT)/lib/intel64
 
 CXXFLAGS = -std=c++11 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
diff --git a/src/matrix/kaldi-matrix.cc b/src/matrix/kaldi-matrix.cc
index fcfe0616b64..faf23cdf0c5 100644
--- a/src/matrix/kaldi-matrix.cc
+++ b/src/matrix/kaldi-matrix.cc
@@ -5,6 +5,7 @@
 //                       Yanmin Qian;  Petr Schwarz;  Jan Silovsky;
 //                       Haihua Xu
 //           2017        Shiyin Kang
+//           2019        Yiwen Shao
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -93,7 +94,7 @@ void MatrixBase<Real>::Invert(Real *log_det, Real *det_sign,
       prod *= (*this)(i, i);
       if (i == num_rows_ - 1 || std::fabs(prod) < 1.0e-10 ||
           std::fabs(prod) > 1.0e+10) {
-        if (log_det != NULL) *log_det += Log(std::fabs(prod));
+        if (log_det != NULL) *log_det += kaldi::Log(std::fabs(prod));
         if (det_sign != NULL) *det_sign *= (prod > 0 ? 1.0 : -1.0);
         prod = 1.0;
       }
@@ -1470,7 +1471,7 @@ void Matrix<Real>::Read(std::istream & is, bool binary, bool add) {
   if (binary) {  // Read in binary mode.
     int peekval = Peek(is, binary);
     if (peekval == 'C') {
-      // This code enable us to read CompressedMatrix as a regular matrix.
+      // This code enables us to read CompressedMatrix as a regular matrix.
       CompressedMatrix compressed_mat;
       compressed_mat.Read(is, binary); // at this point, add == false.
       this->Resize(compressed_mat.NumRows(), compressed_mat.NumCols());
@@ -2098,90 +2099,135 @@ void Matrix<Real>::Transpose() {
 }
 
 template<typename Real>
-void MatrixBase<Real>::ApplyFloor(Real floor_val) {
+void MatrixBase<Real>::Heaviside(const MatrixBase<Real> &src) {
+  KALDI_ASSERT(SameDim(*this, src));
   MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
-  for (MatrixIndexT i = 0; i < num_rows; i++) {
-    Real *data = this->RowData(i);
-    for (MatrixIndexT j = 0; j < num_cols; j++)
-      data[j] = (data[j] < floor_val ? floor_val : data[j]);
+  Real *row_data = data_;
+  const Real *src_row_data = src.Data();
+  for (MatrixIndexT row = 0; row < num_rows;
+       row++,row_data += stride_, src_row_data += src.stride_) {
+    for (MatrixIndexT col = 0; col < num_cols; col++)
+      row_data[col] = (src_row_data[col] > 0 ? 1.0 : 0.0);
   }
 }
 
 template<typename Real>
-void MatrixBase<Real>::ApplyCeiling(Real ceiling_val) {
+void MatrixBase<Real>::Exp(const MatrixBase<Real> &src) {
+  KALDI_ASSERT(SameDim(*this, src));
   MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
-  for (MatrixIndexT i = 0; i < num_rows; i++) {
-    Real *data = this->RowData(i);
-    for (MatrixIndexT j = 0; j < num_cols; j++)
-      data[j] = (data[j] > ceiling_val ? ceiling_val : data[j]);
+  Real *row_data = data_;
+  const Real *src_row_data = src.Data();
+  for (MatrixIndexT row = 0; row < num_rows;
+       row++,row_data += stride_, src_row_data += src.stride_) {
+    for (MatrixIndexT col = 0; col < num_cols; col++)
+      row_data[col] = kaldi::Exp(src_row_data[col]);
   }
 }
 
 template<typename Real>
-void MatrixBase<Real>::ApplyLog() {
-  for (MatrixIndexT i = 0; i < num_rows_; i++) {
-    Row(i).ApplyLog();
+void MatrixBase<Real>::Pow(const MatrixBase<Real> &src, Real power) {
+  KALDI_ASSERT(SameDim(*this, src));
+  MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
+  Real *row_data = data_;
+  const Real *src_row_data = src.Data();
+  for (MatrixIndexT row = 0; row < num_rows;
+       row++,row_data += stride_, src_row_data += src.stride_) {
+    for (MatrixIndexT col = 0; col < num_cols; col++) {
+      row_data[col] = pow(src_row_data[col], power);
+    }
   }
 }
 
 template<typename Real>
-void MatrixBase<Real>::ApplyExp() {
-  for (MatrixIndexT i = 0; i < num_rows_; i++) {
-    Row(i).ApplyExp();
+void MatrixBase<Real>::PowAbs(const MatrixBase<Real> &src, Real power, bool include_sign) {
+  KALDI_ASSERT(SameDim(*this, src));
+  MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
+  Real *row_data = data_;
+  const Real *src_row_data = src.Data();
+  for (MatrixIndexT row = 0; row < num_rows;
+       row++,row_data += stride_, src_row_data += src.stride_) {
+    for (MatrixIndexT col = 0; col < num_cols; col ++) {
+      if (include_sign == true && src_row_data[col] < 0) {
+	row_data[col] = -pow(std::abs(src_row_data[col]), power);
+      } else {
+	row_data[col] = pow(std::abs(src_row_data[col]), power);
+      }
+    }
   }
 }
 
 template<typename Real>
-void MatrixBase<Real>::ApplyExpSpecial() {
-  int32 num_rows = num_rows_, num_cols = num_cols_,
-      stride = stride_;
-  Real *data = data_;
-  for (MatrixIndexT i = 0; i < num_rows; ++i) {
-    for (MatrixIndexT j = 0; j < num_cols; ++j) {
-      Real &x = *(data + j + stride * i);
-      x = x < Real(0) ? Exp(x) : x + Real(1);
-    }
+void MatrixBase<Real>::Floor(const MatrixBase<Real> &src, Real floor_val) {
+  KALDI_ASSERT(SameDim(*this, src));
+  MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
+  Real *row_data = data_;
+  const Real *src_row_data = src.Data();
+  for (MatrixIndexT row = 0; row < num_rows;
+       row++,row_data += stride_, src_row_data += src.stride_) {
+    for (MatrixIndexT col = 0; col < num_cols; col++)
+      row_data[col] = (src_row_data[col] < floor_val ? floor_val : src_row_data[col]);
   }
 }
 
 template<typename Real>
-void MatrixBase<Real>::ApplyPow(Real power) {
-  for (MatrixIndexT i = 0; i < num_rows_; i++) {
-    Row(i).ApplyPow(power);
+void MatrixBase<Real>::Ceiling(const MatrixBase<Real> &src, Real ceiling_val) {
+  KALDI_ASSERT(SameDim(*this, src));
+  MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
+  Real *row_data = data_;
+  const Real *src_row_data = src.Data();
+  for (MatrixIndexT row = 0; row < num_rows;
+       row++,row_data += stride_, src_row_data += src.stride_) {
+    for (MatrixIndexT col = 0; col < num_cols; col++)
+      row_data[col] = (src_row_data[col] > ceiling_val ? ceiling_val : src_row_data[col]);
   }
 }
 
 template<typename Real>
-void MatrixBase<Real>::ApplyPowAbs(Real power, bool include_sign) {
-  for (MatrixIndexT i = 0; i < num_rows_; i++) {
-    Row(i).ApplyPowAbs(power, include_sign);
+void MatrixBase<Real>::Log(const MatrixBase<Real> &src) {
+  KALDI_ASSERT(SameDim(*this, src));
+  MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
+  Real *row_data = data_;
+  const Real *src_row_data = src.Data();
+  for (MatrixIndexT row = 0; row < num_rows;
+       row++,row_data += stride_, src_row_data += src.stride_) {
+    for (MatrixIndexT col = 0; col < num_cols; col++)
+      row_data[col] = kaldi::Log(src_row_data[col]);
   }
 }
 
 template<typename Real>
-void MatrixBase<Real>::ApplyHeaviside() {
+void MatrixBase<Real>::ExpSpecial(const MatrixBase<Real> &src) {
+  KALDI_ASSERT(SameDim(*this, src));
   MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
-  for (MatrixIndexT i = 0; i < num_rows; i++) {
-    Real *data = this->RowData(i);
-    for (MatrixIndexT j = 0; j < num_cols; j++)
-      data[j] = (data[j] > 0 ? 1.0 : 0.0);
+  Real *row_data = data_;
+  const Real *src_row_data = src.Data();
+  for (MatrixIndexT row = 0; row < num_rows;
+       row++,row_data += stride_, src_row_data += src.stride_) {
+    for (MatrixIndexT col = 0; col < num_cols; col++)
+      row_data[col] = (src_row_data[col] < Real(0) ? kaldi::Exp(src_row_data[col]) : (src_row_data[col] + Real(1)));
   }
 }
 
 template<typename Real>
-void MatrixBase<Real>::Heaviside(const MatrixBase<Real> &src) {
+void MatrixBase<Real>::ExpLimited(const MatrixBase<Real> &src, Real lower_limit, Real upper_limit) {
   KALDI_ASSERT(SameDim(*this, src));
   MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
   Real *row_data = data_;
   const Real *src_row_data = src.Data();
   for (MatrixIndexT row = 0; row < num_rows;
        row++,row_data += stride_, src_row_data += src.stride_) {
-    for (MatrixIndexT col = 0; col < num_cols; col++)
-      row_data[col] = (src_row_data[col] > 0 ? 1.0 : 0.0);
+    for (MatrixIndexT col = 0; col < num_cols; col++) {
+      const Real x = src_row_data[col];
+      if (!(x >= lower_limit))
+	row_data[col] = kaldi::Exp(lower_limit);
+      else if (x > upper_limit)
+	row_data[col] = kaldi::Exp(upper_limit);
+      else
+	row_data[col] = kaldi::Exp(x);
+    }
   }
 }
 
-
 template<typename Real>
 bool MatrixBase<Real>::Power(Real power) {
   KALDI_ASSERT(num_rows_ > 0 && num_rows_ == num_cols_);
@@ -2695,10 +2741,10 @@ Real MatrixBase<Real>::LogSumExp(Real prune) const {
     for (MatrixIndexT j = 0; j < num_cols_; j++) {
       BaseFloat f = (*this)(i, j);
       if (f >= cutoff)
-        sum_relto_max_elem += Exp(f - max_elem);
+        sum_relto_max_elem += kaldi::Exp(f - max_elem);
     }
   }
-  return max_elem + Log(sum_relto_max_elem);
+  return max_elem + kaldi::Log(sum_relto_max_elem);
 }
 
 template<typename Real>
@@ -2707,9 +2753,9 @@ Real MatrixBase<Real>::ApplySoftMax() {
   // the 'max' helps to get in good numeric range.
   for (MatrixIndexT i = 0; i < num_rows_; i++)
     for (MatrixIndexT j = 0; j < num_cols_; j++)
-      sum += ((*this)(i, j) = Exp((*this)(i, j) - max));
+      sum += ((*this)(i, j) = kaldi::Exp((*this)(i, j) - max));
   this->Scale(1.0 / sum);
-  return max + Log(sum);
+  return max + kaldi::Log(sum);
 }
 
 template<typename Real>
@@ -2739,7 +2785,7 @@ void MatrixBase<Real>::SoftHinge(const MatrixBase<Real> &src) {
       Real x = src_row_data[c], y;
       if (x > 10.0) y = x; // avoid exponentiating large numbers; function
       // approaches y=x.
-      else y = Log1p(Exp(x)); // these defined in kaldi-math.h
+      else y = Log1p(kaldi::Exp(x)); // these defined in kaldi-math.h
       row_data[c] = y;
     }
   }
diff --git a/src/matrix/kaldi-matrix.h b/src/matrix/kaldi-matrix.h
index 11a5e08b15d..4387538c472 100644
--- a/src/matrix/kaldi-matrix.h
+++ b/src/matrix/kaldi-matrix.h
@@ -4,6 +4,7 @@
 //                      Saarland University;  Petr Schwarz;  Yanmin Qian;
 //                      Karel Vesely;  Go Vivace Inc.;  Haihua Xu
 //           2017       Shiyin Kang
+//           2019       Yiwen Shao
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -59,10 +60,10 @@ class MatrixBase {
   friend class SparseMatrix<float>;
   friend class SparseMatrix<double>;
 
-  /// Returns number of rows (or zero for emtpy matrix).
+  /// Returns number of rows (or zero for empty matrix).
   inline MatrixIndexT  NumRows() const { return num_rows_; }
 
-  /// Returns number of columns (or zero for emtpy matrix).
+  /// Returns number of columns (or zero for empty matrix).
   inline MatrixIndexT NumCols() const { return num_cols_; }
 
   /// Stride (distance in memory between each row).  Will be >= NumCols.
@@ -337,37 +338,42 @@ class MatrixBase {
                  const MatrixIndexT *indexes,
                  MatrixBase<Real> *dst) const;
 
-  /// Applies floor to all matrix elements
-  void ApplyFloor(Real floor_val);
+  inline void ApplyPow(Real power) {
+    this -> Pow(*this, power);
+  }
 
-  /// Applies floor to all matrix elements
-  void ApplyCeiling(Real ceiling_val);
 
-  /// Calculates log of all the matrix elemnts
-  void ApplyLog();
+  inline void ApplyPowAbs(Real power, bool include_sign=false) {
+    this -> PowAbs(*this, power, include_sign);
+  }
 
-  /// Exponentiate each of the elements.
-  void ApplyExp();
+  inline void ApplyHeaviside() {
+    this -> Heaviside(*this);
+  }
 
-  /// For each element x of the matrix, set it to
-  /// (x < 0 ? exp(x) : x + 1).  This function is used
-  /// in our RNNLM training.
-  void ApplyExpSpecial();
+  inline void ApplyFloor(Real floor_val) {
+    this -> Floor(*this, floor_val);
+  }
 
-  /// Applies power to all matrix elements
-  void ApplyPow(Real power);
+  inline void ApplyCeiling(Real ceiling_val) {
+    this -> Ceiling(*this, ceiling_val);
+  }
 
-  /// Apply power to the absolute value of each element.
-  /// Include the sign of the input element if include_sign == true.
-  /// If the power is negative and the input to the power is zero,
-  /// The output will be set zero.
-  void ApplyPowAbs(Real power, bool include_sign=false);
+  inline void ApplyExp() {
+    this -> Exp(*this);
+  }
 
-  /// Applies the Heaviside step function (x > 0 ? 1 : 0) to all matrix elements
-  /// Note: in general you can make different choices for x = 0, but for now
-  /// please leave it as it (i.e. returning zero) because it affects the
-  /// RectifiedLinearComponent in the neural net code.
-  void ApplyHeaviside();
+  inline void ApplyExpSpecial() {
+    this -> ExpSpecial(*this);
+  }
+
+  inline void ApplyExpLimited(Real lower_limit, Real upper_limit) {
+    this -> ExpLimited(*this, lower_limit, upper_limit);
+  }
+
+  inline void ApplyLog() {
+    this -> Log(*this);
+  }
 
   /// Eigenvalue Decomposition of a square NxN matrix into the form (*this) = P D
   /// P^{-1}.  Be careful: the relationship of D to the eigenvalues we output is
@@ -483,6 +489,35 @@ class MatrixBase {
   /// because it affects the RectifiedLinearComponent in the neural net code.
   void Heaviside(const MatrixBase<Real> &src);
 
+  void Exp(const MatrixBase<Real> &src);
+
+  void Pow(const MatrixBase<Real> &src, Real power);
+
+  void Log(const MatrixBase<Real> &src);
+
+  /// Apply power to the absolute value of each element.
+  /// If include_sign is true, the result will be multiplied with
+  /// the sign of the input value.
+  /// If the power is negative and the input to the power is zero,
+  /// The output will be set zero. If include_sign is true, it will
+  /// multiply the result by the sign of the input.
+  void PowAbs(const MatrixBase<Real> &src, Real power, bool include_sign=false);
+
+  void Floor(const MatrixBase<Real> &src, Real floor_val);
+
+  void Ceiling(const MatrixBase<Real> &src, Real ceiling_val);
+
+  /// For each element x of the matrix, set it to
+  /// (x < 0 ? exp(x) : x + 1).  This function is used
+  /// in our RNNLM training.
+  void ExpSpecial(const MatrixBase<Real> &src);
+
+  /// This is equivalent to running:
+  /// Floor(src, lower_limit);
+  /// Ceiling(src, upper_limit);
+  /// Exp(src)
+  void ExpLimited(const MatrixBase<Real> &src, Real lower_limit, Real upper_limit);
+
   /// Set each element to y = log(1 + exp(x))
   void SoftHinge(const MatrixBase<Real> &src);
 
@@ -531,6 +566,10 @@ class MatrixBase {
    * positive semi-definite (check_thresh controls how stringent the check is;
    * set it to 2 to ensure it won't ever complain, but it will zero out negative
    * dimensions in your matrix.
+   *
+   * Caution: if you want the eigenvalues, it may make more sense to convert to
+   * SpMatrix and use Eig() function there, which uses eigenvalue decomposition
+   * directly rather than SVD.
   */
   void SymPosSemiDefEig(VectorBase<Real> *s, MatrixBase<Real> *P,
                         Real check_thresh = 0.001);
@@ -763,7 +802,7 @@ class MatrixBase {
   /// data memory area
   Real*   data_;
 
-  /// these atributes store the real matrix size as it is stored in memory
+  /// these attributes store the real matrix size as it is stored in memory
   /// including memalignment
   MatrixIndexT    num_cols_;   /// < Number of columns
   MatrixIndexT    num_rows_;   /// < Number of rows
diff --git a/src/matrix/kaldi-vector.cc b/src/matrix/kaldi-vector.cc
index c8ea35112ea..2671bf5224b 100644
--- a/src/matrix/kaldi-vector.cc
+++ b/src/matrix/kaldi-vector.cc
@@ -6,7 +6,7 @@
 //                      Haihua Xu; Wei Shi
 //                2015  Guoguo Chen
 //                2017  Daniel Galvez
-
+//                2019  Yiwen Shao
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -448,32 +448,20 @@ void VectorBase<double>::CopyRowFromSp(const SpMatrix<double> &mat, MatrixIndexT
 
 #ifdef HAVE_MKL
 template<>
-void VectorBase<float>::ApplyPow(float power) { vsPowx(dim_, data_, power, data_); }
+void VectorBase<float>::Pow(const VectorBase<float> &v, float power) {
+  vsPowx(dim_, data_, power, v.data_);
+}
 template<>
-void VectorBase<double>::ApplyPow(double power) { vdPowx(dim_, data_, power, data_); }
+void VectorBase<double>::Pow(const VectorBase<double> &v, double power) {
+  vdPowx(dim_, data_, power, v.data_);
+}
 #else
-// takes elements to a power.  Throws exception if could not (but only for power != 1 and power != 2).
+
+// takes elements to a power.  Does not check output.
 template<typename Real>
-void VectorBase<Real>::ApplyPow(Real power) {
-  if (power == 1.0) return;
-  if (power == 2.0) {
-    for (MatrixIndexT i = 0; i < dim_; i++)
-      data_[i] = data_[i] * data_[i];
-  } else if (power == 0.5) {
-    for (MatrixIndexT i = 0; i < dim_; i++) {
-      if (!(data_[i] >= 0.0))
-        KALDI_ERR << "Cannot take square root of negative value "
-                  << data_[i];
-      data_[i] = std::sqrt(data_[i]);
-    }
-  } else {
-    for (MatrixIndexT i = 0; i < dim_; i++) {
-      data_[i] = pow(data_[i], power);
-      if (data_[i] == HUGE_VAL) {  // HUGE_VAL is what errno returns on error.
-        KALDI_ERR << "Could not raise element "  << i << " to power "
-                  << power << ": returned value = " << data_[i];
-      }
-    }
+void VectorBase<Real>::Pow(const VectorBase<Real> &v, Real power) {
+  for (MatrixIndexT i = 0; i < dim_; i++) {
+    data_[i] = pow(v.data_[i], power);
   }
 }
 #endif
@@ -814,17 +802,19 @@ void VectorBase<Real>::ApplyAbs() {
 }
 
 template<typename Real>
-void VectorBase<Real>::ApplyFloor(Real floor_val, MatrixIndexT *floored_count) {
+void VectorBase<Real>::Floor(const VectorBase<Real> &v, Real floor_val, MatrixIndexT *floored_count) {
   if (floored_count == nullptr) {
     for (MatrixIndexT i = 0; i < dim_; i++) {
-      data_[i] = std::max(data_[i], floor_val);
+      data_[i] = std::max(v.data_[i], floor_val);
     }
   } else {
     MatrixIndexT num_floored = 0;
     for (MatrixIndexT i = 0; i < dim_; i++) {
-      if (data_[i] < floor_val) {
+      if (v.data_[i] < floor_val) {
 	data_[i] = floor_val;
 	num_floored++;
+      } else {
+	data_[i] = v.data_[i];
       }
     }
     *floored_count = num_floored;
@@ -832,17 +822,19 @@ void VectorBase<Real>::ApplyFloor(Real floor_val, MatrixIndexT *floored_count) {
 }
 
 template<typename Real>
-void VectorBase<Real>::ApplyCeiling(Real ceil_val, MatrixIndexT *ceiled_count) {
+void VectorBase<Real>::Ceiling(const VectorBase<Real> &v, Real ceil_val, MatrixIndexT *ceiled_count) {
   if (ceiled_count == nullptr) {
     for (MatrixIndexT i = 0; i < dim_; i++) {
-      data_[i] = std::min(data_[i], ceil_val);
+      data_[i] = std::min(v.data_[i], ceil_val);
     }
   } else {
     MatrixIndexT num_changed = 0;
     for (MatrixIndexT i = 0; i < dim_; i++) {
-      if (data_[i] > ceil_val) {
+      if (v.data_[i] > ceil_val) {
 	data_[i] = ceil_val;
 	num_changed++;
+      } else {
+	data_[i] = v.data_[i];
       }
     }
     *ceiled_count = num_changed;
diff --git a/src/matrix/kaldi-vector.h b/src/matrix/kaldi-vector.h
index 383d8ca2862..b7f976d4151 100644
--- a/src/matrix/kaldi-vector.h
+++ b/src/matrix/kaldi-vector.h
@@ -7,6 +7,7 @@
 //                       Wei Shi;
 //                2015   Guoguo Chen
 //                2017   Daniel Galvez
+//                2019   Yiwen Shao
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -119,6 +120,15 @@ class VectorBase {
   template<typename OtherReal>
   void CopyFromVec(const CuVectorBase<OtherReal> &v);
 
+  /// Applies floor to all elements. Returns number of elements
+  /// floored in floored_count if it is non-null.
+  void Floor(const VectorBase<Real> &v, Real floor_val, MatrixIndexT *floored_count = nullptr);
+
+  /// Applies ceiling to all elements. Returns number of elements
+  /// changed in ceiled_count if it is non-null.
+  void Ceiling(const VectorBase<Real> &v, Real ceil_val, MatrixIndexT *ceiled_count = nullptr);
+
+  void Pow(const VectorBase<Real> &v, Real power);
 
   /// Apply natural log to all elements.  Throw if any element of
   /// the vector is negative (but doesn't complain about zero; the
@@ -136,11 +146,15 @@ class VectorBase {
 
   /// Applies floor to all elements. Returns number of elements
   /// floored in floored_count if it is non-null.
-  void ApplyFloor(Real floor_val, MatrixIndexT *floored_count = nullptr);
+  inline void ApplyFloor(Real floor_val, MatrixIndexT *floored_count = nullptr) {
+    this->Floor(*this, floor_val, floored_count);
+  };
 
   /// Applies ceiling to all elements. Returns number of elements
   /// changed in ceiled_count if it is non-null.
-  void ApplyCeiling(Real ceil_val, MatrixIndexT *ceiled_count = nullptr);
+  inline void ApplyCeiling(Real ceil_val, MatrixIndexT *ceiled_count = nullptr) {
+    this->Ceiling(*this, ceil_val, ceiled_count);
+  };
 
   /// Applies floor to all elements. Returns number of elements floored.
   MatrixIndexT ApplyFloor(const VectorBase<Real> &floor_vec);
@@ -162,7 +176,9 @@ class VectorBase {
   void Sigmoid(const VectorBase<Real> &src);
 
   /// Take all  elements of vector to a power.
-  void ApplyPow(Real power);
+  inline void ApplyPow(Real power) {
+    this->Pow(*this, power);
+  };
 
   /// Take the absolute value of all elements of a vector to a power.
   /// Include the sign of the input element if include_sign == true.
@@ -246,7 +262,7 @@ class VectorBase {
   /// Multiplies all elements by this constant.
   void Scale(Real alpha);
 
-  /// Multiplies this vector by lower-triangular marix:  *this <-- *this *M
+  /// Multiplies this vector by lower-triangular matrix:  *this <-- *this *M
   void MulTp(const TpMatrix<Real> &M, const MatrixTransposeType trans);
 
   /// If trans == kNoTrans, solves M x = b, where b is the value of *this at input
@@ -355,7 +371,7 @@ class VectorBase {
   friend class CuVector<Real>;
  protected:
   /// Destructor;  does not deallocate memory, this is handled by child classes.
-  /// This destructor is protected so this object so this object can only be
+  /// This destructor is protected so this object can only be
   /// deleted via a child.
   ~VectorBase() {}
 
diff --git a/src/matrix/sparse-matrix.cc b/src/matrix/sparse-matrix.cc
index 55d8edeb4b3..68a61e17dc3 100644
--- a/src/matrix/sparse-matrix.cc
+++ b/src/matrix/sparse-matrix.cc
@@ -808,6 +808,7 @@ void GeneralMatrix::Compress() {
 
 void GeneralMatrix::Uncompress() {
   if (cmat_.NumRows() != 0) {
+    mat_.Resize(cmat_.NumRows(), cmat_.NumCols(), kUndefined);
     cmat_.CopyToMat(&mat_);
     cmat_.Clear();
   }
diff --git a/src/matrix/tp-matrix.cc b/src/matrix/tp-matrix.cc
index f01ee1e8f46..6e34dc643e9 100644
--- a/src/matrix/tp-matrix.cc
+++ b/src/matrix/tp-matrix.cc
@@ -51,7 +51,7 @@ void TpMatrix<Real>::Invert() {
   // format, so we temporarily put in non-packed format.
   Matrix<Real> tmp(*this);
   int rows = static_cast<int>(this->num_rows_);
-  
+
   // ATLAS call.  It's really row-major ordering and a lower triangular matrix,
   // but there is some weirdness with Fortran-style indexing that we need to
   // take account of, so everything gets swapped.
@@ -102,14 +102,13 @@ void TpMatrix<Real>::Cholesky(const SpMatrix<Real> &orig) {
     }
     // d = orig(j, j) - d;
     d = orig_jdata[j] - d;
-    
+
     if (d >= 0.0) {
       // (*this)(j, j) = std::sqrt(d);
       jdata[j] = std::sqrt(d);
     } else {
-      KALDI_WARN << "Cholesky decomposition failed. Maybe matrix "
-          "is not positive definite. Throwing error";
-      throw std::runtime_error("Cholesky decomposition failed.");
+      KALDI_ERR << "Cholesky decomposition failed. Maybe matrix "
+          "is not positive definite.";
     }
   }
 }
@@ -144,5 +143,3 @@ template class TpMatrix<float>;
 template class TpMatrix<double>;
 
 }  // namespace kaldi
-
-
diff --git a/src/matrix/tp-matrix.h b/src/matrix/tp-matrix.h
index b215e73b000..e3b08701543 100644
--- a/src/matrix/tp-matrix.h
+++ b/src/matrix/tp-matrix.h
@@ -45,11 +45,11 @@ class TpMatrix : public PackedMatrix<Real> {
   /// Copy constructor from CUDA TpMatrix
   /// This is defined in ../cudamatrix/cu-tp-matrix.cc
   explicit TpMatrix(const CuTpMatrix<Real> &cu);
-  
-  
+
+
   template<typename OtherReal> explicit TpMatrix(const TpMatrix<OtherReal>& orig)
       : PackedMatrix<Real>(orig) {}
-  
+
   Real operator() (MatrixIndexT r, MatrixIndexT c) const {
     if (static_cast<UnsignedMatrixIndexT>(c) >
         static_cast<UnsignedMatrixIndexT>(r)) {
@@ -74,9 +74,9 @@ class TpMatrix : public PackedMatrix<Real> {
     return *(this->data_ + (r*(r+1)) / 2 + c);
     // Duplicating code from PackedMatrix.h
   }
-  // Note: Cholesky may throw std::runtime_error
+  // Note: Cholesky may throw KaldiFatalError.
   void Cholesky(const SpMatrix<Real>& orig);
-  
+
   void Invert();
 
   // Inverts in double precision.
@@ -99,7 +99,7 @@ class TpMatrix : public PackedMatrix<Real> {
 
   /// This is implemented in ../cudamatrix/cu-tp-matrix.cc
   void CopyFromMat(const CuTpMatrix<Real> &other);
-  
+
   /// CopyFromTp copies another triangular matrix into this one.
   void CopyFromTp(const TpMatrix<Real> &other) {
     PackedMatrix<Real>::CopyFromPacked(other);
@@ -132,4 +132,3 @@ class TpMatrix : public PackedMatrix<Real> {
 
 
 #endif
-
diff --git a/src/nnet/nnet-activation.h b/src/nnet/nnet-activation.h
index 74b0ebad650..ad9acac26bc 100644
--- a/src/nnet/nnet-activation.h
+++ b/src/nnet/nnet-activation.h
@@ -49,7 +49,7 @@ class Softmax : public Component {
   void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
                     CuMatrixBase<BaseFloat> *out) {
     // y = e^x_j/sum_j(e^x_j)
-    out->ApplySoftMaxPerRow(in);
+    out->SoftMaxPerRow(in);
   }
 
   void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
@@ -81,7 +81,7 @@ class HiddenSoftmax : public Component {
   void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
                     CuMatrixBase<BaseFloat> *out) {
     // y = e^x_j/sum_j(e^x_j)
-    out->ApplySoftMaxPerRow(in);
+    out->SoftMaxPerRow(in);
   }
 
   void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
@@ -167,7 +167,7 @@ class BlockSoftmax : public Component {
       CuSubMatrix<BaseFloat> out_bl =
         out->ColRange(block_offset[bl], block_dims[bl]);
       // y = e^x_j/sum_j(e^x_j),
-      out_bl.ApplySoftMaxPerRow(in_bl);
+      out_bl.SoftMaxPerRow(in_bl);
     }
   }
 
diff --git a/src/nnet/nnet-average-pooling-2d-component.h b/src/nnet/nnet-average-pooling-2d-component.h
deleted file mode 100644
index 17ae87f94db..00000000000
--- a/src/nnet/nnet-average-pooling-2d-component.h
+++ /dev/null
@@ -1,209 +0,0 @@
-// nnet/nnet-average-pooling-2d-component.h
-
-// Copyright 2014  Brno University of Technology (author: Karel Vesely)
-//                 Johns Hopkins University (author: Sri Harish Mallidi)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_AVERAGE_POOLING_2D_COMPONENT_H_
-#define KALDI_NNET_NNET_AVERAGE_POOLING_2D_COMPONENT_H_
-
-#include <string>
-#include <vector>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-utils.h"
-#include "cudamatrix/cu-math.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-/**
- * AveragePoolingComponent :
- * The input/output matrices are split to submatrices with width 'pool_stride_'.
- * The pooling is done over 3rd axis, of the set of 2d matrices.
- * Our pooling supports overlaps, overlaps occur when (pool_step_ < pool_size_).
- */
-class AveragePooling2DComponent : public Component {
- public:
-  AveragePooling2DComponent(int32 dim_in, int32 dim_out):
-    Component(dim_in, dim_out),
-    fmap_x_len_(0), fmap_y_len_(0),
-    pool_x_len_(0), pool_y_len_(0),
-    pool_x_step_(0), pool_y_step_(0)
-  { }
-  ~AveragePooling2DComponent()
-  { }
-
-  Component* Copy() const { return new AveragePooling2DComponent(*this); }
-  ComponentType GetType() const { return kAveragePooling2DComponent; }
-
-  void InitData(std::istream &is) {
-    // parse config
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<FmapXLen>") ReadBasicType(is, false, &fmap_x_len_);
-      else if (token == "<FmapYLen>") ReadBasicType(is, false, &fmap_y_len_);
-      else if (token == "<PoolXLen>") ReadBasicType(is, false, &pool_x_len_);
-      else if (token == "<PoolYLen>") ReadBasicType(is, false, &pool_y_len_);
-      else if (token == "<PoolXStep>") ReadBasicType(is, false, &pool_x_step_);
-      else if (token == "<PoolYStep>") ReadBasicType(is, false, &pool_y_step_);
-      else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-             << " (FmapXLen|FmapYLen|PoolXLen|PoolYLen|PoolXStep|PoolYStep)";
-    }
-    // check
-    KALDI_ASSERT(fmap_x_len_ * fmap_y_len_ != 0);
-    KALDI_ASSERT(pool_x_len_ * pool_y_len_ != 0);
-    KALDI_ASSERT(pool_x_step_ * pool_y_step_  != 0);
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    // pooling hyperparameters
-    ExpectToken(is, binary, "<FmapXLen>");
-    ReadBasicType(is, binary, &fmap_x_len_);
-    ExpectToken(is, binary, "<FmapYLen>");
-    ReadBasicType(is, binary, &fmap_y_len_);
-    ExpectToken(is, binary, "<PoolXLen>");
-    ReadBasicType(is, binary, &pool_x_len_);
-    ExpectToken(is, binary, "<PoolYLen>");
-    ReadBasicType(is, binary, &pool_y_len_);
-    ExpectToken(is, binary, "<PoolXStep>");
-    ReadBasicType(is, binary, &pool_x_step_);
-    ExpectToken(is, binary, "<PoolYStep>");
-    ReadBasicType(is, binary, &pool_y_step_);
-
-    //
-    // Sanity checks:
-    //
-    // input sanity checks
-    // input_dim_ should be multiple of (fmap_x_len_ * fmap_y_len_)
-    KALDI_ASSERT(input_dim_ % (fmap_x_len_ * fmap_y_len_) == 0);
-    int32 num_input_fmaps = input_dim_ / (fmap_x_len_ * fmap_y_len_);
-    KALDI_LOG << "num_fmaps " << num_input_fmaps;
-    // check if step is in sync with fmap_len and filt_len
-    KALDI_ASSERT((fmap_x_len_ - pool_x_len_) % (pool_x_step_) == 0);
-    KALDI_ASSERT((fmap_y_len_ - pool_y_len_) % (pool_y_step_) == 0);
-    int32 out_fmap_x_len = (fmap_x_len_ - pool_x_len_)/pool_x_step_ + 1;
-    int32 out_fmap_y_len = (fmap_y_len_ - pool_y_len_)/pool_y_step_ + 1;
-    //    int32 out_fmap_size = out_fmap_x_len*out_fmap_y_len;
-    // output sanity checks
-    KALDI_ASSERT(output_dim_ % (out_fmap_x_len * out_fmap_y_len)  == 0);
-    int32 num_output_fmaps = output_dim_ / (out_fmap_x_len * out_fmap_y_len);
-    KALDI_ASSERT(num_input_fmaps == num_output_fmaps);
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    // pooling hyperparameters
-    WriteToken(os, binary, "<FmapXLen>");
-    WriteBasicType(os, binary, fmap_x_len_);
-    WriteToken(os, binary, "<FmapYLen>");
-    WriteBasicType(os, binary, fmap_y_len_);
-    WriteToken(os, binary, "<PoolXLen>");
-    WriteBasicType(os, binary, pool_x_len_);
-    WriteToken(os, binary, "<PoolYLen>");
-    WriteBasicType(os, binary, pool_y_len_);
-    WriteToken(os, binary, "<PoolXStep>");
-    WriteBasicType(os, binary, pool_x_step_);
-    WriteToken(os, binary, "<PoolYStep>");
-    WriteBasicType(os, binary, pool_y_step_);
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // useful dims
-    int32 num_input_fmaps = input_dim_ / (fmap_x_len_ * fmap_y_len_);
-    int out_fmap_cnt = 0;
-    for (int32 m = 0; m < fmap_x_len_-pool_x_len_+1; m = m+pool_x_step_) {
-      for (int32 n = 0; n < fmap_y_len_-pool_y_len_+1; n = n+pool_y_step_) {
-        int32 st = 0;
-        st = (m * fmap_y_len_ + n) * num_input_fmaps;
-        CuSubMatrix<BaseFloat> pool(out->ColRange(out_fmap_cnt * num_input_fmaps, num_input_fmaps));
-        pool.SetZero();  // reset
-        for (int32 i = 0; i < pool_x_len_; i++) {
-          for (int32 j = 0; j < pool_y_len_; j++) {
-            int32 c = 0;
-            c = st + i * (num_input_fmaps * fmap_y_len_)
-                   + j * num_input_fmaps;
-            pool.AddMat(1.0, in.ColRange(c, num_input_fmaps));
-          }
-        }
-        pool.Scale(1.0 / (pool_x_len_ * pool_y_len_));
-        out_fmap_cnt++;
-      }
-    }
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // useful dims
-    int32 num_input_fmaps = input_dim_ / (fmap_x_len_ * fmap_y_len_);
-    int32 inp_fmap_size = fmap_x_len_ * fmap_y_len_;
-    //
-    // here we note how many diff matrices are summed for each input patch,
-    std::vector<int32> patch_summands(inp_fmap_size, 0);
-    // this metainfo will be used to divide diff of patches
-    // used in more than one pool.
-    //
-
-    in_diff->SetZero();  // reset
-    int out_fmap_cnt = 0;
-    for (int32 m = 0; m < fmap_x_len_-pool_x_len_+1; m = m+pool_x_step_) {
-      for (int32 n = 0; n < fmap_y_len_-pool_y_len_+1; n = n+pool_y_step_) {
-        int32 st = 0;
-        st = (m * fmap_y_len_ + n) * num_input_fmaps;
-        CuSubMatrix<BaseFloat> src(out_diff.ColRange(out_fmap_cnt * num_input_fmaps, num_input_fmaps));
-        for (int32 i = 0; i < pool_x_len_; i++) {
-          for (int32 j = 0; j < pool_y_len_; j++) {
-            int32 c = 0;
-            c = st + i * (num_input_fmaps * fmap_y_len_)
-                   + j * num_input_fmaps;
-            CuSubMatrix<BaseFloat> tgt(in_diff->ColRange(c, num_input_fmaps));
-            tgt.AddMat(1.0, src);
-            patch_summands[c / num_input_fmaps] += 1;
-          }
-        }
-        out_fmap_cnt++;
-      }
-    }
-
-    // divide diff by average-pooling-dim (derivative of averaging)
-    in_diff->Scale(1.0 / (pool_x_len_ * pool_y_len_));
-
-    // divide diff by #summands (compensate for patches used in more pools)
-    for (int i = 0; i < fmap_x_len_; i++) {
-      for (int32 j = 0; j < fmap_y_len_; j++) {
-        int32 c = i * fmap_y_len_ + j;
-        CuSubMatrix<BaseFloat> tgt(in_diff->ColRange(c*num_input_fmaps, num_input_fmaps));
-        KALDI_ASSERT(patch_summands[c] > 0);  // patch at least in one pool
-        tgt.Scale(1.0 / patch_summands[c]);
-      }
-    }
-  }
-
- private:
-  int32 fmap_x_len_, fmap_y_len_,
-        pool_x_len_, pool_y_len_,
-        pool_x_step_, pool_y_step_;
-};
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_AVERAGE_POOLING_2D_COMPONENT_H_
diff --git a/src/nnet/nnet-component-test.cc b/src/nnet/nnet-component-test.cc
index da181bd18f6..0786eb51c15 100644
--- a/src/nnet/nnet-component-test.cc
+++ b/src/nnet/nnet-component-test.cc
@@ -24,10 +24,7 @@
 #include "nnet/nnet-component.h"
 #include "nnet/nnet-nnet.h"
 #include "nnet/nnet-convolutional-component.h"
-#include "nnet/nnet-convolutional-2d-component.h"
 #include "nnet/nnet-max-pooling-component.h"
-#include "nnet/nnet-max-pooling-2d-component.h"
-#include "nnet/nnet-average-pooling-2d-component.h"
 #include "util/common-utils.h"
 
 namespace kaldi {
@@ -245,151 +242,6 @@ namespace nnet1 {
     delete c;
   }
 
-  void UnitTestMaxPooling2DComponent() { /* Implemented by Harish Mallidi */
-    // make max-pooling2d component
-    Component* c = Component::Init(
-      "<MaxPooling2DComponent> <InputDim> 56 <OutputDim> 18 \
-       <FmapXLen> 4 <FmapYLen> 7 <PoolXLen> 2 <PoolYLen> 3 \
-       <PoolXStep> 1 <PoolYStep> 2"
-    );
-
-    // input matrix,
-    CuMatrix<BaseFloat> mat_in;
-    ReadCuMatrixFromString("[ 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 \
-      11 11 12 12 13 13 14 14 15 15 16 16 17 17 18 18 19 19 20 20 21 21 \
-      22 22 23 23 24 24 25 25 26 26 27 27 ]", &mat_in);
-
-    // expected output (max values in the patch)
-    CuMatrix<BaseFloat> mat_out_ref;
-    ReadCuMatrixFromString("[ 9 9 11 11 13 13 16 16 18 18 \
-      20 20 23 23 25 25 27 27 ]", &mat_out_ref);
-
-    // propagate,
-    CuMatrix<BaseFloat> mat_out;
-    c->Propagate(mat_in, &mat_out);
-    KALDI_LOG << "mat_out" << mat_out << "mat_out_ref" << mat_out_ref;
-    AssertEqual(mat_out, mat_out_ref);
-
-
-    // locations of max values will be shown
-    CuMatrix<BaseFloat> mat_out_diff(mat_out);
-    ReadCuMatrixFromString(
-      "[ 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 ]", &mat_out_diff
-    );
-
-    // expected backpropagated values,
-    CuMatrix<BaseFloat> mat_in_diff_ref;  // hand-computed back-propagated values,
-    ReadCuMatrixFromString("[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 \
-      0.25 0.25 0 0 1 1 0 0 0 0 0.75 0.75 0 0 1 1 0 0 2.5 2.5 \
-      0 0 0 0 3 3 0 0 3.5 3.5 0 0 8 8 ]", &mat_in_diff_ref
-    );
-
-    // backpropagate,
-    CuMatrix<BaseFloat> mat_in_diff;
-    c->Backpropagate(mat_in, mat_out, mat_out_diff, &mat_in_diff);
-    KALDI_LOG << "mat_in_diff " << mat_in_diff
-              << " mat_in_diff_ref " << mat_in_diff_ref;
-    AssertEqual(mat_in_diff, mat_in_diff_ref);
-
-    delete c;
-  }
-
-  void UnitTestAveragePooling2DComponent() { /* Implemented by Harish Mallidi */
-    // make average-pooling2d component
-    Component* c = Component::Init(
-      "<AveragePooling2DComponent> <InputDim> 56 <OutputDim> 18 \
-       <FmapXLen> 4 <FmapYLen> 7 <PoolXLen> 2 <PoolYLen> 3 \
-       <PoolXStep> 1 <PoolYStep> 2"
-    );
-
-    // input matrix,
-    CuMatrix<BaseFloat> mat_in;
-    ReadCuMatrixFromString("[ 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 \
-      11 11 12 12 13 13 14 14 15 15 16 16 17 17 18 18 19 19 20 20 \
-      21 21 22 22 23 23 24 24 25 25 26 26 27 27 ]", &mat_in);
-
-    // expected output (max values in the patch)
-    CuMatrix<BaseFloat> mat_out_ref;
-    ReadCuMatrixFromString("[ 4.5 4.5 6.5 6.5 8.5 8.5 11.5 11.5 13.5 13.5 \
-      15.5 15.5 18.5 18.5 20.5 20.5 22.5 22.5 ]", &mat_out_ref);
-
-    // propagate,
-    CuMatrix<BaseFloat> mat_out;
-    c->Propagate(mat_in, &mat_out);
-    KALDI_LOG << "mat_out" << mat_out << "mat_out_ref" << mat_out_ref;
-    AssertEqual(mat_out, mat_out_ref);
-
-
-    // locations of max values will be shown
-    CuMatrix<BaseFloat> mat_out_diff(mat_out);
-    ReadCuMatrixFromString("[ 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 ]", &mat_out_diff);
-
-    // expected backpropagated values,
-    CuMatrix<BaseFloat> mat_in_diff_ref;  // hand-computed back-propagated values,
-    ReadCuMatrixFromString("[  0 0 0 0 0.0833333 0.0833333 0.166667 0.166667 \
-      0.25 0.25 0.333333 0.333333 0.333333 0.333333 0.25 0.25 0.25 0.25 \
-      0.333333 0.333333 0.416667 0.416667 0.5 0.5 0.583333 0.583333 0.583333 \
-      0.583333 0.75 0.75 0.75 0.75 0.833333 0.833333 0.916667 0.916667 1 1 \
-      1.08333 1.08333 1.08333 1.08333 1 1 1 1 1.08333 1.08333 1.16667 1.16667 \
-      1.25 1.25 1.33333 1.33333 1.33333 1.33333 ]", &mat_in_diff_ref
-    );
-
-    // backpropagate,
-    CuMatrix<BaseFloat> mat_in_diff;
-    c->Backpropagate(mat_in, mat_out, mat_out_diff, &mat_in_diff);
-    KALDI_LOG << "mat_in_diff " << mat_in_diff
-              << " mat_in_diff_ref " << mat_in_diff_ref;
-    AssertEqual(mat_in_diff, mat_in_diff_ref);
-
-    delete c;
-  }
-
-
-  void UnitTestConvolutional2DComponent() { /* Implemented by Harish Mallidi */
-    // Convolutional2D component
-    Component* c = ReadComponentFromString("<Convolutional2DComponent> 18 56 \
-      <LearnRateCoef> 0 <BiasLearnRateCoef> 0 <FmapXLen> 4 <FmapYLen> 7 \
-      <FiltXLen> 2 <FiltYLen> 3 <FiltXStep> 1 <FiltYStep> 2 <ConnectFmap> 1 \
-      <Filters> [ 0 0 1 1 2 2 3 3 4 4 5 5 ; 0 0 1 1 2 2 3 3 4 4 5 5 ] \
-      <Bias> [ 0 0 ]"
-    );
-
-    // input matrix
-    CuMatrix<BaseFloat> mat_in;
-    ReadCuMatrixFromString("[ 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 \
-      11 11 12 12 13 13 14 14 15 15 16 16 17 17 18 18 19 19 20 20 \
-      21 21 22 22 23 23 24 24 25 25 26 26 27 27 ]", &mat_in);
-
-    CuMatrix<BaseFloat> mat_out_ref;
-    ReadCuMatrixFromString("[ 206 206 266 266 326 326 416 416 476 476 536 536 \
-      626 626 686 686 746 746 ]", &mat_out_ref);
-
-    // propagate
-    CuMatrix<BaseFloat> mat_out;
-    c->Propagate(mat_in, &mat_out);
-    KALDI_LOG << "mat_out" << mat_out << "mat_out" << mat_out_ref;
-    AssertEqual(mat_out, mat_out_ref);
-
-    // prepare mat_out_diff, mat_in_diff_ref,
-    CuMatrix<BaseFloat> mat_out_diff;
-    ReadCuMatrixFromString("[ 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 ]",
-                           &mat_out_diff);
-
-    CuMatrix<BaseFloat> mat_in_diff_ref;
-    ReadCuMatrixFromString("[ 0 0 0 0 0 0 2 2 2 2 4 4 8 8 0 0 3 3 4.5 4.5 8 8 \
-      9.5 9.5 13 13 20 20 9 9 18 18 19.5 19.5 23 23 24.5 24.5 28 28 41 41 \
-      36 36 48 48 51 51 56 56 59 59 64 64 80 80 ]", &mat_in_diff_ref);
-
-    // backpropagate
-    CuMatrix<BaseFloat> mat_in_diff;
-    c->Backpropagate(mat_in, mat_out, mat_out_diff, &mat_in_diff);
-    KALDI_LOG << "mat_in_diff " << mat_in_diff
-              << " mat_in_diff_ref " << mat_in_diff_ref;
-    AssertEqual(mat_in_diff, mat_in_diff_ref);
-
-    delete c;
-  }
-
   void UnitTestDropoutComponent() {
     Component* c = ReadComponentFromString("<Dropout> 100 100 <DropoutRetention> 0.7");
     // buffers,
@@ -434,9 +286,6 @@ int main() {
     UnitTestConvolutionalComponentUnity();
     UnitTestConvolutionalComponent3x3();
     UnitTestMaxPoolingComponent();
-    UnitTestConvolutional2DComponent();
-    UnitTestMaxPooling2DComponent();
-    UnitTestAveragePooling2DComponent();
     UnitTestDropoutComponent();
     // end of unit-tests,
     if (loop == 0)
diff --git a/src/nnet/nnet-component.cc b/src/nnet/nnet-component.cc
index 34f988972a0..cf7741e6e57 100644
--- a/src/nnet/nnet-component.cc
+++ b/src/nnet/nnet-component.cc
@@ -35,10 +35,6 @@
 #include "nnet/nnet-average-pooling-component.h"
 #include "nnet/nnet-max-pooling-component.h"
 
-#include "nnet/nnet-convolutional-2d-component.h"
-#include "nnet/nnet-average-pooling-2d-component.h"
-#include "nnet/nnet-max-pooling-2d-component.h"
-
 #include "nnet/nnet-lstm-projected.h"
 #include "nnet/nnet-blstm-projected.h"
 #include "nnet/nnet-recurrent.h"
@@ -56,7 +52,6 @@ const struct Component::key_value Component::kMarkerMap[] = {
   { Component::kAffineTransform, "<AffineTransform>" },
   { Component::kLinearTransform, "<LinearTransform>" },
   { Component::kConvolutionalComponent, "<ConvolutionalComponent>" },
-  { Component::kConvolutional2DComponent, "<Convolutional2DComponent>" },
   { Component::kLstmProjected, "<LstmProjected>" },
   { Component::kLstmProjected, "<LstmProjectedStreams>" }, // bwd compat.
   { Component::kBlstmProjected, "<BlstmProjected>" },
@@ -77,9 +72,7 @@ const struct Component::key_value Component::kMarkerMap[] = {
   { Component::kRescale, "<Rescale>" },
   { Component::kKlHmm, "<KlHmm>" },
   { Component::kAveragePoolingComponent, "<AveragePoolingComponent>" },
-  { Component::kAveragePooling2DComponent, "<AveragePooling2DComponent>" },
   { Component::kMaxPoolingComponent, "<MaxPoolingComponent>" },
-  { Component::kMaxPooling2DComponent, "<MaxPooling2DComponent>" },
   { Component::kSentenceAveragingComponent, "<SentenceAveragingComponent>" },
   { Component::kSimpleSentenceAveragingComponent, "<SimpleSentenceAveragingComponent>" },
   { Component::kFramePoolingComponent, "<FramePoolingComponent>" },
@@ -127,9 +120,6 @@ Component* Component::NewComponentOfType(ComponentType comp_type,
     case Component::kConvolutionalComponent :
       ans = new ConvolutionalComponent(input_dim, output_dim);
       break;
-    case Component::kConvolutional2DComponent :
-      ans = new Convolutional2DComponent(input_dim, output_dim);
-      break;
     case Component::kLstmProjected :
       ans = new LstmProjected(input_dim, output_dim);
       break;
@@ -190,15 +180,9 @@ Component* Component::NewComponentOfType(ComponentType comp_type,
     case Component::kAveragePoolingComponent :
       ans = new AveragePoolingComponent(input_dim, output_dim);
       break;
-    case Component::kAveragePooling2DComponent :
-      ans = new AveragePooling2DComponent(input_dim, output_dim);
-      break;
     case Component::kMaxPoolingComponent :
       ans = new MaxPoolingComponent(input_dim, output_dim);
       break;
-    case Component::kMaxPooling2DComponent :
-      ans = new MaxPooling2DComponent(input_dim, output_dim);
-      break;
     case Component::kFramePoolingComponent :
       ans = new FramePoolingComponent(input_dim, output_dim);
       break;
diff --git a/src/nnet/nnet-component.h b/src/nnet/nnet-component.h
index 2ef56622ca8..0cca2608b21 100644
--- a/src/nnet/nnet-component.h
+++ b/src/nnet/nnet-component.h
@@ -51,7 +51,6 @@ class Component {
     kAffineTransform,
     kLinearTransform,
     kConvolutionalComponent,
-    kConvolutional2DComponent,
     kLstmProjected,
     kBlstmProjected,
     kRecurrentComponent,
@@ -79,9 +78,7 @@ class Component {
     kSentenceAveragingComponent, /* deprecated */
     kSimpleSentenceAveragingComponent,
     kAveragePoolingComponent,
-    kAveragePooling2DComponent,
     kMaxPoolingComponent,
-    kMaxPooling2DComponent,
     kFramePoolingComponent,
     kParallelComponent,
     kMultiBasisComponent
diff --git a/src/nnet/nnet-convolutional-2d-component.h b/src/nnet/nnet-convolutional-2d-component.h
deleted file mode 100644
index 135ce894541..00000000000
--- a/src/nnet/nnet-convolutional-2d-component.h
+++ /dev/null
@@ -1,495 +0,0 @@
-// nnet/nnet-convolutional-2d-component.h
-
-// Copyright 2014-2015  Johns Hopkins University (author: Sri Harish Mallidi)
-//                      Brno University of Technology (author: Karel Vesely),
-//
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_CONVOLUTIONAL_2D_COMPONENT_H_
-#define KALDI_NNET_NNET_CONVOLUTIONAL_2D_COMPONENT_H_
-
-#include <string>
-#include <vector>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-various.h"
-#include "cudamatrix/cu-math.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-/**
- * Convolutional2DComponent implements convolution over 2-axis (frequency and temporal)
- * (i.e. frequency axis in case we are the 1st component in NN).
- * // We don't do convolution along temporal axis, which simplifies the
- * // implementation (and was not helpful for Tara).
- *
- * We assume the input featrues are spliced, i.e. each frame
- * is in fact a set of stacked frames, where we can form patches
- * which span over several frequency bands and time axes.
- *
- * The convolution is done over whole axis with same filters,
- * i.e. we don't use separate filters for different 'regions'
- * of frequency axis.
- *
- * In order to have a fast implementations, the filters
- * are represented in vectorized form, where each rectangular
- * filter corresponds to a row in a matrix, where all filters
- * are stored. The features are then re-shaped to a set of matrices,
- * where one matrix corresponds to single patch-position,
- * where the filters get applied.
- *
- * The type of convolution is controled by hyperparameters:
- * x_patch_dim_,y_patch_dim_     ... temporal and frequency axes sizes of the patch (e.g. (9,9) for 9x9 2D filter)
- * x_patch_step_,y_patch_step_    ... temporal and frequencey sizes of shifts in the convolution (e.g. (1,1) 2D filter with 1 step shift in both axes)
- * x_patch_stride_,y_patch_stride_  ... dimension of the feature (maps if inside convolutional layer) (e.g. (11,32) for 32-band 11 frame spliced spectrogram patch)
- * The type of convolution is controlled by hyperparameters:
- * fmap_x_len_, fmap_y_len_ ... dimension of the feature (maps if inside convolutional layer) (e.g. (11,32) for 32-band 11 frame spliced spectrogram patch)
- * filt_x_len_, filt_y_len_ ... temporal and frequency sizes of the filters (e.g. (9,9) for 9x9 2D filter)
- * filt_x_step_, filt_y_step_ ... temporal and frequency sizes of the filters (e.g. (1,1) for 2D-filter, with 1 step shift in both axes)
- *
- *
- * Due to convolution same weights are used repeateadly,
- * the final gradient is average of all position-specific
- * gradients.
- *
- */
-class Convolutional2DComponent : public UpdatableComponent {
- public:
-  Convolutional2DComponent(int32 dim_in, int32 dim_out):
-    UpdatableComponent(dim_in, dim_out),
-    fmap_x_len_(0), fmap_y_len_(0),
-    filt_x_len_(0), filt_y_len_(0),
-    filt_x_step_(0), filt_y_step_(0),
-    connect_fmap_(0)
-  { }
-
-  ~Convolutional2DComponent()
-  { }
-
-  Component* Copy() const { return new Convolutional2DComponent(*this); }
-  ComponentType GetType() const { return kConvolutional2DComponent; }
-
-  void InitData(std::istream &is) {
-    // define options
-    BaseFloat bias_mean = -2.0, bias_range = 2.0, param_stddev = 0.1;
-    // parse config
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<ParamStddev>") ReadBasicType(is, false, &param_stddev);
-      else if (token == "<BiasMean>")    ReadBasicType(is, false, &bias_mean);
-      else if (token == "<BiasRange>")   ReadBasicType(is, false, &bias_range);
-      else if (token == "<FmapXLen>")    ReadBasicType(is, false, &fmap_x_len_);
-      else if (token == "<FmapYLen>")    ReadBasicType(is, false, &fmap_y_len_);
-      else if (token == "<FiltXLen>")    ReadBasicType(is, false, &filt_x_len_);
-      else if (token == "<FiltYLen>")    ReadBasicType(is, false, &filt_y_len_);
-      else if (token == "<FiltXStep>")   ReadBasicType(is, false, &filt_x_step_);
-      else if (token == "<FiltYStep>")   ReadBasicType(is, false, &filt_y_step_);
-      else if (token == "<ConnectFmap>") ReadBasicType(is, false, &connect_fmap_);
-      else if (token == "<LearnRateCoef>") ReadBasicType(is, false, &learn_rate_coef_);
-      else if (token == "<BiasLearnRateCoef>") ReadBasicType(is, false, &bias_learn_rate_coef_);
-      else KALDI_ERR << "Unknown token " << token << ", a typo in config? "
-                     << "(ParamStddev|BiasMean|BiasRange|FmapXLen|FmapYLen|"
-                        "FiltXLen|FiltYLen|FiltXStep|FiltYStep|ConnectFmap|"
-                        "LearnRateCoef|BiasLearnRateCoef)";
-    }
-
-    //
-    // Sanity checks:
-    //
-    // input sanity checks
-    // input_dim_ should be multiple of (fmap_x_len_ * fmap_y_len_)
-    KALDI_ASSERT(input_dim_ % (fmap_x_len_ * fmap_y_len_) == 0);
-    int32 num_input_fmaps = input_dim_ / (fmap_x_len_ * fmap_y_len_);
-    KALDI_LOG << "num_input_fmaps " << num_input_fmaps;
-    // check if step is in sync with fmap_len and filt_len
-    KALDI_ASSERT((fmap_x_len_ - filt_x_len_) % (filt_x_step_) == 0);
-    KALDI_ASSERT((fmap_y_len_ - filt_y_len_) % (filt_y_step_) == 0);
-    int32 out_fmap_x_len = (fmap_x_len_ - filt_x_len_)/filt_x_step_ + 1;
-    int32 out_fmap_y_len = (fmap_y_len_ - filt_y_len_)/filt_y_step_ + 1;
-    // output sanity checks
-    KALDI_ASSERT(output_dim_ % (out_fmap_x_len * out_fmap_y_len)  == 0);
-    int32 num_output_fmaps = output_dim_ / (out_fmap_x_len * out_fmap_y_len);
-    KALDI_LOG << "num_output_fmaps " << num_output_fmaps;
-    int32 num_filters = output_dim_/(out_fmap_x_len*out_fmap_y_len);
-    KALDI_LOG << "num_filters " << num_filters;
-
-    //
-    // Initialize trainable parameters,
-    //
-    filters_.Resize(num_filters, num_input_fmaps*filt_x_len_*filt_y_len_);
-    RandGauss(0.0, param_stddev, &filters_);
-    //
-    bias_.Resize(num_filters);
-    RandUniform(bias_mean, bias_range, &bias_);
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    ExpectToken(is, binary, "<LearnRateCoef>");
-    ReadBasicType(is, binary, &learn_rate_coef_);
-    ExpectToken(is, binary, "<BiasLearnRateCoef>");
-    ReadBasicType(is, binary, &bias_learn_rate_coef_);
-    // convolution hyperparameters
-    ExpectToken(is, binary, "<FmapXLen>");
-    ReadBasicType(is, binary, &fmap_x_len_);
-    ExpectToken(is, binary, "<FmapYLen>");
-    ReadBasicType(is, binary, &fmap_y_len_);
-    ExpectToken(is, binary, "<FiltXLen>");
-    ReadBasicType(is, binary, &filt_x_len_);
-    ExpectToken(is, binary, "<FiltYLen>");
-    ReadBasicType(is, binary, &filt_y_len_);
-    ExpectToken(is, binary, "<FiltXStep>");
-    ReadBasicType(is, binary, &filt_x_step_);
-    ExpectToken(is, binary, "<FiltYStep>");
-    ReadBasicType(is, binary, &filt_y_step_);
-    ExpectToken(is, binary, "<ConnectFmap>");
-    ReadBasicType(is, binary, &connect_fmap_);
-
-    // trainable parameters
-    ExpectToken(is, binary, "<Filters>");
-    filters_.Read(is, binary);
-    ExpectToken(is, binary, "<Bias>");
-    bias_.Read(is, binary);
-
-    //
-    // Sanity checks:
-    //
-    // input sanity checks
-    // input_dim_ should be multiple of (fmap_x_len_ * fmap_y_len_)
-    KALDI_ASSERT(input_dim_ % (fmap_x_len_ * fmap_y_len_) == 0);
-    // int32 num_input_fmaps = input_dim_ / (fmap_x_len_ * fmap_y_len_);
-    // KALDI_LOG << "num_input_fmaps " << num_input_fmaps;
-    // check if step is in sync with fmap_len and filt_len
-    KALDI_ASSERT((fmap_x_len_ - filt_x_len_) % (filt_x_step_) == 0);
-    KALDI_ASSERT((fmap_y_len_ - filt_y_len_) % (filt_y_step_) == 0);
-    int32 out_fmap_x_len = (fmap_x_len_ - filt_x_len_)/filt_x_step_ + 1;
-    int32 out_fmap_y_len = (fmap_y_len_ - filt_y_len_)/filt_y_step_ + 1;
-
-    // output sanity checks
-    KALDI_ASSERT(output_dim_ % (out_fmap_x_len * out_fmap_y_len)  == 0);
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    WriteToken(os, binary, "<LearnRateCoef>");
-    WriteBasicType(os, binary, learn_rate_coef_);
-    WriteToken(os, binary, "<BiasLearnRateCoef>");
-    WriteBasicType(os, binary, bias_learn_rate_coef_);
-    if (!binary) os << "\n";
-
-    // convolution hyperparameters
-    WriteToken(os, binary, "<FmapXLen>");
-    WriteBasicType(os, binary, fmap_x_len_);
-    WriteToken(os, binary, "<FmapYLen>");
-    WriteBasicType(os, binary, fmap_y_len_);
-    WriteToken(os, binary, "<FiltXLen>");
-    WriteBasicType(os, binary, filt_x_len_);
-    WriteToken(os, binary, "<FiltYLen>");
-    WriteBasicType(os, binary, filt_y_len_);
-    WriteToken(os, binary, "<FiltXStep>");
-    WriteBasicType(os, binary, filt_x_step_);
-    WriteToken(os, binary, "<FiltYStep>");
-    WriteBasicType(os, binary, filt_y_step_);
-    WriteToken(os, binary, "<ConnectFmap>");
-    WriteBasicType(os, binary, connect_fmap_);
-    if (!binary) os << "\n";
-
-    // trainable parameters
-    WriteToken(os, binary, "<Filters>");
-    if (!binary) os << "\n";
-    filters_.Write(os, binary);
-    WriteToken(os, binary, "<Bias>");
-    if (!binary) os << "\n";
-    bias_.Write(os, binary);
-  }
-
-  int32 NumParams() const {
-    return filters_.NumRows()*filters_.NumCols() + bias_.Dim();
-  }
-
-  void GetGradient(VectorBase<BaseFloat>* gradient) const {
-    KALDI_ASSERT(gradient->Dim() == NumParams());
-    int32 filters_num_elem = filters_.NumRows() * filters_.NumCols();
-    gradient->Range(0, filters_num_elem).CopyRowsFromMat(filters_);
-    gradient->Range(filters_num_elem, bias_.Dim()).CopyFromVec(bias_);
-  }
-
-  void GetParams(VectorBase<BaseFloat>* params) const {
-    KALDI_ASSERT(params->Dim() == NumParams());
-    int32 filters_num_elem = filters_.NumRows() * filters_.NumCols();
-    params->Range(0, filters_num_elem).CopyRowsFromMat(filters_);
-    params->Range(filters_num_elem, bias_.Dim()).CopyFromVec(bias_);
-  }
-
-  void SetParams(const VectorBase<BaseFloat>& params) {
-    KALDI_ASSERT(params.Dim() == NumParams());
-    int32 filters_num_elem = filters_.NumRows() * filters_.NumCols();
-    filters_.CopyRowsFromVec(params.Range(0, filters_num_elem));
-    bias_.CopyFromVec(params.Range(filters_num_elem, bias_.Dim()));
-  }
-
-  std::string Info() const {
-    return std::string("\n  filters") + MomentStatistics(filters_) +
-           ", lr-coef " + ToString(learn_rate_coef_) +
-           "\n  bias" + MomentStatistics(bias_) +
-           ", lr-coef " + ToString(bias_learn_rate_coef_);
-  }
-  std::string InfoGradient() const {
-    return std::string("\n  filters_grad") + MomentStatistics(filters_grad_) +
-           ", lr-coef " + ToString(learn_rate_coef_) +
-           "\n  bias_grad" + MomentStatistics(bias_grad_) +
-           ", lr-coef " + ToString(bias_learn_rate_coef_);
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // useful dims
-    int32 num_input_fmaps = input_dim_ / (fmap_x_len_ * fmap_y_len_);
-    // int32 inp_fmap_size = fmap_x_len_ * fmap_y_len_;
-    int32 out_fmap_x_len = (fmap_x_len_ - filt_x_len_)/filt_x_step_ + 1;
-    int32 out_fmap_y_len = (fmap_y_len_ - filt_y_len_)/filt_y_step_ + 1;
-    int32 out_fmap_size = out_fmap_x_len*out_fmap_y_len;
-    int32 num_output_fmaps = output_dim_ / (out_fmap_x_len * out_fmap_y_len);
-    // this is total num_filters,
-    // so each input_fmap has size num_filters/num_input_fmaps
-    int32 num_filters = filters_.NumRows();
-    KALDI_ASSERT(num_filters == num_output_fmaps);
-    // int32 filter_size = filt_x_len_*filt_y_len_;
-    int32 num_frames = in.NumRows();
-
-    // we will need the buffers
-    if (vectorized_feature_patches_.size() == 0) {
-      vectorized_feature_patches_.resize(out_fmap_size);
-      feature_patch_diffs_.resize(out_fmap_size);
-    }
-
-    for (int32 p = 0; p < out_fmap_size; p++) {
-      vectorized_feature_patches_[p].Resize(num_frames, filters_.NumCols());
-    }
-
-    // Checked for num_input_fmaps=1, check for num_inp_fmaps>1
-    int32 out_fmap_cnt = 0;
-    for (int32 m = 0; m < fmap_x_len_-filt_x_len_+1; m = m+filt_x_step_) {
-      for (int32 n = 0; n < fmap_y_len_-filt_y_len_+1; n = n+filt_y_step_) {
-    std::vector<int32> column_mask;
-    int32 st = 0;
-    if (connect_fmap_ == 1) {
-      st = (m * fmap_y_len_ + n) * num_input_fmaps;
-    } else {
-      st = m * fmap_y_len_ * num_input_fmaps + n;
-    }
-
-    for (int32 i = 0; i < filt_x_len_; i++) {
-      for (int32 j = 0; j < filt_y_len_*num_input_fmaps; j++) {
-        int32 c = 0;
-        if (connect_fmap_ == 1) {
-          c = st + i * (num_input_fmaps*fmap_y_len_) + j;
-        } else {
-          c = st + i * (num_input_fmaps * fmap_y_len_)
-                     + (j / num_input_fmaps)
-                     + (j % num_input_fmaps) * fmap_y_len_;
-        }
-        column_mask.push_back(c);
-      }
-    }
-    CuArray<int32> cu_column_mask(column_mask);
-    vectorized_feature_patches_[out_fmap_cnt].CopyCols(in, cu_column_mask);
-    out_fmap_cnt++;
-      }
-    }
-
-    for (int32 p = 0; p < out_fmap_size; p++) {
-      CuSubMatrix<BaseFloat> tgt(out->ColRange(p*num_filters, num_filters));
-      tgt.AddVecToRows(1.0, bias_, 0.0);
-      tgt.AddMatMat(1.0, vectorized_feature_patches_[p], kNoTrans, filters_, kTrans, 1.0);
-    }
-  }
-
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // useful dims
-    int32 num_input_fmaps = input_dim_ / (fmap_x_len_ * fmap_y_len_);
-
-    int32 out_fmap_x_len = (fmap_x_len_ - filt_x_len_)/filt_x_step_ + 1;
-    int32 out_fmap_y_len = (fmap_y_len_ - filt_y_len_)/filt_y_step_ + 1;
-    int32 out_fmap_size = out_fmap_x_len * out_fmap_y_len;
-    int32 num_output_fmaps = output_dim_ / (out_fmap_x_len * out_fmap_y_len);
-    // this is total num_filters,
-    // so each input_fmap has num_filters/num_input_fmaps
-    int32 num_filters = filters_.NumRows();
-    KALDI_ASSERT(num_filters == num_output_fmaps);
-    // int32 filter_size = filt_x_len_*filt_y_len_;
-    int32 num_frames = in.NumRows();
-
-    for (int32 p = 0; p < out_fmap_size; p++) {
-      feature_patch_diffs_[p].Resize(num_frames, filters_.NumCols(), kSetZero);
-      CuSubMatrix<BaseFloat> out_diff_patch(out_diff.ColRange(p*num_filters, num_filters));
-      feature_patch_diffs_[p].AddMatMat(1.0, out_diff_patch, kNoTrans, filters_, kNoTrans, 0.0);
-    }
-
-    // compute in_diff_summands_ once
-    if (in_diff_summands_.Dim() == 0) {
-      in_diff_summands_.Resize(in_diff->NumCols(), kSetZero);
-      for (int32 m = 0; m < fmap_x_len_-filt_x_len_+1; m = m+filt_x_step_) {
-        for (int32 n = 0; n < fmap_y_len_-filt_y_len_+1; n = n+filt_y_step_) {
-          int32 st = 0;
-          if (connect_fmap_ == 1) {
-            st = (m * fmap_y_len_ + n) * num_input_fmaps;
-          } else {
-            st = m * fmap_y_len_ * num_input_fmaps + n;
-          }
-          for (int32 i = 0; i < filt_x_len_; i++) {
-            for (int32 j = 0; j < filt_y_len_*num_input_fmaps; j++) {
-              int32 c = 0;
-              if (connect_fmap_ == 1) {
-                c = st + i * (num_input_fmaps * fmap_y_len_) + j;
-              } else {
-                c = st + i * (num_input_fmaps * fmap_y_len_)
-                       + (j / num_input_fmaps)
-                       + (j % num_input_fmaps) * fmap_y_len_;
-              }
-              // add 1.0
-              in_diff_summands_.Range(c, 1).Add(1.0);
-            }
-          }
-        }
-      }
-      in_diff_summands_.InvertElements();
-    }
-
-    int32 out_fmap_cnt = 0;
-
-    for (int32 m = 0; m < fmap_x_len_-filt_x_len_+1; m = m+filt_x_step_) {
-      for (int32 n = 0; n< fmap_y_len_-filt_y_len_+1; n = n+filt_y_step_) {
-        int32 st = 0;
-        if (connect_fmap_ == 1) {
-          st = (m * fmap_y_len_ + n) * num_input_fmaps;
-        } else {
-          st = m * fmap_y_len_ * num_input_fmaps + n;
-        }
-
-        for (int32 i = 0; i < filt_x_len_; i++) {
-          for (int32 j = 0; j < filt_y_len_*num_input_fmaps; j++) {
-            int32 c = 0;
-            if (connect_fmap_ == 1) {
-              c = st + i *(num_input_fmaps*fmap_y_len_)+j;
-            } else {
-              c = st + i * (num_input_fmaps * fmap_y_len_)
-                     + (j / num_input_fmaps)
-                     + (j % num_input_fmaps) * fmap_y_len_;
-            }
-            // from which col?
-            CuMatrix<BaseFloat>& diff_mat = feature_patch_diffs_[out_fmap_cnt];
-            CuSubMatrix<BaseFloat> src(diff_mat.ColRange(i*filt_y_len_*num_input_fmaps+j, 1));
-            // to which col?
-            CuSubMatrix<BaseFloat> tgt(in_diff->ColRange(c, 1));
-            tgt.AddMat(1.0, src);
-          }
-        }
-        out_fmap_cnt++;
-      }
-    }
-    // compensate for summands
-    in_diff->MulColsVec(in_diff_summands_);
-  }
-
-
-  void Update(const CuMatrixBase<BaseFloat> &input,
-              const CuMatrixBase<BaseFloat> &diff) {
-    // useful dims,
-    int32 out_fmap_x_len = (fmap_x_len_ - filt_x_len_)/filt_x_step_ + 1;
-    int32 out_fmap_y_len = (fmap_y_len_ - filt_y_len_)/filt_y_step_ + 1;
-    int32 out_fmap_size = out_fmap_x_len * out_fmap_y_len;
-    int32 num_output_fmaps = output_dim_ / (out_fmap_x_len * out_fmap_y_len);
-
-    // This is total num_filters,
-    // each input_fmap has num_filters / num_input_fmaps:
-    int32 num_filters = filters_.NumRows();
-    KALDI_ASSERT(num_filters == num_output_fmaps);
-
-    // we use following hyperparameters from the option class,
-    const BaseFloat lr = opts_.learn_rate;
-
-    //
-    // calculate the gradient
-    //
-    filters_grad_.Resize(filters_.NumRows(), filters_.NumCols(), kSetZero);
-    bias_grad_.Resize(filters_.NumRows(), kSetZero);
-    //
-    for (int32 p = 0; p < out_fmap_size; p++) {
-      CuSubMatrix<BaseFloat> diff_patch(diff.ColRange(p * num_filters, num_filters));
-      filters_grad_.AddMatMat(1.0, diff_patch, kTrans, vectorized_feature_patches_[p], kNoTrans, 1.0);
-      bias_grad_.AddRowSumMat(1.0, diff_patch, 1.0);
-    }
-    // scale
-    filters_grad_.Scale(1.0/num_output_fmaps);
-    bias_grad_.Scale(1.0/num_output_fmaps);
-
-    //
-    // update
-    //
-    filters_.AddMat(-lr * learn_rate_coef_, filters_grad_);
-    bias_.AddVec(-lr * bias_learn_rate_coef_, bias_grad_);
-  }
-
- private:
-  /// feature maps dimensions (for input x_ is usually splice
-  /// and y_ is num of fbanks) shift for 2nd dim of a patch
-  /// (i.e. frame length before splicing),
-  int32 fmap_x_len_, fmap_y_len_;
-
-  /// 2D filter dimensions, x_ temporal, y_ spectral,
-  int32 filt_x_len_, filt_y_len_;
-
-  /// 2D shifts along temporal and spectral axis,
-  int32 filt_x_step_, filt_y_step_;
-
-  int32 connect_fmap_;  ///< if connect_fmap_ = 1, then each fmap has num_filt
-
-  CuMatrix<BaseFloat> filters_;  ///< row = vectorized rectangular filter
-  CuVector<BaseFloat> bias_;  ///< bias for each filter
-
-  CuMatrix<BaseFloat> filters_grad_;  ///< gradient of filters
-  CuVector<BaseFloat> bias_grad_;  ///< gradient of biases
-
-  /** Buffer of reshaped inputs:
-   *  1row = vectorized rectangular feature patch,
-   *  1col = dim over speech frames,
-   *  std::vector-dim = patch-position
-   */
-  std::vector<CuMatrix<BaseFloat> > vectorized_feature_patches_;
-
-  /** Buffer for backpropagation:
-   *  derivatives in the domain of 'vectorized_feature_patches_',
-   *  1row = vectorized rectangular feature patch,
-   *  1col = dim over speech frames,
-   *  std::vector-dim = patch-position
-   */
-  std::vector<CuMatrix<BaseFloat> > feature_patch_diffs_;
-
-  /// Auxiliary vector for compensating #summands when backpropagating
-  CuVector<BaseFloat> in_diff_summands_;
-};
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_CONVOLUTIONAL_2D_COMPONENT_H_
diff --git a/src/nnet/nnet-max-pooling-2d-component.h b/src/nnet/nnet-max-pooling-2d-component.h
deleted file mode 100644
index 4a4045ca73d..00000000000
--- a/src/nnet/nnet-max-pooling-2d-component.h
+++ /dev/null
@@ -1,225 +0,0 @@
-// nnet/nnet-max-pooling-2d-component.h
-
-// Copyright 2014  Brno University of Technology (author: Karel Vesely),
-//                 Johns Hopkins University (author: Sri Harish Mallidi)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_MAX_POOLING_2D_COMPONENT_H_
-#define KALDI_NNET_NNET_MAX_POOLING_2D_COMPONENT_H_
-
-#include <string>
-#include <vector>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-utils.h"
-#include "cudamatrix/cu-math.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-/**
- * MaxPoolingComponent :
- * The input/output matrices are split to submatrices with width 'pool_stride_'.
- * The pooling is done over 3rd axis, of the set of 2d matrices.
- * Our pooling supports overlaps, overlaps occur when (pool_step_ < pool_size_).
- */
-class MaxPooling2DComponent : public Component {
- public:
-  MaxPooling2DComponent(int32 dim_in, int32 dim_out):
-    Component(dim_in, dim_out),
-    fmap_x_len_(0), fmap_y_len_(0),
-    pool_x_len_(0), pool_y_len_(0),
-    pool_x_step_(0), pool_y_step_(0)
-  { }
-
-  ~MaxPooling2DComponent()
-  { }
-
-  Component* Copy() const { return new MaxPooling2DComponent(*this); }
-  ComponentType GetType() const { return kMaxPooling2DComponent; }
-
-  void InitData(std::istream &is) {
-    // parse config
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<FmapXLen>") ReadBasicType(is, false, &fmap_x_len_);
-      else if (token == "<FmapYLen>") ReadBasicType(is, false, &fmap_y_len_);
-      else if (token == "<PoolXLen>") ReadBasicType(is, false, &pool_x_len_);
-      else if (token == "<PoolYLen>") ReadBasicType(is, false, &pool_y_len_);
-      else if (token == "<PoolXStep>") ReadBasicType(is, false, &pool_x_step_);
-      else if (token == "<PoolYStep>") ReadBasicType(is, false, &pool_y_step_);
-      else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-                     << " (FmapXLen|FmapYLen|PoolXLen|PoolYLen|PoolXStep|PoolYStep)";
-    }
-    // check
-    KALDI_ASSERT(fmap_x_len_ * fmap_y_len_ != 0);
-    KALDI_ASSERT(pool_x_len_ * pool_y_len_ != 0);
-    KALDI_ASSERT(pool_x_step_ * pool_y_step_ != 0);
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    // pooling hyperparameters
-    ExpectToken(is, binary, "<FmapXLen>");
-    ReadBasicType(is, binary, &fmap_x_len_);
-    ExpectToken(is, binary, "<FmapYLen>");
-    ReadBasicType(is, binary, &fmap_y_len_);
-    ExpectToken(is, binary, "<PoolXLen>");
-    ReadBasicType(is, binary, &pool_x_len_);
-    ExpectToken(is, binary, "<PoolYLen>");
-    ReadBasicType(is, binary, &pool_y_len_);
-    ExpectToken(is, binary, "<PoolXStep>");
-    ReadBasicType(is, binary, &pool_x_step_);
-    ExpectToken(is, binary, "<PoolYStep>");
-    ReadBasicType(is, binary, &pool_y_step_);
-
-    //
-    // Sanity checks:
-    //
-    // input sanity checks
-    // input_dim_ should be multiple of (fmap_x_len_ * fmap_y_len_)
-    KALDI_ASSERT(input_dim_ % (fmap_x_len_ * fmap_y_len_) == 0);
-    int32 num_input_fmaps = input_dim_ / (fmap_x_len_ * fmap_y_len_);
-    KALDI_LOG << "num_fmaps " << num_input_fmaps;
-    // check if step is in sync with fmap_len and filt_len
-    KALDI_ASSERT((fmap_x_len_ - pool_x_len_) % (pool_x_step_) == 0);
-    KALDI_ASSERT((fmap_y_len_ - pool_y_len_) % (pool_y_step_) == 0);
-    int32 out_fmap_x_len = (fmap_x_len_ - pool_x_len_)/pool_x_step_ + 1;
-    int32 out_fmap_y_len = (fmap_y_len_ - pool_y_len_)/pool_y_step_ + 1;
-    //    int32 out_fmap_size = out_fmap_x_len*out_fmap_y_len;
-    // output sanity checks
-    KALDI_ASSERT(output_dim_ % (out_fmap_x_len * out_fmap_y_len)  == 0);
-    int32 num_output_fmaps = output_dim_ / (out_fmap_x_len * out_fmap_y_len);
-    KALDI_ASSERT(num_input_fmaps == num_output_fmaps);
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    // pooling hyperparameters
-    WriteToken(os, binary, "<FmapXLen>");
-    WriteBasicType(os, binary, fmap_x_len_);
-    WriteToken(os, binary, "<FmapYLen>");
-    WriteBasicType(os, binary, fmap_y_len_);
-    WriteToken(os, binary, "<PoolXLen>");
-    WriteBasicType(os, binary, pool_x_len_);
-    WriteToken(os, binary, "<PoolYLen>");
-    WriteBasicType(os, binary, pool_y_len_);
-    WriteToken(os, binary, "<PoolXStep>");
-    WriteBasicType(os, binary, pool_x_step_);
-    WriteToken(os, binary, "<PoolYStep>");
-    WriteBasicType(os, binary, pool_y_step_);
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // useful dims
-    int32 num_input_fmaps = input_dim_ / (fmap_x_len_ * fmap_y_len_);
-    int out_fmap_cnt = 0;
-    for (int32 m = 0; m < fmap_x_len_-pool_x_len_+1; m = m+pool_x_step_) {
-      for (int32 n = 0; n < fmap_y_len_-pool_y_len_+1; n = n+pool_y_step_) {
-        int32 st = 0;
-        st = (m * fmap_y_len_ + n) * num_input_fmaps;
-        CuSubMatrix<BaseFloat> pool(
-          out->ColRange(out_fmap_cnt * num_input_fmaps, num_input_fmaps)
-        );
-        pool.Set(-1e20);  // reset (large neg value)
-        for (int32 i = 0; i < pool_x_len_; i++) {
-          for (int32 j = 0; j < pool_y_len_; j++) {
-            int32 c = 0;
-            c = st + i * (num_input_fmaps * fmap_y_len_)
-                   + j * num_input_fmaps;
-            pool.Max(in.ColRange(c, num_input_fmaps));
-          }
-        }
-        out_fmap_cnt++;
-      }
-    }
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // useful dims
-    int32 num_input_fmaps = input_dim_ / (fmap_x_len_ * fmap_y_len_);
-    int32 inp_fmap_size = fmap_x_len_ * fmap_y_len_;
-
-    //
-    // here we note how many diff matrices are summed for each input patch,
-    std::vector<int32> patch_summands(inp_fmap_size, 0);
-    // this metainfo will be used to divide diff of patches
-    // used in more than one pool.
-    //
-
-    in_diff->SetZero();  // reset
-
-    int out_fmap_cnt = 0;
-    for (int32 m = 0; m < fmap_x_len_-pool_x_len_+1; m = m+pool_x_step_) {
-      for (int32 n = 0; n < fmap_y_len_-pool_y_len_+1; n = n+pool_y_step_) {
-        int32 st = 0;
-        st = (m*fmap_y_len_+n)*num_input_fmaps;
-
-        for (int32 i = 0; i < pool_x_len_; i++) {
-          for (int32 j = 0; j < pool_y_len_; j++) {
-            int32 c = 0;
-            c = st + i * (num_input_fmaps * fmap_y_len_)
-                   + j * num_input_fmaps;
-            //
-            CuSubMatrix<BaseFloat> in_p(in.ColRange(c, num_input_fmaps));
-            CuSubMatrix<BaseFloat> out_p(
-              out.ColRange(out_fmap_cnt*num_input_fmaps, num_input_fmaps)
-            );
-            //
-
-            CuSubMatrix<BaseFloat> tgt(in_diff->ColRange(c, num_input_fmaps));
-            CuMatrix<BaseFloat> src(
-              out_diff.ColRange(out_fmap_cnt*num_input_fmaps, num_input_fmaps)
-            );
-
-            CuMatrix<BaseFloat> mask;
-            in_p.EqualElementMask(out_p, &mask);
-            src.MulElements(mask);
-            tgt.AddMat(1.0, src);
-
-            patch_summands[c/num_input_fmaps] += 1;
-          }
-        }
-        out_fmap_cnt++;
-      }
-    }
-
-    // divide diff by #summands (compensate for patches used in more pools),
-    for (int i = 0; i < fmap_x_len_; i++) {
-      for (int32 j = 0; j < fmap_y_len_; j++) {
-        int32 c = i * fmap_y_len_ + j;
-        CuSubMatrix<BaseFloat> tgt(in_diff->ColRange(c * num_input_fmaps, num_input_fmaps));
-        KALDI_ASSERT(patch_summands[c] > 0);  // patch at least in one pool
-        tgt.Scale(1.0 / patch_summands[c]);
-      }
-    }
-  }
-
- private:
-  int32 fmap_x_len_, fmap_y_len_,
-        pool_x_len_, pool_y_len_,
-        pool_x_step_, pool_y_step_;
-};
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_MAX_POOLING_2D_COMPONENT_H_
diff --git a/src/nnet2/combine-nnet-fast.cc b/src/nnet2/combine-nnet-fast.cc
index 02265a5f6ac..7ab2c9caf05 100644
--- a/src/nnet2/combine-nnet-fast.cc
+++ b/src/nnet2/combine-nnet-fast.cc
@@ -204,7 +204,7 @@ void FastNnetCombiner::CombineNnets(const Vector<double> &scale_params,
   int32 num_nnets = nnets.size();
   KALDI_ASSERT(num_nnets >= 1);
   int32 num_uc = nnets[0].NumUpdatableComponents();
-  KALDI_ASSERT(num_nnets * nnets[0].NumUpdatableComponents());
+  KALDI_ASSERT(nnets[0].NumUpdatableComponents() >= 1);
 
 
   *dest = nnets[0];
diff --git a/src/nnet2/combine-nnet.cc b/src/nnet2/combine-nnet.cc
index 417db1b84c4..57cc6133c58 100644
--- a/src/nnet2/combine-nnet.cc
+++ b/src/nnet2/combine-nnet.cc
@@ -31,9 +31,9 @@ static void CombineNnets(const Vector<BaseFloat> &scale_params,
   int32 num_nnets = nnets.size();
   KALDI_ASSERT(num_nnets >= 1);
   int32 num_uc = nnets[0].NumUpdatableComponents();
-  KALDI_ASSERT(num_nnets * nnets[0].NumUpdatableComponents());
-  
-  
+  KALDI_ASSERT(nnets[0].NumUpdatableComponents() >= 1);
+
+
   *dest = nnets[0];
   SubVector<BaseFloat> scale_params0(scale_params, 0, num_uc);
   dest->ScaleComponents(scale_params0);
@@ -59,7 +59,7 @@ static int32 GetInitialModel(
   for (int32 n = 0; n < num_nnets; n++) {
     BaseFloat objf = ComputeNnetObjf(nnets[n], validation_set,
                                      minibatch_size) / tot_frames;
-    
+
     if (n == 0 || objf > best_objf) {
       best_objf = objf;
       best_n = n;
@@ -98,7 +98,7 @@ static void GetInitialScaleParams(
       num_nnets = static_cast<int32>(nnets.size());
   if (initial_model < 0 || initial_model > num_nnets)
     initial_model = GetInitialModel(validation_set, nnets);
-  
+
   KALDI_ASSERT(initial_model >= 0 && initial_model <= num_nnets);
   int32 num_uc = nnets[0].NumUpdatableComponents();
 
@@ -107,7 +107,7 @@ static void GetInitialScaleParams(
     KALDI_LOG << "Initializing with neural net with index " << initial_model;
     // At this point we're using the best of the individual neural nets.
     scale_params->Set(0.0);
-    
+
     // Set the block of parameters corresponding to the "best" of the
     // source neural nets to
     SubVector<double> best_block(*scale_params, num_uc * initial_model, num_uc);
@@ -129,14 +129,14 @@ static double ComputeObjfAndGradient(
     Vector<double> *gradient) {
 
   Vector<BaseFloat> scale_params_float(scale_params);
-  
+
   Nnet nnet_combined;
   CombineNnets(scale_params_float, nnets, &nnet_combined);
-  
+
   Nnet nnet_gradient(nnet_combined);
   bool is_gradient = true;
   nnet_gradient.SetZero(is_gradient);
-  
+
   // note: "ans" is normalized by the total weight of validation frames.
   int32 batch_size = 1024;
   double ans = ComputeNnetGradient(nnet_combined,
@@ -146,7 +146,7 @@ static double ComputeObjfAndGradient(
 
   double tot_frames = validation_set.size();
   if (gradient != NULL) {
-    int32 i = 0; // index into scale_params.  
+    int32 i = 0; // index into scale_params.
     for (int32 n = 0; n < static_cast<int32>(nnets.size()); n++) {
       for (int32 j = 0; j < nnet_combined.NumComponents(); j++) {
         const UpdatableComponent *uc =
@@ -155,7 +155,7 @@ static double ComputeObjfAndGradient(
             dynamic_cast<const UpdatableComponent*>(&(nnet_gradient.GetComponent(j)));
         if (uc != NULL) {
           double dotprod = uc->DotProduct(*uc_gradient) / tot_frames;
-          (*gradient)(i) = dotprod; 
+          (*gradient)(i) = dotprod;
           i++;
         }
       }
@@ -165,14 +165,14 @@ static double ComputeObjfAndGradient(
 
   if (debug) {
     KALDI_LOG << "Double-checking gradient computation";
-    
+
     Vector<BaseFloat> manual_gradient(scale_params.Dim());
     for (int32 i = 0; i < scale_params.Dim(); i++) {
       double delta = 1.0e-04, fg = fabs((*gradient)(i));
       if (fg < 1.0e-07) fg = 1.0e-07;
       if (fg * delta < 1.0e-05)
         delta = 1.0e-05 / fg;
-      
+
       Vector<double> scale_params_temp(scale_params);
       scale_params_temp(i) += delta;
       double new_ans = ComputeObjfAndGradient(validation_set,
@@ -185,10 +185,10 @@ static double ComputeObjfAndGradient(
     KALDI_LOG << "Manually computed gradient is " << manual_gradient;
     KALDI_LOG << "Gradient we computed is " << *gradient;
   }
-  
+
   return ans;
 }
-                                   
+
 
 void CombineNnets(const NnetCombineConfig &combine_config,
                   const std::vector<NnetExample> &validation_set,
@@ -205,7 +205,7 @@ void CombineNnets(const NnetCombineConfig &combine_config,
   int32 dim = scale_params.Dim();
   KALDI_ASSERT(dim > 0);
   Vector<double> gradient(dim);
-  
+
   double objf, initial_objf;
 
   LbfgsOptions lbfgs_options;
@@ -213,11 +213,11 @@ void CombineNnets(const NnetCombineConfig &combine_config,
   lbfgs_options.m = dim; // Store the same number of vectors as the dimension
   // itself, so this is BFGS.
   lbfgs_options.first_step_impr = combine_config.initial_impr;
-  
+
   OptimizeLbfgs<double> lbfgs(scale_params,
                               lbfgs_options);
-  
-  for (int32 i = 0; i < combine_config.num_bfgs_iters; i++) {    
+
+  for (int32 i = 0; i < combine_config.num_bfgs_iters; i++) {
     scale_params.CopyFromVec(lbfgs.GetProposedValue());
     objf = ComputeObjfAndGradient(validation_set,
                                   scale_params,
@@ -227,9 +227,9 @@ void CombineNnets(const NnetCombineConfig &combine_config,
 
     KALDI_VLOG(2) << "Iteration " << i << " scale-params = " << scale_params
                   << ", objf = " << objf << ", gradient = " << gradient;
-    
+
     if (i == 0) initial_objf = objf;
-    
+
     lbfgs.DoStep(objf, gradient);
   }
 
@@ -244,10 +244,10 @@ void CombineNnets(const NnetCombineConfig &combine_config,
                                      nnets[0].NumUpdatableComponents());
   scale_params_mat.CopyRowsFromVec(scale_params_float);
   KALDI_LOG << "Final scale factors are " << scale_params_mat;
-  
+
   CombineNnets(scale_params_float, nnets, nnet_out);
 }
- 
-  
+
+
 } // namespace nnet2
 } // namespace kaldi
diff --git a/src/nnet2/nnet-component-test.cc b/src/nnet2/nnet-component-test.cc
index 04e476c01bd..5aeaf28cd1e 100644
--- a/src/nnet2/nnet-component-test.cc
+++ b/src/nnet2/nnet-component-test.cc
@@ -605,7 +605,7 @@ void UnitTestSumGroupComponent() {
 
 
 void UnitTestDctComponent() {
-  int32 m = 1 + Rand() % 4, n = 1 + Rand() % 4,
+  int32 m = 3 + Rand() % 4, n = 3 + Rand() % 4,
   dct_dim = m, dim = m * n;
   bool reorder = (Rand() % 2 == 0);
   {
@@ -619,12 +619,6 @@ void UnitTestDctComponent() {
     component.InitFromString(str);
     UnitTestGenericComponentInternal(component);
   }
-  {
-    const char *str = "dim=10 dct-dim=5 reorder=true dct-keep-dim=1";
-    DctComponent component;
-    component.InitFromString(str);
-    UnitTestGenericComponentInternal(component);
-  }
   {
     const char *str = "dim=10 dct-dim=5 reorder=true dct-keep-dim=2";
     DctComponent component;
diff --git a/src/nnet2/nnet-component.cc b/src/nnet2/nnet-component.cc
index eafeaceb9fe..f0919acfac8 100644
--- a/src/nnet2/nnet-component.cc
+++ b/src/nnet2/nnet-component.cc
@@ -909,7 +909,7 @@ void SoftmaxComponent::Propagate(const ChunkInfo &in_info,
   // for that row, we do
   // x_i = exp(x_i) / sum_j exp(x_j).
 
-  out->ApplySoftMaxPerRow(in);
+  out->SoftMaxPerRow(in);
 
   // This floor on the output helps us deal with
   // almost-zeros in a way that doesn't lead to overflow.
@@ -956,7 +956,7 @@ void LogSoftmaxComponent::Propagate(const ChunkInfo &in_info,
 
   // Applies log softmax function to each row of the output. For each row, we do
   // x_i = x_i - log(sum_j exp(x_j))
-  out->ApplyLogSoftMaxPerRow(in);
+  out->LogSoftMaxPerRow(in);
 
   // Just to be consistent with SoftmaxComponent::Propagate()
   out->ApplyFloor(Log(1.0e-20));
diff --git a/src/nnet2bin/nnet-am-compute.cc b/src/nnet2bin/nnet-am-compute.cc
index 32da30b73a5..2b50f7cc656 100644
--- a/src/nnet2bin/nnet-am-compute.cc
+++ b/src/nnet2bin/nnet-am-compute.cc
@@ -94,7 +94,7 @@ int main(int argc, char *argv[]) {
     int64 num_done = 0, num_frames = 0;
 
     Vector<BaseFloat> inv_priors(am_nnet.Priors());
-    KALDI_ASSERT(inv_priors.Dim() == am_nnet.NumPdfs() &&
+    KALDI_ASSERT((!divide_by_priors || inv_priors.Dim() == am_nnet.NumPdfs()) &&
                  "Priors in neural network not set up.");
     inv_priors.ApplyPow(-1.0);
 
@@ -159,5 +159,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
-
diff --git a/src/nnet3/attention.cc b/src/nnet3/attention.cc
index bd8cb6bf85c..ddfddbaf74a 100644
--- a/src/nnet3/attention.cc
+++ b/src/nnet3/attention.cc
@@ -133,7 +133,7 @@ void AttentionForward(BaseFloat key_scale,
   // compute the soft-max function.  Up till this point, 'c'
   // actually contained what in attention.h we called 'b', which is
   // the input to the softmax.
-  c->ApplySoftMaxPerRow(*c);
+  c->SoftMaxPerRow(*c);
 
 
   // the part of the output that is weighted
diff --git a/src/nnet3/decodable-online-looped.cc b/src/nnet3/decodable-online-looped.cc
index 2159575df6c..751438606e8 100644
--- a/src/nnet3/decodable-online-looped.cc
+++ b/src/nnet3/decodable-online-looped.cc
@@ -30,6 +30,7 @@ DecodableNnetLoopedOnlineBase::DecodableNnetLoopedOnlineBase(
     num_chunks_computed_(0),
     current_log_post_subsampled_offset_(-1),
     info_(info),
+    frame_offset_(0),
     input_features_(input_features),
     ivector_features_(ivector_features),
     computer_(info_.opts.compute_config, info_.computation,
@@ -66,7 +67,7 @@ int32 DecodableNnetLoopedOnlineBase::NumFramesReady() const {
   if (input_finished) {
     // if the input has finished,... we'll pad with duplicates of the last frame
     // as needed to get the required right context.
-    return (features_ready + sf - 1) / sf;
+    return (features_ready + sf - 1) / sf - frame_offset_;
   } else {
     // note: info_.right_context_ includes both the model context and any
     // extra_right_context_ (but this
@@ -78,7 +79,7 @@ int32 DecodableNnetLoopedOnlineBase::NumFramesReady() const {
     // doesn't need any attention to rounding because info_.frames_per_chunk
     // is always a multiple of 'sf' (see 'frames_per_chunk = GetChunksize..."
     // in decodable-simple-looped.cc).
-    return num_chunks_ready * info_.frames_per_chunk / sf;
+    return num_chunks_ready * info_.frames_per_chunk / sf - frame_offset_;
   }
 }
 
@@ -105,9 +106,14 @@ bool DecodableNnetLoopedOnlineBase::IsLastFrame(
     return false;
   int32 sf = info_.opts.frame_subsampling_factor,
      num_subsampled_frames_ready = (features_ready + sf - 1) / sf;
-  return (subsampled_frame == num_subsampled_frames_ready - 1);
+  return (subsampled_frame + frame_offset_ == num_subsampled_frames_ready - 1);
 }
 
+void DecodableNnetLoopedOnlineBase::SetFrameOffset(int32 frame_offset) {
+  KALDI_ASSERT(0 <= frame_offset &&
+               frame_offset <= frame_offset_ + NumFramesReady());
+  frame_offset_ = frame_offset;
+}
 
 void DecodableNnetLoopedOnlineBase::AdvanceChunk() {
   // Prepare the input data for the next chunk of features.
@@ -231,6 +237,7 @@ void DecodableNnetLoopedOnlineBase::AdvanceChunk() {
 
 BaseFloat DecodableNnetLoopedOnline::LogLikelihood(int32 subsampled_frame,
                                                     int32 index) {
+  subsampled_frame += frame_offset_;
   EnsureFrameIsComputed(subsampled_frame);
   // note: we index by 'inde
   return current_log_post_(
@@ -241,6 +248,7 @@ BaseFloat DecodableNnetLoopedOnline::LogLikelihood(int32 subsampled_frame,
 
 BaseFloat DecodableAmNnetLoopedOnline::LogLikelihood(int32 subsampled_frame,
                                                     int32 index) {
+  subsampled_frame += frame_offset_;
   EnsureFrameIsComputed(subsampled_frame);
   return current_log_post_(
       subsampled_frame - current_log_post_subsampled_offset_,
diff --git a/src/nnet3/decodable-online-looped.h b/src/nnet3/decodable-online-looped.h
index 3041d3c4637..5ed5c0d73a5 100644
--- a/src/nnet3/decodable-online-looped.h
+++ b/src/nnet3/decodable-online-looped.h
@@ -81,6 +81,17 @@ class DecodableNnetLoopedOnlineBase: public DecodableInterface {
     return info_.opts.frame_subsampling_factor;
   }
 
+  /// Sets the frame offset value. Frame offset is initialized to 0 when the
+  /// decodable object is constructed and stays as 0 unless this method is
+  /// called. This method is useful when we want to reset the decoder state,
+  /// i.e. call decoder.InitDecoding(), but we want to keep using the same
+  /// decodable object, e.g. in case of an endpoint. The frame offset affects
+  /// the behavior of IsLastFrame(), NumFramesReady() and LogLikelihood()
+  /// methods.
+  void SetFrameOffset(int32 frame_offset);
+
+  /// Returns the frame offset value.
+  int32 GetFrameOffset() const { return frame_offset_; }
 
  protected:
 
@@ -111,6 +122,11 @@ class DecodableNnetLoopedOnlineBase: public DecodableInterface {
 
   const DecodableNnetSimpleLoopedInfo &info_;
 
+  // IsLastFrame(), NumFramesReady() and LogLikelihood() methods take into
+  // account this offset value. We initialize frame_offset_ as 0 and it stays as
+  // 0 unless SetFrameOffset() method is called.
+  int32 frame_offset_;
+
  private:
 
   // This function does the computation for the next chunk.  It will change
diff --git a/src/nnet3/decodable-simple-looped.cc b/src/nnet3/decodable-simple-looped.cc
index 0452304cf55..71aa7daaa17 100644
--- a/src/nnet3/decodable-simple-looped.cc
+++ b/src/nnet3/decodable-simple-looped.cc
@@ -52,7 +52,6 @@ DecodableNnetSimpleLoopedInfo::DecodableNnetSimpleLoopedInfo(
   Init(opts, &(am_nnet->GetNnet()));
 }
 
-
 void DecodableNnetSimpleLoopedInfo::Init(
     const NnetSimpleLoopedComputationOptions &opts,
     Nnet *nnet) {
@@ -86,10 +85,8 @@ void DecodableNnetSimpleLoopedInfo::Init(
   CompileLooped(*nnet, opts.optimize_config, request1, request2, request3,
                 &computation);
   computation.ComputeCudaIndexes();
-  if (GetVerboseLevel() >= 3) {
-    KALDI_VLOG(3) << "Computation is:";
-    computation.Print(std::cerr, *nnet);
-  }
+  KALDI_VLOG(3) << "Computation is:\n"
+                << NnetComputationPrintInserter{computation, *nnet};
 }
 
 
diff --git a/src/nnet3/natural-gradient-online.cc b/src/nnet3/natural-gradient-online.cc
index 0677e1ca474..a205490ee3f 100644
--- a/src/nnet3/natural-gradient-online.cc
+++ b/src/nnet3/natural-gradient-online.cc
@@ -119,14 +119,14 @@ void OnlineNaturalGradient::InitDefault(int32 D) {
   t_ = 0;
 }
 
-void OnlineNaturalGradient::Init(const CuMatrixBase<BaseFloat> &R0) {
-  int32 D = R0.NumCols();
+void OnlineNaturalGradient::Init(const CuMatrixBase<BaseFloat> &X0) {
+  int32 D = X0.NumCols();
   // for locking reasons it's better to use a different object.
   OnlineNaturalGradient this_copy(*this);
   this_copy.InitDefault(D);
   this_copy.t_ = 1;  // Prevent recursion to Init() again.
 
-  CuMatrix<BaseFloat> R0_copy(R0.NumRows(), R0.NumCols(), kUndefined);
+  CuMatrix<BaseFloat> X0_copy(X0.NumRows(), X0.NumCols(), kUndefined);
   // 'num_iters' is number of iterations with the same data from a pseudorandom
   // start.  this is a faster way of starting than doing eigenvalue
   // decomposition.
@@ -134,11 +134,11 @@ void OnlineNaturalGradient::Init(const CuMatrixBase<BaseFloat> &R0) {
   // Note: we only do three iterations of initialization if we have enough data
   // that it's reasonably possible to estimate the subspace of dimension
   // this_copy.rank_.  If we don't have more than that many rows in our initial
-  // minibatch R0, we just do one iteration... this gives us almost exactly
-  // (barring small effects due to epsilon_ > 0) the row subspace of R0 after
+  // minibatch X0, we just do one iteration... this gives us almost exactly
+  // (barring small effects due to epsilon_ > 0) the row subspace of X0 after
   // one iteration anyway.
   int32 num_init_iters;
-  if (R0.NumRows() <= this_copy.rank_)
+  if (X0.NumRows() <= this_copy.rank_)
     num_init_iters = 1;
   else
     num_init_iters = 3;
@@ -147,8 +147,8 @@ void OnlineNaturalGradient::Init(const CuMatrixBase<BaseFloat> &R0) {
                                // initialize.
   for (int32 i = 0; i < num_init_iters; i++) {
     BaseFloat scale;
-    R0_copy.CopyFromMat(R0);
-    this_copy.PreconditionDirections(&R0_copy, &scale);
+    X0_copy.CopyFromMat(X0);
+    this_copy.PreconditionDirections(&X0_copy, &scale);
   }
   rank_ = this_copy.rank_;
   W_t_.Swap(&this_copy.W_t_);
@@ -197,7 +197,7 @@ void OnlineNaturalGradient::PreconditionDirections(
   t_ += 1;
 }
 
-void OnlineNaturalGradient::ReorthogonalizeXt1(
+void OnlineNaturalGradient::ReorthogonalizeRt1(
     const VectorBase<BaseFloat> &d_t1,
     BaseFloat rho_t1,
     CuMatrixBase<BaseFloat> *W_t1,
@@ -214,7 +214,7 @@ void OnlineNaturalGradient::ReorthogonalizeXt1(
   ComputeEt(d_t1, beta_t1, &e_t1, &sqrt_e_t1, &inv_sqrt_e_t1);
 
   temp_O->SymAddMat2(1.0, *W_t1, kNoTrans, 0.0);
-  // O_t =  E_t^{-0.5} W_t W_t^T E_t^{-0.5}
+  // O_{t+1} =  E_{t+1}^{-0.5} W_{t+1} W_{t+1}^T E_{t+1}^{-0.5}
   Matrix<BaseFloat> O_mat(*temp_O);
   SpMatrix<BaseFloat> O(O_mat, kTakeLower);
   for (int32 i = 0; i < R; i++) {
@@ -439,7 +439,7 @@ void OnlineNaturalGradient::PreconditionDirectionsInternal(
     if (self_debug_) {
       KALDI_WARN << "Reorthogonalizing.";
     }
-    ReorthogonalizeXt1(d_t1,
+    ReorthogonalizeRt1(d_t1,
                        rho_t1,
                        &W_t1,
                        &J_t,
@@ -510,7 +510,7 @@ void OnlineNaturalGradient::ComputeWt1(int32 N,
   // B_t = J_t + (1-\eta)/(\eta/N) (D_t + \rho_t I) W_t
   J_t->AddDiagVecMat(1.0, w_t_coeff_gpu, W_t, kNoTrans, 1.0);
 
-  // A_t = (\eta/N) E_{t+1}^{0.5} C_t^{-0.5} U_t^T E_t^{-0.5} B_t
+  // A_t = (\eta/N) E_{t+1}^{0.5} C_t^{-0.5} U_t^T E_t^{-0.5}
   Matrix<BaseFloat> A_t(U_t, kTrans);
   for (int32 i = 0; i < R; i++) {
     BaseFloat i_factor = (eta / N) * sqrt_e_t1(i) * inv_sqrt_c_t(i);
diff --git a/src/nnet3/natural-gradient-online.h b/src/nnet3/natural-gradient-online.h
index a68ad9bbb53..77be28a19d4 100644
--- a/src/nnet3/natural-gradient-online.h
+++ b/src/nnet3/natural-gradient-online.h
@@ -375,8 +375,8 @@ namespace nnet3 {
    * Initialization *
 
    Now, a note on what we do on time t = 0, i.e. for the first minibatch.  We
-   initialize X_0 to the top R eigenvectors of 1/N X_0 X_0^T, where N is the
-   minibatch size (num-rows of R0).  If L is the corresponding RxR diagonal
+   initialize R_0 to the top R eigenvectors of 1/N X_0 X_0^T, where N is the
+   minibatch size (num-rows of X0).  If L is the corresponding RxR diagonal
    matrix of eigenvalues, then we will set D_0 = L - \rho_0 I.  We set \rho_0
    to ensure that
                       tr(F_0) = 1/N tr(X_0 X_0^T),
@@ -457,7 +457,7 @@ class OnlineNaturalGradient {
             not.
 
   */
-  void PreconditionDirections(CuMatrixBase<BaseFloat> *R,
+  void PreconditionDirections(CuMatrixBase<BaseFloat> *X,
                               BaseFloat *scale);
 
 
@@ -515,7 +515,7 @@ class OnlineNaturalGradient {
   // This function is called if C_t has high condition number; it makes sure
   // that R_{t+1} is orthogonal.  See the section in the extended comment above
   // on "keeping R_t orthogonal".
-  void ReorthogonalizeXt1(const VectorBase<BaseFloat> &d_t1,
+  void ReorthogonalizeRt1(const VectorBase<BaseFloat> &d_t1,
                           BaseFloat rho_t1,
                           CuMatrixBase<BaseFloat> *W_t1,
                           CuMatrixBase<BaseFloat> *temp_W,
diff --git a/src/nnet3/nnet-analyze.cc b/src/nnet3/nnet-analyze.cc
index 584a7c19ab8..a3696403eba 100644
--- a/src/nnet3/nnet-analyze.cc
+++ b/src/nnet3/nnet-analyze.cc
@@ -880,7 +880,7 @@ void ComputationChecker::CheckComputationIndexes() const {
           KALDI_ERR << "Backprop input needed but not supplied.";
         if ((properties & kBackpropNeedsOutput) && c.arg4 == 0)
           KALDI_ERR << "Backprop output needed but not supplied.";
-        if (c.arg6 == 0 && !(properties && kUpdatableComponent)) {
+        if (c.arg6 == 0 && !(properties & kUpdatableComponent)) {
           // note: we could perhaps make this just a warning,
           // or optimize it away somehow.
           KALDI_ERR << "Backprop is done but has no effect.";
diff --git a/src/nnet3/nnet-batch-compute.cc b/src/nnet3/nnet-batch-compute.cc
index 5da55d0f70d..9d71a021f05 100644
--- a/src/nnet3/nnet-batch-compute.cc
+++ b/src/nnet3/nnet-batch-compute.cc
@@ -393,44 +393,105 @@ void NnetBatchComputer::FormatInputs(
       ivector_dim = tasks[0]->ivector.Dim(),
       num_tasks = tasks.size();
   KALDI_ASSERT(num_tasks > 0 && num_tasks <= minibatch_size);
+  
+  // destination matrix
+  input->Resize(minibatch_size * num_input_frames, input_dim,
+                kUndefined);
+ 
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+
+    std::vector<const BaseFloat*> inputs(num_tasks);
+    std::vector<BaseFloat*> outputs(num_tasks);
+    std::vector<int32_t> ldi(num_tasks), ldo(num_tasks);
+    std::vector<int32_t> num_rows(num_tasks), num_cols(num_tasks);
+
+    // compute matrix descriptions for each copy
+    for (int32 n = 0; n < num_tasks; n++) {
+      const CuMatrix<BaseFloat> &input_mat = tasks[n]->input;
+      CuSubMatrix<BaseFloat> output_mat = input->RowRange(
+          n * num_input_frames, num_input_frames);
+
+      // create matrix batch description arrays
+      num_rows[n] = num_input_frames;
+      num_cols[n] = input_dim;
+      outputs[n] = output_mat.Data();
+      inputs[n] = input_mat.Data();
+      ldo[n] = output_mat.Stride();
+      ldi[n] = input_mat.Stride();
+    }
 
-  // We first aggregate the input frames and i-vectors in matrices on the CPU,
-  // and then transfer them to the GPU.  Later on we'll change this code to
-  // used pinned memory.
-  Matrix<BaseFloat> input_cpu(num_tasks * num_input_frames, input_dim,
-                              kUndefined);
-
+    // execute batched copy
+    cuda_batched_copy_mats(num_tasks, &num_rows[0], &num_cols[0], &inputs[0], 
+        &ldi[0], &outputs[0], &ldo[0]);
 
-  for (int32 n = 0; n < num_tasks; n++) {
-    SubMatrix<BaseFloat> input_part(input_cpu,
+  } else 
+#endif
+  {
+    for (int32 n = 0; n < num_tasks; n++) {
+      CuSubMatrix<BaseFloat> input_part(*input,
                                     n * num_input_frames, num_input_frames,
                                     0, input_dim);
-    input_part.CopyFromMat(tasks[n]->input);
-  }
-  input->Resize(minibatch_size * num_input_frames, input_dim,
-                kUndefined);
-  input->RowRange(0, num_tasks * num_input_frames).CopyFromMat(input_cpu);
-  if (num_tasks < minibatch_size) {
-    // The following will make things easier to debug if something fails, but
-    // shouldn't be strictly necessary.
-    // the -1 means 'take all remaining rows'.
-    input->RowRange(num_tasks * num_input_frames,
-                    (minibatch_size - num_tasks) * num_input_frames).SetZero();
+      input_part.CopyFromMat(tasks[n]->input);
+    }
   }
 
-  if (ivector_dim != 0) {
-    Matrix<BaseFloat> ivectors_cpu(num_tasks, ivector_dim, kUndefined);
-    for (int32 n = 0; n < num_tasks; n++)
-      ivectors_cpu.Row(n).CopyFromVec(tasks[n]->ivector);
-
-    ivector->Resize(minibatch_size, ivector_dim, kUndefined);
-    ivector->RowRange(0, num_tasks).CopyFromMat(ivectors_cpu);
-
+  if (GetVerboseLevel() >=2 ) {
     if (num_tasks < minibatch_size) {
       // The following will make things easier to debug if something fails, but
       // shouldn't be strictly necessary.
       // the -1 means 'take all remaining rows'.
-      ivector->RowRange(num_tasks, minibatch_size - num_tasks).SetZero();
+      input->RowRange(num_tasks * num_input_frames,
+                      (minibatch_size - num_tasks) * num_input_frames).SetZero();
+    }
+  }
+
+  if (ivector_dim != 0) {
+    ivector->Resize(minibatch_size, ivector_dim, kUndefined);
+
+#if HAVE_CUDA == 1
+    if (CuDevice::Instantiate().Enabled()) {
+     
+      // using the batched matrix copy routine for this.  This isn't
+      // extremely efficient but the kernel takes a minimal amount of 
+      // time so making a batched vector copy is not worth the effort.
+      std::vector<const BaseFloat*> inputs(num_tasks);
+      std::vector<BaseFloat*> outputs(num_tasks);
+      std::vector<int32_t> ldi(num_tasks), ldo(num_tasks);
+      std::vector<int32_t> num_rows(num_tasks), num_cols(num_tasks);
+
+      // compute source pointers for each input
+      for (int32 n = 0; n < num_tasks; n++) {
+        const CuVector<BaseFloat> &input_vec = tasks[n]->ivector;
+        CuSubVector<BaseFloat> output_vec = ivector->Row(n);
+        // create matrix batch description arrays
+        num_rows[n] = 1;
+        num_cols[n] = ivector_dim;
+        outputs[n] = output_vec.Data();
+        inputs[n] = input_vec.Data();
+        ldo[n] = 1;
+        ldi[n] = 1;
+      }
+
+      // execute batched copy
+      cuda_batched_copy_mats(num_tasks, &num_rows[0], &num_cols[0], &inputs[0], &ldi[0], 
+          &outputs[0], &ldo[0]);
+
+    } else 
+#endif
+    {
+      for (int32 n = 0; n < num_tasks; n++) {
+        ivector->Row(n).CopyFromVec(tasks[n]->ivector);
+      }
+    }
+
+    if (GetVerboseLevel() >= 2) {
+      if (num_tasks < minibatch_size) {
+        // The following will make things easier to debug if something fails, but
+        // shouldn't be strictly necessary.
+        // the -1 means 'take all remaining rows'.
+        ivector->RowRange(num_tasks, minibatch_size - num_tasks).SetZero();
+      }
     }
   }
 }
@@ -444,42 +505,97 @@ void NnetBatchComputer::FormatOutputs(
       num_tasks = tasks.size();
   bool did_output_to_gpu = false;
 
-  // Note: it may not be optimal to do so many individual calls to copy the
-  // output to CPU; we'd have to test that, as I'm not sure how much the latency
-  // of a GPU call is.  On the other hand, the downsides of one big call are
-  // that we'd have to make another copy in CPU memory; and also we might not be
-  // able to take advantage if not all frames of the output are used.
-
-  // Also, we should probably used pinned memory.
-
   // We don't bother zeroing frames of the output that are unused, but you could
-  // un-comment the commented lines of code below to do so.
-  for (int32 n = 0; n < num_tasks; n++) {
-    NnetInferenceTask *task = tasks[n];
-
-    int32 left_unused = task->num_initial_unused_output_frames,
-        used = task->num_used_output_frames;
-     // int32 right_unused = num_output_frames - used - left_unused;
-
-    if (task->output_to_cpu) {
-      task->output_cpu.Resize(num_output_frames, output_dim,
-                              kUndefined);
-      // if (left_unused > 0)
-      //   task->output_cpu.RowRange(0, left_unused).SetZero();
-      task->output_cpu.RowRange(left_unused, used).CopyFromMat(
-          output.RowRange(n * num_output_frames + left_unused, used));
-      // if (right_unused > 0)
-      //   task->output_cpu.RowRange(0, left_unused + used, right_unused).SetZero();
-    } else {
-      did_output_to_gpu = true;
-      task->output.Resize(num_output_frames, output_dim,
-                          kUndefined);
-      // if (left_unused > 0)
-      //   task->output.RowRange(0, left_unused).SetZero();
-      task->output.RowRange(left_unused, used).CopyFromMat(
-          output.RowRange(n * num_output_frames + left_unused, used));
-      // if (right_unused > 0)
-      //   task->output.RowRange(0, left_unused + used, right_unused).SetZero();
+  // un-comment the commented lines of code below to do so and add equivalent
+  // calls to the cuda version.
+
+#if HAVE_CUDA == 1 
+  if (CuDevice::Instantiate().Enabled()) {
+
+    std::vector<const BaseFloat*> inputs(num_tasks);
+    std::vector<BaseFloat*> outputs(num_tasks);
+    std::vector<int32_t> ldi(num_tasks), ldo(num_tasks);
+    std::vector<int32_t> num_rows(num_tasks), num_cols(num_tasks);
+
+    int b=0;  // batch counter
+    for (int32 n = 0; n < num_tasks; n++) {
+      NnetInferenceTask *task = tasks[n];
+
+      int32 left_unused = task->num_initial_unused_output_frames,
+            used = task->num_used_output_frames;
+      // int32 right_unused = num_output_frames - used - left_unused;
+      
+      // TODO do we really expect different tasks to output CPU or GPU? 
+      // This adds a bit of code complexity.  Perhaps output_to_cpu should 
+      // be a property of the batch computer and not the tasks
+      if (task->output_to_cpu) {
+        task->output_cpu.Resize(num_output_frames, output_dim,
+            kUndefined);
+        // if (left_unused > 0)
+        //   task->output_cpu.RowRange(0, left_unused).SetZero();
+        task->output_cpu.RowRange(left_unused, used).CopyFromMat(
+            output.RowRange(n * num_output_frames + left_unused, used));
+        // if (right_unused > 0)
+        //   task->output_cpu.RowRange(
+        //   0, left_unused + used, right_unused).SetZero();
+
+      } else {
+        did_output_to_gpu = true;
+        task->output.Resize(num_output_frames, output_dim,
+            kUndefined);
+
+        CuSubMatrix<BaseFloat> output_mat = task->output.RowRange(
+            left_unused, used);
+        const CuSubMatrix<BaseFloat> input_mat = output.RowRange(
+            n * num_output_frames + left_unused, used);
+       
+        // create matrix batch description arrays
+        num_rows[b] = output_mat.NumRows();
+        num_cols[b] = output_mat.NumCols();
+        outputs[b] = output_mat.Data();
+        inputs[b] = input_mat.Data();
+        ldo[b] = output_mat.Stride();
+        ldi[b] = input_mat.Stride();
+        b++; // increase batch count
+      }
+    }
+    
+    // execute batched copy
+    cuda_batched_copy_mats(b, &num_rows[0], &num_cols[0], &inputs[0], &ldi[0], 
+        &outputs[0], &ldo[0]);
+  
+  } else
+#endif
+  {
+    //TODO i don't think all of these paths are actually possible.  We should simplify this.  
+    //Is it possible to output_to_gpu with HAVE_CUDA == 0 or when the device is disabled?
+    for (int32 n = 0; n < num_tasks; n++) {
+      NnetInferenceTask *task = tasks[n];
+
+      int32 left_unused = task->num_initial_unused_output_frames,
+            used = task->num_used_output_frames;
+      // int32 right_unused = num_output_frames - used - left_unused;
+
+      if (task->output_to_cpu) {
+        task->output_cpu.Resize(num_output_frames, output_dim,
+            kUndefined);
+        // if (left_unused > 0)
+        //   task->output_cpu.RowRange(0, left_unused).SetZero();
+        task->output_cpu.RowRange(left_unused, used).CopyFromMat(
+            output.RowRange(n * num_output_frames + left_unused, used));
+        // if (right_unused > 0)
+        //   task->output_cpu.RowRange(0, left_unused + used, right_unused).SetZero();
+      } else {
+        did_output_to_gpu = true;
+        task->output.Resize(num_output_frames, output_dim,
+            kUndefined);
+        // if (left_unused > 0)
+        //   task->output.RowRange(0, left_unused).SetZero();
+        task->output.RowRange(left_unused, used).CopyFromMat(
+            output.RowRange(n * num_output_frames + left_unused, used));
+        // if (right_unused > 0)
+        //   task->output.RowRange(0, left_unused + used, right_unused).SetZero();
+      }
     }
   }
   // The output of this function will likely be consumed by another thread.
@@ -550,7 +666,6 @@ bool NnetBatchComputer::Compute(bool allow_partial_minibatch) {
   minfo->tot_num_tasks += static_cast<int64>(tasks.size());
   minfo->seconds_taken += tim.Elapsed();
 
-
   SynchronizeGpu();
 
   for (size_t i = 0; i < tasks.size(); i++)
@@ -653,7 +768,7 @@ void GetOutputFrameInfoForTasks(
 
 void AddOnlineIvectorsToTasks(
     const NnetBatchComputerOptions &opts,
-    const Matrix<BaseFloat> &online_ivectors,
+    const CuMatrix<BaseFloat> &online_ivectors,
     int32 online_ivector_period,
     std::vector<NnetInferenceTask> *tasks) {
   int32 f = opts.frame_subsampling_factor,
@@ -704,7 +819,7 @@ void AddOnlineIvectorsToTasks(
 static void SplitInputToTasks(const NnetBatchComputerOptions &opts,
                               int32 nnet_left_context,
                               int32 nnet_right_context,
-                              const Matrix<BaseFloat> &input,
+                              const CuMatrix<BaseFloat> &input,
                               std::vector<NnetInferenceTask> *tasks) {
   int32 num_input_frames = input.NumRows(),
       f = opts.frame_subsampling_factor,
@@ -716,6 +831,7 @@ static void SplitInputToTasks(const NnetBatchComputerOptions &opts,
                                    opts.extra_right_context :
                                    opts.extra_right_context_final),
       num_tasks = tasks->size();
+
   for (int32 i = 0; i < num_tasks; i++) {
     NnetInferenceTask &task = (*tasks)[i];
     // begin_output_t and end_output_t are the subsampled frame indexes at
@@ -755,27 +871,50 @@ static void SplitInputToTasks(const NnetBatchComputerOptions &opts,
 
     task.input.Resize(end_input_t_padded - begin_input_t_padded,
                       input.NumCols(), kUndefined);
-    // the 't' value below is in the numbering of 'input'.
-    for (int32 t = begin_input_t_padded; t < end_input_t_padded; t++) {
-      int32 t_clipped = t;
-      if (t_clipped < 0) t_clipped = 0;
-      if (t_clipped >= num_input_frames) t_clipped = num_input_frames - 1;
-      SubVector<BaseFloat> dest(task.input,
-                                t - begin_input_t_padded),
-          src(input, t_clipped);
-      dest.CopyFromVec(src);
-    }
+
+    // Copy from intput into task input with clamping
+    task.input.CopyRangeFromMatClamped(input, begin_input_t_padded, 
+        end_input_t_padded, 0, num_input_frames-1);
   }
 }
 
 } // namespace utterance_splitting
 
-
 void NnetBatchComputer::SplitUtteranceIntoTasks(
     bool output_to_cpu,
     const Matrix<BaseFloat> &input,
-    const Vector<BaseFloat> *ivector,
-    const Matrix<BaseFloat> *online_ivectors,
+    const Vector<BaseFloat> *h_ivector,
+    const Matrix<BaseFloat> *h_online_ivectors,
+    int32 online_ivector_period,
+    std::vector<NnetInferenceTask> *tasks) {
+
+  // Inputs are expected to be in device memory. 
+  // create temporary device arrays and copy
+  // inputs into them
+  CuMatrix<BaseFloat> cu_input(input);
+  CuVector<BaseFloat> cu_ivector, *ivector = NULL;
+  CuMatrix<BaseFloat> cu_online_ivectors, *online_ivectors = NULL;
+
+  if (h_ivector!=NULL) {
+    cu_ivector.Resize(h_ivector->Dim(), kUndefined);
+    cu_ivector.CopyFromVec(*h_ivector);
+    ivector = &cu_ivector;
+  }
+  if (h_online_ivectors!=NULL) {
+    cu_online_ivectors.Resize(h_online_ivectors->NumRows(), h_online_ivectors->NumCols(), kUndefined);
+    cu_online_ivectors.CopyFromMat(*h_online_ivectors);
+    online_ivectors = &cu_online_ivectors;
+  }
+
+  SplitUtteranceIntoTasks(output_to_cpu, cu_input, ivector,
+      online_ivectors, online_ivector_period, tasks);
+}
+
+void NnetBatchComputer::SplitUtteranceIntoTasks(
+    bool output_to_cpu,
+    const CuMatrix<BaseFloat> &input,
+    const CuVector<BaseFloat> *ivector,
+    const CuMatrix<BaseFloat> *online_ivectors,
     int32 online_ivector_period,
     std::vector<NnetInferenceTask> *tasks) {
   using namespace utterance_splitting;
@@ -811,10 +950,47 @@ void NnetBatchComputer::SplitUtteranceIntoTasks(
   SplitInputToTasks(opts_, nnet_left_context_, nnet_right_context_,
                     input, tasks);
 
+
   if (ivector != NULL) {
     KALDI_ASSERT(online_ivectors == NULL);
-    for (size_t i = 0; i < tasks->size(); i++)
-      (*tasks)[i].ivector = *ivector;
+
+#if HAVE_CUDA == 1 
+    if (CuDevice::Instantiate().Enabled()) {
+      int32_t num_tasks = tasks->size();
+
+      std::vector<const BaseFloat*> inputs(num_tasks);
+      std::vector<BaseFloat*> outputs(num_tasks);
+      std::vector<int32_t> ldi(num_tasks), ldo(num_tasks);
+      std::vector<int32_t> num_rows(num_tasks), num_cols(num_tasks);
+
+      int b=0;  // batch counter
+        
+      for (size_t i = 0; i < tasks->size(); i++) {
+        CuVector<BaseFloat> &output_vec = (*tasks)[i].ivector;
+        const CuVector<BaseFloat> &input_vec =  *ivector;
+
+        output_vec.Resize(input_vec.Dim(), kUndefined);
+
+        // create matrix batch description arrays
+        num_rows[b] = 1;
+        num_cols[b] = output_vec.Dim();
+        outputs[b] = output_vec.Data();
+        inputs[b] = input_vec.Data();
+        ldo[b] = 0;
+        ldi[b] = 0;
+        b++; // increase batch count
+      }
+    
+      // execute batched copy
+      cuda_batched_copy_mats(b, &num_rows[0], &num_cols[0], &inputs[0], &ldi[0], 
+          &outputs[0], &ldo[0]);
+    } else
+#endif
+    {
+      for (size_t i = 0; i < tasks->size(); i++)
+        (*tasks)[i].ivector = *ivector;
+    }
+
   } else if (online_ivectors != NULL) {
     AddOnlineIvectorsToTasks(opts_, *online_ivectors,
                              online_ivector_period, tasks);
@@ -863,6 +1039,85 @@ void MergeTaskOutput(
   }
   KALDI_ASSERT(cur_output_frame == num_output_frames);
 }
+void MergeTaskOutput(
+    const std::vector<NnetInferenceTask> &tasks,
+    CuMatrix<BaseFloat> *output) {
+  int32 num_tasks = tasks.size(),
+      num_output_frames = 0,
+      output_dim = -1;
+  for (int32 i = 0; i < num_tasks; i++) {
+    const NnetInferenceTask &task = tasks[i];
+    num_output_frames += task.num_used_output_frames;
+    if (i == 0) {
+      output_dim = (task.output_to_cpu ?
+                    task.output_cpu.NumCols() :
+                    task.output.NumCols());
+    }
+  }
+  KALDI_ASSERT(num_output_frames != 0 && output_dim != 0);
+  int32 cur_output_frame = 0;
+  output->Resize(num_output_frames, output_dim, kUndefined);
+  
+#if HAVE_CUDA == 1 
+  if (CuDevice::Instantiate().Enabled()) {
+
+    std::vector<const BaseFloat*> inputs(num_tasks);
+    std::vector<BaseFloat*> outputs(num_tasks);
+    std::vector<int32_t> ldi(num_tasks), ldo(num_tasks);
+    std::vector<int32_t> num_rows(num_tasks), num_cols(num_tasks);
+
+    int b=0;  // batch counter
+    for (int32 i = 0; i < num_tasks; i++) {
+      const NnetInferenceTask &task = tasks[i];
+      int32 skip = task.num_initial_unused_output_frames,
+            num_used = task.num_used_output_frames;
+      KALDI_ASSERT(cur_output_frame == task.first_used_output_frame_index);
+      if (task.output_to_cpu) {
+        output->RowRange(cur_output_frame, num_used).CopyFromMat(
+            task.output_cpu.RowRange(skip, num_used));
+      } else {
+        CuSubMatrix<BaseFloat> output_mat = 
+          output->RowRange(cur_output_frame, num_used);
+        const CuSubMatrix<BaseFloat> input_mat =  
+          task.output.RowRange(skip, num_used);
+
+        // create matrix batch description arrays
+        num_rows[b] = output_mat.NumRows();
+        num_cols[b] = output_mat.NumCols();
+        outputs[b] = output_mat.Data();
+        inputs[b] = input_mat.Data();
+        ldo[b] = output_mat.Stride();
+        ldi[b] = input_mat.Stride();
+        b++; // increase batch count
+      }
+      cur_output_frame += num_used;
+    }
+
+    // execute batched copy
+    cuda_batched_copy_mats(b, &num_rows[0], &num_cols[0], &inputs[0], &ldi[0], 
+        &outputs[0], &ldo[0]);
+
+  } else
+#endif
+ {
+  for (int32 i = 0; i < num_tasks; i++) {
+    const NnetInferenceTask &task = tasks[i];
+    int32 skip = task.num_initial_unused_output_frames,
+        num_used = task.num_used_output_frames;
+    KALDI_ASSERT(cur_output_frame == task.first_used_output_frame_index);
+    if (task.output_to_cpu) {
+      output->RowRange(cur_output_frame, num_used).CopyFromMat(
+          task.output_cpu.RowRange(skip, num_used));
+    } else {
+      output->RowRange(cur_output_frame, num_used).CopyFromMat(
+          task.output.RowRange(skip, num_used));
+    }
+    cur_output_frame += num_used;
+  }
+ }
+ 
+  KALDI_ASSERT(cur_output_frame == num_output_frames);
+}
 
 
 NnetBatchInference::NnetBatchInference(
diff --git a/src/nnet3/nnet-batch-compute.h b/src/nnet3/nnet-batch-compute.h
index 9861a28976c..a29973761d9 100644
--- a/src/nnet3/nnet-batch-compute.h
+++ b/src/nnet3/nnet-batch-compute.h
@@ -60,7 +60,7 @@ struct NnetInferenceTask {
   // the lowest t value was originally nonzero in the 'natural' numbering, this
   // just means we conceptually shift the 't' values; the only real constraint
   // is that the 't' values are contiguous.
-  Matrix<BaseFloat> input;
+  CuMatrix<BaseFloat> input;
 
   // The index of the first output frame (in the shifted numbering where the
   // first output frame is numbered zero.  This will typically be less than one,
@@ -113,7 +113,7 @@ struct NnetInferenceTask {
   bool is_irregular;
 
   // The i-vector for this chunk, if this network accepts i-vector inputs.
-  Vector<BaseFloat> ivector;
+  CuVector<BaseFloat> ivector;
 
   // A priority (higher is more urgent); may be either sign.  May be updated
   // after this object is provided to class NnetBatchComputer.
@@ -193,6 +193,9 @@ struct NnetBatchComputerOptions: public NnetSimpleComputationOptions {
 void MergeTaskOutput(
     const std::vector<NnetInferenceTask> &tasks,
     Matrix<BaseFloat> *output);
+void MergeTaskOutput(
+    const std::vector<NnetInferenceTask> &tasks,
+    CuMatrix<BaseFloat> *output);
 
 /**
    This class does neural net inference in a way that is optimized for GPU use:
@@ -266,6 +269,13 @@ class NnetBatchComputer {
       const Matrix<BaseFloat> *online_ivectors,
       int32 online_ivector_period,
       std::vector<NnetInferenceTask> *tasks);
+  void SplitUtteranceIntoTasks(
+      bool output_to_cpu,
+      const CuMatrix<BaseFloat> &input,
+      const CuVector<BaseFloat> *ivector,
+      const CuMatrix<BaseFloat> *online_ivectors,
+      int32 online_ivector_period,
+      std::vector<NnetInferenceTask> *tasks);
 
   const NnetBatchComputerOptions &GetOptions() { return opts_; }
 
diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc
index a798cb597f5..d9562887817 100644
--- a/src/nnet3/nnet-chain-training.cc
+++ b/src/nnet3/nnet-chain-training.cc
@@ -33,6 +33,7 @@ NnetChainTrainer::NnetChainTrainer(const NnetChainTrainingOptions &opts,
     compiler_(*nnet, opts_.nnet_config.optimize_config,
               opts_.nnet_config.compiler_config),
     num_minibatches_processed_(0),
+    max_change_stats_(*nnet),
     srand_seed_(RandInt(0, 100000)) {
   if (opts.nnet_config.zero_component_stats)
     ZeroComponentStats(nnet);
@@ -41,9 +42,6 @@ NnetChainTrainer::NnetChainTrainer(const NnetChainTrainingOptions &opts,
                opts.nnet_config.backstitch_training_interval > 0);
   delta_nnet_ = nnet_->Copy();
   ScaleNnet(0.0, delta_nnet_);
-  const int32 num_updatable = NumUpdatableComponents(*delta_nnet_);
-  num_max_change_per_component_applied_.resize(num_updatable, 0);
-  num_max_change_global_applied_ = 0;
 
   if (opts.nnet_config.read_cache != "") {
     bool binary;
@@ -111,17 +109,19 @@ void NnetChainTrainer::TrainInternal(const NnetChainExample &eg,
   this->ProcessOutputs(false, eg, &computer);
   computer.Run();
 
-  // If relevant, add in the part of the gradient that comes from L2
-  // regularization.
+  // If relevant, add in the part of the gradient that comes from
+  // parameter-level L2 regularization.
   ApplyL2Regularization(*nnet_,
                         GetNumNvalues(eg.inputs, false) *
                         nnet_config.l2_regularize_factor,
                         delta_nnet_);
 
   // Updates the parameters of nnet
-  bool success = UpdateNnetWithMaxChange(*delta_nnet_,
-      nnet_config.max_param_change, 1.0, 1.0 - nnet_config.momentum, nnet_,
-      &num_max_change_per_component_applied_, &num_max_change_global_applied_);
+  bool success = UpdateNnetWithMaxChange(
+      *delta_nnet_,
+      nnet_config.max_param_change,
+      1.0, 1.0 - nnet_config.momentum, nnet_,
+      &max_change_stats_);
 
   // Scale down the batchnorm stats (keeps them fresh... this affects what
   // happens when we use the model with batchnorm test-mode set).
@@ -176,9 +176,10 @@ void NnetChainTrainer::TrainInternalBackstitch(const NnetChainExample &eg,
   }
 
   // Updates the parameters of nnet
-  UpdateNnetWithMaxChange(*delta_nnet_,
-      nnet_config.max_param_change, max_change_scale, scale_adding, nnet_,
-      &num_max_change_per_component_applied_, &num_max_change_global_applied_);
+  UpdateNnetWithMaxChange(
+      *delta_nnet_, nnet_config.max_param_change,
+      max_change_scale, scale_adding, nnet_,
+      &max_change_stats_);
 
   if (is_backstitch_step1) {
     // The following will only do something if we have a LinearComponent or
@@ -276,41 +277,10 @@ bool NnetChainTrainer::PrintTotalStats() const {
     const ObjectiveFunctionInfo &info = iter->second;
     ans = info.PrintTotalStats(name) || ans;
   }
-  PrintMaxChangeStats();
+  max_change_stats_.Print(*nnet_);
   return ans;
 }
 
-void NnetChainTrainer::PrintMaxChangeStats() const {
-  KALDI_ASSERT(delta_nnet_ != NULL);
-  const NnetTrainerOptions &nnet_config = opts_.nnet_config;
-  int32 i = 0;
-  for (int32 c = 0; c < delta_nnet_->NumComponents(); c++) {
-    Component *comp = delta_nnet_->GetComponent(c);
-    if (comp->Properties() & kUpdatableComponent) {
-      UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(comp);
-      if (uc == NULL)
-        KALDI_ERR << "Updatable component does not inherit from class "
-                  << "UpdatableComponent; change this code.";
-      if (num_max_change_per_component_applied_[i] > 0)
-        KALDI_LOG << "For " << delta_nnet_->GetComponentName(c)
-                  << ", per-component max-change was enforced "
-                  << (100.0 * num_max_change_per_component_applied_[i]) /
-                     (num_minibatches_processed_ *
-                     (nnet_config.backstitch_training_scale == 0.0 ? 1.0 :
-                     1.0 + 1.0 / nnet_config.backstitch_training_interval))
-                  << " \% of the time.";
-      i++;
-    }
-  }
-  if (num_max_change_global_applied_ > 0)
-    KALDI_LOG << "The global max-change was enforced "
-              << (100.0 * num_max_change_global_applied_) /
-                 (num_minibatches_processed_ *
-                 (nnet_config.backstitch_training_scale == 0.0 ? 1.0 :
-                 1.0 + 1.0 / nnet_config.backstitch_training_interval))
-              << " \% of the time.";
-}
-
 NnetChainTrainer::~NnetChainTrainer() {
   if (opts_.nnet_config.write_cache != "") {
     Output ko(opts_.nnet_config.write_cache, opts_.nnet_config.binary_write_cache);
diff --git a/src/nnet3/nnet-chain-training.h b/src/nnet3/nnet-chain-training.h
index 5bf6a3f6fce..bc5143491ac 100644
--- a/src/nnet3/nnet-chain-training.h
+++ b/src/nnet3/nnet-chain-training.h
@@ -64,10 +64,6 @@ class NnetChainTrainer {
   // Prints out the final stats, and return true if there was a nonzero count.
   bool PrintTotalStats() const;
 
-  // Prints out the max-change stats (if nonzero): the percentage of time that
-  // per-component max-change and global max-change were enforced.
-  void PrintMaxChangeStats() const;
-
   ~NnetChainTrainer();
  private:
   // The internal function for doing one step of conventional SGD training.
@@ -88,11 +84,8 @@ class NnetChainTrainer {
 
   chain::DenominatorGraph den_graph_;
   Nnet *nnet_;
-  Nnet *delta_nnet_;  // Only used if momentum != 0.0 or max-param-change !=
-                      // 0.0.  nnet representing accumulated parameter-change
-                      // (we'd call this gradient_nnet_, but due to
-                      // natural-gradient update, it's better to consider it as
-                      // a delta-parameter nnet.
+  Nnet *delta_nnet_;  // stores the change to the parameters on each training
+                      // iteration.
   CachingOptimizingCompiler compiler_;
 
   // This code supports multiple output layers, even though in the
@@ -101,8 +94,7 @@ class NnetChainTrainer {
   int32 num_minibatches_processed_;
 
   // stats for max-change.
-  std::vector<int32> num_max_change_per_component_applied_;
-  int32 num_max_change_global_applied_;
+  MaxChangeStats max_change_stats_;
 
   unordered_map<std::string, ObjectiveFunctionInfo, StringHasher> objf_info_;
 
diff --git a/src/nnet3/nnet-computation.h b/src/nnet3/nnet-computation.h
index 97d8b9045ea..a3571eeb532 100644
--- a/src/nnet3/nnet-computation.h
+++ b/src/nnet3/nnet-computation.h
@@ -514,17 +514,22 @@ struct NnetComputation {
   NnetComputation(): need_model_derivative(false) { }
 };
 
-
-
-
-// This operator is to print out the NnetComputation in a human-readable way, for
-// debugging purposes.
-// We don't give Read and Write functions to struct NnetComputation, because we
-// don't anticipate needing to write it to disk.
-std::ostream &operator << (std::ostream &os,
-                           NnetComputation &computation);
-
-
+// A helper class equipped with the stream insertion operator<< to print out
+// the NnetComputation in a human-readable way, with NnetComputation::Print(),
+// for debugging purposes, e.g.:
+//    KALDI_VLOG(3) << NnetComputationPrintInserter{mycomputation, mynet};
+struct NnetComputationPrintInserter {
+  const NnetComputation& computation;
+  const Nnet& nnet;
+  void Print(std::ostream& os) const {
+    computation.Print(os, nnet);
+  }
+  friend inline std::ostream &operator <<(std::ostream &os,
+                                          NnetComputationPrintInserter xhis) {
+    xhis.Print(os);
+    return os;
+  }
+};
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-compute.cc b/src/nnet3/nnet-compute.cc
index 7ee7d7df717..b5052c71759 100644
--- a/src/nnet3/nnet-compute.cc
+++ b/src/nnet3/nnet-compute.cc
@@ -491,8 +491,10 @@ void NnetComputer::GetPointers(int32 indexes_multi_index,
   for (int32 i = 0; i < size; i += 30 + RandInt(0, 9)) {
     // Do a pseudo-random spot check that the row-indexes are not out of range.
     int32 submatrix_index = pairs[i].first, row = pairs[i].second;
-    CuSubMatrix<BaseFloat> m = GetSubMatrix(submatrix_index);
-    KALDI_ASSERT(row >= 0 && row < m.NumRows() && num_cols == m.NumCols());
+    if (submatrix_index != -1) {
+      CuSubMatrix<BaseFloat> m = GetSubMatrix(submatrix_index);
+      KALDI_ASSERT(row >= 0 && row < m.NumRows() && num_cols == m.NumCols());
+    }
   }
 #endif
   pointers->CopyFromVec(vec);
diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc
index cc5fe3cc050..15004092eaa 100644
--- a/src/nnet3/nnet-example-utils.cc
+++ b/src/nnet3/nnet-example-utils.cc
@@ -214,8 +214,8 @@ void GetComputationRequest(const Nnet &nnet,
     const NnetIo &io = eg.io[i];
     const std::string &name = io.name;
     int32 node_index = nnet.GetNodeIndex(name);
-    if (node_index == -1 &&
-        !nnet.IsInputNode(node_index) && !nnet.IsOutputNode(node_index))
+    if (node_index == -1 ||
+        (!nnet.IsInputNode(node_index) && !nnet.IsOutputNode(node_index)))
       KALDI_ERR << "Nnet example has input or output named '" << name
                 << "', but no such input or output node is in the network.";
 
diff --git a/src/nnet3/nnet-parse-test.cc b/src/nnet3/nnet-parse-test.cc
index babdbbdcb0e..5ae4917dba6 100644
--- a/src/nnet3/nnet-parse-test.cc
+++ b/src/nnet3/nnet-parse-test.cc
@@ -23,193 +23,6 @@
 namespace kaldi {
 namespace nnet3 {
 
-void UnitTestConfigLineParse() {
-  std::string str;
-  {
-    ConfigLine cfl;
-    str = "a-b xx=yyy foo=bar  baz=123 ba=1:2";
-    bool status = cfl.ParseLine(str);
-    KALDI_ASSERT(status && cfl.FirstToken() == "a-b");
-
-    KALDI_ASSERT(cfl.HasUnusedValues());
-    std::string str_value;
-    KALDI_ASSERT(cfl.GetValue("xx", &str_value));
-    KALDI_ASSERT(str_value == "yyy");
-    KALDI_ASSERT(cfl.HasUnusedValues());
-    KALDI_ASSERT(cfl.GetValue("foo", &str_value));
-    KALDI_ASSERT(str_value == "bar");
-    KALDI_ASSERT(cfl.HasUnusedValues());
-    KALDI_ASSERT(!cfl.GetValue("xy", &str_value));
-    KALDI_ASSERT(cfl.GetValue("baz", &str_value));
-    KALDI_ASSERT(str_value == "123");
-
-    std::vector<int32> int_values;
-    KALDI_ASSERT(!cfl.GetValue("xx", &int_values));
-    KALDI_ASSERT(cfl.GetValue("baz", &int_values));
-    KALDI_ASSERT(cfl.HasUnusedValues());
-    KALDI_ASSERT(int_values.size() == 1 && int_values[0] == 123);
-    KALDI_ASSERT(cfl.GetValue("ba", &int_values));
-    KALDI_ASSERT(int_values.size() == 2 && int_values[0] == 1 && int_values[1] == 2);
-    KALDI_ASSERT(!cfl.HasUnusedValues());
-  }
-
-  {
-    ConfigLine cfl;
-    str = "a-b baz=x y z pp = qq ab =cd ac= bd";
-    KALDI_ASSERT(!cfl.ParseLine(str));
-  }
-  {
-    ConfigLine cfl;
-    str = "a-b baz=x y z pp = qq ab=cd ac=bd";
-    KALDI_ASSERT(!cfl.ParseLine(str));
-  }
-  {
-    ConfigLine cfl;
-    str = "foo-bar";
-    KALDI_ASSERT(cfl.ParseLine(str));
-  }
-  {
-    ConfigLine cfl;
-    str = "foo-bar a=b c d f=g";
-    std::string value;
-    KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "foo-bar" &&
-                 cfl.GetValue("a", &value)  && value == "b c d" &&
-                 cfl.GetValue("f", &value) && value == "g" &&
-                 !cfl.HasUnusedValues());
-  }
-  {
-    ConfigLine cfl;
-    str = "zzz a=b baz";
-    KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "zzz" &&
-                 cfl.UnusedValues() == "a=b baz");
-  }
-  {
-    ConfigLine cfl;
-    str = "xxx a=b baz ";
-    KALDI_ASSERT(cfl.ParseLine(str) && cfl.UnusedValues() == "a=b baz");
-  }
-  {
-    ConfigLine cfl;
-    str = "xxx a=b =c";
-    KALDI_ASSERT(!cfl.ParseLine(str));
-  }
-  {
-    ConfigLine cfl;
-    str = "xxx baz='x y z' pp=qq ab=cd ac=bd";
-    KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "xxx");
-    std::string str_value;
-    KALDI_ASSERT(cfl.GetValue("baz", &str_value));
-    KALDI_ASSERT(str_value == "x y z");
-    KALDI_ASSERT(cfl.GetValue("pp", &str_value));
-    KALDI_ASSERT(str_value == "qq");
-    KALDI_ASSERT(cfl.UnusedValues() == "ab=cd ac=bd");
-    KALDI_ASSERT(cfl.GetValue("ab", &str_value));
-    KALDI_ASSERT(str_value == "cd");
-    KALDI_ASSERT(cfl.UnusedValues() == "ac=bd");
-    KALDI_ASSERT(cfl.HasUnusedValues());
-    KALDI_ASSERT(cfl.GetValue("ac", &str_value));
-    KALDI_ASSERT(str_value == "bd");
-    KALDI_ASSERT(!cfl.HasUnusedValues());
-  }
-
-  {
-    ConfigLine cfl;
-    str = "x baz= pp = qq flag=t ";
-    KALDI_ASSERT(!cfl.ParseLine(str));
-  }
-  {
-    ConfigLine cfl;
-    str = " x baz= pp=qq flag=t  ";
-    KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "x");
-
-    std::string str_value;
-    KALDI_ASSERT(cfl.GetValue("baz", &str_value));
-    KALDI_ASSERT(str_value == "");
-    KALDI_ASSERT(cfl.GetValue("pp", &str_value));
-    KALDI_ASSERT(str_value == "qq");
-    KALDI_ASSERT(cfl.HasUnusedValues());
-    KALDI_ASSERT(cfl.GetValue("flag", &str_value));
-    KALDI_ASSERT(str_value == "t");
-    KALDI_ASSERT(!cfl.HasUnusedValues());
-
-    bool bool_value = false;
-    KALDI_ASSERT(cfl.GetValue("flag", &bool_value));
-    KALDI_ASSERT(bool_value);
-  }
-
-  {
-    ConfigLine cfl;
-    str = "xx _baz=a -pp=qq";
-    KALDI_ASSERT(!cfl.ParseLine(str));
-  }
-  {
-    ConfigLine cfl;
-    str = "xx 0baz=a pp=qq";
-    KALDI_ASSERT(!cfl.ParseLine(str));
-  }
-  {
-    ConfigLine cfl;
-    str = "xx -baz=a pp=qq";
-    KALDI_ASSERT(!cfl.ParseLine(str));
-  }
-  {
-    ConfigLine cfl;
-    str = "xx _baz'=a pp=qq";
-    KALDI_ASSERT(!cfl.ParseLine(str));
-  }
-  {
-    ConfigLine cfl;
-    str = " baz=g";
-    KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "");
-    bool flag;
-    KALDI_ASSERT(!cfl.GetValue("baz", &flag));
-  }
-  {
-    ConfigLine cfl;
-    str = "xx _baz1=a pp=qq";
-    KALDI_ASSERT(cfl.ParseLine(str));
-
-    std::string str_value;
-    KALDI_ASSERT(cfl.GetValue("_baz1", &str_value));
-  }
-}
-
-void UnitTestReadConfig() {
-  std::string str = "a-b alpha=aa beta=\"b b\"# String test\n"
-      "a-b beta2='b c' beta3=bd # \n"
-      "a-b gamma=1:2:3:4  # Int Vector test\n"
-      " a-b de1ta=f  # Bool + Integer in key Comment test delta=t  \n"
-      "a-b _epsilon=-1  # Int Vector test _epsilon=1 \n"
-      "a-b zet-_a=0.15   theta=1.1# Float, -, _ test\n"
-      "a-b quoted='a b c' # quoted string\n"
-      "a-b quoted2=\"d e 'a b=c' f\" # string quoted with double quotes";
-
-  std::istringstream is(str);
-  std::vector<std::string> lines;
-  ReadConfigLines(is, &lines);
-  KALDI_ASSERT(lines.size() == 8);
-
-  ConfigLine cfl;
-  for (size_t i = 0; i < lines.size(); i++) {
-    KALDI_ASSERT(cfl.ParseLine(lines[i]) && cfl.FirstToken() == "a-b");
-    if (i == 1) {
-        KALDI_ASSERT(cfl.GetValue("beta2", &str) && str == "b c");
-    }
-    if (i == 4) {
-      KALDI_ASSERT(cfl.GetValue("_epsilon", &str) && str == "-1");
-    }
-    if (i == 5) {
-      BaseFloat float_val = 0;
-      KALDI_ASSERT(cfl.GetValue("zet-_a", &float_val) && ApproxEqual(float_val, 0.15));
-    }
-    if (i == 6) {
-      KALDI_ASSERT(cfl.GetValue("quoted", &str) && str == "a b c");
-    }
-    if (i == 7) {
-      KALDI_ASSERT(cfl.GetValue("quoted2", &str) && str == "d e 'a b=c' f");
-    }
-  }
-}
 
 void UnitTestDescriptorTokenize() {
   std::vector<std::string> lines;
@@ -281,8 +94,6 @@ int main() {
   using namespace kaldi;
   using namespace kaldi::nnet3;
 
-  UnitTestConfigLineParse();
-  UnitTestReadConfig();
   UnitTestDescriptorTokenize();
   UnitTestSummarizeVector();
   UnitTestNameMatchesPattern();
diff --git a/src/nnet3/nnet-parse.cc b/src/nnet3/nnet-parse.cc
index a51bba21484..17dec23e7c1 100644
--- a/src/nnet3/nnet-parse.cc
+++ b/src/nnet3/nnet-parse.cc
@@ -27,353 +27,6 @@
 namespace kaldi {
 namespace nnet3 {
 
-
-bool ConfigLine::ParseLine(const std::string &line) {
-  data_.clear();
-  whole_line_ = line;
-  if (line.size() == 0) return false;   // Empty line
-  size_t pos = 0, size = line.size();
-  while (isspace(line[pos]) && pos < size) pos++;
-  if (pos == size)
-    return false;  // whitespace-only line
-  size_t first_token_start_pos = pos;
-  // first get first_token_.
-  while (!isspace(line[pos]) && pos < size) {
-    if (line[pos] == '=') {
-      // If the first block of non-whitespace looks like "foo-bar=...",
-      // then we ignore it: there is no initial token, and FirstToken()
-      // is empty.
-      pos = first_token_start_pos;
-      break;
-    }
-    pos++;
-  }
-  first_token_ = std::string(line, first_token_start_pos, pos - first_token_start_pos);
-  // first_token_ is expected to be either empty or something like
-  // "component-node", which actually is a slightly more restrictive set of
-  // strings than IsValidName() checks for this is a convenient way to check it.
-  if (!first_token_.empty() && !IsValidName(first_token_))
-    return false;
-
-  while (pos < size) {
-    if (isspace(line[pos])) {
-      pos++;
-      continue;
-    }
-
-    // OK, at this point we know that we are pointing at nonspace.
-    size_t next_equals_sign = line.find_first_of("=", pos);
-    if (next_equals_sign == pos || next_equals_sign == std::string::npos) {
-      // we're looking for something like 'key=value'.  If there is no equals sign,
-      // or it's not preceded by something, it's a parsing failure.
-      return false;
-    }
-    std::string key(line, pos, next_equals_sign - pos);
-    if (!IsValidName(key)) return false;
-
-    // handle any quotes.  we support key='blah blah' or key="foo bar".
-    // no escaping is supported.
-    if (line[next_equals_sign+1] == '\'' || line[next_equals_sign+1] == '"') {
-      char my_quote = line[next_equals_sign+1];
-      size_t next_quote = line.find_first_of(my_quote, next_equals_sign + 2);
-      if (next_quote == std::string::npos) {  // no matching quote was found.
-        KALDI_WARN << "No matching quote for " << my_quote << " in config line '"
-                   << line << "'";
-        return false;
-      } else {
-        std::string value(line, next_equals_sign + 2,
-                          next_quote - next_equals_sign - 2);
-        data_.insert(std::make_pair(key, std::make_pair(value, false)));
-        pos = next_quote + 1;
-        continue;
-      }
-    } else {
-      // we want to be able to parse something like "... input=Offset(a, -1) foo=bar":
-      // in general, config values with spaces in them, even without quoting.
-
-      size_t next_next_equals_sign = line.find_first_of("=", next_equals_sign + 1),
-          terminating_space = size;
-
-      if (next_next_equals_sign != std::string::npos) {  // found a later equals sign.
-        size_t preceding_space = line.find_last_of(" \t", next_next_equals_sign);
-        if (preceding_space != std::string::npos &&
-            preceding_space > next_equals_sign)
-          terminating_space = preceding_space;
-      }
-      while (isspace(line[terminating_space - 1]) && terminating_space > 0)
-        terminating_space--;
-
-      std::string value(line, next_equals_sign + 1,
-                        terminating_space - (next_equals_sign + 1));
-      data_.insert(std::make_pair(key, std::make_pair(value, false)));
-      pos = terminating_space;
-    }
-  }
-  return true;
-}
-
-bool ConfigLine::GetValue(const std::string &key, std::string *value) {
-  KALDI_ASSERT(value != NULL);
-  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
-  for (; it != data_.end(); ++it) {
-    if (it->first == key) {
-      *value = (it->second).first;
-      (it->second).second = true;
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ConfigLine::GetValue(const std::string &key, BaseFloat *value) {
-  KALDI_ASSERT(value != NULL);
-  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
-  for (; it != data_.end(); ++it) {
-    if (it->first == key) {
-      if (!ConvertStringToReal((it->second).first, value))
-        return false;
-      (it->second).second = true;
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ConfigLine::GetValue(const std::string &key, int32 *value) {
-  KALDI_ASSERT(value != NULL);
-  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
-  for (; it != data_.end(); ++it) {
-    if (it->first == key) {
-      if (!ConvertStringToInteger((it->second).first, value))
-        return false;
-      (it->second).second = true;
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ConfigLine::GetValue(const std::string &key, std::vector<int32> *value) {
-  KALDI_ASSERT(value != NULL);
-  value->clear();
-  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
-  for (; it != data_.end(); ++it) {
-    if (it->first == key) {
-      if (!SplitStringToIntegers((it->second).first, ":,", true, value)) {
-        // KALDI_WARN << "Bad option " << (it->second).first;
-        return false;
-      }
-      (it->second).second = true;
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ConfigLine::GetValue(const std::string &key, bool *value) {
-  KALDI_ASSERT(value != NULL);
-  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
-  for (; it != data_.end(); ++it) {
-    if (it->first == key) {
-      if ((it->second).first.size() == 0) return false;
-      switch (((it->second).first)[0]) {
-        case 'F':
-        case 'f':
-          *value = false;
-          break;
-        case 'T':
-        case 't':
-          *value = true;
-          break;
-        default:
-          return false;
-      }
-      (it->second).second = true;
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ConfigLine::HasUnusedValues() const {
-  std::map<std::string, std::pair<std::string, bool> >::const_iterator it = data_.begin();
-  for (; it != data_.end(); ++it) {
-    if (!(it->second).second) return true;
-  }
-  return false;
-}
-
-std::string ConfigLine::UnusedValues() const {
-  std::string unused_str;
-  std::map<std::string, std::pair<std::string, bool> >::const_iterator it = data_.begin();
-  for (; it != data_.end(); ++it) {
-    if (!(it->second).second) {
-      if (unused_str == "")
-        unused_str = it->first + "=" + (it->second).first;
-      else
-        unused_str += " " + it->first + "=" + (it->second).first;
-    }
-  }
-  return unused_str;
-}
-
-// This is like ExpectToken but for two tokens, and it
-// will either accept token1 and then token2, or just token2.
-// This is useful in Read functions where the first token
-// may already have been consumed.
-void ExpectOneOrTwoTokens(std::istream &is, bool binary,
-                          const std::string &token1,
-                          const std::string &token2) {
-  KALDI_ASSERT(token1 != token2);
-  std::string temp;
-  ReadToken(is, binary, &temp);
-  if (temp == token1) {
-    ExpectToken(is, binary, token2);
-  } else {
-    if (temp != token2) {
-      KALDI_ERR << "Expecting token " << token1 << " or " << token2
-                << " but got " << temp;
-    }
-  }
-}
-
-// static
-bool ParseFromString(const std::string &name, std::string *string,
-                     int32 *param) {
-  std::vector<std::string> split_string;
-  SplitStringToVector(*string, " \t", true,
-                      &split_string);
-  std::string name_equals = name + "="; // the name and then the equals sign.
-  size_t len = name_equals.length();
-
-  for (size_t i = 0; i < split_string.size(); i++) {
-    if (split_string[i].compare(0, len, name_equals) == 0) {
-      if (!ConvertStringToInteger(split_string[i].substr(len), param))
-        KALDI_ERR << "Bad option " << split_string[i];
-      *string = "";
-      // Set "string" to all the pieces but the one we used.
-      for (size_t j = 0; j < split_string.size(); j++) {
-        if (j != i) {
-          if (!string->empty()) *string += " ";
-          *string += split_string[j];
-        }
-      }
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ParseFromString(const std::string &name, std::string *string,
-                     bool *param) {
-  std::vector<std::string> split_string;
-  SplitStringToVector(*string, " \t", true,
-                      &split_string);
-  std::string name_equals = name + "="; // the name and then the equals sign.
-  size_t len = name_equals.length();
-
-  for (size_t i = 0; i < split_string.size(); i++) {
-    if (split_string[i].compare(0, len, name_equals) == 0) {
-      std::string b = split_string[i].substr(len);
-      if (b.empty())
-        KALDI_ERR << "Bad option " << split_string[i];
-      if (b[0] == 'f' || b[0] == 'F') *param = false;
-      else if (b[0] == 't' || b[0] == 'T') *param = true;
-      else
-        KALDI_ERR << "Bad option " << split_string[i];
-      *string = "";
-      // Set "string" to all the pieces but the one we used.
-      for (size_t j = 0; j < split_string.size(); j++) {
-        if (j != i) {
-          if (!string->empty()) *string += " ";
-          *string += split_string[j];
-        }
-      }
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ParseFromString(const std::string &name, std::string *string,
-                     BaseFloat *param) {
-  std::vector<std::string> split_string;
-  SplitStringToVector(*string, " \t", true,
-                      &split_string);
-  std::string name_equals = name + "="; // the name and then the equals sign.
-  size_t len = name_equals.length();
-
-  for (size_t i = 0; i < split_string.size(); i++) {
-    if (split_string[i].compare(0, len, name_equals) == 0) {
-      if (!ConvertStringToReal(split_string[i].substr(len), param))
-        KALDI_ERR << "Bad option " << split_string[i];
-      *string = "";
-      // Set "string" to all the pieces but the one we used.
-      for (size_t j = 0; j < split_string.size(); j++) {
-        if (j != i) {
-          if (!string->empty()) *string += " ";
-          *string += split_string[j];
-        }
-      }
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ParseFromString(const std::string &name, std::string *string,
-                     std::string *param) {
-  std::vector<std::string> split_string;
-  SplitStringToVector(*string, " \t", true,
-                      &split_string);
-  std::string name_equals = name + "="; // the name and then the equals sign.
-  size_t len = name_equals.length();
-
-  for (size_t i = 0; i < split_string.size(); i++) {
-    if (split_string[i].compare(0, len, name_equals) == 0) {
-      *param = split_string[i].substr(len);
-
-      // Set "string" to all the pieces but the one we used.
-      *string = "";
-      for (size_t j = 0; j < split_string.size(); j++) {
-        if (j != i) {
-          if (!string->empty()) *string += " ";
-          *string += split_string[j];
-        }
-      }
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ParseFromString(const std::string &name, std::string *string,
-                     std::vector<int32> *param) {
-  std::vector<std::string> split_string;
-  SplitStringToVector(*string, " \t", true,
-                      &split_string);
-  std::string name_equals = name + "="; // the name and then the equals sign.
-  size_t len = name_equals.length();
-
-  for (size_t i = 0; i < split_string.size(); i++) {
-    if (split_string[i].compare(0, len, name_equals) == 0) {
-      if (!SplitStringToIntegers(split_string[i].substr(len), ":,",
-                                 false, param))
-        KALDI_ERR << "Bad option " << split_string[i];
-      *string = "";
-      // Set "string" to all the pieces but the one we used.
-      for (size_t j = 0; j < split_string.size(); j++) {
-        if (j != i) {
-          if (!string->empty()) *string += " ";
-          *string += split_string[j];
-        }
-      }
-      return true;
-    }
-  }
-  return false;
-}
-
 bool DescriptorTokenize(const std::string &input,
                         std::vector<std::string> *tokens) {
   KALDI_ASSERT(tokens != NULL);
@@ -422,32 +75,6 @@ bool DescriptorTokenize(const std::string &input,
   return true;
 }
 
-bool IsValidName(const std::string &name) {
-  if (name.size() == 0) return false;
-  for (size_t i = 0; i < name.size(); i++) {
-    if (i == 0 && !isalpha(name[i]) && name[i] != '_')
-      return false;
-    if (!isalnum(name[i]) && name[i] != '_' && name[i] != '-' && name[i] != '.')
-      return false;
-  }
-  return true;
-}
-
-void ReadConfigLines(std::istream &is,
-                    std::vector<std::string> *lines) {
-  KALDI_ASSERT(lines != NULL);
-  std::string line;
-  while (std::getline(is, line)) {
-    if (line.size() == 0) continue;
-    size_t start = line.find_first_not_of(" \t");
-    size_t end = line.find_first_of('#');
-    if (start == std::string::npos || start == end) continue;
-    end = line.find_last_not_of(" \t", end - 1);
-    KALDI_ASSERT(end >= start);
-    lines->push_back(line.substr(start, end - start + 1));
-  }
-}
-
 std::string ErrorContext(std::istream &is) {
   if (!is.good()) return "end of line";
   char buf[21];
diff --git a/src/nnet3/nnet-parse.h b/src/nnet3/nnet-parse.h
index a073a54f7e0..0fc19d51f6c 100644
--- a/src/nnet3/nnet-parse.h
+++ b/src/nnet3/nnet-parse.h
@@ -26,103 +26,6 @@
 namespace kaldi {
 namespace nnet3 {
 
-/**
-   This class is responsible for parsing input like
-    hi-there xx=yyy a=b c empty= f-oo=Append(bar, sss) ba_z=123 bing='a b c' baz="a b c d='a b' e"
-   and giving you access to the fields, in this case
-
-   FirstToken() == "hi-there", and key->value pairs:
-
-   xx->yyy, a->"b c", empty->"", f-oo->"Append(bar, sss)", ba_z->"123",
-   bing->"a b c", baz->"a b c d='a b' e"
-
-   The first token is optional, if the line started with a key-value pair then
-   FirstValue() will be empty.
-
-   Note: it can parse value fields with space inside them only if they are free of the '='
-   character.  If values are going to contain the '=' character, you need to quote them
-   with either single or double quotes.
-
-   Key values may contain -_a-zA-Z0-9, but must begin with a-zA-Z_.
- */
-class ConfigLine {
- public:
-  // Tries to parse the line as a config-file line.  Returns false
-  // if it could not for some reason, e.g. parsing failure.  In most cases
-  // prints no warnings; the user should do this.  Does not expect comments.
-  bool ParseLine(const std::string &line);
-
-  // the GetValue functions are overloaded for various types.  They return true
-  // if the key exists with value that can be converted to that type, and false
-  // otherwise.  They also mark the key-value pair as having been read.  It is
-  // not an error to read values twice.
-  bool GetValue(const std::string &key, std::string *value);
-  bool GetValue(const std::string &key, BaseFloat *value);
-  bool GetValue(const std::string &key, int32 *value);
-  // Values may be separated by ":" or by ",".
-  bool GetValue(const std::string &key, std::vector<int32> *value);
-  bool GetValue(const std::string &key, bool *value);
-
-  bool HasUnusedValues() const;
-  /// returns e.g. foo=bar xxx=yyy if foo and xxx were not consumed by one
-  /// of the GetValue() functions.
-  std::string UnusedValues() const;
-
-  const std::string &FirstToken() const { return first_token_; }
-
-  const std::string WholeLine() { return whole_line_; }
-  // use default assignment operator and copy constructor.
- private:
-  std::string whole_line_;
-  // the first token of the line, e.g. if line is
-  // foo-bar baz=bing
-  // then first_token_ would be "foo-bar".
-  std::string first_token_;
-
-  // data_ maps from key to (value, is-this-value-consumed?).
-  std::map<std::string, std::pair<std::string, bool> > data_;
-
-};
-
-// Note: the ParseFromString functions are to be removed after we switch over to
-// using the ConfigLine mechanism.
-
-
-/// \file nnet-parse.h
-///   This header contains a few parsing-related functions that are used
-///    while reading parsing neural network files and config files.
-
-/// Function used in Init routines.  Suppose name=="foo", if "string" has a
-/// field like foo=12, this function will set "param" to 12 and remove that
-/// element from "string".  It returns true if the parameter was read.
-bool ParseFromString(const std::string &name, std::string *string,
-                     int32 *param);
-
-/// This version of ParseFromString is for parameters of type BaseFloat.
-bool ParseFromString(const std::string &name, std::string *string,
-                     BaseFloat *param);
-
-/// This version of ParseFromString is for parameters of type bool, which can
-/// appear as any string beginning with f, F, t or T.
-bool ParseFromString(const std::string &name, std::string *string,
-                     bool *param);
-
-/// This version of ParseFromString is for parsing strings.  (these
-/// should not contain space).
-bool ParseFromString(const std::string &name, std::string *string,
-                     std::string *param);
-
-/// This version of ParseFromString handles colon-separated or comma-separated
-/// lists of integers.
-bool ParseFromString(const std::string &name, std::string *string,
-                     std::vector<int32> *param);
-
-/// This function is like ExpectToken but for two tokens, and it will either
-/// accept token1 and then token2, or just token2.  This is useful in Read
-/// functions where the first token may already have been consumed.
-void ExpectOneOrTwoTokens(std::istream &is, bool binary,
-                          const std::string &token1,
-                          const std::string &token2);
 
 /**
    This function tokenizes input when parsing Descriptor configuration
@@ -142,32 +45,6 @@ void ExpectOneOrTwoTokens(std::istream &is, bool binary,
 bool DescriptorTokenize(const std::string &input,
                         std::vector<std::string> *tokens);
 
-/// Returns true if 'name' would be a valid name for a component or node in a
-/// Nnet.  This is a nonempty string beginning with A-Za-z_, and containing only
-/// '-', '_', '.', A-Z, a-z, or 0-9.
-bool IsValidName(const std::string &name);
-
-
-/**
-   This function reads in a config file and *appends* its contents to a vector of
-   lines; it is responsible for removing comments (anything after '#') and
-   stripping out any lines that contain only whitespace after comment removal.
- */
-void ReadConfigLines(std::istream &is,
-                     std::vector<std::string> *lines);
-
-
-/**
-   This function converts config-lines from a simple sequence of strings
-   as output by ReadConfigLines(), into a sequence of first-tokens and
-   name-value pairs.  The general format is:
-      "command-type bar=baz xx=yyy"
-   etc., although there are subtleties as to what exactly is allowed, see
-   documentation for class ConfigLine for details.
-   This function will die if there was a parsing failure.
- */
-void ParseConfigLines(const std::vector<std::string> &lines,
-                      std::vector<ConfigLine> *config_lines);
 
 /*
   Returns true if name 'name' matches pattern 'pattern'.  The pattern
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index e8c99494b06..53c8d46578b 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -3548,7 +3548,7 @@ void* SoftmaxComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
   // Apply softmax function to each row of the output...
   // for that row, we do
   // x_i = exp(x_i) / sum_j exp(x_j).
-  out->ApplySoftMaxPerRow(in);
+  out->SoftMaxPerRow(in);
 
   // This floor on the output helps us deal with
   // almost-zeros in a way that doesn't lead to overflow.
@@ -3601,7 +3601,7 @@ void* LogSoftmaxComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
                                     CuMatrixBase<BaseFloat> *out) const {
   // Applies log softmax function to each row of the output. For each row, we do
   // x_i = x_i - log(sum_j exp(x_j))
-  out->ApplyLogSoftMaxPerRow(in);
+  out->LogSoftMaxPerRow(in);
   return NULL;
 }
 
@@ -4068,13 +4068,13 @@ bool CompositeComponent::IsUpdatable() const {
 int32 CompositeComponent::InputDim() const {
   KALDI_ASSERT(!components_.empty());
   return components_.front()->InputDim();
-};
+}
 
 // virtual
 int32 CompositeComponent::OutputDim() const {
   KALDI_ASSERT(!components_.empty());
   return components_.back()->OutputDim();
-};
+}
 
 // virtual
 int32 CompositeComponent::Properties() const {
@@ -4096,7 +4096,7 @@ int32 CompositeComponent::Properties() const {
   if (last_component_properties & kStoresStats)
     ans |= kBackpropNeedsOutput;
   return ans;
-};
+}
 
 
 MatrixStrideType CompositeComponent::GetStrideType(int32 i) const {
@@ -4319,7 +4319,7 @@ void CompositeComponent::Backprop(const std::string &debug_info,
       // optimization; other propagates might also be skippable.
       int32 properties = components_[num_components - 2]->Properties(),
           next_properties = components_[num_components - 1]->Properties();
-      if (!(properties & (kBackpropNeedsOutput || kUsesMemo)) &&
+      if (!(properties & (kBackpropNeedsOutput | kUsesMemo)) &&
           !(next_properties & kBackpropNeedsInput)) {
         num_components_to_propagate--;
       }
diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc
index 0acaa5c2008..b4563c7a2c3 100644
--- a/src/nnet3/nnet-training.cc
+++ b/src/nnet3/nnet-training.cc
@@ -30,6 +30,7 @@ NnetTrainer::NnetTrainer(const NnetTrainerOptions &config,
     nnet_(nnet),
     compiler_(*nnet, config_.optimize_config, config_.compiler_config),
     num_minibatches_processed_(0),
+    max_change_stats_(*nnet),
     srand_seed_(RandInt(0, 100000)) {
   if (config.zero_component_stats)
     ZeroComponentStats(nnet);
@@ -38,9 +39,6 @@ NnetTrainer::NnetTrainer(const NnetTrainerOptions &config,
                config.backstitch_training_interval > 0);
   delta_nnet_ = nnet_->Copy();
   ScaleNnet(0.0, delta_nnet_);
-  const int32 num_updatable = NumUpdatableComponents(*delta_nnet_);
-  num_max_change_per_component_applied_.resize(num_updatable, 0);
-  num_max_change_global_applied_ = 0;
 
   if (config_.read_cache != "") {
     bool binary;
@@ -111,9 +109,9 @@ void NnetTrainer::TrainInternal(const NnetExample &eg,
                         delta_nnet_);
 
   // Update the parameters of nnet
-  bool success = UpdateNnetWithMaxChange(*delta_nnet_, config_.max_param_change,
-      1.0, 1.0 - config_.momentum, nnet_,
-      &num_max_change_per_component_applied_, &num_max_change_global_applied_);
+  bool success = UpdateNnetWithMaxChange(
+      *delta_nnet_, config_.max_param_change,
+      1.0, 1.0 - config_.momentum, nnet_, &max_change_stats_);
 
   // Scale down the batchnorm stats (keeps them fresh... this affects what
   // happens when we use the model with batchnorm test-mode set).
@@ -167,9 +165,10 @@ void NnetTrainer::TrainInternalBackstitch(const NnetExample &eg,
   }
 
   // Updates the parameters of nnet
-  UpdateNnetWithMaxChange(*delta_nnet_, config_.max_param_change,
+  UpdateNnetWithMaxChange(
+      *delta_nnet_, config_.max_param_change,
       max_change_scale, scale_adding, nnet_,
-      &num_max_change_per_component_applied_, &num_max_change_global_applied_);
+      &max_change_stats_);
 
   if (is_backstitch_step1) {
     // The following will only do something if we have a LinearComponent or
@@ -236,40 +235,10 @@ bool NnetTrainer::PrintTotalStats() const {
     bool ok = info.PrintTotalStats(name);
     ans = ans || ok;
   }
-  PrintMaxChangeStats();
+  max_change_stats_.Print(*nnet_);
   return ans;
 }
 
-void NnetTrainer::PrintMaxChangeStats() const {
-  KALDI_ASSERT(delta_nnet_ != NULL);
-  int32 i = 0;
-  for (int32 c = 0; c < delta_nnet_->NumComponents(); c++) {
-    Component *comp = delta_nnet_->GetComponent(c);
-    if (comp->Properties() & kUpdatableComponent) {
-      UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(comp);
-      if (uc == NULL)
-        KALDI_ERR << "Updatable component does not inherit from class "
-                  << "UpdatableComponent; change this code.";
-      if (num_max_change_per_component_applied_[i] > 0)
-        KALDI_LOG << "For " << delta_nnet_->GetComponentName(c)
-                  << ", per-component max-change was enforced "
-                  << (100.0 * num_max_change_per_component_applied_[i]) /
-                     (num_minibatches_processed_ *
-                     (config_.backstitch_training_scale == 0.0 ? 1.0 :
-                     1.0 + 1.0 / config_.backstitch_training_interval))
-                  << " \% of the time.";
-      i++;
-    }
-  }
-  if (num_max_change_global_applied_ > 0)
-    KALDI_LOG << "The global max-change was enforced "
-              << (100.0 * num_max_change_global_applied_) /
-                 (num_minibatches_processed_ *
-                 (config_.backstitch_training_scale == 0.0 ? 1.0 :
-                 1.0 + 1.0 / config_.backstitch_training_interval))
-              << " \% of the time.";
-}
-
 void ObjectiveFunctionInfo::UpdateStats(
     const std::string &output_name,
     int32 minibatches_per_phase,
diff --git a/src/nnet3/nnet-training.h b/src/nnet3/nnet-training.h
index fffc621930a..64ec7abc58e 100644
--- a/src/nnet3/nnet-training.h
+++ b/src/nnet3/nnet-training.h
@@ -26,6 +26,7 @@
 #include "nnet3/nnet-compute.h"
 #include "nnet3/nnet-optimize.h"
 #include "nnet3/nnet-example-utils.h"
+#include "nnet3/nnet-utils.h"
 
 namespace kaldi {
 namespace nnet3 {
@@ -187,10 +188,6 @@ class NnetTrainer {
   // Prints out the final stats, and return true if there was a nonzero count.
   bool PrintTotalStats() const;
 
-  // Prints out the max-change stats (if nonzero): the percentage of time that
-  // per-component max-change and global max-change were enforced.
-  void PrintMaxChangeStats() const;
-
   ~NnetTrainer();
  private:
   // The internal function for doing one step of conventional SGD training.
@@ -220,8 +217,7 @@ class NnetTrainer {
   int32 num_minibatches_processed_;
 
   // stats for max-change.
-  std::vector<int32> num_max_change_per_component_applied_;
-  int32 num_max_change_global_applied_;
+  MaxChangeStats max_change_stats_;
 
   unordered_map<std::string, ObjectiveFunctionInfo, StringHasher> objf_info_;
 
diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index e020f8fc6a7..5ab9126f0b5 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -630,13 +630,37 @@ void FindOrphanNodes(const Nnet &nnet, std::vector<int32> *nodes) {
 }
 
 
+// Parameters used in applying SVD:
+// 1. Energy threshold : For each Affine weights layer in the original baseline nnet3 model,
+//  we perform SVD based factoring of the weights matrix of the layer,
+//  into a singular values (left diagonal) matrix, and two Eigen matrices.
+//
+// SVD : Wx = UEV, U,V are Eigen matrices, and E is the singularity matrix)
+//
+// We take the center matrix E, and consider only the Singular values which contribute
+//  to (Energy-threshold) times the total Energy of Singularity parameters.
+//   These Singularity parameters are actually sorted in descending order and lower
+//    values are pruned out until the Total energy (Sum of squares) of the pruned set
+//     of parameters is just above (Energy-threshold * Total init energy). The values which
+//      are pruned away are replaced with 0 in the Singularity matrix
+//      and the Weights matrix after SVD is derived with shrinked dimensions.
+//
+// 2. Shrinkage-threshold : If the Shrinkage ratio of the SVD refactored Weights matrix
+//       is higher than Shrinkage-threshold for any of the Tdnn layers,
+//        the SVD process is aborted for that particular Affine weights layer.
+//
+
 // this class implements the internals of the edit directive 'apply-svd'.
 class SvdApplier {
  public:
   SvdApplier(const std::string component_name_pattern,
              int32 bottleneck_dim,
+             BaseFloat energy_threshold,
+             BaseFloat shrinkage_threshold,
              Nnet *nnet): nnet_(nnet),
                           bottleneck_dim_(bottleneck_dim),
+        		  energy_threshold_(energy_threshold),
+          		  shrinkage_threshold_(shrinkage_threshold),
                           component_name_pattern_(component_name_pattern) { }
   void ApplySvd() {
     DecomposeComponents();
@@ -673,43 +697,70 @@ class SvdApplier {
                      << " -> " << output_dim;
           continue;
         }
-        size_t n = modified_component_info_.size();
-        modification_index_[c] = n;
-        modified_component_info_.resize(n + 1);
-        ModifiedComponentInfo &info = modified_component_info_[n];
-        info.component_index = c;
-        info.component_name = component_name;
         Component *component_a = NULL, *component_b = NULL;
-        info.component_name_a = component_name + "_a";
-        info.component_name_b = component_name + "_b";
-        if (nnet_->GetComponentIndex(info.component_name_a) >= 0)
-          KALDI_ERR << "Neural network already has a component named "
-                    << info.component_name_a;
-        if (nnet_->GetComponentIndex(info.component_name_b) >= 0)
-          KALDI_ERR << "Neural network already has a component named "
-                    << info.component_name_b;
-        DecomposeComponent(component_name, *affine, &component_a, &component_b);
-        info.component_a_index = nnet_->AddComponent(info.component_name_a,
-                                                     component_a);
-        info.component_b_index = nnet_->AddComponent(info.component_name_b,
-                                                     component_b);
+	if (DecomposeComponent(component_name, *affine, &component_a, &component_b)) {
+	  size_t n = modified_component_info_.size();
+	  modification_index_[c] = n;
+	  modified_component_info_.resize(n + 1);
+	  ModifiedComponentInfo &info = modified_component_info_[n];
+	  info.component_index = c;
+	  info.component_name = component_name;
+	  info.component_name_a = component_name + "_a";
+	  info.component_name_b = component_name + "_b";
+	  if (nnet_->GetComponentIndex(info.component_name_a) >= 0)
+	    KALDI_ERR << "Neural network already has a component named "
+		      << info.component_name_a;
+	  if (nnet_->GetComponentIndex(info.component_name_b) >= 0)
+	    KALDI_ERR << "Neural network already has a component named "
+		      << info.component_name_b;
+	  info.component_a_index = nnet_->AddComponent(info.component_name_a,
+						       component_a);
+	  info.component_b_index = nnet_->AddComponent(info.component_name_b,
+						       component_b);
+	}
       }
     }
     KALDI_LOG << "Converted " << modified_component_info_.size()
               << " components to FixedAffineComponent.";
   }
 
-  void DecomposeComponent(const std::string &component_name,
+  // This function finds the minimum index of 
+  // the Descending order sorted [input_vector],
+  // over a range of indices from [lower] to [upper] index,
+  // for which the sum of elements upto the found min. index is greater
+  // than [min_val].
+  // We add one to this index to return the reduced dimension value.
+
+  int32 GetReducedDimension(const Vector<BaseFloat> &input_vector,
+			     int32 lower,
+			     int32 upper,
+			     BaseFloat min_val) {
+    BaseFloat sum = 0;
+    int32 i = 0;
+    for (i = lower; i <= upper; i++) {
+	sum = sum + input_vector(i);
+	if (sum >= min_val) break;
+    }
+    return (i+1);
+  }
+ 
+// Here we perform SVD based refactorig of an input Affine component.
+// After applying SVD , we sort the Singularity values in descending order,
+// and take the subset of values which contribute to energy_threshold times
+// total original sum of squared singular values, and then refactor the Affine
+// component using only these selected singular values, thus making the bottleneck
+// dim of the refactored Affine layer equal to the no. of Singular values selected.
+// This function returs false if the shrinkage ratio of the total no. of parameters,
+// after the above SVD based refactoring, is greater than shrinkage threshold.
+//
+  bool DecomposeComponent(const std::string &component_name,
                           const AffineComponent &affine,
                           Component **component_a_out,
                           Component **component_b_out) {
     int32 input_dim = affine.InputDim(), output_dim = affine.OutputDim();
     Matrix<BaseFloat> linear_params(affine.LinearParams());
     Vector<BaseFloat> bias_params(affine.BiasParams());
-
-    int32 bottleneck_dim = bottleneck_dim_,
-        middle_dim = std::min<int32>(input_dim, output_dim);
-    KALDI_ASSERT(bottleneck_dim < middle_dim);
+    int32 middle_dim = std::min<int32>(input_dim, output_dim);
 
     // note: 'linear_params' is of dimension output_dim by input_dim.
     Vector<BaseFloat> s(middle_dim);
@@ -718,15 +769,40 @@ class SvdApplier {
     linear_params.Svd(&s, &B, &A);
     // make sure the singular values are sorted from greatest to least value.
     SortSvd(&s, &B, &A);
-    BaseFloat s_sum_orig = s.Sum();
-    s.Resize(bottleneck_dim, kCopyData);
-    A.Resize(bottleneck_dim, input_dim, kCopyData);
-    B.Resize(output_dim, bottleneck_dim, kCopyData);
-    BaseFloat s_sum_reduced = s.Sum();
+    Vector<BaseFloat> s2(s.Dim());
+    s2.AddVec2(1.0, s);
+    BaseFloat s2_sum_orig = s2.Sum();
+    KALDI_ASSERT(energy_threshold_ < 1);
+    KALDI_ASSERT(shrinkage_threshold_ < 1);
+    if (energy_threshold_ > 0) {
+      BaseFloat min_singular_sum = energy_threshold_ * s2_sum_orig;
+      bottleneck_dim_ = GetReducedDimension(s2, 0, s2.Dim()-1, min_singular_sum);
+    } 
+    SubVector<BaseFloat> this_part(s2, 0, bottleneck_dim_);
+    BaseFloat s2_sum_reduced = this_part.Sum();
+    BaseFloat shrinkage_ratio =
+      static_cast<BaseFloat>(bottleneck_dim_ * (input_dim+output_dim))
+      / static_cast<BaseFloat>(input_dim * output_dim);
+    if (shrinkage_ratio > shrinkage_threshold_) {
+      KALDI_LOG << "Shrinkage ratio " << shrinkage_ratio
+		<< " greater than threshold : " << shrinkage_threshold_
+		<< " Skipping SVD for this layer.";
+      return false;
+    }
+
+    s.Resize(bottleneck_dim_, kCopyData);
+    A.Resize(bottleneck_dim_, input_dim, kCopyData);
+    B.Resize(output_dim, bottleneck_dim_, kCopyData);
+    KALDI_LOG << "For component " << component_name
+              << " singular value squared sum changed by "
+              << (s2_sum_orig - s2_sum_reduced)
+              << " (from " << s2_sum_orig << " to " << s2_sum_reduced << ")";
     KALDI_LOG << "For component " << component_name
-              << " singular value sum changed by "
-              << (s_sum_orig - s_sum_reduced)
-              << " (from " << s_sum_orig << " to " << s_sum_reduced << ")";
+	      << " dimension reduced from "
+              << " (" << input_dim << "," << output_dim << ")"
+	      << " to [(" << input_dim << "," << bottleneck_dim_
+	      << "), (" << bottleneck_dim_ << "," << output_dim <<")]";
+    KALDI_LOG << "shrinkage ratio : " << shrinkage_ratio;
 
     // we'll divide the singular values equally between the two
     // parameter matrices.
@@ -745,23 +821,22 @@ class SvdApplier {
     component_b->SetUpdatableConfigs(affine);
     *component_a_out = component_a;
     *component_b_out = component_b;
+    return true;
   }
 
   // This function modifies the topology of the neural network, splitting
   // up the components we're modifying into two parts.
   // Suppose we have something like:
   //  component-node name=some_node component=some_component input=
+  // nodes_to_modify will be a list of component-node indexes that we
+  // need to split into two.  These will be nodes like
+  // component-node name=component_node_name component=component_name input=xxx
+  // where 'component_name' is one of the components that we're splitting.
+  // node_names_modified is nnet_->node_names_ except with, for the nodes that
+  // we are splitting in two, "some_node_name" replaced with
+  // "some_node_name_b" (the second of the two split nodes).
   void ModifyTopology() {
-    // nodes_to_split will be a list of component-node indexes that we
-    // need to split into two.  These will be nodes like
-    // component-node name=component_node_name component=component_name input=xxx
-    // where 'component_name' is one of the components that we're splitting.
     std::set<int32> nodes_to_modify;
-
-
-    // node_names_modified is nnet_->node_names_ except with, for the nodes that
-    // we are splitting in two, "some_node_name" replaced with
-    // "some_node_name_b" (the second of the two split nodes).
     std::vector<std::string> node_names_orig = nnet_->GetNodeNames(),
         node_names_modified = node_names_orig;
 
@@ -881,6 +956,8 @@ class SvdApplier {
 
   Nnet *nnet_;
   int32 bottleneck_dim_;
+  BaseFloat energy_threshold_;
+  BaseFloat shrinkage_threshold_;
   std::string component_name_pattern_;
 };
 
@@ -1313,13 +1390,21 @@ void ReadEditConfig(std::istream &edit_config_is, Nnet *nnet) {
     } else if (directive == "apply-svd") {
       std::string name_pattern;
       int32 bottleneck_dim = -1;
-      if (!config_line.GetValue("name", &name_pattern) ||
-          !config_line.GetValue("bottleneck-dim", &bottleneck_dim))
-        KALDI_ERR << "Edit directive apply-svd requires 'name' and "
-            "'bottleneck-dim' to be specified.";
-      if (bottleneck_dim <= 0)
-        KALDI_ERR << "Bottleneck-dim must be positive in apply-svd command.";
-      SvdApplier applier(name_pattern, bottleneck_dim, nnet);
+      BaseFloat energy_threshold = -1;
+      BaseFloat shrinkage_threshold = 1.0;
+      config_line.GetValue("bottleneck-dim", &bottleneck_dim);
+      config_line.GetValue("energy-threshold", &energy_threshold);
+      config_line.GetValue("shrinkage-threshold", &shrinkage_threshold);
+      if (!config_line.GetValue("name", &name_pattern))
+        KALDI_ERR << "Edit directive apply-svd requires 'name' to be specified.";
+      if (bottleneck_dim <= 0 && energy_threshold <=0)
+        KALDI_ERR << "Either Bottleneck-dim or energy-threshold "
+	  "must be set in apply-svd command. "
+	  "Range of possible values is (0 1]";
+      SvdApplier applier(name_pattern, bottleneck_dim,
+			 energy_threshold,
+			 shrinkage_threshold,
+			 nnet);
       applier.ApplySvd();
     } else if (directive == "reduce-rank") {
       std::string name_pattern;
@@ -1655,7 +1740,6 @@ class ModelCollapser {
                                                   component_index2);
   }
 
-
   /**
      Tries to produce a component that's equivalent to running the component
      'component_index2' with input given by 'component_index1'.  This handles
@@ -2173,5 +2257,47 @@ void ApplyL2Regularization(const Nnet &nnet,
 }
 
 
+bool UpdateNnetWithMaxChange(const Nnet &delta_nnet,
+                             BaseFloat max_param_change,
+                             BaseFloat max_change_scale,
+                             BaseFloat scale, Nnet *nnet,
+                             MaxChangeStats *stats) {
+  bool ans = UpdateNnetWithMaxChange(
+      delta_nnet, max_param_change, max_change_scale,
+      scale, nnet,
+      &(stats->num_max_change_per_component_applied),
+      &(stats->num_max_change_global_applied));
+  stats->num_minibatches_processed++;
+  return ans;
+}
+
+
+void MaxChangeStats::Print(const Nnet &nnet) const {
+  int32 i = 0;
+  for (int32 c = 0; c < nnet.NumComponents(); c++) {
+    const Component *comp = nnet.GetComponent(c);
+    if (comp->Properties() & kUpdatableComponent) {
+      const UpdatableComponent *uc = dynamic_cast<const UpdatableComponent*>(
+          comp);
+      if (uc == NULL)
+        KALDI_ERR << "Updatable component does not inherit from class "
+                  << "UpdatableComponent; change this code.";
+      if (num_max_change_per_component_applied[i] > 0)
+        KALDI_LOG << "For " << nnet.GetComponentName(c)
+                  << ", per-component max-change was enforced "
+                  << ((100.0 * num_max_change_per_component_applied[i]) /
+                      num_minibatches_processed)
+                  << " \% of the time.";
+      i++;
+    }
+  }
+  if (num_max_change_global_applied > 0)
+    KALDI_LOG << "The global max-change was enforced "
+              << ((100.0 * num_max_change_global_applied) /
+                  num_minibatches_processed)
+              << " \% of the time.";
+}
+
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h
index 787bd228a38..08307fc766d 100644
--- a/src/nnet3/nnet-utils.h
+++ b/src/nnet3/nnet-utils.h
@@ -300,13 +300,18 @@ void CollapseModel(const CollapseModelConfig &config,
        DropoutMaskComponent or GeneralDropoutComponent whose
        names match the given <name-pattern> (e.g. lstm*).  <name-pattern> defaults to "*".
 
-    apply-svd name=<name-pattern> bottleneck-dim=<dim>
+    apply-svd name=<name-pattern> bottleneck-dim=<dim> energy-threshold=<threshold> shrinkage-threshold=<s>
        Locates all components with names matching <name-pattern>, which are
        type AffineComponent or child classes thereof.  If <dim> is
        less than the minimum of the (input or output) dimension of the component,
-       it does SVD on the components' parameters, retaining only the alrgest
+       it does SVD on the components' parameters, retaining only the largest
        <dim> singular values, replacing these components with sequences of two
        components, of types LinearComponent and NaturalGradientAffineComponent.
+       Instead we can set the filtering criterion for the Singular values as energy-threshold,
+       and retain those values which contribute to energy-threshold times the total energy of
+       the original singular values. A particular SVD factored component is left unshrinked,
+       if the shrinkage ratio of the total no. of its parameters,
+       after the SVD based refactoring, is greater than shrinkage threshold.
        See also 'reduce-rank'.
 
     reduce-rank name=<name-pattern> rank=<dim>
@@ -377,6 +382,17 @@ bool UpdateNnetWithMaxChange(const Nnet &delta_nnet,
                              num_max_change_per_component_applied,
                              int32 *num_max_change_global_applied);
 
+struct MaxChangeStats;
+
+// This overloaded version of UpdateNnetWithMaxChange() is a convenience
+// wrapper for when you have a MaxChangeStats object to keep track
+// of how many times the max-change was applied.  See documentation above.
+bool UpdateNnetWithMaxChange(const Nnet &delta_nnet,
+                             BaseFloat max_param_change,
+                             BaseFloat max_change_scale,
+                             BaseFloat scale, Nnet *nnet,
+                             MaxChangeStats *stats);
+
 
 /**
    This function is used as part of the regular training workflow, prior to
@@ -513,6 +529,24 @@ int32 GetNumNvalues(const std::vector<NnetIo> &io_vec,
                     bool exhaustive);
 
 
+struct MaxChangeStats {
+  int32 num_max_change_global_applied;
+  int32 num_minibatches_processed;
+  std::vector<int32> num_max_change_per_component_applied;
+
+  MaxChangeStats(const Nnet &nnet):
+      num_max_change_global_applied(0),
+      num_minibatches_processed(0),
+      num_max_change_per_component_applied(NumUpdatableComponents(nnet), 0) { }
+
+  // Prints the max-change stats.  Usually will be called at the end
+  // of the program.  The nnet is only needed for structural information,
+  // to work out the component names.
+  void Print(const Nnet &nnet) const;
+};
+
+
+
 } // namespace nnet3
 } // namespace kaldi
 
diff --git a/src/nnet3bin/nnet3-am-copy.cc b/src/nnet3bin/nnet3-am-copy.cc
index 2230ae77c00..c820814db24 100644
--- a/src/nnet3bin/nnet3-am-copy.cc
+++ b/src/nnet3bin/nnet3-am-copy.cc
@@ -116,7 +116,6 @@ int main(int argc, char *argv[]) {
     if (!nnet_config.empty()) {
       Input ki(nnet_config);
       am_nnet.GetNnet().ReadConfig(ki.Stream());
-      am_nnet.SetContext();
     }
 
     if(convert_repeated_to_block)
@@ -137,6 +136,9 @@ int main(int argc, char *argv[]) {
       ReadEditConfig(is, &(am_nnet.GetNnet()));
     }
 
+    am_nnet.SetContext();  // in case we used the config or edits-config or
+                           // edits options
+
     if (scale != 1.0)
       ScaleNnet(scale, &(am_nnet.GetNnet()));
 
diff --git a/src/nnet3bin/nnet3-compute-batch.cc b/src/nnet3bin/nnet3-compute-batch.cc
index b0001c96f57..5d4b9b1db48 100644
--- a/src/nnet3bin/nnet3-compute-batch.cc
+++ b/src/nnet3bin/nnet3-compute-batch.cc
@@ -80,6 +80,10 @@ int main(int argc, char *argv[]) {
                 "priors stored with the model (in this case, "
                 "a .mdl file is expected as input).");
 
+#if HAVE_CUDA==1
+    CuDevice::RegisterDeviceOptions(&po);
+#endif
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 3) {
diff --git a/src/nnet3bin/nnet3-compute.cc b/src/nnet3bin/nnet3-compute.cc
index 45fde99a4f5..cf133025aae 100644
--- a/src/nnet3bin/nnet3-compute.cc
+++ b/src/nnet3bin/nnet3-compute.cc
@@ -78,6 +78,10 @@ int main(int argc, char *argv[]) {
                 "priors stored with the model (in this case, "
                 "a .mdl file is expected as input).");
 
+#if HAVE_CUDA==1
+    CuDevice::RegisterDeviceOptions(&po);
+#endif
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 3) {
diff --git a/src/nnet3bin/nnet3-latgen-faster-batch.cc b/src/nnet3bin/nnet3-latgen-faster-batch.cc
index fad2d5ed356..ec52cff9776 100644
--- a/src/nnet3bin/nnet3-latgen-faster-batch.cc
+++ b/src/nnet3bin/nnet3-latgen-faster-batch.cc
@@ -108,6 +108,10 @@ int main(int argc, char *argv[]) {
     po.Register("use-gpu", &use_gpu,
                 "yes|no|optional|wait, only has effect if compiled with CUDA");
 
+#if HAVE_CUDA==1
+    CuDevice::RegisterDeviceOptions(&po);
+#endif
+    
     po.Read(argc, argv);
 
     if (po.NumArgs() != 4) {
diff --git a/src/nnet3bin/nnet3-xvector-compute.cc b/src/nnet3bin/nnet3-xvector-compute.cc
index a4bc89a7def..e327681cf9b 100644
--- a/src/nnet3bin/nnet3-xvector-compute.cc
+++ b/src/nnet3bin/nnet3-xvector-compute.cc
@@ -113,6 +113,10 @@ int main(int argc, char *argv[]) {
     po.Register("pad-input", &pad_input, "If true, duplicate the first and "
       "last frames of the input features as required to equal min-chunk-size.");
 
+#if HAVE_CUDA==1
+    CuDevice::RegisterDeviceOptions(&po);
+#endif
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 3) {
diff --git a/src/nnetbin/cuda-gpu-available.cc b/src/nnetbin/cuda-gpu-available.cc
index 390468d3046..2036ea82056 100644
--- a/src/nnetbin/cuda-gpu-available.cc
+++ b/src/nnetbin/cuda-gpu-available.cc
@@ -35,19 +35,29 @@ using namespace kaldi;
 void TestGpuComputation() {
   CuMatrix<BaseFloat> m(100,100);
   m.SetRandn();
-  m.ApplySoftMaxPerRow(m);
+  m.SoftMaxPerRow(m);
 }
 #endif
 
 int main(int argc, char *argv[]) try {
+
+  /* only for Doxygen documentation, never shown in command line */
+  const char *usage =
+        "Test if there is a GPU available, and if the GPU setup is correct.\n"
+        "A GPU is acquired and a small computation is done\n"
+        "(generating a random matrix and computing softmax for its rows).\n"
+        "\n"
+        "exit-code: 0 = success, 1 = compiled without GPU support, -1 = error\n"
+        "\n"
+        "Usage:  cuda-gpu-available\n";
+
   char hostname[100] = "UNKNOWN-HOSTNAME";
 #if !defined(_MSC_VER) && !defined(__CYGWIN__)
   if (gethostname(hostname, 100)) {
     KALDI_WARN << "Cannot get hostname, " << strerror(errno);
   }
 #endif
-  KALDI_LOG << std::endl << std::endl
-    << "### IS CUDA GPU AVAILABLE? '" << hostname << "' ###";
+  KALDI_LOG << "\n\n### IS CUDA GPU AVAILABLE? '" << hostname << "' ###";
 #if HAVE_CUDA == 1
   CuDevice::Instantiate().SelectGpuId("yes");
   fprintf(stderr, "### HURRAY, WE GOT A CUDA GPU FOR COMPUTATION!!! ##\n\n");
@@ -76,9 +86,9 @@ int main(int argc, char *argv[]) try {
   return 0;
 #else
   std::cerr
-    << "### CUDA WAS NOT COMPILED IN! ###" << std::endl
+    << "### CUDA WAS NOT COMPILED IN! ###\n"
     << "To support CUDA, you must run 'configure' on a machine "
-    << "that has the CUDA compiler 'nvcc' available.";
+    << "that has the CUDA compiler 'nvcc' available.\n";
   return 1;
 #endif
 } catch (const std::exception &e) {
@@ -95,4 +105,3 @@ int main(int argc, char *argv[]) try {
     << "### - You should see your GPU (burnt GPUs may disappear from the list until reboot),";
   return -1;
 }
-
diff --git a/src/nnetbin/nnet-train-multistream.cc b/src/nnetbin/nnet-train-multistream.cc
index 133c49e02a5..0667aa865bf 100644
--- a/src/nnetbin/nnet-train-multistream.cc
+++ b/src/nnetbin/nnet-train-multistream.cc
@@ -46,6 +46,7 @@ bool ReadData(SequentialBaseFloatMatrixReader& feature_reader,
   for ( ; !feature_reader.Done(); feature_reader.Next()) {
     // Do we have targets?
     const std::string& utt = feature_reader.Key();
+    KALDI_VLOG(3) << "Reading: " << utt;
     if (!target_reader.HasKey(utt)) {
       KALDI_WARN << utt << ", missing targets";
       (*num_no_tgt_mat)++;
@@ -216,6 +217,7 @@ int main(int argc, char *argv[]) {
     Mse mse(loss_opts);
 
     Timer time;
+    double time_gpu = 0;
     KALDI_LOG << (crossvalidate ? "CROSS-VALIDATION" : "TRAINING")
               << " STARTED";
 
@@ -227,6 +229,7 @@ int main(int argc, char *argv[]) {
     std::vector<Matrix<BaseFloat> > feats_utt(num_streams);
     std::vector<Posterior> labels_utt(num_streams);
     std::vector<Vector<BaseFloat> > weights_utt(num_streams);
+    std::vector<int32> cursor_utt(num_streams); // 0 initialized,
     std::vector<int32> new_utt_flags(num_streams);
 
     CuMatrix<BaseFloat> feats_transf, nnet_out, obj_diff;
@@ -238,7 +241,7 @@ int main(int argc, char *argv[]) {
       new_utt_flags.assign(num_streams, 0);  // set new-utterance flags to zero,
       for (int s = 0; s < num_streams; s++) {
         // Need a new utterance for stream 's'?
-        if (feats_utt[s].NumRows() == 0) {
+        if (cursor_utt[s] >= feats_utt[s].NumRows()) {
           Matrix<BaseFloat> feats;
           Posterior targets;
           Vector<BaseFloat> weights;
@@ -249,7 +252,9 @@ int main(int argc, char *argv[]) {
                        &num_no_tgt_mat, &num_other_error)) {
 
             // input transform may contain splicing,
+            Timer t;
             nnet_transf.Feedforward(CuMatrix<BaseFloat>(feats), &feats_transf);
+            time_gpu += t.Elapsed();
 
             /* Here we could do the 'targets_delay', BUT...
              * It is better to do it by a <Splice> component!
@@ -262,17 +267,25 @@ int main(int argc, char *argv[]) {
             feats_utt[s] = Matrix<BaseFloat>(feats_transf);
             labels_utt[s] = targets;
             weights_utt[s] = weights;
+            cursor_utt[s] = 0;
             new_utt_flags[s] = 1;
           }
         }
       }
 
-      // end the training after processing all the frames,
-      size_t frames_to_go = 0;
+      // End the training when 1st stream is empty
+      // (this avoids over-adaptation to last utterances),
+      size_t inactive_streams = 0;
       for (int32 s = 0; s < num_streams; s++) {
-        frames_to_go += feats_utt[s].NumRows();
+        if (feats_utt[s].NumRows() - cursor_utt[s] <= 0) {
+          inactive_streams += 1;
+        }
+      }
+      if (inactive_streams >= 1) {
+        KALDI_LOG << "No more data to re-fill one of the streams, end of the training!";
+        KALDI_LOG << "(remaining stubs of data are discarded, don't overtrain on them)";
+        break;
       }
-      if (frames_to_go == 0) break;
 
       // number of frames we'll pack as the streams,
       std::vector<int32> frame_num_utt;
@@ -291,78 +304,74 @@ int main(int argc, char *argv[]) {
         weight_host.Resize(n_streams * batch_size, kSetZero);
         frame_num_utt.resize(n_streams, 0);
 
-        // we'll slice at most 'batch_size' frames,
+        // we slice at the 'cursor' at most 'batch_size' frames,
         for (int32 s = 0; s < n_streams; s++) {
-          int32 num_rows = feats_utt[s].NumRows();
+          int32 num_rows = std::max(0, feats_utt[s].NumRows() - cursor_utt[s]);
           frame_num_utt[s] = std::min(batch_size, num_rows);
         }
 
         // pack the data,
         {
           for (int32 s = 0; s < n_streams; s++) {
-            const Matrix<BaseFloat>& mat_tmp = feats_utt[s];
-            for (int32 r = 0; r < frame_num_utt[s]; r++) {
-              feat_mat_host.Row(r*n_streams + s).CopyFromVec(mat_tmp.Row(r));
+            if (frame_num_utt[s] > 0) {
+              auto mat_tmp = feats_utt[s].RowRange(cursor_utt[s], frame_num_utt[s]);
+              for (int32 r = 0; r < frame_num_utt[s]; r++) {
+                feat_mat_host.Row(r*n_streams + s).CopyFromVec(mat_tmp.Row(r));
+              }
             }
           }
 
           for (int32 s = 0; s < n_streams; s++) {
-            const Posterior& target_tmp = labels_utt[s];
             for (int32 r = 0; r < frame_num_utt[s]; r++) {
-              target_host[r*n_streams + s] = target_tmp[r];
+              target_host[r*n_streams + s] = labels_utt[s][cursor_utt[s] + r];
             }
           }
 
           // padded frames will keep initial zero-weight,
           for (int32 s = 0; s < n_streams; s++) {
-            const Vector<BaseFloat>& weight_tmp = weights_utt[s];
-            for (int32 r = 0; r < frame_num_utt[s]; r++) {
-              weight_host(r*n_streams + s) = weight_tmp(r);
+            if (frame_num_utt[s] > 0) {
+              auto weight_tmp = weights_utt[s].Range(cursor_utt[s], frame_num_utt[s]);
+              for (int32 r = 0; r < frame_num_utt[s]; r++) {
+                weight_host(r*n_streams + s) = weight_tmp(r);
+              }
             }
           }
         }
 
-        // remove the data we just packed,
-        {
-          for (int32 s = 0; s < n_streams; s++) {
-            // feats,
-            Matrix<BaseFloat>& m = feats_utt[s];
-            if (m.NumRows() == frame_num_utt[s]) {
-              feats_utt[s].Resize(0,0);  // we packed last chunk,
-            } else {
-              feats_utt[s] = Matrix<BaseFloat>(
-                m.RowRange(frame_num_utt[s], m.NumRows() - frame_num_utt[s])
-              );
-            }
-            // labels,
-            Posterior& post = labels_utt[s];
-            post.erase(post.begin(), post.begin() + frame_num_utt[s]);
-            // weights,
-            Vector<BaseFloat>& w = weights_utt[s];
-            if (w.Dim() == frame_num_utt[s]) {
-              weights_utt[s].Resize(0);  // we packed last chunk,
-            } else {
-              weights_utt[s] = Vector<BaseFloat>(
-                w.Range(frame_num_utt[s], w.Dim() - frame_num_utt[s])
-              );
-            }
-          }
+        // advance the cursors,
+        for (int32 s = 0; s < n_streams; s++) {
+          cursor_utt[s] += frame_num_utt[s];
         }
       }
 
       // pass the info about padding,
       nnet.SetSeqLengths(frame_num_utt);
-      // Show the 'utt' lengths in the VLOG[2],
-      if (GetVerboseLevel() >= 2) {
-        std::ostringstream os;
-        os << "[ ";
-        for (size_t i = 0; i < frame_num_utt.size(); i++) {
-          os << frame_num_utt[i] << " ";
+
+      // Show debug info,
+      if (GetVerboseLevel() >= 4) {
+        // cursors in the feature_matrices,
+        {
+          std::ostringstream os;
+          os << "[ ";
+          for (size_t i = 0; i < cursor_utt.size(); i++) {
+            os << cursor_utt[i] << " ";
+          }
+          os << "]";
+          KALDI_LOG << "cursor_utt[" << cursor_utt.size() << "]" << os.str();
+        }
+        // frames in the mini-batch,
+        {
+          std::ostringstream os;
+          os << "[ ";
+          for (size_t i = 0; i < frame_num_utt.size(); i++) {
+            os << frame_num_utt[i] << " ";
+          }
+          os << "]";
+          KALDI_LOG << "frame_num_utt[" << frame_num_utt.size() << "]" << os.str();
         }
-        os << "]";
-        KALDI_LOG << "frame_num_utt[" << frame_num_utt.size() << "]" << os.str();
       }
 
+      Timer t;
       // with new utterance we reset the history,
       nnet.ResetStreams(new_utt_flags);
 
@@ -383,6 +392,7 @@ int main(int argc, char *argv[]) {
         // back-propagate, and do the update,
         nnet.Backpropagate(obj_diff, NULL);
       }
+      time_gpu += t.Elapsed();
 
       // 1st minibatch : show what happens in network,
       if (total_frames == 0) {
@@ -438,7 +448,8 @@ int main(int argc, char *argv[]) {
       << num_other_error << " with other errors. "
       << "[" << (crossvalidate ? "CROSS-VALIDATION" : "TRAINING")
       << ", " << time.Elapsed() / 60 << " min, processing "
-      << total_frames / time.Elapsed() << " frames per sec.]";
+      << total_frames / time.Elapsed() << " frames per sec, "
+      << "GPU_time " << 100.*time_gpu/time.Elapsed() << "% ]";
 
     if (objective_function == "xent") {
       KALDI_LOG << xent.Report();
diff --git a/src/online/online-audio-source.cc b/src/online/online-audio-source.cc
index 7b3c31682aa..5998be0690f 100644
--- a/src/online/online-audio-source.cc
+++ b/src/online/online-audio-source.cc
@@ -72,18 +72,18 @@ OnlinePaSource::OnlinePaSource(const uint32 timeout,
                                &pa_ringbuf_, sizeof(SampleType),
                                rb_size_ / sizeof(SampleType), ring_buffer_);
   if (rbs != 0)
-    throw runtime_error("Unexpected PortAudio ring buffer init error");
+    KALDI_ERR << "PortAudio ring buffer init error";
 
   PaError paerr = Pa_Initialize();
   if (paerr != paNoError)
-    throw runtime_error("PortAudio initialization error");
+    KALDI_ERR << "PortAudio initialization error";
   // Monophone, 16-bit input hardcoded
   KALDI_ASSERT(sizeof(SampleType) == 2 &&
                "The current OnlinePaSource code assumes 16-bit input");
   paerr = Pa_OpenDefaultStream(&pa_stream_, 1, 0, paInt16, sample_rate_, 0,
                                PaCallback, this);
   if (paerr != paNoError)
-    throw runtime_error("PortAudio failed to open the default stream");
+    KALDI_ERR << "PortAudio failed to open the default stream";
 }
 
 
@@ -103,7 +103,7 @@ bool OnlinePaSource::Read(Vector<BaseFloat> *data) {
   if (!pa_started_) {  // start stream the first time Read() is called
     PaError paerr = Pa_StartStream(pa_stream_);
     if (paerr != paNoError)
-      throw std::runtime_error("Error while trying to open PortAudio stream");
+      KALDI_ERR << "Error while trying to open PortAudio stream";
     pa_started_ = true;
   }
   Timer timer;
diff --git a/src/online/online-audio-source.h b/src/online/online-audio-source.h
index d880660d24f..64153e9cd52 100644
--- a/src/online/online-audio-source.h
+++ b/src/online/online-audio-source.h
@@ -42,7 +42,7 @@ class OnlineAudioSourceItf {
   // The function returns true if there may be more data, and false if it
   // knows we are at the end of the stream.
   // In case an unexpected and unrecoverable error occurs the function throws
-  // an exception of type std::runtime_error (e.g. by using KALDI_ERR macro).
+  // an exception of type KaldiFatalError (by using KALDI_ERR macro).
   //
   // NOTE: The older version of this interface had a second paramater - "timeout".
   //       We decided to remove it, because we don't envision usage scenarios,
diff --git a/src/online/online-feat-input.h b/src/online/online-feat-input.h
index b730a373ac0..e433c386212 100644
--- a/src/online/online-feat-input.h
+++ b/src/online/online-feat-input.h
@@ -31,6 +31,7 @@
 
 #include "online-audio-source.h"
 #include "feat/feature-functions.h"
+#include "feat/feature-window.h"
 
 namespace kaldi {
 
@@ -275,7 +276,8 @@ class OnlineFeInput : public OnlineFeatInputItf {
   // "frame_size" - frame extraction window size in audio samples
   // "frame_shift" - feature frame width in audio samples
   OnlineFeInput(OnlineAudioSourceItf *au_src, E *fe,
-                const int32 frame_size, const int32 frame_shift);
+                const int32 frame_size, const int32 frame_shift,
+                const bool snip_edges = true);
 
   virtual int32 Dim() const { return extractor_->Dim(); }
 
@@ -287,15 +289,26 @@ class OnlineFeInput : public OnlineFeatInputItf {
   const int32 frame_size_;
   const int32 frame_shift_;
   Vector<BaseFloat> wave_; // the samples to be passed for extraction
+  Vector<BaseFloat> wave_remainder_; // the samples remained from the previous
+                                     // feature batch
+  FrameExtractionOptions frame_opts_;
 
   KALDI_DISALLOW_COPY_AND_ASSIGN(OnlineFeInput);
 };
 
 template<class E>
 OnlineFeInput<E>::OnlineFeInput(OnlineAudioSourceItf *au_src, E *fe,
-                                   int32 frame_size, int32 frame_shift)
+                                int32 frame_size, int32 frame_shift,
+                                bool snip_edges)
     : source_(au_src), extractor_(fe),
-      frame_size_(frame_size), frame_shift_(frame_shift) {}
+      frame_size_(frame_size), frame_shift_(frame_shift) {
+      // we need a FrameExtractionOptions to call NumFrames()
+      // 1000 is just a fake sample rate which equates ms and samples
+      frame_opts_.samp_freq = 1000;
+      frame_opts_.frame_shift_ms = frame_shift;
+      frame_opts_.frame_length_ms = frame_size;
+      frame_opts_.snip_edges = snip_edges;
+}
 
 template<class E> bool
 OnlineFeInput<E>::Compute(Matrix<BaseFloat> *output) {
@@ -311,11 +324,26 @@ OnlineFeInput<E>::Compute(Matrix<BaseFloat> *output) {
 
   bool ans = source_->Read(&read_samples);
 
+  Vector<BaseFloat> all_samples(wave_remainder_.Dim() + read_samples.Dim());
+  all_samples.Range(0, wave_remainder_.Dim()).CopyFromVec(wave_remainder_);
+  all_samples.Range(wave_remainder_.Dim(), read_samples.Dim()).
+      CopyFromVec(read_samples);
+
   // Extract the features
-  if (read_samples.Dim() >= frame_size_) {
-    extractor_->Compute(read_samples, 1.0, output);
+  if (all_samples.Dim() >= frame_size_) {
+    // extract waveform remainder before calling Compute()
+    int32 num_frames = NumFrames(all_samples.Dim(), frame_opts_);
+    // offset is the amount at the start that has been extracted.
+    int32 offset = num_frames * frame_shift_;
+    int32 remaining_len = all_samples.Dim() - offset;
+    wave_remainder_.Resize(remaining_len);
+    KALDI_ASSERT(remaining_len >= 0);
+    if (remaining_len > 0)
+      wave_remainder_.CopyFromVec(SubVector<BaseFloat>(all_samples, offset, remaining_len));
+    extractor_->Compute(all_samples, 1.0, output);
   } else {
     output->Resize(0, 0);
+    wave_remainder_ = all_samples;
   }
 
   return ans;
diff --git a/src/online2/online-feature-pipeline.h b/src/online2/online-feature-pipeline.h
index f89cbbbb898..fab1be3cb27 100644
--- a/src/online2/online-feature-pipeline.h
+++ b/src/online2/online-feature-pipeline.h
@@ -166,7 +166,7 @@ class OnlineFeaturePipeline: public OnlineFeatureInterface {
 
   // This is supplied for debug purposes.
   void GetAsMatrix(Matrix<BaseFloat> *feats);
-  
+
   void FreezeCmvn();  // stop it from moving further (do this when you start
                       // using fMLLR). This will crash if NumFramesReady() == 0.
 
diff --git a/src/online2/online-ivector-feature.cc b/src/online2/online-ivector-feature.cc
index 2042fbb8b80..3642a9aeaff 100644
--- a/src/online2/online-ivector-feature.cc
+++ b/src/online2/online-ivector-feature.cc
@@ -195,6 +195,9 @@ void OnlineIvectorFeature::UpdateStatsForFrames(
   // Remove duplicates of frames.
   MergePairVectorSumming(&frame_weights);
 
+  if (frame_weights.empty())
+    return;
+
   int32 num_frames = static_cast<int32>(frame_weights.size());
   int32 feat_dim = lda_normalized_->Dim();
   Matrix<BaseFloat> feats(num_frames, feat_dim, kUndefined),
diff --git a/src/online2/online-nnet2-feature-pipeline.cc b/src/online2/online-nnet2-feature-pipeline.cc
index 510c401fba2..c495c9fc8ef 100644
--- a/src/online2/online-nnet2-feature-pipeline.cc
+++ b/src/online2/online-nnet2-feature-pipeline.cc
@@ -128,6 +128,21 @@ void OnlineNnet2FeaturePipeline::GetFrame(int32 frame,
   return final_feature_->GetFrame(frame, feat);
 }
 
+void OnlineNnet2FeaturePipeline::UpdateFrameWeights(
+    const std::vector<std::pair<int32, BaseFloat> > &delta_weights,
+    int32 frame_offset) {
+  if (frame_offset == 0) {
+    IvectorFeature()->UpdateFrameWeights(delta_weights);
+  } else {
+    std::vector<std::pair<int32, BaseFloat> > offset_delta_weights;
+    for (size_t i = 0; i < delta_weights.size(); i++) {
+      offset_delta_weights.push_back(std::make_pair(
+          delta_weights[i].first + frame_offset, delta_weights[i].second));
+    }
+    IvectorFeature()->UpdateFrameWeights(offset_delta_weights);
+  }
+}
+
 void OnlineNnet2FeaturePipeline::SetAdaptationState(
     const OnlineIvectorExtractorAdaptationState &adaptation_state) {
   if (info_.use_ivectors) {
diff --git a/src/online2/online-nnet2-feature-pipeline.h b/src/online2/online-nnet2-feature-pipeline.h
index d8f933a090d..2e3fbf7bd78 100644
--- a/src/online2/online-nnet2-feature-pipeline.h
+++ b/src/online2/online-nnet2-feature-pipeline.h
@@ -196,6 +196,20 @@ class OnlineNnet2FeaturePipeline: public OnlineFeatureInterface {
   virtual int32 NumFramesReady() const;
   virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
 
+  /// If you are downweighting silence, you can call
+  /// OnlineSilenceWeighting::GetDeltaWeights and supply the output to this
+  /// class using UpdateFrameWeights().  The reason why this call happens
+  /// outside this class, rather than this class pulling in the data weights,
+  /// relates to multi-threaded operation and also from not wanting this class
+  /// to have excessive dependencies.
+  ///
+  /// You must either always call this as soon as new data becomes available,
+  /// ideally just after calling AcceptWaveform(), or never call it for the
+  /// lifetime of this object.
+  void UpdateFrameWeights(
+      const std::vector<std::pair<int32, BaseFloat> > &delta_weights,
+      int32 frame_offset = 0);
+
   /// Set the adaptation state to a particular value, e.g. reflecting previous
   /// utterances of the same speaker; this will generally be called after
   /// Copy().
@@ -228,18 +242,25 @@ class OnlineNnet2FeaturePipeline: public OnlineFeatureInterface {
   /// rescoring the lattices, this may not be much of an issue.
   void InputFinished();
 
-  // This function returns the ivector-extracting part of the feature pipeline
-  // (or NULL if iVectors are not being used); the pointer is owned here and not
-  // given to the caller.  This function is used in nnet3, and also in the
-  // silence-weighting code used to exclude silence from the iVector estimation.
+  // This function returns the iVector-extracting part of the feature pipeline
+  // (or NULL if iVectors are not being used); the pointer ownership is retained
+  // by this object and not transferred to the caller.  This function is used in
+  // nnet3, and also in the silence-weighting code used to exclude silence from
+  // the iVector estimation.
   OnlineIvectorFeature *IvectorFeature() {
     return ivector_feature_;
   }
 
+  // A const accessor for the iVector extractor. Returns NULL if iVectors are
+  // not being used.
+  const OnlineIvectorFeature *IvectorFeature() const {
+    return ivector_feature_;
+  }
+
   // This function returns the part of the feature pipeline that would be given
   // as the primary (non-iVector) input to the neural network in nnet3
   // applications.
- OnlineFeatureInterface *InputFeature() {
+  OnlineFeatureInterface *InputFeature() {
     return feature_plus_optional_pitch_;
   }
 
diff --git a/src/online2/online-nnet3-decoding.cc b/src/online2/online-nnet3-decoding.cc
index fbe0c2bed7b..1a6e43f1723 100644
--- a/src/online2/online-nnet3-decoding.cc
+++ b/src/online2/online-nnet3-decoding.cc
@@ -41,6 +41,12 @@ SingleUtteranceNnet3DecoderTpl<FST>::SingleUtteranceNnet3DecoderTpl(
   decoder_.InitDecoding();
 }
 
+template <typename FST>
+void SingleUtteranceNnet3DecoderTpl<FST>::InitDecoding(int32 frame_offset) {
+  decoder_.InitDecoding();
+  decodable_.SetFrameOffset(frame_offset);
+}
+
 template <typename FST>
 void SingleUtteranceNnet3DecoderTpl<FST>::AdvanceDecoding() {
   decoder_.AdvanceDecoding(&decodable_);
@@ -56,7 +62,6 @@ int32 SingleUtteranceNnet3DecoderTpl<FST>::NumFramesDecoded() const {
   return decoder_.NumFramesDecoded();
 }
 
-
 template <typename FST>
 void SingleUtteranceNnet3DecoderTpl<FST>::GetLattice(bool end_of_utterance,
                                              CompactLattice *clat) const {
diff --git a/src/online2/online-nnet3-decoding.h b/src/online2/online-nnet3-decoding.h
index 568c0b6a0b3..9adf77fcb56 100644
--- a/src/online2/online-nnet3-decoding.h
+++ b/src/online2/online-nnet3-decoding.h
@@ -60,7 +60,13 @@ class SingleUtteranceNnet3DecoderTpl {
                                  const FST &fst,
                                  OnlineNnet2FeaturePipeline *features);
 
-  /// advance the decoding as far as we can.
+  /// Initializes the decoding and sets the frame offset of the underlying
+  /// decodable object. This method is called by the constructor. You can also
+  /// call this method when you want to reset the decoder state, but want to
+  /// keep using the same decodable object, e.g. in case of an endpoint.
+  void InitDecoding(int32 frame_offset = 0);
+
+  /// Advances the decoding as far as we can.
   void AdvanceDecoding();
 
   /// Finalizes the decoding. Cleans up and prunes remaining tokens, so the
diff --git a/src/online2bin/Makefile b/src/online2bin/Makefile
index 8792cc5b11a..28c135eb950 100644
--- a/src/online2bin/Makefile
+++ b/src/online2bin/Makefile
@@ -11,7 +11,8 @@ BINFILES = online2-wav-gmm-latgen-faster apply-cmvn-online \
      online2-wav-nnet2-latgen-faster ivector-extract-online2 \
      online2-wav-dump-features ivector-randomize \
      online2-wav-nnet2-am-compute  online2-wav-nnet2-latgen-threaded \
-     online2-wav-nnet3-latgen-faster online2-wav-nnet3-latgen-grammar
+     online2-wav-nnet3-latgen-faster online2-wav-nnet3-latgen-grammar \
+     online2-tcp-nnet3-decode-faster
 
 OBJFILES =
 
diff --git a/src/online2bin/online2-tcp-nnet3-decode-faster.cc b/src/online2bin/online2-tcp-nnet3-decode-faster.cc
new file mode 100644
index 00000000000..f060ba7bdd2
--- /dev/null
+++ b/src/online2bin/online2-tcp-nnet3-decode-faster.cc
@@ -0,0 +1,490 @@
+// online2bin/online2-tcp-nnet3-decode-faster.cc
+
+// Copyright 2014  Johns Hopkins University (author: Daniel Povey)
+//           2016  Api.ai (Author: Ilya Platonov)
+//           2018  Polish-Japanese Academy of Information Technology (Author: Danijel Korzinek)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "feat/wave-reader.h"
+#include "online2/online-nnet3-decoding.h"
+#include "online2/online-nnet2-feature-pipeline.h"
+#include "online2/onlinebin-util.h"
+#include "online2/online-timing.h"
+#include "online2/online-endpoint.h"
+#include "fstext/fstext-lib.h"
+#include "lat/lattice-functions.h"
+#include "util/kaldi-thread.h"
+#include "nnet3/nnet-utils.h"
+
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <poll.h>
+#include <signal.h>
+#include <arpa/inet.h>
+#include <unistd.h>
+#include <string>
+
+namespace kaldi {
+
+class TcpServer {
+ public:
+  explicit TcpServer(int read_timeout);
+  ~TcpServer();
+
+  bool Listen(int32 port);  // start listening on a given port
+  int32 Accept();  // accept a client and return its descriptor
+
+  bool ReadChunk(size_t len); // get more data and return false if end-of-stream
+
+  Vector<BaseFloat> GetChunk(); // get the data read by above method
+
+  bool Write(const std::string &msg); // write to accepted client
+  bool WriteLn(const std::string &msg, const std::string &eol = "\n"); // write line to accepted client
+
+  void Disconnect();
+
+ private:
+  struct ::sockaddr_in h_addr_;
+  int32 server_desc_, client_desc_;
+  int16 *samp_buf_;
+  size_t buf_len_, has_read_;
+  pollfd client_set_[1];
+  int read_timeout_;
+};
+
+std::string LatticeToString(const Lattice &lat, const fst::SymbolTable &word_syms) {
+  LatticeWeight weight;
+  std::vector<int32> alignment;
+  std::vector<int32> words;
+  GetLinearSymbolSequence(lat, &alignment, &words, &weight);
+
+  std::ostringstream msg;
+  for (size_t i = 0; i < words.size(); i++) {
+    std::string s = word_syms.Find(words[i]);
+    if (s.empty()) {
+      KALDI_WARN << "Word-id " << words[i] << " not in symbol table.";
+      msg << "<#" << std::to_string(i) << "> ";
+    } else
+      msg << s << " ";
+  }
+  return msg.str();
+}
+
+std::string GetTimeString(int32 t_beg, int32 t_end, BaseFloat time_unit) {
+  char buffer[100];
+  double t_beg2 = t_beg * time_unit;
+  double t_end2 = t_end * time_unit;
+  snprintf(buffer, 100, "%.2f %.2f", t_beg2, t_end2);
+  return std::string(buffer);
+}
+
+int32 GetLatticeTimeSpan(const Lattice& lat) {
+  std::vector<int32> times;
+  LatticeStateTimes(lat, &times);
+  return times.back();
+}
+
+std::string LatticeToString(const CompactLattice &clat, const fst::SymbolTable &word_syms) {
+  if (clat.NumStates() == 0) {
+    KALDI_WARN << "Empty lattice.";
+    return "";
+  }
+  CompactLattice best_path_clat;
+  CompactLatticeShortestPath(clat, &best_path_clat);
+
+  Lattice best_path_lat;
+  ConvertLattice(best_path_clat, &best_path_lat);
+  return LatticeToString(best_path_lat, word_syms);
+}
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace fst;
+
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Reads in audio from a network socket and performs online\n"
+        "decoding with neural nets (nnet3 setup), with iVector-based\n"
+        "speaker adaptation and endpointing.\n"
+        "Note: some configuration values and inputs are set via config\n"
+        "files whose filenames are passed as options\n"
+        "\n"
+        "Usage: online2-tcp-nnet3-decode-faster [options] <nnet3-in> "
+        "<fst-in> <word-symbol-table>\n";
+
+    ParseOptions po(usage);
+
+
+    // feature_opts includes configuration for the iVector adaptation,
+    // as well as the basic features.
+    OnlineNnet2FeaturePipelineConfig feature_opts;
+    nnet3::NnetSimpleLoopedComputationOptions decodable_opts;
+    LatticeFasterDecoderConfig decoder_opts;
+    OnlineEndpointConfig endpoint_opts;
+
+    BaseFloat chunk_length_secs = 0.18;
+    BaseFloat output_period = 1;
+    BaseFloat samp_freq = 16000.0;
+    int port_num = 5050;
+    int read_timeout = 3;
+    bool produce_time = false;
+
+    po.Register("samp-freq", &samp_freq,
+                "Sampling frequency of the input signal (coded as 16-bit slinear).");
+    po.Register("chunk-length", &chunk_length_secs,
+                "Length of chunk size in seconds, that we process.");
+    po.Register("output-period", &output_period,
+                "How often in seconds, do we check for changes in output.");
+    po.Register("num-threads-startup", &g_num_threads,
+                "Number of threads used when initializing iVector extractor.");
+    po.Register("read-timeout", &read_timeout,
+                "Number of seconds of timout for TCP audio data to appear on the stream. Use -1 for blocking.");
+    po.Register("port-num", &port_num,
+                "Port number the server will listen on.");
+    po.Register("produce-time", &produce_time,
+                "Prepend begin/end times between endpoints (e.g. '5.46 6.81 <text_output>', in seconds)");
+
+    feature_opts.Register(&po);
+    decodable_opts.Register(&po);
+    decoder_opts.Register(&po);
+    endpoint_opts.Register(&po);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      return 1;
+    }
+
+    std::string nnet3_rxfilename = po.GetArg(1),
+        fst_rxfilename = po.GetArg(2),
+        word_syms_filename = po.GetArg(3);
+
+    OnlineNnet2FeaturePipelineInfo feature_info(feature_opts);
+
+    BaseFloat frame_shift = feature_info.FrameShiftInSeconds();
+    int32 frame_subsampling = decodable_opts.frame_subsampling_factor;
+
+    KALDI_VLOG(1) << "Loading AM...";
+
+    TransitionModel trans_model;
+    nnet3::AmNnetSimple am_nnet;
+    {
+      bool binary;
+      Input ki(nnet3_rxfilename, &binary);
+      trans_model.Read(ki.Stream(), binary);
+      am_nnet.Read(ki.Stream(), binary);
+      SetBatchnormTestMode(true, &(am_nnet.GetNnet()));
+      SetDropoutTestMode(true, &(am_nnet.GetNnet()));
+      nnet3::CollapseModel(nnet3::CollapseModelConfig(), &(am_nnet.GetNnet()));
+    }
+
+    // this object contains precomputed stuff that is used by all decodable
+    // objects.  It takes a pointer to am_nnet because if it has iVectors it has
+    // to modify the nnet to accept iVectors at intervals.
+    nnet3::DecodableNnetSimpleLoopedInfo decodable_info(decodable_opts,
+                                                        &am_nnet);
+
+    KALDI_VLOG(1) << "Loading FST...";
+
+    fst::Fst<fst::StdArc> *decode_fst = ReadFstKaldiGeneric(fst_rxfilename);
+
+    fst::SymbolTable *word_syms = NULL;
+    if (!word_syms_filename.empty())
+      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
+        KALDI_ERR << "Could not read symbol table from file "
+                  << word_syms_filename;
+
+    signal(SIGPIPE, SIG_IGN); // ignore SIGPIPE to avoid crashing when socket forcefully disconnected
+
+    TcpServer server(read_timeout);
+
+    server.Listen(port_num);
+
+    while (true) {
+
+      server.Accept();
+
+      int32 samp_count = 0;// this is used for output refresh rate
+      size_t chunk_len = static_cast<size_t>(chunk_length_secs * samp_freq);
+      int32 check_period = static_cast<int32>(samp_freq * output_period);
+      int32 check_count = check_period;
+
+      int32 frame_offset = 0;
+
+      bool eos = false;
+
+      OnlineNnet2FeaturePipeline feature_pipeline(feature_info);
+      SingleUtteranceNnet3Decoder decoder(decoder_opts, trans_model,
+                                          decodable_info,
+                                          *decode_fst, &feature_pipeline);
+
+      while (!eos) {
+
+        decoder.InitDecoding(frame_offset);
+        OnlineSilenceWeighting silence_weighting(
+            trans_model,
+            feature_info.silence_weighting_config,
+            decodable_opts.frame_subsampling_factor);
+        std::vector<std::pair<int32, BaseFloat>> delta_weights;
+
+        while (true) {
+          eos = !server.ReadChunk(chunk_len);
+
+          if (eos) {
+            feature_pipeline.InputFinished();
+            decoder.AdvanceDecoding();
+            decoder.FinalizeDecoding();
+            frame_offset += decoder.NumFramesDecoded();
+            if (decoder.NumFramesDecoded() > 0) {
+              CompactLattice lat;
+              decoder.GetLattice(true, &lat);
+              std::string msg = LatticeToString(lat, *word_syms);
+
+              // get time-span from previous endpoint to end of audio,
+              if (produce_time) {
+                int32 t_beg = frame_offset - decoder.NumFramesDecoded();
+                int32 t_end = frame_offset;
+                msg = GetTimeString(t_beg, t_end, frame_shift * frame_subsampling) + " " + msg;
+              }
+
+              KALDI_VLOG(1) << "EndOfAudio, sending message: " << msg;
+              server.WriteLn(msg);
+            } else
+              server.Write("\n");
+            server.Disconnect();
+            break;
+          }
+
+          Vector<BaseFloat> wave_part = server.GetChunk();
+          feature_pipeline.AcceptWaveform(samp_freq, wave_part);
+          samp_count += chunk_len;
+
+          if (silence_weighting.Active() &&
+              feature_pipeline.IvectorFeature() != NULL) {
+            silence_weighting.ComputeCurrentTraceback(decoder.Decoder());
+            silence_weighting.GetDeltaWeights(feature_pipeline.NumFramesReady(),
+                                              &delta_weights);
+            feature_pipeline.UpdateFrameWeights(delta_weights,
+                                                frame_offset * decodable_opts.frame_subsampling_factor);
+          }
+
+          decoder.AdvanceDecoding();
+
+          if (samp_count > check_count) {
+            if (decoder.NumFramesDecoded() > 0) {
+              Lattice lat;
+              decoder.GetBestPath(false, &lat);
+              TopSort(&lat); // for LatticeStateTimes(),
+              std::string msg = LatticeToString(lat, *word_syms);
+
+              // get time-span after previous endpoint,
+              if (produce_time) {
+                int32 t_beg = frame_offset;
+                int32 t_end = frame_offset + GetLatticeTimeSpan(lat);
+                msg = GetTimeString(t_beg, t_end, frame_shift * frame_subsampling) + " " + msg;
+              }
+
+              KALDI_VLOG(1) << "Temporary transcript: " << msg;
+              server.WriteLn(msg, "\r");
+            }
+            check_count += check_period;
+          }
+
+          if (decoder.EndpointDetected(endpoint_opts)) {
+            decoder.FinalizeDecoding();
+            frame_offset += decoder.NumFramesDecoded();
+            CompactLattice lat;
+            decoder.GetLattice(true, &lat);
+            std::string msg = LatticeToString(lat, *word_syms);
+
+            // get time-span between endpoints,
+            if (produce_time) {
+              int32 t_beg = frame_offset - decoder.NumFramesDecoded();
+              int32 t_end = frame_offset;
+              msg = GetTimeString(t_beg, t_end, frame_shift * frame_subsampling) + " " + msg;
+            }
+
+            KALDI_VLOG(1) << "Endpoint, sending message: " << msg;
+            server.WriteLn(msg);
+            break; // while (true)
+          }
+        }
+      }
+    }
+  } catch (const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+} // main()
+
+
+namespace kaldi {
+TcpServer::TcpServer(int read_timeout) {
+  server_desc_ = -1;
+  client_desc_ = -1;
+  samp_buf_ = NULL;
+  buf_len_ = 0;
+  read_timeout_ = 1000 * read_timeout;
+}
+
+bool TcpServer::Listen(int32 port) {
+  h_addr_.sin_addr.s_addr = INADDR_ANY;
+  h_addr_.sin_port = htons(port);
+  h_addr_.sin_family = AF_INET;
+
+  server_desc_ = socket(AF_INET, SOCK_STREAM, 0);
+
+  if (server_desc_ == -1) {
+    KALDI_ERR << "Cannot create TCP socket!";
+    return false;
+  }
+
+  int32 flag = 1;
+  int32 len = sizeof(int32);
+  if (setsockopt(server_desc_, SOL_SOCKET, SO_REUSEADDR, &flag, len) == -1) {
+    KALDI_ERR << "Cannot set socket options!";
+    return false;
+  }
+
+  if (bind(server_desc_, (struct sockaddr *) &h_addr_, sizeof(h_addr_)) == -1) {
+    KALDI_ERR << "Cannot bind to port: " << port << " (is it taken?)";
+    return false;
+  }
+
+  if (listen(server_desc_, 1) == -1) {
+    KALDI_ERR << "Cannot listen on port!";
+    return false;
+  }
+
+  KALDI_LOG << "TcpServer: Listening on port: " << port;
+
+  return true;
+
+}
+
+TcpServer::~TcpServer() {
+  Disconnect();
+  if (server_desc_ != -1)
+    close(server_desc_);
+  delete[] samp_buf_;
+}
+
+int32 TcpServer::Accept() {
+  KALDI_LOG << "Waiting for client...";
+
+  socklen_t len;
+
+  len = sizeof(struct sockaddr);
+  client_desc_ = accept(server_desc_, (struct sockaddr *) &h_addr_, &len);
+
+  struct sockaddr_storage addr;
+  char ipstr[20];
+
+  len = sizeof addr;
+  getpeername(client_desc_, (struct sockaddr *) &addr, &len);
+
+  struct sockaddr_in *s = (struct sockaddr_in *) &addr;
+  inet_ntop(AF_INET, &s->sin_addr, ipstr, sizeof ipstr);
+
+  client_set_[0].fd = client_desc_;
+  client_set_[0].events = POLLIN;
+
+  KALDI_LOG << "Accepted connection from: " << ipstr;
+
+  return client_desc_;
+}
+
+bool TcpServer::ReadChunk(size_t len) {
+  if (buf_len_ != len) {
+    buf_len_ = len;
+    delete[] samp_buf_;
+    samp_buf_ = new int16[len];
+  }
+
+  ssize_t ret;
+  int poll_ret;
+  size_t to_read = len;
+  has_read_ = 0;
+  while (to_read > 0) {
+    poll_ret = poll(client_set_, 1, read_timeout_);
+    if (poll_ret == 0) {
+      KALDI_WARN << "Socket timeout! Disconnecting...";
+      break;
+    }
+    if (client_set_[0].revents != POLLIN) {
+      KALDI_WARN << "Socket error! Disconnecting...";
+      break;
+    }
+    ret = read(client_desc_, static_cast<void *>(samp_buf_ + has_read_), to_read * sizeof(int16));
+    if (ret <= 0) {
+      KALDI_WARN << "Stream over...";
+      break;
+    }
+    to_read -= ret / sizeof(int16);
+    has_read_ += ret / sizeof(int16);
+  }
+
+  return has_read_ > 0;
+}
+
+Vector<BaseFloat> TcpServer::GetChunk() {
+  Vector<BaseFloat> buf;
+
+  buf.Resize(static_cast<MatrixIndexT>(has_read_));
+
+  for (int i = 0; i < has_read_; i++)
+    buf(i) = static_cast<BaseFloat>(samp_buf_[i]);
+
+  return buf;
+}
+
+bool TcpServer::Write(const std::string &msg) {
+
+  const char *p = msg.c_str();
+  size_t to_write = msg.size();
+  size_t wrote = 0;
+  while (to_write > 0) {
+    ssize_t ret = write(client_desc_, static_cast<const void *>(p + wrote), to_write);
+    if (ret <= 0)
+      return false;
+
+    to_write -= ret;
+    wrote += ret;
+  }
+
+  return true;
+}
+
+bool TcpServer::WriteLn(const std::string &msg, const std::string &eol) {
+  if (Write(msg))
+    return Write(eol);
+  else return false;
+}
+
+void TcpServer::Disconnect() {
+  if (client_desc_ != -1) {
+    close(client_desc_);
+    client_desc_ = -1;
+  }
+}
+}  // namespace kaldi
diff --git a/src/onlinebin/online-audio-client.cc b/src/onlinebin/online-audio-client.cc
index 241aee426cc..577204b65e7 100644
--- a/src/onlinebin/online-audio-client.cc
+++ b/src/onlinebin/online-audio-client.cc
@@ -85,7 +85,7 @@ int main(int argc, char** argv) {
 
     int32 client_desc = socket(AF_INET, SOCK_STREAM, 0);
     if (client_desc == -1) {
-      std::cerr << "ERROR: couldn't create socket!" << std::endl;
+      std::cerr << "ERROR: couldn't create socket!\n";
       return -1;
     }
 
@@ -96,8 +96,8 @@ int main(int argc, char** argv) {
     if (addr == INADDR_NONE) {
       hp = gethostbyname(server_addr_str.c_str());
       if (hp == NULL) {
-        std::cerr << "ERROR: couldn't resolve host string: " << server_addr_str
-                  << std::endl;
+        std::cerr << "ERROR: couldn't resolve host string: "
+                  << server_addr_str << '\n';
         close(client_desc);
         return -1;
       }
@@ -110,13 +110,13 @@ int main(int argc, char** argv) {
     server.sin_family = AF_INET;
     server.sin_port = htons(server_port);
     if (::connect(client_desc, (struct sockaddr*) &server, sizeof(server))) {
-      std::cerr << "ERROR: couldn't connect to server!" << std::endl;
+      std::cerr << "ERROR: couldn't connect to server!\n";
       close(client_desc);
       return -1;
     }
 
     KALDI_VLOG(2) << "Connected to KALDI server at host " << server_addr_str
-        << " port " << server_port << std::endl;
+        << " port " << server_port;
 
     char* pack_buffer = new char[packet_size];
 
@@ -124,7 +124,7 @@ int main(int argc, char** argv) {
     for (; !reader.Done(); reader.Next()) {
       std::string wav_key = reader.Key();
 
-      KALDI_VLOG(2) << "File: " << wav_key << std::endl;
+      KALDI_VLOG(2) << "File: " << wav_key;
 
       const WaveData &wav_data = reader.Value();
 
@@ -257,8 +257,7 @@ int main(int argc, char** argv) {
 
       {
         float speed = total_input_dur / total_reco_dur;
-        KALDI_VLOG(2) << "Recognized (" << speed << "xRT): " << reco_output
-            << std::endl;
+        KALDI_VLOG(2) << "Recognized (" << speed << "xRT): " << reco_output;
       }
 
       if (htk) {
@@ -266,7 +265,8 @@ int main(int argc, char** argv) {
         std::ofstream htk_file(name.c_str());
         for (size_t i = 0; i < results.size(); i++)
           htk_file << (int) (results[i].start * 10000000) << " "
-              << (int) (results[i].end * 10000000) << " " << results[i].word << std::endl;
+                   << (int) (results[i].end * 10000000) << " "
+                   << results[i].word << "\n";
         htk_file.close();
       }
 
@@ -309,12 +309,13 @@ int main(int argc, char** argv) {
         std::string name = wav_key + ".vtt";
         std::ofstream vtt_file(name.c_str());
 
-        vtt_file << "WEBVTT FILE" << std::endl << std::endl;
+        vtt_file << "WEBVTT FILE\n\n";
 
         for (size_t i = 0; i < subtitles.size(); i++)
-          vtt_file << (i + 1) << std::endl << TimeToTimecode(subtitles[i].start)
-              << " --> " << TimeToTimecode(subtitles[i].end) << std::endl
-              << subtitles[i].word << std::endl << std::endl;
+          vtt_file << (i + 1) << "\n"
+                   << TimeToTimecode(subtitles[i].start) << " --> "
+                   << TimeToTimecode(subtitles[i].end) << "\n"
+                   << subtitles[i].word << "\n\n";
 
         vtt_file.close();
       }
diff --git a/src/onlinebin/online-gmm-decode-faster.cc b/src/onlinebin/online-gmm-decode-faster.cc
index 8ad86a489d4..46904dbc59e 100644
--- a/src/onlinebin/online-gmm-decode-faster.cc
+++ b/src/onlinebin/online-gmm-decode-faster.cc
@@ -70,7 +70,7 @@ int main(int argc, char *argv[]) {
     OnlineFeatureMatrixOptions feature_reading_opts;
     decoder_opts.Register(&po, true);
     feature_reading_opts.Register(&po);
-    
+
     po.Register("left-context", &left_context, "Number of frames of left context");
     po.Register("right-context", &right_context, "Number of frames of right context");
     po.Register("acoustic-scale", &acoustic_scale,
@@ -86,7 +86,7 @@ int main(int argc, char *argv[]) {
       po.PrintUsage();
       return 1;
     }
-    
+
     std::string model_rxfilename = po.GetArg(1),
         fst_rxfilename = po.GetArg(2),
         word_syms_filename = po.GetArg(3),
@@ -151,7 +151,7 @@ int main(int argc, char *argv[]) {
       opts.order = kDeltaOrder;
       feat_transform = new OnlineDeltaInput(opts, &cmn_input);
     }
-    
+
     // feature_reading_opts contains number of retries, batch size.
     OnlineFeatureMatrix feature_matrix(feature_reading_opts,
                                        feat_transform);
@@ -200,4 +200,4 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 #endif
-} // main()
+}  // main()
diff --git a/src/probe/README.slow_expf b/src/probe/README.slow_expf
index 00c9ce5be09..c20386b8137 100644
--- a/src/probe/README.slow_expf
+++ b/src/probe/README.slow_expf
@@ -1,5 +1,6 @@
-On some machines, expf() turns out to be very slow: much slower than its double precision counterpart exp().
-Probably this is concerned with the version of glibc.
+On some machines, expf() turns out to be very slow: much slower than its double
+precision counterpart exp().  Probably this is concerned with the version of
+glibc.
 
 Here are a couple of examples:
 
@@ -21,5 +22,7 @@ configuration$ ./exp-test
 exp() time: 0.0028439
 expf() time: 0.00713329
 
-If slow behaviour is detected, then KALDI_NO_EXPF macro will be used, and the Exp() wrapper in base/kaldi-math.h will use exp() even for single precision floats.
-The behaviour of expf() is considered to be slow if it is slower than exp() by at least 10%.
\ No newline at end of file
+If slow behaviour is detected, then KALDI_NO_EXPF macro will be used, and the
+Exp() wrapper in base/kaldi-math.h will use exp() even for single precision
+floats.  The behaviour of expf() is considered to be slow if it is slower than
+exp() by at least 10%.
diff --git a/src/probe/exp-test.cc b/src/probe/exp-test.cc
index 1fd8a64c6a6..d6cc76d4ce2 100644
--- a/src/probe/exp-test.cc
+++ b/src/probe/exp-test.cc
@@ -17,35 +17,52 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
+// Read Makefile.slow_expf. This test must be compiled with -O0.
+
 #include <iostream>
 #include <cmath>
 #include "base/timer.h"
 
-#define SAMPLE 100000
+int main() {
+  int test_iter = 300000;
+
+  // Make sure that the CPU bumps its clock to full speed: run the first loop
+  // without timing. Then increase the sample iteration count exponentially
+  // until the loop takes at least 10ms. We run this loop 1/4 of the number of
+  // actual test iterations and call both exp() and expf(), so that the overall
+  // test run will take 20 to 60 ms, to ensure a sensibly measurable result.
+  for (bool first = true; ; first=false) {
+    kaldi::Timer timer;
+    for(int i = 0; i < test_iter; i += 4) {
+      (void)exp((double)(i & 0x0F));
+      (void)expf((double)(i & 0x0F));
+    }
+    double time = timer.Elapsed();
+    if (first) continue;
+    if (time > 0.01) break;
+    test_iter *= 3;
+  }
 
-int main() { 
-  float dummy = 0.0;
   kaldi::Timer exp_timer;
-  for(int i = 0; i < SAMPLE; ++i) {
-    dummy += exp((double)(i % 10));
+  for(int i = 0; i < test_iter; ++i) {
+    (void)exp((double)(i & 0x0F));
   }
   double exp_time = exp_timer.Elapsed();
 
   kaldi::Timer expf_timer;
-  for(int i = 0; i < SAMPLE; ++i) {
-    dummy += expf((double)(i % 10));
+  for(int i = 0; i < test_iter; ++i) {
+    (void)expf((double)(i & 0x0F));
   }
   double expf_time = expf_timer.Elapsed();
-  
-  // Often exp() and expf() perform very similarly, 
-  // so we will replace expf() by exp() only if there is at least 10% difference 
-  if (expf_time < exp_time * 1.1) { 
+
+  double ratio = expf_time / exp_time;
+  if (ratio < 1.1) {
+    // Often exp() and expf() perform very similarly, so we will replace expf()
+    // by exp() only if there is at least 10% difference.
     return 0;
-  } else {
-    std::cerr << "exp() time: " << exp_time << std::endl;
-    std::cerr << "expf() time: " << expf_time << std::endl;
-    return 1;
   }
-  
-  std::cerr << dummy << std::endl; // No complaint about the unused variable
+
+  std::cerr << ("WARNING: slow expf() detected. expf() is slower than exp() "
+                "by the factor of ") << ratio << "\n";
+  return 1;
 }
diff --git a/src/rnnlm/rnnlm-core-training.cc b/src/rnnlm/rnnlm-core-training.cc
index 5a1ae97895f..d1a01f7ef66 100644
--- a/src/rnnlm/rnnlm-core-training.cc
+++ b/src/rnnlm/rnnlm-core-training.cc
@@ -302,7 +302,7 @@ void RnnlmCoreTrainer::PrintMaxChangeStats() const {
                   << ", per-component max-change was enforced "
                   << ((100.0 * num_max_change_per_component_applied_[i]) /
                       num_minibatches_processed_)
-                  << "\% of the time.";
+                  << "% of the time.";
       i++;
     }
   }
@@ -312,7 +312,7 @@ void RnnlmCoreTrainer::PrintMaxChangeStats() const {
                  (num_minibatches_processed_ *
                  (config_.backstitch_training_scale == 0.0 ? 1.0 :
                  1.0 + 1.0 / config_.backstitch_training_interval))
-              << "\% of the time.";
+              << "% of the time.";
 }
 
 void RnnlmCoreTrainer::ProcessOutput(
diff --git a/src/rnnlm/rnnlm-embedding-training.cc b/src/rnnlm/rnnlm-embedding-training.cc
index c4238c7356a..0b5916b6bba 100644
--- a/src/rnnlm/rnnlm-embedding-training.cc
+++ b/src/rnnlm/rnnlm-embedding-training.cc
@@ -117,9 +117,9 @@ void RnnlmEmbeddingTrainer::TrainBackstitch(
     bool is_backstitch_step1,
     CuMatrixBase<BaseFloat> *embedding_deriv) {
 
-  // backstitch training is incompatible with momentum > 0  
+  // backstitch training is incompatible with momentum > 0
   KALDI_ASSERT(config_.momentum == 0.0);
-  
+
   // If relevant, do the following:
   // "embedding_deriv += - 2 * l2_regularize * embedding_mat_"
   // This is an approximate to the regular l2 regularization (add l2 regularization
@@ -130,7 +130,7 @@ void RnnlmEmbeddingTrainer::TrainBackstitch(
       embedding_deriv->AddMat(1.0 / (1.0 + config_.backstitch_training_scale) *
           l2_term, *embedding_mat_);
     }
-  } 
+  }
 
   BaseFloat scale = 1.0;
   if (config_.use_natural_gradient) {
@@ -213,7 +213,7 @@ void RnnlmEmbeddingTrainer::Train(
 }
 
 void RnnlmEmbeddingTrainer::TrainBackstitch(
-    bool is_backstitch_step1, 
+    bool is_backstitch_step1,
     const CuArrayBase<int32> &active_words,
     CuMatrixBase<BaseFloat> *embedding_deriv) {
 
@@ -232,7 +232,7 @@ void RnnlmEmbeddingTrainer::TrainBackstitch(
       embedding_deriv->AddRows(l2_term / (1.0 + config_.backstitch_training_scale),
                                *embedding_mat_, active_words);
     }
-  } 
+  }
   BaseFloat scale = 1.0;
   if (config_.use_natural_gradient) {
     if (is_backstitch_step1) preconditioner_.Freeze(true);
@@ -273,7 +273,7 @@ void RnnlmEmbeddingTrainer::PrintStats() {
                (num_minibatches_ *
                (config_.backstitch_training_scale == 0.0 ? 1.0 :
                1.0 + 1.0 / config_.backstitch_training_interval))
-            << " \% of the time.";
+            << " % of the time.";
 
   Matrix<BaseFloat> delta_embedding_mat(*embedding_mat_);
   delta_embedding_mat.AddMat(-1.0, initial_embedding_mat_);
diff --git a/src/tfrnnlm/Makefile b/src/tfrnnlm/Makefile
index db2b840b959..3dc8d584210 100644
--- a/src/tfrnnlm/Makefile
+++ b/src/tfrnnlm/Makefile
@@ -16,11 +16,13 @@ TENSORFLOW = ../../tools/tensorflow
 
 all:
 
-EXTRA_CXXFLAGS = -Wno-sign-compare -I$(TENSORFLOW)/bazel-tensorflow/external/protobuf/src \
+EXTRA_CXXFLAGS = -Wno-sign-compare \
+                 -I$(TENSORFLOW)/bazel-tensorflow/external/protobuf_archive/src \
                  -I$(TENSORFLOW)/bazel-genfiles -I$(TENSORFLOW) \
                  -I$(TENSORFLOW)/tensorflow/contrib/makefile/downloads/eigen \
                  -I$(TENSORFLOW)/tensorflow/contrib/makefile/downloads/nsync/public \
-                 -I$(TENSORFLOW)/tensorflow/contrib/makefile/downloads/protobuf/src
+                 -I$(TENSORFLOW)/tensorflow/contrib/makefile/downloads/protobuf/src \
+                 -I${TENSORFLOW}/tensorflow/contrib/makefile/downloads/absl
 
 OBJFILES = tensorflow-rnnlm.o
 
@@ -29,7 +31,7 @@ TESTFILES =
 LIBNAME = kaldi-tensorflow-rnnlm
 
 ADDLIBS = ../lm/kaldi-lm.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a 
+          ../base/kaldi-base.a
 LDLIBS +=  -lz -ldl -fPIC -lrt
 LDLIBS += -L$(TENSORFLOW)/bazel-bin/tensorflow -ltensorflow_cc -ltensorflow_framework
 
diff --git a/src/tfrnnlmbin/Makefile b/src/tfrnnlmbin/Makefile
index 4beeeb0d594..77fe58c088c 100644
--- a/src/tfrnnlmbin/Makefile
+++ b/src/tfrnnlmbin/Makefile
@@ -14,11 +14,13 @@ TENSORFLOW = $(shell pwd)/../../tools/tensorflow
 
 all:
 
-EXTRA_CXXFLAGS = -Wno-sign-compare -I$(TENSORFLOW)/bazel-tensorflow/external/protobuf/src \
+EXTRA_CXXFLAGS = -Wno-sign-compare \
+                 -I$(TENSORFLOW)/bazel-tensorflow/external/protobuf_archive/src \
                  -I$(TENSORFLOW)/bazel-genfiles -I$(TENSORFLOW) \
                  -I$(TENSORFLOW)/tensorflow/contrib/makefile/downloads/eigen \
                  -I$(TENSORFLOW)/tensorflow/contrib/makefile/downloads/nsync/public \
-                 -I$(TENSORFLOW)/tensorflow/contrib/makefile/downloads/protobuf/src
+                 -I$(TENSORFLOW)/tensorflow/contrib/makefile/downloads/protobuf/src \
+                 -I${TENSORFLOW}/tensorflow/contrib/makefile/downloads/absl
 include ../kaldi.mk
 
 BINFILES = lattice-lmrescore-tf-rnnlm lattice-lmrescore-tf-rnnlm-pruned
@@ -30,11 +32,11 @@ TESTFILES =
 ADDLIBS = ../lat/kaldi-lat.a ../lm/kaldi-lm.a ../fstext/kaldi-fstext.a \
           ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
           ../matrix/kaldi-matrix.a ../base/kaldi-base.a \
-          ../tfrnnlm/kaldi-tensorflow-rnnlm.a 
+          ../tfrnnlm/kaldi-tensorflow-rnnlm.a
 
 LDLIBS +=  -lz -ldl -fPIC -lrt
 LDLIBS += -L$(TENSORFLOW)/bazel-bin/tensorflow -ltensorflow_cc -ltensorflow_framework
 
-LDFLAGS += -Wl,-rpath=$(shell pwd)/../../tools/tensorflow/bazel-bin/tensorflow/
+LDFLAGS += -Wl,-rpath,$(TENSORFLOW)/bazel-bin/tensorflow/
 
 include ../makefiles/default_rules.mk
diff --git a/src/tree/build-tree-questions.h b/src/tree/build-tree-questions.h
index a6bcfdd500b..22f12d62912 100644
--- a/src/tree/build-tree-questions.h
+++ b/src/tree/build-tree-questions.h
@@ -52,7 +52,7 @@ struct QuestionsForKey {  // Configuration class associated with a particular ke
   std::vector<std::vector<EventValueType> > initial_questions;
   RefineClustersOptions refine_opts;  // if refine_opts.max_iter == 0,
   // we just pick from the initial questions.
-  
+
   QuestionsForKey(int32 num_iters = 5): refine_opts(num_iters, 2) {
     // refine_cfg with 5 iters and top-n = 2 (this is no restriction because
     // RefineClusters called with 2 clusters; would get set to that anyway as
@@ -102,7 +102,9 @@ class Questions {  // careful, this is a class.
     KALDI_ASSERT(keys_out != NULL);
     CopyMapKeysToVector(key_idx_, keys_out);
   }
-  const bool HasQuestionsForKey(EventKeyType key) const { return (key_idx_.count(key) != 0); }
+  bool HasQuestionsForKey(EventKeyType key) const {
+    return (key_idx_.count(key) != 0);
+  }
   ~Questions() { kaldi::DeletePointers(&key_options_); }
 
 
diff --git a/src/tree/build-tree-utils.cc b/src/tree/build-tree-utils.cc
index 4c9be833185..254d7ec36d8 100644
--- a/src/tree/build-tree-utils.cc
+++ b/src/tree/build-tree-utils.cc
@@ -400,7 +400,7 @@ BaseFloat FindBestSplitForKey(const BuildTreeStatsType &stats,
     for (size_t i = 0;i < assignments.size();i++) if (assignments[i] == 1) yes_set.push_back(i);
   }
   *yes_set_out = yes_set;
-    
+
   DeletePointers(&clusters);
 #ifdef KALDI_PARANOID
   {  // Check the "ans" is correct.
@@ -763,10 +763,9 @@ EventMap *GetToLengthMap(const BuildTreeStatsType &stats, int32 P,
   std::vector<BuildTreeStatsType> stats_by_phone;
   try {
     SplitStatsByKey(stats, P, &stats_by_phone);
-  } catch(const std::runtime_error &err) {
-    KALDI_ERR << "Caught exception in GetToLengthMap: you seem "
-        "to have provided invalid stats [no central-phone "
-        "key].  Message was: " << err.what();
+  } catch(const KaldiFatalError &) {
+    KALDI_ERR <<
+        "You seem to have provided invalid stats [no central-phone key].";
   }
   std::map<EventValueType, EventAnswerType> phone_to_length;
   for (size_t p = 0; p < stats_by_phone.size(); p++) {
@@ -774,10 +773,9 @@ EventMap *GetToLengthMap(const BuildTreeStatsType &stats, int32 P,
       std::vector<BuildTreeStatsType> stats_by_length;
       try {
         SplitStatsByKey(stats_by_phone[p], kPdfClass, &stats_by_length);
-      } catch(const std::runtime_error &err) {
-        KALDI_ERR << "Caught exception in GetToLengthMap: you seem "
-            "to have provided invalid stats [no position "
-            "key].  Message was: " << err.what();
+      } catch(const KaldiFatalError &) {
+        KALDI_ERR <<
+            "You seem to have provided invalid stats [no position key].";
       }
       size_t length = stats_by_length.size();
       for (size_t i = 0; i < length; i++) {
@@ -868,7 +866,7 @@ EventMap *ClusterEventMapToNClustersRestrictedByMap(
     int32 *num_removed_ptr) {
   std::vector<BuildTreeStatsType> split_stats;
   SplitStatsByMap(stats, e_restrict, &split_stats);
-  
+
   if (num_clusters_required < split_stats.size()) {
     KALDI_WARN << "num-clusters-required is less than size of map. Not doing anything.";
     if (num_removed_ptr) *num_removed_ptr = 0;
@@ -904,10 +902,10 @@ EventMap *ClusterEventMapToNClustersRestrictedByMap(
           if (j > max_index) max_index = j;
         }
       }
-      
+
       normalizer += SumClusterableNormalizer(summed_stats_contiguous[i]);
-    } else { 
-      // Even if split_stats[i] is empty, a cluster will be assigned to 
+    } else {
+      // Even if split_stats[i] is empty, a cluster will be assigned to
       // that. To compensate, we decrease the num-clusters required.
       num_non_empty_clusters_required--;
     }
@@ -919,7 +917,7 @@ EventMap *ClusterEventMapToNClustersRestrictedByMap(
   if (num_non_empty_clusters_required > num_non_empty_clusters) {
     KALDI_WARN << "Cannot get required num-clusters " << num_clusters_required
                << " as number of non-empty clusters required is larger than "
-               << " number of non-empty clusters: " << num_non_empty_clusters_required 
+               << " number of non-empty clusters: " << num_non_empty_clusters_required
                << " > " << num_non_empty_clusters;
     if (num_removed_ptr) *num_removed_ptr = 0;
     return e_in.Copy();
@@ -929,7 +927,7 @@ EventMap *ClusterEventMapToNClustersRestrictedByMap(
   BaseFloat change = ClusterBottomUpCompartmentalized(
       summed_stats_contiguous,
       std::numeric_limits<BaseFloat>::infinity(),
-      num_non_empty_clusters_required,  
+      num_non_empty_clusters_required,
       NULL,  // don't need clusters out.
       &assignments);  // this algorithm is quadratic, so might be quite slow.
 
@@ -1052,7 +1050,7 @@ EventMap *GetStubMap(int32 P,
     // Do a split.  Recurse.
     size_t half_sz = phone_sets.size() / 2;
     std::vector<std::vector<int32> >::const_iterator half_phones =
-        phone_sets.begin() + half_sz;  
+        phone_sets.begin() + half_sz;
     std::vector<bool>::const_iterator half_share =
         share_roots.begin() + half_sz;
     std::vector<std::vector<int32> > phone_sets_1, phone_sets_2;
@@ -1127,4 +1125,3 @@ bool ConvertStats(int32 oldN, int32 oldP, int32 newN, int32 newP,
 
 
 } // end namespace kaldi
-
diff --git a/src/tree/cluster-utils-test.cc b/src/tree/cluster-utils-test.cc
index fd5d9690939..8eee3fb5505 100644
--- a/src/tree/cluster-utils-test.cc
+++ b/src/tree/cluster-utils-test.cc
@@ -97,10 +97,11 @@ static void TestObjfPlus() {
   AssertEqual(a.Objf(), (BaseFloat)0.0);
   AssertEqual(b.Objf(), (BaseFloat)0.0);
   AssertEqual( a.ObjfPlus(b), -0.5 * (1.0-2.5)*(1.0-2.5));  // 0.5 because half-distance, squared = 1/4, times two points...
-  KALDI_LOG << "Non-binary Output: "<<'\n';
-  a.Write(KALDI_LOG, false);
-  KALDI_LOG << "Binary Output: "<<'\n';
-  a.Write(KALDI_LOG, true);
+  KALDI_LOG << "Non-binary Output:";
+  a.Write(std::cerr, false);
+  std::cerr << "\nBinary Output:\n";
+  a.Write(std::cerr, true);
+  std::cerr << "\n";
 }
 
 static void TestObjfMinus() {
@@ -395,7 +396,7 @@ static void TestClusterKMeansVector() {
     std::vector<Clusterable*> points;
     for (size_t j = 0; j < n_clust; j++) {
       size_t n_points = 1 + Rand() % 5;
-      
+
       Vector<BaseFloat> clust_center(dim);
       clust_center.SetRandn();
       for (size_t k = 0; k < n_points; k++) {
@@ -573,5 +574,3 @@ int main() {
   TestClusterBottomUp();
   TestRefineClusters();
 }
-
-
diff --git a/src/tree/event-map.cc b/src/tree/event-map.cc
index f5b84e68d64..1f2581b4751 100644
--- a/src/tree/event-map.cc
+++ b/src/tree/event-map.cc
@@ -289,7 +289,7 @@ void EventMap::Check(const std::vector<std::pair<EventKeyType, EventValueType> >
 // static member of EventMap.
 bool EventMap::Lookup(const EventType &event,
                       EventKeyType key, EventValueType *ans) {
-  // this assumes the the "event" array is sorted (e.g. on the KeyType value;
+  // this assumes that the "event" array is sorted (e.g. on the KeyType value;
   // just doing std::sort will do this) and has no duplicate values with the same
   // key.  call Check() to verify this.
 #ifdef KALDI_PARANOID
diff --git a/src/util/edit-distance-inl.h b/src/util/edit-distance-inl.h
index c1d1682804c..3304b27d0bf 100644
--- a/src/util/edit-distance-inl.h
+++ b/src/util/edit-distance-inl.h
@@ -35,8 +35,8 @@ int32 LevenshteinEditDistance(const std::vector<T> &a,
   //  elements a_0 ... a_{M-1} and b_0 ... b_{N-1}.
   //  We are computing the recursion
   //     E(m, n) = min(  E(m-1, n-1) + (1-delta(a_{m-1}, b_{n-1})),
-  //                    E(m-1, n),
-  //                    E(m, n-1) ).
+  //                    E(m-1, n) + 1,
+  //                    E(m, n-1) + 1).
   //  where E(m, n) is defined for m = 0..M and n = 0..N and out-of-
   //  bounds quantities are considered to be infinity (i.e. the
   //  recursion does not visit them).
diff --git a/src/util/hash-list-inl.h b/src/util/hash-list-inl.h
index 3fe16182b82..da6165af784 100644
--- a/src/util/hash-list-inl.h
+++ b/src/util/hash-list-inl.h
@@ -121,15 +121,24 @@ HashList<I, T>::~HashList() {
   }
 }
 
-
 template<class I, class T>
-void HashList<I, T>::Insert(I key, T val) {
+inline typename HashList<I, T>::Elem* HashList<I, T>::Insert(I key, T val) {
   size_t index = (static_cast<size_t>(key) % hash_size_);
   HashBucket &bucket = buckets_[index];
+  // Check the element is existing or not.
+  if (bucket.last_elem != NULL) {
+    Elem *head = (bucket.prev_bucket == static_cast<size_t>(-1) ?
+                  list_head_ :
+                  buckets_[bucket.prev_bucket].last_elem->tail),
+         *tail = bucket.last_elem->tail;
+    for (Elem *e = head; e != tail; e = e->tail)
+      if (e->key == key) return e;
+  }
+
+  // This is a new element. Insert it.
   Elem *elem = New();
   elem->key = key;
   elem->val = val;
-
   if (bucket.last_elem == NULL) {  // Unoccupied bucket.  Insert at
     // head of bucket list (which is tail of regular list, they go in
     // opposite directions).
@@ -152,6 +161,7 @@ void HashList<I, T>::Insert(I key, T val) {
     bucket.last_elem->tail = elem;
     bucket.last_elem = elem;
   }
+  return elem;
 }
 
 template<class I, class T>
diff --git a/src/util/hash-list.h b/src/util/hash-list.h
index 67257d053cd..9ae0043f050 100644
--- a/src/util/hash-list.h
+++ b/src/util/hash-list.h
@@ -86,14 +86,12 @@ template<class I, class T> class HashList {
   /// is free to modify the "val" element.
   inline Elem *Find(I key);
 
-  /// Insert inserts a new element into the hashtable/stored list.  By calling
-  /// this,
-  /// the user asserts that it is not already present (e.g. Find was called and
-  /// returned NULL).  With current code, calling this if an element already
-  ///  exists will result in duplicate elements in the structure, and Find()
-  ///  will find the first one that was added.
-  /// [but we don't guarantee this behavior].
-  inline void Insert(I key, T val);
+  /// Insert inserts a new element into the hashtable/stored list.
+  /// Because element keys in a hashtable are unique, this operation checks
+  /// whether each inserted element has a key equivalent to the one of an
+  /// element already in the hashtable. If so, the element is not inserted,
+  /// returning an pointer to this existing element.
+  inline Elem *Insert(I key, T val);
 
   /// Insert inserts another element with same key into the hashtable/
   /// stored list.
diff --git a/src/util/kaldi-io.cc b/src/util/kaldi-io.cc
index 493a335f2db..96cd8fa1041 100644
--- a/src/util/kaldi-io.cc
+++ b/src/util/kaldi-io.cc
@@ -58,7 +58,7 @@ typedef basic_pipebuf<char> PipebufType;
 
 namespace kaldi {
 
-std::string PrintableRxfilename(std::string rxfilename) {
+std::string PrintableRxfilename(const std::string &rxfilename) {
   if (rxfilename == "" || rxfilename == "-") {
     return "standard input";
   } else {
@@ -70,12 +70,12 @@ std::string PrintableRxfilename(std::string rxfilename) {
 }
 
 
-std::string PrintableWxfilename(std::string wxfilename) {
+std::string PrintableWxfilename(const std::string &wxfilename) {
   if (wxfilename == "" || wxfilename == "-") {
     return "standard output";
   } else {
     // If this call to Escape later causes compilation issues,
-    // just replace it with "return rxfilename"; it's only a
+    // just replace it with "return wxfilename"; it's only a
     // pretty-printing issue.
     return ParseOptions::Escape(wxfilename);
   }
diff --git a/src/util/kaldi-io.h b/src/util/kaldi-io.h
index cf25b6deeb3..b7f166980fb 100644
--- a/src/util/kaldi-io.h
+++ b/src/util/kaldi-io.h
@@ -59,13 +59,13 @@ class InputImplBase;  // Forward decl; defined in a .cc file
 //                        Documents\\boo"
 //          (whatever the actual file-system interprets)
 // (2) Standard output:  "" or "-"
-// (3) A pipe: e.g.  "gunzip -c /tmp/abc.gz |"
+// (3) A pipe: e.g. "| gzip -c > /tmp/abc.gz"
 //
 //
 // A "rxfilename" is an extended filename for reading.  It can take four forms:
 // (1) An actual filename, whatever the file-system can read, e.g. "/my/file".
 // (2) Standard input: "" or "-"
-// (3) A pipe: e.g. "| gzip -c > /tmp/abc.gz"
+// (3) A pipe: e.g.  "gunzip -c /tmp/abc.gz |"
 // (4) An offset into a file, e.g.: "/mnt/blah/data/1.ark:24871"
 //   [these are created by the Table and TableWriter classes; I may also write
 //    a program that creates them for arbitrary files]
@@ -99,7 +99,7 @@ enum OutputType {
 ///     |.
 ///  - kFileOutput: Normal filenames
 ///  - kStandardOutput: The empty string or "-", interpreted as standard output
-///  - kPipeOutput: pipes, e.g. "gunzip -c some_file.gz |"
+///  - kPipeOutput: pipes, e.g. "| gzip -c > /tmp/abc.gz"
 OutputType ClassifyWxfilename(const std::string &wxfilename);
 
 enum InputType {
@@ -116,7 +116,7 @@ enum InputType {
 ///       with trailing |.
 ///  - kFileInput: normal filenames
 ///  - kStandardInput: the empty string or "-"
-///  - kPipeInput: e.g. "| gzip -c > blah.gz"
+///  - kPipeInput: e.g. "gunzip -c /tmp/abc.gz |"
 ///  - kOffsetFileInput: offsets into files, e.g.  /some/filename:12970
 InputType ClassifyRxfilename(const std::string &rxfilename);
 
@@ -182,7 +182,7 @@ class Output {
 // Input interprets four kinds of filenames:
 //  (1) Normal filenames
 //  (2) The empty string or "-", interpreted as standard output
-//  (3) Pipes, e.g. "| gzip -c > some_file.gz"
+//  (3) A pipe: e.g.  "gunzip -c /tmp/abc.gz |"
 //  (4) Offsets into [real] files, e.g. "/my/filename:12049"
 // The last one has no correspondence in Output.
 
@@ -264,12 +264,12 @@ template <class C> inline void WriteKaldiObject(const C &c,
 /// PrintableRxfilename turns the rxfilename into a more human-readable
 /// form for error reporting, i.e. it does quoting and escaping and
 /// replaces "" or "-" with "standard input".
-std::string PrintableRxfilename(std::string rxfilename);
+std::string PrintableRxfilename(const std::string &rxfilename);
 
-/// PrintableWxfilename turns the filename into a more human-readable
+/// PrintableWxfilename turns the wxfilename into a more human-readable
 /// form for error reporting, i.e. it does quoting and escaping and
 /// replaces "" or "-" with "standard output".
-std::string PrintableWxfilename(std::string wxfilename);
+std::string PrintableWxfilename(const std::string &wxfilename);
 
 /// @}
 
diff --git a/src/util/kaldi-pipebuf.h b/src/util/kaldi-pipebuf.h
index 9b83cdccc3d..61034ac2757 100644
--- a/src/util/kaldi-pipebuf.h
+++ b/src/util/kaldi-pipebuf.h
@@ -82,7 +82,6 @@ class basic_pipebuf : public std::basic_filebuf<CharType, Traits> {
 };  // class basic_pipebuf
 #endif  // _MSC_VER
 
-};  // namespace kaldi
+}  // namespace kaldi
 
 #endif  // KALDI_UTIL_KALDI_PIPEBUF_H_
-
diff --git a/src/util/kaldi-table-inl.h b/src/util/kaldi-table-inl.h
index 465f800b26c..6aca2f137e3 100644
--- a/src/util/kaldi-table-inl.h
+++ b/src/util/kaldi-table-inl.h
@@ -48,7 +48,7 @@ template<class Holder> class SequentialTableReaderImplBase {
   // called on a just-allocated object.
   virtual bool Open(const std::string &rxfilename) = 0;
   // Done() should be called on a successfully opened, not-closed object.
-  // only throws if called a the wrong time (i.e. code error).
+  // only throws if called at the wrong time (i.e. code error).
   virtual bool Done() const = 0;
   // Returns true if the reader is open [i.e. Open() succeeded and
   // the user has not called Close()]
@@ -1152,7 +1152,7 @@ class TableWriterScriptImpl: public TableWriterImplBase<Holder> {
                                            &script_rxfilename_,
                                            &opts_);
     KALDI_ASSERT(ws == kScriptWspecifier);  // or wrongly called.
-    KALDI_ASSERT(script_.empty());  // no way it could be nonempty at this poin.
+    KALDI_ASSERT(script_.empty());  // no way it could be nonempty at this point.
 
     if (!ReadScriptFile(script_rxfilename_,
                          true,  // print any warnings
diff --git a/src/util/kaldi-table-test.cc b/src/util/kaldi-table-test.cc
index b23ba63a18a..572e7a7ccec 100644
--- a/src/util/kaldi-table-test.cc
+++ b/src/util/kaldi-table-test.cc
@@ -90,20 +90,20 @@ void UnitTestReadScriptFile() {
 
 void UnitTestClassifyWspecifier() {
   {
-    std::string a = "b,ark:foo|";
+    std::string a = "b,ark:|foo";
     std::string ark = "x", scp = "y";
     WspecifierOptions opts;
     WspecifierType ans = ClassifyWspecifier(a, &ark, &scp, &opts);
-    KALDI_ASSERT(ans == kArchiveWspecifier && ark == "foo|" && scp == "" &&
+    KALDI_ASSERT(ans == kArchiveWspecifier && ark == "|foo" && scp == "" &&
                  opts.binary == true);
   }
 
   {
-    std::string a = "t,ark:foo|";
+    std::string a = "t,ark:|foo";
     std::string ark = "x", scp = "y";
     WspecifierOptions opts;
     WspecifierType ans = ClassifyWspecifier(a, &ark, &scp, &opts);
-    KALDI_ASSERT(ans == kArchiveWspecifier && ark == "foo|" && scp == "" &&
+    KALDI_ASSERT(ans == kArchiveWspecifier && ark == "|foo" && scp == "" &&
                  opts.binary == false);
   }
 
diff --git a/src/util/kaldi-table.cc b/src/util/kaldi-table.cc
index 99ddafb2017..1aeceb2bb7d 100644
--- a/src/util/kaldi-table.cc
+++ b/src/util/kaldi-table.cc
@@ -223,7 +223,7 @@ WspecifierType ClassifyWspecifier(const std::string &wspecifier,
 
 
 RspecifierType ClassifyRspecifier(const std::string &rspecifier,
-                                  std::string *wxfilename,
+                                  std::string *rxfilename,
                                   RspecifierOptions *opts) {
   // Examples
   // ark:rxfilename  ->  kArchiveRspecifier
@@ -247,7 +247,7 @@ RspecifierType ClassifyRspecifier(const std::string &rspecifier,
 
   // Improperly formed Rspecifiers will be classified as kNoRspecifier.
 
-  if (wxfilename) wxfilename->clear();
+  if (rxfilename) rxfilename->clear();
 
   if (opts != NULL)
     *opts = RspecifierOptions();  // Make sure all the defaults are as in the
@@ -308,8 +308,8 @@ RspecifierType ClassifyRspecifier(const std::string &rspecifier,
     }
   }
   if ((rs == kArchiveRspecifier || rs == kScriptRspecifier)
-     && wxfilename != NULL)
-    *wxfilename = after_colon;
+     && rxfilename != NULL)
+    *rxfilename = after_colon;
   return rs;
 }
 
diff --git a/src/util/kaldi-table.h b/src/util/kaldi-table.h
index e3a80b2743b..6865cea14ec 100644
--- a/src/util/kaldi-table.h
+++ b/src/util/kaldi-table.h
@@ -67,7 +67,7 @@ typedef std::vector<std::string> KeyList;
 //  t means text mode.
 //  b means binary mode.
 //  f means flush the stream after writing each entry.
-//   (nf means don't flush, and isn't very useful as the default is to flush).
+//   (nf means don't flush, and the default is not to flush).
 //  p means permissive mode, when writing to an "scp" file only: will ignore
 //     missing scp entries, i.e. won't write anything for those files but will
 //     return success status).
@@ -79,7 +79,7 @@ typedef std::vector<std::string> KeyList;
 //  ark,b:-
 //
 //  The meanings of rxfilename and wxfilename are as described in
-//  kaldi-stream.h (they are filenames but include pipes, stdin/stdout
+//  kaldi-io.h (they are filenames but include pipes, stdin/stdout
 //  and so on; filename is a regular filename.
 //
 
@@ -100,7 +100,7 @@ typedef std::vector<std::string> KeyList;
 //    key filename:12407
 //  where the number is the byte offset into the file.
 //  In this case we restrict the archive-filename to be an actual filename,
-//  as we can't see a situtation where an extended filename would make sense
+//  as we can't see a situation where an extended filename would make sense
 //  for this (we can't fseek() in pipes).
 
 enum WspecifierType  {
@@ -236,7 +236,7 @@ class RandomAccessTableReader {
 
   RandomAccessTableReader(): impl_(NULL) { }
 
-  // This constructor equivalent to default constructor + "open", but
+  // This constructor is equivalent to default constructor + "open", but
   // throws on error.
   explicit RandomAccessTableReader(const std::string &rspecifier);
 
@@ -315,7 +315,7 @@ class SequentialTableReader {
 
   // Return reference to the current value.  It's only valid to call this if
   // Done() returned false.  The reference is valid till next call to this
-  // object.  If will throw if you are reading an scp file, did not specify the
+  // object.  It will throw if you are reading an scp file, did not specify the
   // "permissive" (p) option and the file cannot be read.  [The permissive
   // option makes it behave as if that key does not even exist, if the
   // corresponding file cannot be read.]  You probably wouldn't want to catch
@@ -383,8 +383,7 @@ class TableWriter {
   // Returns true if open for writing.
   bool IsOpen() const;
 
-  // Write the object.  Throws  std::runtime_error on error (via the
-  // KALDI_ERR macro)
+  // Write the object. Throws KaldiFatalError on error via the KALDI_ERR macro.
   inline void Write(const std::string &key, const T &value) const;
 
 
diff --git a/src/util/parse-options.cc b/src/util/parse-options.cc
index 2f75cb655f9..667d9e91c94 100644
--- a/src/util/parse-options.cc
+++ b/src/util/parse-options.cc
@@ -323,14 +323,7 @@ int ParseOptions::Read(int argc, const char *const argv[]) {
 #else
     const char *c = strrchr(argv[0], '/');
 #endif
-    if (c == NULL)
-      c = argv[0];
-    else
-      c++;
-    char *program_name = new char[strlen(c)+1];
-    strcpy(program_name, c);
-    delete [] g_program_name;
-    g_program_name = program_name;
+    SetProgramName(c == NULL ? argv[0] : c + 1);
   }
   // first pass: look for config parameter, look for priority
   for (i = 1; i < argc; i++) {
diff --git a/src/util/simple-io-funcs.cc b/src/util/simple-io-funcs.cc
index 3d770dfff99..cb732a10a6d 100644
--- a/src/util/simple-io-funcs.cc
+++ b/src/util/simple-io-funcs.cc
@@ -21,7 +21,7 @@
 
 namespace kaldi {
 
-bool WriteIntegerVectorSimple(std::string wxfilename,
+bool WriteIntegerVectorSimple(const std::string &wxfilename,
                               const std::vector<int32> &list) {
   kaldi::Output ko;
   // false, false is: text-mode, no Kaldi header.
@@ -30,7 +30,8 @@ bool WriteIntegerVectorSimple(std::string wxfilename,
   return ko.Close();
 }
 
-bool ReadIntegerVectorSimple(std::string rxfilename, std::vector<int32> *list) {
+bool ReadIntegerVectorSimple(const std::string &rxfilename,
+                             std::vector<int32> *list) {
   kaldi::Input ki;
   if (!ki.OpenTextMode(rxfilename)) return false;
   std::istream &is = ki.Stream();
@@ -42,7 +43,7 @@ bool ReadIntegerVectorSimple(std::string rxfilename, std::vector<int32> *list) {
   return is.eof();  // should be eof, or junk at end of file.
 }
 
-bool WriteIntegerVectorVectorSimple(std::string wxfilename,
+bool WriteIntegerVectorVectorSimple(const std::string &wxfilename,
                                  const std::vector<std::vector<int32> > &list) {
   kaldi::Output ko;
   // false, false is: text-mode, no Kaldi header.
@@ -58,7 +59,7 @@ bool WriteIntegerVectorVectorSimple(std::string wxfilename,
   return ko.Close();
 }
 
-bool ReadIntegerVectorVectorSimple(std::string rxfilename,
+bool ReadIntegerVectorVectorSimple(const std::string &rxfilename,
                                    std::vector<std::vector<int32> > *list) {
   kaldi::Input ki;
   if (!ki.OpenTextMode(rxfilename)) return false;
diff --git a/src/util/simple-io-funcs.h b/src/util/simple-io-funcs.h
index 58445356e02..30b90acb399 100644
--- a/src/util/simple-io-funcs.h
+++ b/src/util/simple-io-funcs.h
@@ -35,13 +35,14 @@ namespace kaldi {
 /// WriteToList attempts to write this list of integers, one per line,
 /// to the given file, in text format.
 /// returns true if succeeded.
-bool WriteIntegerVectorSimple(std::string wxfilename,
+bool WriteIntegerVectorSimple(const std::string &wxfilename,
                               const std::vector<int32> &v);
 
 /// ReadFromList attempts to read this list of integers, one per line,
 /// from the given file, in text format.
 /// returns true if succeeded.
-bool ReadIntegerVectorSimple(std::string rxfilename, std::vector<int32> *v);
+bool ReadIntegerVectorSimple(const std::string &rxfilename,
+                             std::vector<int32> *v);
 
 // This is a file format like:
 // 1 2
@@ -49,10 +50,10 @@ bool ReadIntegerVectorSimple(std::string rxfilename, std::vector<int32> *v);
 //
 // 4 5 6
 // etc.
-bool WriteIntegerVectorVectorSimple(std::string wxfilename,
+bool WriteIntegerVectorVectorSimple(const std::string &wxfilename,
                                     const std::vector<std::vector<int32> > &v);
 
-bool ReadIntegerVectorVectorSimple(std::string rxfilename,
+bool ReadIntegerVectorVectorSimple(const std::string &rxfilename,
                                    std::vector<std::vector<int32> > *v);
 
 
diff --git a/src/util/stl-utils.h b/src/util/stl-utils.h
index a1506f557a7..647073a2215 100644
--- a/src/util/stl-utils.h
+++ b/src/util/stl-utils.h
@@ -97,8 +97,6 @@ void CopySetToVector(const std::set<T> &s, std::vector<T> *v) {
 
 template<class T>
 void CopySetToVector(const unordered_set<T> &s, std::vector<T> *v) {
-  // adds members of s to v, in sorted order from lowest to highest
-  // (because the set was in sorted order).
   KALDI_ASSERT(v != NULL);
   v->resize(s.size());
   typename unordered_set<T>::const_iterator siter = s.begin(), send = s.end();
diff --git a/src/util/text-utils-test.cc b/src/util/text-utils-test.cc
index 5bfe4cb24d0..3b58f4f1dd1 100644
--- a/src/util/text-utils-test.cc
+++ b/src/util/text-utils-test.cc
@@ -2,6 +2,7 @@
 
 // Copyright 2009-2011     Microsoft Corporation
 //                2017     Johns Hopkins University (author: Daniel Povey)
+//                2015  Vimal Manohar (Johns Hopkins University)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -324,6 +325,193 @@ void TestStringsApproxEqual() {
   KALDI_ASSERT(!StringsApproxEqual("x 1.0 y", "x 1.0001 y", 4));
 }
 
+void UnitTestConfigLineParse() {
+  std::string str;
+  {
+    ConfigLine cfl;
+    str = "a-b xx=yyy foo=bar  baz=123 ba=1:2";
+    bool status = cfl.ParseLine(str);
+    KALDI_ASSERT(status && cfl.FirstToken() == "a-b");
+
+    KALDI_ASSERT(cfl.HasUnusedValues());
+    std::string str_value;
+    KALDI_ASSERT(cfl.GetValue("xx", &str_value));
+    KALDI_ASSERT(str_value == "yyy");
+    KALDI_ASSERT(cfl.HasUnusedValues());
+    KALDI_ASSERT(cfl.GetValue("foo", &str_value));
+    KALDI_ASSERT(str_value == "bar");
+    KALDI_ASSERT(cfl.HasUnusedValues());
+    KALDI_ASSERT(!cfl.GetValue("xy", &str_value));
+    KALDI_ASSERT(cfl.GetValue("baz", &str_value));
+    KALDI_ASSERT(str_value == "123");
+
+    std::vector<int32> int_values;
+    KALDI_ASSERT(!cfl.GetValue("xx", &int_values));
+    KALDI_ASSERT(cfl.GetValue("baz", &int_values));
+    KALDI_ASSERT(cfl.HasUnusedValues());
+    KALDI_ASSERT(int_values.size() == 1 && int_values[0] == 123);
+    KALDI_ASSERT(cfl.GetValue("ba", &int_values));
+    KALDI_ASSERT(int_values.size() == 2 && int_values[0] == 1 && int_values[1] == 2);
+    KALDI_ASSERT(!cfl.HasUnusedValues());
+  }
+
+  {
+    ConfigLine cfl;
+    str = "a-b baz=x y z pp = qq ab =cd ac= bd";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = "a-b baz=x y z pp = qq ab=cd ac=bd";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = "foo-bar";
+    KALDI_ASSERT(cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = "foo-bar a=b c d f=g";
+    std::string value;
+    KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "foo-bar" &&
+                 cfl.GetValue("a", &value)  && value == "b c d" &&
+                 cfl.GetValue("f", &value) && value == "g" &&
+                 !cfl.HasUnusedValues());
+  }
+  {
+    ConfigLine cfl;
+    str = "zzz a=b baz";
+    KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "zzz" &&
+                 cfl.UnusedValues() == "a=b baz");
+  }
+  {
+    ConfigLine cfl;
+    str = "xxx a=b baz ";
+    KALDI_ASSERT(cfl.ParseLine(str) && cfl.UnusedValues() == "a=b baz");
+  }
+  {
+    ConfigLine cfl;
+    str = "xxx a=b =c";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = "xxx baz='x y z' pp=qq ab=cd ac=bd";
+    KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "xxx");
+    std::string str_value;
+    KALDI_ASSERT(cfl.GetValue("baz", &str_value));
+    KALDI_ASSERT(str_value == "x y z");
+    KALDI_ASSERT(cfl.GetValue("pp", &str_value));
+    KALDI_ASSERT(str_value == "qq");
+    KALDI_ASSERT(cfl.UnusedValues() == "ab=cd ac=bd");
+    KALDI_ASSERT(cfl.GetValue("ab", &str_value));
+    KALDI_ASSERT(str_value == "cd");
+    KALDI_ASSERT(cfl.UnusedValues() == "ac=bd");
+    KALDI_ASSERT(cfl.HasUnusedValues());
+    KALDI_ASSERT(cfl.GetValue("ac", &str_value));
+    KALDI_ASSERT(str_value == "bd");
+    KALDI_ASSERT(!cfl.HasUnusedValues());
+  }
+
+  {
+    ConfigLine cfl;
+    str = "x baz= pp = qq flag=t ";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = " x baz= pp=qq flag=t  ";
+    KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "x");
+
+    std::string str_value;
+    KALDI_ASSERT(cfl.GetValue("baz", &str_value));
+    KALDI_ASSERT(str_value == "");
+    KALDI_ASSERT(cfl.GetValue("pp", &str_value));
+    KALDI_ASSERT(str_value == "qq");
+    KALDI_ASSERT(cfl.HasUnusedValues());
+    KALDI_ASSERT(cfl.GetValue("flag", &str_value));
+    KALDI_ASSERT(str_value == "t");
+    KALDI_ASSERT(!cfl.HasUnusedValues());
+
+    bool bool_value = false;
+    KALDI_ASSERT(cfl.GetValue("flag", &bool_value));
+    KALDI_ASSERT(bool_value);
+  }
+
+  {
+    ConfigLine cfl;
+    str = "xx _baz=a -pp=qq";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = "xx 0baz=a pp=qq";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = "xx -baz=a pp=qq";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = "xx _baz'=a pp=qq";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = " baz=g";
+    KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "");
+    bool flag;
+    KALDI_ASSERT(!cfl.GetValue("baz", &flag));
+  }
+  {
+    ConfigLine cfl;
+    str = "xx _baz1=a pp=qq";
+    KALDI_ASSERT(cfl.ParseLine(str));
+
+    std::string str_value;
+    KALDI_ASSERT(cfl.GetValue("_baz1", &str_value));
+  }
+}
+
+void UnitTestReadConfig() {
+  std::string str = "a-b alpha=aa beta=\"b b\"# String test\n"
+      "a-b beta2='b c' beta3=bd # \n"
+      "a-b gamma=1:2:3:4  # Int Vector test\n"
+      " a-b de1ta=f  # Bool + Integer in key Comment test delta=t  \n"
+      "a-b _epsilon=-1  # Int Vector test _epsilon=1 \n"
+      "a-b zet-_a=0.15   theta=1.1# Float, -, _ test\n"
+      "a-b quoted='a b c' # quoted string\n"
+      "a-b quoted2=\"d e 'a b=c' f\" # string quoted with double quotes";
+
+  std::istringstream is(str);
+  std::vector<std::string> lines;
+  ReadConfigLines(is, &lines);
+  KALDI_ASSERT(lines.size() == 8);
+
+  ConfigLine cfl;
+  for (size_t i = 0; i < lines.size(); i++) {
+    KALDI_ASSERT(cfl.ParseLine(lines[i]) && cfl.FirstToken() == "a-b");
+    if (i == 1) {
+        KALDI_ASSERT(cfl.GetValue("beta2", &str) && str == "b c");
+    }
+    if (i == 4) {
+      KALDI_ASSERT(cfl.GetValue("_epsilon", &str) && str == "-1");
+    }
+    if (i == 5) {
+      BaseFloat float_val = 0;
+      KALDI_ASSERT(cfl.GetValue("zet-_a", &float_val) && ApproxEqual(float_val, 0.15));
+    }
+    if (i == 6) {
+      KALDI_ASSERT(cfl.GetValue("quoted", &str) && str == "a b c");
+    }
+    if (i == 7) {
+      KALDI_ASSERT(cfl.GetValue("quoted2", &str) && str == "d e 'a b=c' f");
+    }
+  }
+}
 
 }  // end namespace kaldi
 
@@ -344,5 +532,7 @@ int main() {
   TestNan<double>();
   TestInf<float>();
   TestInf<double>();
+  UnitTestConfigLineParse();
+  UnitTestReadConfig();
   std::cout << "Test OK\n";
 }
diff --git a/src/util/text-utils.cc b/src/util/text-utils.cc
index 200e3ad9327..bbf38ecc5cc 100644
--- a/src/util/text-utils.cc
+++ b/src/util/text-utils.cc
@@ -340,4 +340,252 @@ bool StringsApproxEqual(const std::string &a,
 }
 
 
+bool ConfigLine::ParseLine(const std::string &line) {
+  data_.clear();
+  whole_line_ = line;
+  if (line.size() == 0) return false;   // Empty line
+  size_t pos = 0, size = line.size();
+  while (isspace(line[pos]) && pos < size) pos++;
+  if (pos == size)
+    return false;  // whitespace-only line
+  size_t first_token_start_pos = pos;
+  // first get first_token_.
+  while (!isspace(line[pos]) && pos < size) {
+    if (line[pos] == '=') {
+      // If the first block of non-whitespace looks like "foo-bar=...",
+      // then we ignore it: there is no initial token, and FirstToken()
+      // is empty.
+      pos = first_token_start_pos;
+      break;
+    }
+    pos++;
+  }
+  first_token_ = std::string(line, first_token_start_pos, pos - first_token_start_pos);
+  // first_token_ is expected to be either empty or something like
+  // "component-node", which actually is a slightly more restrictive set of
+  // strings than IsValidName() checks for this is a convenient way to check it.
+  if (!first_token_.empty() && !IsValidName(first_token_))
+    return false;
+
+  while (pos < size) {
+    if (isspace(line[pos])) {
+      pos++;
+      continue;
+    }
+
+    // OK, at this point we know that we are pointing at nonspace.
+    size_t next_equals_sign = line.find_first_of("=", pos);
+    if (next_equals_sign == pos || next_equals_sign == std::string::npos) {
+      // we're looking for something like 'key=value'.  If there is no equals sign,
+      // or it's not preceded by something, it's a parsing failure.
+      return false;
+    }
+    std::string key(line, pos, next_equals_sign - pos);
+    if (!IsValidName(key)) return false;
+
+    // handle any quotes.  we support key='blah blah' or key="foo bar".
+    // no escaping is supported.
+    if (line[next_equals_sign+1] == '\'' || line[next_equals_sign+1] == '"') {
+      char my_quote = line[next_equals_sign+1];
+      size_t next_quote = line.find_first_of(my_quote, next_equals_sign + 2);
+      if (next_quote == std::string::npos) {  // no matching quote was found.
+        KALDI_WARN << "No matching quote for " << my_quote << " in config line '"
+                   << line << "'";
+        return false;
+      } else {
+        std::string value(line, next_equals_sign + 2,
+                          next_quote - next_equals_sign - 2);
+        data_.insert(std::make_pair(key, std::make_pair(value, false)));
+        pos = next_quote + 1;
+        continue;
+      }
+    } else {
+      // we want to be able to parse something like "... input=Offset(a, -1) foo=bar":
+      // in general, config values with spaces in them, even without quoting.
+
+      size_t next_next_equals_sign = line.find_first_of("=", next_equals_sign + 1),
+          terminating_space = size;
+
+      if (next_next_equals_sign != std::string::npos) {  // found a later equals sign.
+        size_t preceding_space = line.find_last_of(" \t", next_next_equals_sign);
+        if (preceding_space != std::string::npos &&
+            preceding_space > next_equals_sign)
+          terminating_space = preceding_space;
+      }
+      while (isspace(line[terminating_space - 1]) && terminating_space > 0)
+        terminating_space--;
+
+      std::string value(line, next_equals_sign + 1,
+                        terminating_space - (next_equals_sign + 1));
+      data_.insert(std::make_pair(key, std::make_pair(value, false)));
+      pos = terminating_space;
+    }
+  }
+  return true;
+}
+
+bool ConfigLine::GetValue(const std::string &key, std::string *value) {
+  KALDI_ASSERT(value != NULL);
+  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
+  for (; it != data_.end(); ++it) {
+    if (it->first == key) {
+      *value = (it->second).first;
+      (it->second).second = true;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool ConfigLine::GetValue(const std::string &key, BaseFloat *value) {
+  KALDI_ASSERT(value != NULL);
+  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
+  for (; it != data_.end(); ++it) {
+    if (it->first == key) {
+      if (!ConvertStringToReal((it->second).first, value))
+        return false;
+      (it->second).second = true;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool ConfigLine::GetValue(const std::string &key, int32 *value) {
+  KALDI_ASSERT(value != NULL);
+  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
+  for (; it != data_.end(); ++it) {
+    if (it->first == key) {
+      if (!ConvertStringToInteger((it->second).first, value))
+        return false;
+      (it->second).second = true;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool ConfigLine::GetValue(const std::string &key, std::vector<int32> *value) {
+  KALDI_ASSERT(value != NULL);
+  value->clear();
+  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
+  for (; it != data_.end(); ++it) {
+    if (it->first == key) {
+      if (!SplitStringToIntegers((it->second).first, ":,", true, value)) {
+        // KALDI_WARN << "Bad option " << (it->second).first;
+        return false;
+      }
+      (it->second).second = true;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool ConfigLine::GetValue(const std::string &key, bool *value) {
+  KALDI_ASSERT(value != NULL);
+  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
+  for (; it != data_.end(); ++it) {
+    if (it->first == key) {
+      if ((it->second).first.size() == 0) return false;
+      switch (((it->second).first)[0]) {
+        case 'F':
+        case 'f':
+          *value = false;
+          break;
+        case 'T':
+        case 't':
+          *value = true;
+          break;
+        default:
+          return false;
+      }
+      (it->second).second = true;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool ConfigLine::HasUnusedValues() const {
+  std::map<std::string, std::pair<std::string, bool> >::const_iterator it = data_.begin();
+  for (; it != data_.end(); ++it) {
+    if (!(it->second).second) return true;
+  }
+  return false;
+}
+
+std::string ConfigLine::UnusedValues() const {
+  std::string unused_str;
+  std::map<std::string, std::pair<std::string, bool> >::const_iterator it = data_.begin();
+  for (; it != data_.end(); ++it) {
+    if (!(it->second).second) {
+      if (unused_str == "")
+        unused_str = it->first + "=" + (it->second).first;
+      else
+        unused_str += " " + it->first + "=" + (it->second).first;
+    }
+  }
+  return unused_str;
+}
+
+// This is like ExpectToken but for two tokens, and it
+// will either accept token1 and then token2, or just token2.
+// This is useful in Read functions where the first token
+// may already have been consumed.
+void ExpectOneOrTwoTokens(std::istream &is, bool binary,
+                          const std::string &token1,
+                          const std::string &token2) {
+  KALDI_ASSERT(token1 != token2);
+  std::string temp;
+  ReadToken(is, binary, &temp);
+  if (temp == token1) {
+    ExpectToken(is, binary, token2);
+  } else {
+    if (temp != token2) {
+      KALDI_ERR << "Expecting token " << token1 << " or " << token2
+                << " but got " << temp;
+    }
+  }
+}
+
+
+bool IsValidName(const std::string &name) {
+  if (name.size() == 0) return false;
+  for (size_t i = 0; i < name.size(); i++) {
+    if (i == 0 && !isalpha(name[i]) && name[i] != '_')
+      return false;
+    if (!isalnum(name[i]) && name[i] != '_' && name[i] != '-' && name[i] != '.')
+      return false;
+  }
+  return true;
+}
+
+void ReadConfigLines(std::istream &is,
+                    std::vector<std::string> *lines) {
+  KALDI_ASSERT(lines != NULL);
+  std::string line;
+  while (std::getline(is, line)) {
+    if (line.size() == 0) continue;
+    size_t start = line.find_first_not_of(" \t");
+    size_t end = line.find_first_of('#');
+    if (start == std::string::npos || start == end) continue;
+    end = line.find_last_not_of(" \t", end - 1);
+    KALDI_ASSERT(end >= start);
+    lines->push_back(line.substr(start, end - start + 1));
+  }
+}
+
+void ParseConfigLines(const std::vector<std::string> &lines,
+                      std::vector<ConfigLine> *config_lines) {
+  config_lines->resize(lines.size());
+  for (size_t i = 0; i < lines.size(); i++) {
+    bool ret = (*config_lines)[i].ParseLine(lines[i]);
+    if (!ret) {
+      KALDI_ERR << "Error parsing config line: " << lines[i];
+    }
+  }
+}
+
+
 }  // end namespace kaldi
diff --git a/src/util/text-utils.h b/src/util/text-utils.h
index 7bc20957672..02f4bf483fc 100644
--- a/src/util/text-utils.h
+++ b/src/util/text-utils.h
@@ -183,6 +183,98 @@ bool StringsApproxEqual(const std::string &a,
                         const std::string &b,
                         int32 decimal_places_check = 2);
 
+/**
+   This class is responsible for parsing input like
+    hi-there xx=yyy a=b c empty= f-oo=Append(bar, sss) ba_z=123 bing='a b c' baz="a b c d='a b' e"
+   and giving you access to the fields, in this case
+
+   FirstToken() == "hi-there", and key->value pairs:
+
+   xx->yyy, a->"b c", empty->"", f-oo->"Append(bar, sss)", ba_z->"123",
+   bing->"a b c", baz->"a b c d='a b' e"
+
+   The first token is optional, if the line started with a key-value pair then
+   FirstValue() will be empty.
+
+   Note: it can parse value fields with space inside them only if they are free of the '='
+   character.  If values are going to contain the '=' character, you need to quote them
+   with either single or double quotes.
+
+   Key values may contain -_a-zA-Z0-9, but must begin with a-zA-Z_.
+ */
+class ConfigLine {
+ public:
+  // Tries to parse the line as a config-file line.  Returns false
+  // if it could not for some reason, e.g. parsing failure.  In most cases
+  // prints no warnings; the user should do this.  Does not expect comments.
+  bool ParseLine(const std::string &line);
+
+  // the GetValue functions are overloaded for various types.  They return true
+  // if the key exists with value that can be converted to that type, and false
+  // otherwise.  They also mark the key-value pair as having been read.  It is
+  // not an error to read values twice.
+  bool GetValue(const std::string &key, std::string *value);
+  bool GetValue(const std::string &key, BaseFloat *value);
+  bool GetValue(const std::string &key, int32 *value);
+  // Values may be separated by ":" or by ",".
+  bool GetValue(const std::string &key, std::vector<int32> *value);
+  bool GetValue(const std::string &key, bool *value);
+
+  bool HasUnusedValues() const;
+  /// returns e.g. foo=bar xxx=yyy if foo and xxx were not consumed by one
+  /// of the GetValue() functions.
+  std::string UnusedValues() const;
+
+  const std::string &FirstToken() const { return first_token_; }
+
+  const std::string WholeLine() { return whole_line_; }
+  // use default assignment operator and copy constructor.
+ private:
+  std::string whole_line_;
+  // the first token of the line, e.g. if line is
+  // foo-bar baz=bing
+  // then first_token_ would be "foo-bar".
+  std::string first_token_;
+
+  // data_ maps from key to (value, is-this-value-consumed?).
+  std::map<std::string, std::pair<std::string, bool> > data_;
+
+};
+
+/// This function is like ExpectToken but for two tokens, and it will either
+/// accept token1 and then token2, or just token2.  This is useful in Read
+/// functions where the first token may already have been consumed.
+void ExpectOneOrTwoTokens(std::istream &is, bool binary,
+                          const std::string &token1,
+                          const std::string &token2);
+
+
+/**
+   This function reads in a config file and *appends* its contents to a vector of
+   lines; it is responsible for removing comments (anything after '#') and
+   stripping out any lines that contain only whitespace after comment removal.
+ */
+void ReadConfigLines(std::istream &is,
+                     std::vector<std::string> *lines);
+
+
+/**
+   This function converts config-lines from a simple sequence of strings
+   as output by ReadConfigLines(), into a sequence of first-tokens and
+   name-value pairs.  The general format is:
+      "command-type bar=baz xx=yyy"
+   etc., although there are subtleties as to what exactly is allowed, see
+   documentation for class ConfigLine for details.
+   This function will die if there was a parsing failure.
+ */
+void ParseConfigLines(const std::vector<std::string> &lines,
+                      std::vector<ConfigLine> *config_lines);
+
+
+/// Returns true if 'name' would be a valid name for a component or node in a
+/// nnet3Nnet.  This is a nonempty string beginning with A-Za-z_, and containing only
+/// '-', '_', '.', A-Z, a-z, or 0-9.
+bool IsValidName(const std::string &name);
 
 }  // namespace kaldi
 
diff --git a/tools/Makefile b/tools/Makefile
index 094a9b608d3..e690df3da88 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -1,14 +1,15 @@
 # SHELL += -x
 
-CXX = g++
-CC = gcc         # used for sph2pipe
-# CXX = clang++  # Uncomment these lines
-# CC = clang     # to build with Clang.
+CXX ?= g++
+CC ?= gcc        # used for sph2pipe
+# CXX = clang++  # Uncomment these lines...
+# CC = clang     # ...to build with Clang.
 
 # Note: OpenFst requires a relatively recent C++ compiler with C++11 support,
 # e.g. g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3.
 OPENFST_VERSION ?= 1.6.7
 CUB_VERSION ?= 1.8.0
+OPENBLAS_VERSION ?= 0.3.5
 
 # Default features configured for OpenFST; can be overridden in the make command line.
 OPENFST_CONFIGURE ?= --enable-static --enable-shared --enable-far --enable-ngram-fsts
@@ -129,31 +130,6 @@ sph2pipe_v2.5.tar.gz:
 	wget -T 10 -t 3 http://www.openslr.org/resources/3/sph2pipe_v2.5.tar.gz || \
 	wget --no-check-certificate -T 10  https://sourceforge.net/projects/kaldi/files/sph2pipe_v2.5.tar.gz
 
-openblas: openblas_compiled
-
-.PHONY: openblas_compiled
-
-fortran_opt = $(shell gcc -v 2>&1 | perl -e '$$x = join(" ", <STDIN>); if($$x =~ m/target=\S+64\S+/) { print "BINARY=64"; }')
-
-
-# note: you can uncomment the line that has USE_THREAD=1 and comment the line
-# that has USE_THREAD=0 if you want Open Blas to use multiple threads.  then
-# you could set, for example, OPENBLAS_NUM_THREADS=2 in your path.sh so that the
-# runtime knows how many threads to use.  Note: if you ever get the error
-# "Program is Terminated. Because you tried to allocate too many memory
-# regions.", this is because OpenBLAS has a fixed buffer size controlled by the
-# Makefile option NUM_THREADS; I believe this limits the product of number of
-# program threads that are calling BLAS by the shell variable
-# OPENBLAS_NUM_THREADS.  In that case it might help to increase the NUM_THREADS
-# option.
-openblas_compiled:
-	echo "Note: see tools/Makefile for options regarding OpenBLAS compilation"
-	-git clone https://github.com/xianyi/OpenBLAS.git
-	-cd OpenBLAS; git pull
-	cd OpenBLAS; sed 's:# FCOMMON_OPT = -frecursive:FCOMMON_OPT = -frecursive:' < Makefile.rule >tmp && mv tmp Makefile.rule
-	# $(MAKE) PREFIX=`pwd`/OpenBLAS/install FC=gfortran $(fortran_opt) DEBUG=1 USE_THREAD=1 NUM_THREADS=64 -C OpenBLAS all install
-	$(MAKE) PREFIX=`pwd`/OpenBLAS/install FC=gfortran $(fortran_opt) DEBUG=1 USE_THREAD=0 -C OpenBLAS all install
-
 
 .PHONY: cub
 cub:
@@ -161,3 +137,14 @@ cub:
 	unzip -oq cub-$(CUB_VERSION).zip
 	rm -f cub
 	ln -s cub-$(CUB_VERSION) cub
+
+# OpenBLAS is not compiled by default. Run 'make -j openblas' in this directory to build.
+.PHONY: openblas
+openblas:
+	@-rm -rf OpenBLAS xianyi-OpenBLAS-*
+	wget -t3 -nv -O- $$( \
+            wget -qO- 'https://api.github.com/repos/xianyi/OpenBLAS/releases/tags/v$(OPENBLAS_VERSION)' | \
+            python -c 'import sys,json;print(json.load(sys.stdin)["tarball_url"])') | \
+	  tar xzf -
+	mv xianyi-OpenBLAS-* OpenBLAS
+	$(MAKE) PREFIX=$$(pwd)/OpenBLAS/install USE_THREAD=0 -C OpenBLAS all install
diff --git a/tools/config/common_path.sh b/tools/config/common_path.sh
index 9a7ae2d9b29..fc941c52c17 100644
--- a/tools/config/common_path.sh
+++ b/tools/config/common_path.sh
@@ -22,4 +22,5 @@ ${KALDI_ROOT}/src/rnnlmbin:\
 ${KALDI_ROOT}/src/sgmm2bin:\
 ${KALDI_ROOT}/src/sgmmbin:\
 ${KALDI_ROOT}/src/tfrnnlmbin:\
+${KALDI_ROOT}/src/cudadecoderbin:\
 $PATH
diff --git a/tools/extras/check_dependencies.sh b/tools/extras/check_dependencies.sh
index 1b63c4c99d9..e133961e0a3 100755
--- a/tools/extras/check_dependencies.sh
+++ b/tools/extras/check_dependencies.sh
@@ -10,48 +10,45 @@ debian_packages=
 opensuse_packages=
 
 function add_packages {
-  redhat_packages="$redhat_packages $1";
-  debian_packages="$debian_packages $2";
-  opensuse_packages="$opensuse_packages $3";
+  redhat_packages="$redhat_packages $1"
+  debian_packages="$debian_packages ${2:-$1}"
+  opensuse_packages="$opensuse_packages ${3:-$1}"
 }
 
-if ! which which >&/dev/null; then
-  echo "$0: which is not installed."
-  add_packages which debianutils which
-fi
+function have { type -t "$1" >/dev/null; }
 
-COMPILER_VER_INFO=$($CXX --version 2>/dev/null)
-case $COMPILER_VER_INFO in
+compiler_ver_info=$($CXX --version 2>/dev/null)
+case $compiler_ver_info in
   "")
-    echo "$0: $CXX is not installed."
+    echo "$0: Compiler '$CXX' is not installed."
     echo "$0: You need g++ >= 4.8.3, Apple Xcode >= 5.0 or clang >= 3.3."
-    add_packages gcc-c++ g++ gcc-c++
+    add_packages gcc-c++ g++
     status=1
     ;;
   "g++ "* )
-    GCC_VER=$($CXX -dumpversion)
-    GCC_VER_NUM=$(echo $GCC_VER | sed 's/\./ /g' | xargs printf "%d%02d%02d")
-    if [ $GCC_VER_NUM -lt 40803 ]; then
-        echo "$0: $CXX (g++-$GCC_VER) is not supported."
+    gcc_ver=$($CXX -dumpversion)
+    gcc_ver_num=$(echo $gcc_ver | sed 's/\./ /g' | xargs printf "%d%02d%02d")
+    if [ $gcc_ver_num -lt 40803 ]; then
+        echo "$0: Compiler '$CXX' (g++-$gcc_ver) is not supported."
         echo "$0: You need g++ >= 4.8.3, Apple clang >= 5.0 or LLVM clang >= 3.3."
         status=1
     fi
     ;;
   "Apple LLVM "* )
     # See https://gist.github.com/yamaya/2924292
-    CLANG_VER=$(echo $COMPILER_VER_INFO | grep version | sed "s/.*version \([0-9\.]*\).*/\1/")
-    CLANG_VER_NUM=$(echo $COMPILER_VER_INFO | grep version | sed "s/.*clang-\([0-9]*\).*/\1/")
-    if [ $CLANG_VER_NUM -lt 500 ]; then
-        echo "$0: $CXX (Apple clang-$CLANG_VER) is not supported."
+    clang_ver=$(echo $compiler_ver_info | grep version | sed "s/.*version \([0-9\.]*\).*/\1/")
+    clang_ver_num=$(echo $compiler_ver_info | grep version | sed "s/.*clang-\([0-9]*\).*/\1/")
+    if [ $clang_ver_num -lt 500 ]; then
+        echo "$0: Compiler '$CXX' (Apple clang-$clang_ver) is not supported."
         echo "$0: You need g++ >= 4.8.3, Apple clang >= 5.0 or LLVM clang >= 3.3."
         status=1
     fi
     ;;
   "clang "* )
-    CLANG_VER=$(echo $COMPILER_VER_INFO | grep version | sed "s/.*version \([0-9\.]*\).*/\1/")
-    CLANG_VER_NUM=$(echo $CLANG_VER | sed 's/\./ /g' | xargs printf "%d%02d")
-    if [ $CLANG_VER_NUM -lt 303 ]; then
-        echo "$0: $CXX (LLVM clang-$CLANG_VER) is not supported."
+    clang_ver=$(echo $compiler_ver_info | grep version | sed "s/.*version \([0-9\.]*\).*/\1/")
+    clang_ver_num=$(echo $clang_ver | sed 's/\./ /g' | xargs printf "%d%02d")
+    if [ $clang_ver_num -lt 303 ]; then
+        echo "$0: Compiler '$CXX' (LLVM clang-$clang_ver) is not supported."
         echo "$0: You need g++ >= 4.8.3, Apple clang >= 5.0 or LLVM clang >= 3.3."
         status=1
     fi
@@ -61,53 +58,55 @@ case $COMPILER_VER_INFO in
     ;;
 esac
 
-if ! echo "#include <zlib.h>" | $CXX -E - >&/dev/null; then
+# Cannot check this without a compiler.
+if have "$CXX" && ! echo "#include <zlib.h>" | $CXX -E - >&/dev/null; then
   echo "$0: zlib is not installed."
-  add_packages zlib-devel zlib1g-dev zlib-devel
+  add_packages zlib-devel zlib1g-dev
 fi
 
 for f in make automake autoconf patch grep bzip2 gzip unzip wget git sox; do
-  if ! which $f >&/dev/null; then
+  if ! have $f; then
     echo "$0: $f is not installed."
-    add_packages $f $f $f
+    add_packages $f
   fi
 done
 
-if ! which libtoolize >&/dev/null && ! which glibtoolize >&/dev/null; then
+if ! have libtoolize && ! have glibtoolize; then
   echo "$0: neither libtoolize nor glibtoolize is installed"
-  add_packages libtool libtool libtool
+  add_packages libtool
 fi
 
-if ! which svn >&/dev/null; then
+if ! have svn; then
   echo "$0: subversion is not installed"
-  add_packages subversion subversion subversion
+  add_packages subversion
 fi
 
-if ! which awk >&/dev/null; then
+if ! have awk; then
   echo "$0: awk is not installed"
-  add_packages gawk gawk gawk
+  add_packages gawk
 fi
 
 pythonok=true
-if ! which python2.7 >&/dev/null; then
+if ! have python2.7; then
   echo "$0: python2.7 is not installed"
-  add_packages python2.7 python2.7
+  add_packages python2.7
   pythonok=false
 fi
 
-if ! which python3 >&/dev/null; then
+if ! have python3; then
   echo "$0: python3 is not installed"
-  add_packages python3 python3
+  add_packages python3
   pythonok=false
 fi
 
 (
 #Use a subshell so that sourcing env.sh does not have an influence on the rest of the script
 [ -f ./env.sh ] && . ./env.sh
-if $pythonok && ! which python2 >&/dev/null; then
+if $pythonok && ! have python2; then
   mkdir -p $PWD/python
-  echo "$0: python2.7 is installed, but the python2 binary does not exist. Creating a symlink and adding this to tools/env.sh"
-  ln -s $(which python2.7) $PWD/python/python2
+  echo "$0: python2.7 is installed, but the python2 binary does not exist." \
+       "Creating a symlink and adding this to tools/env.sh"
+  ln -s $(command -v python2.7) $PWD/python/python2
   echo "export PATH=$PWD/python:\${PATH}" >> env.sh
 fi
 
@@ -115,83 +114,96 @@ if [[ -f $PWD/python/.use_default_python && -f $PWD/python/python ]]; then
   rm $PWD/python/python
 fi
 
-if $pythonok && which python >&/dev/null && [[ ! -f $PWD/python/.use_default_python ]]; then
-  version=`python 2>&1 --version | awk '{print $2}' `
+if $pythonok && have python && [[ ! -f $PWD/python/.use_default_python ]]; then
+  version=$(python 2>&1 --version | awk '{print $2}')
   if [[ $version != "2.7"* ]] ; then
-    echo "$0: WARNING python 2.7 is not the default python. We fixed this by adding a correct symlink more prominently on the path."
-    echo "$0: If you really want to use python $version as default, add an empty file $PWD/python/.use_default_python and run this script again."
+    echo "$0: WARNING python 2.7 is not the default python. We fixed this by" \
+         "adding a correct symlink more prominently on the path."
+    echo " ... If you really want to use python $version as default, add an" \
+         "empty file $PWD/python/.use_default_python and run this script again."
     mkdir -p $PWD/python
-    ln -s $(which python2.7) $PWD/python/python
+    ln -s $(command -v python2.7) $PWD/python/python
     echo "export PATH=$PWD/python:\${PATH}" >> env.sh
   fi
 fi
 )
 
-printed=false
-
-if which apt-get >&/dev/null && ! which zypper >/dev/null; then
-  # if we're using apt-get [but we're not OpenSuse, which uses zypper as the
-  # primary installer, but sometimes installs apt-get for some compatibility
-  # reason without it really working]...
-  if [ ! -z "$debian_packages" ]; then
-    echo "$0: we recommend that you run (our best guess):"
-    echo " sudo apt-get install $debian_packages"
-    printed=true
-    status=1
-  fi
-  if ! dpkg -l | grep -E 'libatlas3gf|libatlas3-base' >/dev/null; then
-    echo "You should probably do: "
-    echo " sudo apt-get install libatlas3-base"
-    printed=true
-  fi
-elif which yum >&/dev/null; then
-  if [ ! -z "$redhat_packages" ]; then
-    echo "$0: we recommend that you run (our best guess):"
-    echo " sudo yum install $redhat_packages"
-    printed=true
-    status=1
-  fi
-  if ! rpm -qa|  grep atlas >/dev/null; then
-    echo "You should probably do something like: "
-    echo "sudo yum install atlas.x86_64"
-    printed=true
-  fi
-elif which zypper >&/dev/null; then
-  if [ ! -z "$opensuse_packages" ]; then
-    echo "$0: we recommend that you run (our best guess):"
-    echo " sudo zypper install $opensuse_packages"
-    printed=true
-    status=1
-  fi
-  if ! zypper search -i | grep -E 'libatlas3|libatlas3-devel' >/dev/null; then
-    echo "You should probably do: "
-    echo "sudo zypper install libatlas3-devel"
-    printed=true
+mathlib_missing=false
+case $(uname -m) in
+  x86_64)  # Suggest MKL on an Intel64 system (configure does not like i?86 hosts).
+    # We do not know if compiler exists at this point, so double-check the
+    # well-known mkl.h file location. The compiler test would still find it if
+    # installed in an alternative location (this is unlikely).
+    if [ ! -f /opt/intel/mkl/include/mkl.h ] &&
+         ! echo '#include <mkl.h>' | $CXX -I /opt/intel/mkl/include -E - >&/dev/null; then
+      if [[ $(uname) == Linux ]]; then
+        echo "$0: Intel MKL is not installed. Run extras/install_mkl.sh to install it."
+      else
+        echo "$0: Intel MKL is not installed. Download the installer package for your
+ ... system from: https://software.intel.com/mkl/choose-download."
+      fi
+      mathlib_missing=true
+    fi
+      ;;
+  *)  # Suggest OpenBLAS on other hardware.
+    if [ ! -f $(pwd)/OpenBLAS/install/include/openblas_config.h ] &&
+         ! echo '#include <openblas_config.h>' |
+            $CXX -I $(pwd)/OpenBLAS/install/include -E - >&/dev/null; then
+      echo "$0: OpenBLAS not detected. Run extras/install_openblas.sh
+ ... to compile it for your platform, or configure with --openblas-root= if you
+ ... have it installed in a location we could not guess. Note that packaged
+ ... library may be significantly slower and/or older than the one the above
+ ... would build."
+      mathlib_missing=true
+    fi
+      ;;
+esac
+$mathlib_missing &&
+  echo "\
+ ... You can also use other matrix algebra libraries. For information, see:
+ ...   http://kaldi-asr.org/doc/matrixwrap.html"
+
+# Report missing programs and libraries.
+if [ -n "$debian_packages" ]; then
+  install_pkg_command=$(
+    # Guess package manager from user's distribution type. Use a subshell
+    # because we are potentially importing a lot of dirt here.
+    eval $(grep 2>/dev/null ^ID /etc/os-release) 2>/dev/null
+    for rune in ${ID-} ${ID_LIKE-}; do
+      # The case '(pattern)' syntax is necessary in subshell for bash 3.x.
+      case $rune in
+        (rhel|centos|redhat) echo "yum install $redhat_packages"; break;;
+        (fedora) echo "dnx install $redhat_packages"; break;;
+        (suse) echo "zypper install $opensuse_packages"; break;;
+        (debian) echo "apt-get install $debian_packages"; break;;
+      esac
+    done
+  )
+
+  # Print the suggestion to install missing packages.
+  if [ -n "$install_pkg_command" ]; then
+    echo "$0: Some prerequisites are missing; install them using the command:"
+    echo "  sudo" $install_pkg_command
+  else
+    echo "$0: The following prerequisites are missing; install them first:"
+    echo "  " $debian_packages
   fi
-fi
-
-if [ ! -z "$debian_packages" ]; then
-  # If the list of packages to be installed is nonempty,
-  # we'll exit with error status.  Check this outside of
-  # checking for yum or apt-get, as we want it to exit with
-  # error even if we're not on Debian or red hat.
   status=1
 fi
 
-
 if [ $(pwd | wc -w) -gt 1 ]; then
   echo "*** $0: Warning: Kaldi scripts will fail if the directory name contains a space."
   echo "***  (it's OK if you just want to compile a few tools -> disable this check)."
-  status=1;
+  status=1
 fi
 
-if which grep >&/dev/null && pwd | grep -E 'JOB|LMWT' >/dev/null; then
+if pwd | grep -E 'JOB|LMWT' >/dev/null; then
   echo "*** $0: Kaldi scripts will fail if the directory name contains"
   echo "***  either of the strings 'JOB' or 'LMWT'."
-  status=1;
+  status=1
 fi
 
-if ! $printed && [ $status -eq 0 ]; then
+if ! $mathlib_missing && [ $status -eq 0 ]; then
   echo "$0: all OK."
 fi
 
diff --git a/tools/extras/install_diarization_VBHMM.sh b/tools/extras/install_diarization_VBHMM.sh
new file mode 100644
index 00000000000..31da88871d7
--- /dev/null
+++ b/tools/extras/install_diarization_VBHMM.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+set -u
+set -e
+
+
+# Make sure we are in the tools/ directory.
+if [ `basename $PWD` == extras ]; then
+    cd ..
+fi
+
+! [ `basename $PWD` == tools ] && \
+    echo "You must call this script from the tools/ directory" && exit 1;
+
+# We download the original VB HMM scripts of the Brno University of Technology.
+# numexpr is a required dependency for speeding up the VB_diarization.
+if [ ! -d VB_diarization ]; then
+  git clone https://github.com/GoVivaceInc/VB_diarization  
+  cp VB_diarization/VB_diarization.py ../egs/callhome_diarization/v1/diarization/
+fi
+
+pip install numexpr
diff --git a/tools/extras/install_mkl.sh b/tools/extras/install_mkl.sh
new file mode 100755
index 00000000000..fe2ea7bdb65
--- /dev/null
+++ b/tools/extras/install_mkl.sh
@@ -0,0 +1,265 @@
+#!/bin/bash
+
+# Intel MKL is now freely available even for commercial use. This script
+# attempts to install the MKL package automatically from Intel's repository.
+#
+# For manual repository setup instructions, see:
+#   https://software.intel.com/articles/installing-intel-free-libs-and-python-yum-repo
+#   https://software.intel.com/articles/installing-intel-free-libs-and-python-apt-repo
+#
+# For other package managers, or non-Linux platforms, see:
+#   https://software.intel.com/mkl/choose-download
+
+set -o pipefail
+
+default_package=intel-mkl-64bit-2019.2-057
+
+yum_repo='https://yum.repos.intel.com/mkl/setup/intel-mkl.repo'
+apt_repo='https://apt.repos.intel.com/mkl'
+intel_key_url='https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB'
+
+Usage () {
+  cat >&2 <<EOF
+Usage: $0 [-s] [<MKL-package>]
+
+Checks if MKL is present on the system, and/or attempts to install it.
+
+If <MKL-package> is not provided, ${default_package} will be installed.
+
+Intel packages are installed under the /opt/intel directory. You should be root
+to install MKL into this directory; run this script using the sudo command.
+
+Options:
+  -s  - Skip check for MKL being already present.
+  -p <suse|redhat|debian|fedora> -- Force type of package management. Use only
+                                    if automatic detection fails, as instructed.
+  -h  - Show this message.
+
+Environment:
+  CC   The C compiler to use for MKL check. If not set, uses 'cc'.
+EOF
+  exit 2
+}
+
+Fatal () { echo "$0: $@"; exit 1; }
+
+Have () { type -t "$1" >/dev/null; }
+
+# Option values.
+skip_cc=
+distro=
+
+while getopts ":hksp:" opt; do
+  case ${opt} in
+    h) Usage ;;
+    s) skip_cc=yes ;;
+    p) case $OPTARG in
+         suse|redhat|debian|fedora) distro=$OPTARG ;;
+         *) Fatal "invalid value -p '${OPTARG}'. " \
+                  "Allowed: 'suse', 'redhat', 'debian' or 'fedora'."
+       esac ;;
+    \?) echo >&2 "$0: invalid option -${OPTARG}."; Usage ;;
+  esac
+done
+shift $((OPTIND-1))
+
+orig_arg_package=${1-''}
+package=${1:-$default_package}
+
+# Check that we are actually on Linux, otherwise give a helpful reference.
+[[ $(uname) == Linux ]] || Fatal "\
+This script can be used on Linux only, and your system is $(uname).
+
+Installer packages for Mac and Windows are available for download from Intel:
+https://software.intel.com/mkl/choose-download"
+
+# Test if MKL is already installed on the system.
+if [[ ! $skip_cc ]]; then
+  : ${CC:=cc}
+  Have "$CC" || Fatal "\
+C compiler $CC not found.
+
+You can skip the check for MKL presence by invoking this script with the '-s'
+option to this script, but you will need a functional compiler anyway, so we
+recommend that you install it first."
+
+  mkl_version=$($CC -E -I /opt/intel/mkl/include - <<< \
+                      '#include <mkl_version.h>
+           __INTEL_MKL__.__INTEL_MKL_MINOR__.__INTEL_MKL_UPDATE__' 2>/dev/null |
+                  tail -n 1 ) || mkl_version=
+  mkl_version=${mkl_version// /}
+
+  [[ $mkl_version ]] && Fatal "\
+MKL version $mkl_version is already installed.
+
+You can skip the check for MKL presence by invoking this script with the '-s'
+option and proceed with automated installation, but we highly discourage
+this. This script will register Intel repositories with your system, and it
+seems that they have been already registered, or MKL has been installed some
+other way.
+
+You should use your package manager to check which MKL package is already
+installed. Note that Intel packages register the latest installed version of
+the library as the default. If your installed version is older than
+$package, it makes sense to upgrade."
+fi
+
+# Try to determine which package manager the distro uses, unless overridden.
+if [[ ! $distro ]]; then
+  dist_vars=$(cat /etc/os-release 2>/dev/null)
+  eval "$dist_vars"
+  for rune in $CPE_NAME $ID $ID_LIKE; do
+    case "$rune" in
+      cpe:/o:fedoraproject:fedora:2[01]) distro=redhat; break;;  # Use yum.
+      rhel|centos) distro=redhat; break;;
+      redhat|suse|fedora|debian) distro=$rune; break;;
+    esac
+  done
+
+  # Certain old distributions do not have /etc/os-release. We are unlikely to
+  # encounter these in the wild, but just in case.
+  # NOTE: Do not try to guess Fedora specifically here! Fedora 20 and below
+  #       detect as redhat, and this is good, because they use yum by default.
+  [[ ! $distro && -f /etc/redhat-release ]] && distro=redhat
+  [[ ! $distro && -f /etc/SuSE-release ]]   && distro=suse
+  [[ ! $distro && -f /etc/debian_release ]] && distro=debian
+
+  [[ ! $distro ]] && Fatal "\
+Unable to determine package management style.
+
+Invoke this script with the option '-p <style>', where <style> can be:
+  redhat -- RedHat-like, uses yum and rpm for package management.
+  fedora -- Fedora 22+, also RedHat-like, but uses dnf instead of yum.
+  suse   -- SUSE-like, uses zypper and rpm.
+  debian -- Debian-like, uses apt and dpkg.
+
+We do not currently support other package management systems. Check the Intel's
+documentation at https://software.intel.com/mkl/choose-download for other
+install options."
+
+  echo >&2 "$0: Your system is using ${distro}-style package management."
+fi
+
+# Check for root.
+if [[ "$(id -u)" -ne 0 ]]; then
+  echo >&2 "$0: You must be root to install MKL.
+
+Restart this script using the 'sudo' command, as:
+
+  sudo $0 -sp $distro $package
+
+We recommend adding the '-sp $distro' options to skip the MKL and distro
+detection, since this has already been done. This minimizes the number of
+programs invoked with the root privileges to keep your system safe from
+unexpected or erroneous changes. Also, if you are setting the CC environment
+variable, sudo might not allow it to propagate to the command that it invokes."
+
+  if [ -t 0 ]; then
+    echo; read -ep "Run the above sudo command now? [Y/n]:"
+    case $REPLY in
+      ''|[Yy]*) set -x; exec sudo "$0" -sp "$distro" "$package"
+    esac
+  fi
+  exit 0
+fi
+
+# The install variants, each in a finction to simplify error reporting.
+# Each one invokes a subshell with a 'set -x' to to show system-modifying
+# commands it runs. The subshells simply limit the scope of this diagnostics
+# and avoid creating noise (if we were using 'set +x', it would be printed).
+Install_redhat () {
+  # yum-utils contains yum-config-manager, in case the user does not have it.
+  ( set -x
+    yum -y install yum-utils &&
+    yum-config-manager --add-repo "$yum_repo" &&
+    yum -y install "$package" )
+}
+
+Install_fedora () {
+  ( set -x
+    dnf -y install 'dnf-command(config-manager)' &&
+    dnf config-manager --add-repo "$yum_repo" &&
+    dnf -y install "$package" )
+}
+
+Install_suse () {
+  # zypper bug until libzypp-17.6.4: '--gpg-auto-import-keys' is ignored.
+  # See https://github.com/openSUSE/zypper/issues/144#issuecomment-418685933
+  # We must disable gpg checks with '--no-gpg-checks'. I won't bend backwards
+  # as far as check the installed .so version...
+  ( set -x
+    zypper addrepo "$yum_repo" &&
+    zypper --gpg-auto-import-keys --no-gpg-checks \
+           --non-interactive install "$package" )
+}
+
+Install_debian () {
+  local keyring='/usr/share/keyrings/intel-sw-products.gpg' \
+        sources_d='/etc/apt/sources.list.d' \
+        trusted_d='/etc/apt/trusted.gpg.d' \
+        apt_maj= apt_min= apt_ver=
+
+  # apt before 1.2 does not understand the signed-by option, and always
+  # look for the keyring in their trusted.gpg.d directory. This is not
+  # considered a good security practice any more. If apt is old, add a link
+  # to the keyring file and remind the user to delete it when apt is upgraded.
+  IFS=' .' builtin read _ apt_maj apt_min _ < <(apt-get --version)
+  apt_ver=$(builtin printf '%03d%03d' $apt_maj $apt_min)
+
+  # Get alternative location of /etc/apt/sources.list.d, if so configured.
+  eval $(apt-config shell sources_d Dir::Etc::sourceparts/f \
+                          trusted_d Dir::Etc::trustedparts/f)
+
+  # apt is much more involved to configure than other package managers, as fas
+  # as third-party security keys go.
+  ( set -x;
+    apt-get update &&
+    apt-get install -y wget apt-transport-https ca-certificates gnupg &&
+    wget -qO- $intel_key_url | apt-key --keyring $keyring add - &&
+    echo "deb [signed-by=${keyring}] $apt_repo all main" \
+         > "$sources_d/intel-mkl.list" ) || return 1
+
+  if [[ $apt_ver < '001002' ]]; then
+    ( set -x; ln -s "$keyring" "${trusted_d}/" ) || return 1
+  fi
+
+  ( set +x
+    apt-get update &&
+    apt-get install -y "$package" ) || return 1
+
+  # Print the message after the large install, so the user may notice. I hope...
+  if [[ $apt_ver < '001002' ]]; then
+    echo >&2 "$0: Your apt-get version is earlier than 1.2.
+
+This version does not understand individual repositories signing keys, and
+trusts all keys in $trusted_d. We have created a link
+$trusted_d/$(basename $keyring) pointing to the file
+$keyring. If/when you upgrade your system to
+a higher version of apt, removing this link will help make it more secure.
+
+This is not considered a severe security issue, but separating keyrings is the
+current recommended security practice."
+  fi
+}
+
+# Register MKL .so libraries with the ld.so.
+ConfigLdSo() {
+  [ -d /etc/ld.so.conf.d ] || return 0
+  type -t ldconfig >/dev/null || return 0
+  echo >&2 "$0: Configuring ld runtime bindings"
+  ( set -x;
+    echo >/etc/ld.so.conf.d/intel-mkl.conf "\
+/opt/intel/lib/intel64
+/opt/intel/mkl/lib/intel64"
+    ldconfig )
+}
+
+# Invoke installation.
+if Install_${distro} && ConfigLdSo; then
+  echo >&2 "$0: MKL package $package was successfully installed"
+else
+  Fatal "MKL package $package installation FAILED.
+
+Please open an issue with us at https://github.com/kaldi-asr/kaldi/ if you
+believe this is a bug."
+fi
diff --git a/tools/extras/install_openblas.sh b/tools/extras/install_openblas.sh
index 44ff1793018..90afe8e9de4 100755
--- a/tools/extras/install_openblas.sh
+++ b/tools/extras/install_openblas.sh
@@ -1,7 +1,5 @@
 #!/bin/bash
 
-# to be run from ..
-# this script just exists to tell you how you'd make openblas- we actually did it via Makefile rules,
-# but it's not a default target.
-
-make openblas
+# OpenBLAS is downloaded and built by tools/Makefile, but not automatically by
+# its default 'all' target.
+make -j openblas
diff --git a/tools/extras/install_portaudio.sh b/tools/extras/install_portaudio.sh
index ed9529477a6..36c95047a7f 100755
--- a/tools/extras/install_portaudio.sh
+++ b/tools/extras/install_portaudio.sh
@@ -14,10 +14,10 @@
 #See the Apache 2 License for the specific language governing permissions and
 #limitations under the License.
 #
-#This script attempts to install port audio, which is needed for the run-on 
-#decoding stuff. Portaudio enables the decoder to grab a live audio stream 
-#from the soundcard. I tested portaudio on Linux (RedHat and Suse Linux) and 
-#on MacOS 10.7. On Linux, it compiles out of the box. For MacOS 10.7, 
+#This script attempts to install port audio, which is needed for the run-on
+#decoding stuff. Portaudio enables the decoder to grab a live audio stream
+#from the soundcard. I tested portaudio on Linux (RedHat and Suse Linux) and
+#on MacOS 10.7. On Linux, it compiles out of the box. For MacOS 10.7,
 #it is necessary to edit the Makefile (this script tries to do that).
 #The script will remove all occurances of
 #
@@ -29,8 +29,8 @@
 #also, it seems that one has to uncomment the inclusion of AudioToolbox in
 #include/pa_mac_core.h
 #
-#All this should make it compile fine for x86_64 under MacOS 10.7 
-#(always assuming that you installed XCode, wget and 
+#All this should make it compile fine for x86_64 under MacOS 10.7
+#(always assuming that you installed XCode, wget and
 #the Linux environment stuff on MacOS)
 
 echo "****() Installing portaudio"
@@ -38,7 +38,7 @@ echo "****() Installing portaudio"
 if [ ! -e pa_stable_v19_20111121.tgz ]; then
     echo "Could not find portaudio tarball pa_stable_v19_20111121.tgz"
     echo "Trying to download it via wget!"
-    
+
     if ! which wget >&/dev/null; then
         echo "This script requires you to first install wget"
         echo "You can also just download pa_stable_v19_20111121.tgz from"
@@ -81,8 +81,10 @@ if [ -z "$MACOS" ]; then
     echo "${pa_patch}" | patch -p0 Makefile.in
 fi
 
+patch -p0  Makefile.in < ../extras/portaudio.patch
+autoconf
 ./configure --prefix=`pwd`/install --with-pic
-sed -i.bk '40s:src/common/pa_ringbuffer.o::g; 40s:$: src/common/pa_ringbuffer.o:' Makefile
+perl -i -pe 's:src/common/pa_ringbuffer.o:: if /^OTHER_OBJS\s*=/' Makefile
 
 if [ "$MACOS" != "" ]; then
     echo "detected MacOS operating system ... trying to fix Makefile"
@@ -93,7 +95,7 @@ if [ "$MACOS" != "" ]; then
     mv include/pa_mac_core.h include/pa_mac_core.h.bck
     cat include/pa_mac_core.h.bck \
       | sed 's/\/\/\#include \<AudioToolbox\/AudioToolbox.h\>/#include \<AudioToolbox\/AudioToolbox.h\>/g' \
-      > include/pa_mac_core.h 
+      > include/pa_mac_core.h
 fi
 
 make
diff --git a/tools/extras/install_tensorflow_cc.sh b/tools/extras/install_tensorflow_cc.sh
index 95e81053e74..b13fcbeff44 100755
--- a/tools/extras/install_tensorflow_cc.sh
+++ b/tools/extras/install_tensorflow_cc.sh
@@ -25,7 +25,7 @@ else
 fi
 
 
-[ ! -f bazel.zip ] && wget https://github.com/bazelbuild/bazel/releases/download/0.5.4/bazel-0.5.4-dist.zip -O bazel.zip
+[ ! -f bazel.zip ] && wget https://github.com/bazelbuild/bazel/releases/download/0.15.0/bazel-0.15.0-dist.zip -O bazel.zip
 mkdir -p bazel
 cd bazel
 unzip ../bazel.zip
@@ -33,12 +33,13 @@ unzip ../bazel.zip
 cd ../
 
 # now bazel is built
-git clone https://github.com/tensorflow/tensorflow
+[ ! -d tensorflow ] && git clone https://github.com/tensorflow/tensorflow
 cd tensorflow
-git checkout r1.4
+git fetch --tags
+git checkout r1.12
 ./configure
 
-tensorflow/contrib/makefile/download_dependencies.sh 
+tensorflow/contrib/makefile/download_dependencies.sh
 bazel build -c opt //tensorflow:libtensorflow.so
 bazel build -c opt //tensorflow:libtensorflow_cc.so
 
diff --git a/tools/extras/portaudio.patch b/tools/extras/portaudio.patch
new file mode 100644
index 00000000000..9fc201f9278
--- /dev/null
+++ b/tools/extras/portaudio.patch
@@ -0,0 +1,21 @@
+diff --git a/Makefile.in b/Makefile.in
+index 24129a3..61a3952 100644
+--- a/Makefile.in
++++ b/Makefile.in
+@@ -44,7 +44,7 @@ PALIB = libportaudio.la
+ PAINC = include/portaudio.h
+ 
+ PA_LDFLAGS = $(LDFLAGS) $(SHARED_FLAGS) -rpath $(libdir) -no-undefined \
+-	     -export-symbols-regex "(Pa|PaMacCore|PaJack|PaAlsa|PaAsio|PaOSS)_.*" \
++	     -export-symbols-regex "(Pa|PaUtil|PaMacCore|PaJack|PaAlsa|PaAsio|PaOSS)_.*" \
+ 	     -version-info $(LT_CURRENT):$(LT_REVISION):$(LT_AGE)
+ 
+ COMMON_OBJS = \
+@@ -57,6 +57,7 @@ COMMON_OBJS = \
+ 	src/common/pa_process.o \
+ 	src/common/pa_stream.o \
+ 	src/common/pa_trace.o \
++	src/common/pa_ringbuffer.o \
+ 	src/hostapi/skeleton/pa_hostapi_skeleton.o
+ 
+ LOOPBACK_OBJS = \
diff --git a/windows/variables.props.dev b/windows/variables.props.dev
index 9fb2457c99c..0810edcd262 100644
--- a/windows/variables.props.dev
+++ b/windows/variables.props.dev
@@ -8,6 +8,8 @@
     <OPENBLASDIR>C:\Users\Yenda\Downloads\kaldi-svn\tools\OpenBLAS-v0.2.14-Win64-int32</OPENBLASDIR>
     <OPENFST>C:\Users\jtrmal\Documents\openfst\</OPENFST>
     <OPENFSTLIB>C:\Users\jtrmal\Documents\openfst\build64</OPENFSTLIB>
+    <CUBDIR>c:\Users\jtrmal\Documents\cub\</CUBDIR>
+    <NVTOOLSDIR>C:\Program FIles\NVIDIA Corporation\NvToolsExt\</NVTOOLSDIR>
     <!-- Do not modify anything after this line -->
   </PropertyGroup>
   <PropertyGroup />
@@ -29,5 +31,13 @@
       <Value>$(MKLDIR)</Value>
       <EnvironmentVariable>true</EnvironmentVariable>
     </BuildMacro>
+    <BuildMacro Include="CUBDIR">
+      <Value>$(CUBDIR)</Value>
+      <EnvironmentVariable>true</EnvironmentVariable>
+    </BuildMacro>
+    <BuildMacro Include="NVTOOLSDIR">
+      <Value>$(NVTOOLSDIR)</Value>
+      <EnvironmentVariable>true</EnvironmentVariable>
+    </BuildMacro>
   </ItemGroup>
 </Project>