From d605149ec4fd84536f7a1a08f63200c00798cc8e Mon Sep 17 00:00:00 2001
From: LvHang <hanglv@nwpu-aslp.org>
Date: Mon, 16 Apr 2018 18:04:47 -0400
Subject: [PATCH 1/2] merge raw data codes

fix some bugs about raw waveform codes

add make_raw_feats script

add ApplyMinMaxToWeights

fix the scripts to use max/min_param_value

On/Off batchnorm in fft component

add a nnet3/get_egs_old.sh backup

small fix

small fix2
---
 egs/multi_en/s5/cmd.sh                        |   8 +-
 egs/multi_en/s5/conf/queue_no_k20.conf        |  16 +
 egs/multi_en/s5/conf/raw_no_mvn.conf          |   7 +
 egs/wsj/s5/steps/libs/common.py               |  51 ++
 egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py |   1 +
 egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py |   6 +-
 .../s5/steps/libs/nnet3/xconfig/raw_signal.py | 552 ++++++++++++++++++
 egs/wsj/s5/steps/make_raw_feats.sh            | 129 ++++
 egs/wsj/s5/steps/nnet3/chain/get_egs_old.sh   | 428 ++++++++++++++
 egs/wsj/s5/steps/nnet3/chain/train.py         |  27 +
 src/featbin/Makefile                          |   2 +-
 src/featbin/compute-raw-frame-feats.cc        | 122 ++++
 src/nnet3/nnet-chain-training.cc              |   3 +
 src/nnet3/nnet-component-itf.cc               |  46 ++
 src/nnet3/nnet-component-itf.h                |  15 +-
 src/nnet3/nnet-convolutional-component.h      |   1 +
 src/nnet3/nnet-general-component.h            |   2 +
 src/nnet3/nnet-simple-component.cc            | 449 +++++++++++++-
 src/nnet3/nnet-simple-component.h             | 155 ++++-
 src/nnet3/nnet-utils.cc                       |  16 +
 src/nnet3/nnet-utils.h                        |   9 +
 21 files changed, 2036 insertions(+), 9 deletions(-)
 create mode 100644 egs/multi_en/s5/conf/queue_no_k20.conf
 create mode 100644 egs/multi_en/s5/conf/raw_no_mvn.conf
 create mode 100644 egs/wsj/s5/steps/libs/nnet3/xconfig/raw_signal.py
 create mode 100755 egs/wsj/s5/steps/make_raw_feats.sh
 create mode 100755 egs/wsj/s5/steps/nnet3/chain/get_egs_old.sh
 create mode 100644 src/featbin/compute-raw-frame-feats.cc

diff --git a/egs/multi_en/s5/cmd.sh b/egs/multi_en/s5/cmd.sh
index ed453ff8416..4f3b12aa700 100755
--- a/egs/multi_en/s5/cmd.sh
+++ b/egs/multi_en/s5/cmd.sh
@@ -10,7 +10,7 @@
 # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
 # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
-export train_cmd="queue.pl --mem 2G"
-export decode_cmd="queue.pl --mem 4G"
-export mkgraph_cmd="queue.pl --mem 8G"
-export cuda_cmd="queue.pl --gpu 1"
+export train_cmd="queue.pl --mem 2G --config conf/queue_no_k20.conf --allow-k10-k20 true"
+export decode_cmd="queue.pl --mem 4G --config conf/queue_no_k20.conf --allow-k10-k20 true"
+export mkgraph_cmd="queue.pl --mem 8G --config conf/queue_no_k20.conf --allow-k10-k20 true"
+export cuda_cmd="queue.pl --gpu 1 --config conf/queue_no_k20.conf --allow-k10-k20 true"
diff --git a/egs/multi_en/s5/conf/queue_no_k20.conf b/egs/multi_en/s5/conf/queue_no_k20.conf
new file mode 100644
index 00000000000..e8d19a24ef7
--- /dev/null
+++ b/egs/multi_en/s5/conf/queue_no_k20.conf
@@ -0,0 +1,16 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -l 'hostname=!a08*&!a09*&!a10*&!c04*&!b18*&!b19*&!b20*'
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0 -q all.q
+option gpu=* -l gpu=$0 -q g.q
+default allow_k20=true
+option allow_k20=true
+option allow_k20=false -l 'hostname=!g01*&!g02*&!b06*'
+default allow_k10_k20=true
+option allow_k10_k20=true
+option allow_k10_k20=false -l 'hostname=!b0*&!b10*&!g01*&!g02'
diff --git a/egs/multi_en/s5/conf/raw_no_mvn.conf b/egs/multi_en/s5/conf/raw_no_mvn.conf
new file mode 100644
index 00000000000..90fe4eed9d4
--- /dev/null
+++ b/egs/multi_en/s5/conf/raw_no_mvn.conf
@@ -0,0 +1,7 @@
+# configs for raw wav features
+--sample-frequency=8000
+--remove-dc-offset=false
+--loudness-equalize=false
+--remove-global-dc-offset=false
+--snip-edges=false
+--dither=1
diff --git a/egs/wsj/s5/steps/libs/common.py b/egs/wsj/s5/steps/libs/common.py
index 1e8e2ced6ce..5dffde08145 100644
--- a/egs/wsj/s5/steps/libs/common.py
+++ b/egs/wsj/s5/steps/libs/common.py
@@ -17,6 +17,7 @@
 import subprocess
 import sys
 import threading
+import numpy as np
 
 logger = logging.getLogger(__name__)
 logger.addHandler(logging.NullHandler())
@@ -517,3 +518,53 @@ def write_idct_matrix(feat_dim, cepstral_lifter, file_path):
     for k in range(0, feat_dim):
         idct_matrix[k].append(0)
     write_kaldi_matrix(file_path, idct_matrix)
+
+
+def compute_sin_cos_transform_matrix(K, N, compute_cosine=True, add_bias=False, half_range=False):
+    assert(K <= N)
+    n_range = (N/2 if half_range is  True else N)
+    matrix = [[0] * (K + (1 if add_bias else 0)) for i in range(n_range)]
+    if compute_cosine:
+        for k in range(0, K):
+            for n in range(0, n_range):
+                matrix[n][k] = math.cos(2* math.pi / float(N) * n * k)
+    else:
+        for k in range(0, K):
+            for n in range(0, n_range):
+                matrix[n][k] = -1.0 * math.sin(2* math.pi / float(N) * n * k)
+    return matrix
+
+def write_sin_cos_transform_matrix(feat_dim, fft_dim, file_path, compute_cosine=True, add_bias=False, half_range=False):
+    # generate discrete sin and cosine transform and write to the file
+    transform_matrix = compute_sin_cos_transform_matrix(feat_dim, fft_dim,
+                       compute_cosine=compute_cosine, add_bias=add_bias, half_range=half_range)
+    write_kaldi_matrix(file_path, transform_matrix)
+
+def write_negate_vector(fft_dim, file_path):
+    scale_vec = [[-1.0] * fft_dim]
+    write_kaldi_matrix(file_path, scale_vec)
+
+# This function computes transform for applying mean-subtraction -> pre-emphasis
+#   -> windowing, which can be used in the begining of network.
+def compute_and_write_preprocess_transform(preemph, dim, file_path):
+    preemph_mat = [[0] * dim for i in range(dim)]
+    mean_subtract_mat = [[-1.0/dim] * dim for i in range(dim)]
+    window_mat = [[0] * dim for i in range(dim)]
+    preemph_mat[0][0] = 1.0 - 1.0 * preemph;
+    for i in range(dim):
+        if (i > 0):
+            preemph_mat[i][i-1] = -1.0 * preemph
+            preemph_mat[i][i] = 1.0
+        mean_subtract_mat [i][i] = 1.0 - 1.0/dim
+        if (i ==  0):
+            i_fl = float(i+1)
+        elif (i == (dim-1)):
+            i_fl = float(dim-2.0)
+        else:
+            i_fl = float(i)
+        window_mat[i][i] = (0.5 - 0.5 * math.cos(2 * math.pi * i_fl / float(dim)))**0.85
+    tot_mat_tmp = np.dot(preemph_mat, mean_subtract_mat)
+    tot_mat = np.dot(window_mat, tot_mat_tmp)
+    bias = np.zeros((dim,1))
+    biased_mat = np.c_[tot_mat, bias]
+    write_kaldi_matrix(file_path, biased_mat)
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py
index 32d6e87eba1..0a82ee44615 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py
@@ -10,3 +10,4 @@
 from .gru import *
 from .stats_layer import *
 from .trivial_layers import *
+from .raw_signal import *
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
index a3e3e970311..cccd29ad53d 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
@@ -71,7 +71,11 @@
         'renorm-component': xlayers.XconfigRenormComponent,
         'batchnorm-component': xlayers.XconfigBatchnormComponent,
         'no-op-component': xlayers.XconfigNoOpComponent,
-        'linear-component': xlayers.XconfigLinearComponent
+        'linear-component': xlayers.XconfigLinearComponent,
+        'preprocess-fft-abs-norm-lognorm-affine-log-layer': xlayers.XconfigFftFilterLayer,
+        'preprocess-fft-abs-lognorm-affine-log-layer': xlayers.XconfigFftFilterLayer,
+        'preprocess-fft-abs-log-layer': xlayers.XconfigFftFilterLayer,
+        'preprocess-tconv-abs-log-nin-affine-layer': xlayers.XconfigTimeDomainLayer
 }
 
 # Turn a config line and a list of previous layers into
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/raw_signal.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/raw_signal.py
new file mode 100644
index 00000000000..d2cffe04cc7
--- /dev/null
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/raw_signal.py
@@ -0,0 +1,552 @@
+# Copyright 2017 Pegah Ghahremani
+# Apache 2.0.
+
+""" This module contains layer types for processig raw waveform frames.
+"""
+
+from __future__ import print_function
+import math
+import re
+import sys
+from libs.nnet3.xconfig.basic_layers import XconfigLayerBase
+
+# This class is used for frequency-domain filter learning.
+# This class is for parsing lines like
+# 'preprocess-fft-abs-lognorm-affine-log-layer fft-dim=512 num-left-inputs=1'
+# 'num-right-inputs=2 l2-reg=0.001'
+# preprocess : applies windowing and pre-emphasis on input frames.
+# fft : compute real and imaginary part of discrete cosine transform
+#       using sine and cosine transform.
+# abs : computes absolute value of real and complex part of fft.
+# lognorm : normalize input in log-space using batchnorm followed by per-element
+#           scale and offset.
+# affine : filterbank learned using AffineComponent
+
+class XconfigFftFilterLayer(XconfigLayerBase):
+    def __init__(self, first_token, key_to_value, prev_names = None):
+        # Here we just list some likely combinations.. you can just add any
+        # combinations you want to use, to this list.
+        assert first_token in ['preprocess-fft-abs-lognorm-affine-log-layer',
+                               'preprocess-fft-abs-norm-lognorm-affine-log-layer',
+                               'preprocess-fft-abs-norm-affine-log-layer',
+                               'preprocess-fft-abs-log-layer']
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+        self.config = { 'input':'[-1]',
+                        'dim': -1,
+                        'max-change' : 0.75,
+                        'target-rms' : 1.0,
+                        'learning-rate-factor' : 1.0,
+                        'max-change' : 0.75,
+                        'max-param-value' : 1.0,
+                        'min-param-value' : 0.0,
+                        'l2-regularize' : 0.005,
+                        'learning-rate-factor' : 1,
+                        'dim' : -1,
+                        'write-init-config' : True,
+                        'num-filters' : 100,
+                        'sin-transform-file' : '',
+                        'cos-transform-file' : '',
+                        'scale': 1.0,
+                        'half-fft-range' : False} # l2-regularize and min-param-value
+                                                   # and max-param-value affects
+                                                   # layers affine layer.
+    def check_configs(self):
+        if self.config['target-rms'] < 0.0:
+            raise RuntimeError("target-rms has invalid value {0}"
+                               .format(self.config['target-rms']))
+        if self.config['learning-rate-factor'] <= 0.0:
+            raise RuntimeError("learning-rate-factor has invalid value {0}"
+                               .format(self.config['learning-rate-factor']))
+        if self.config['max-param-value'] < self.config['min-param-value']:
+            raise RuntimeError("max-param-value {0} should be larger than "
+                               "min-param-value {1}."
+                               "".format(self.config['max-param-value'],
+                                         self.config['min-param-value']))
+
+        if self.config['sin-transform-file'] is None:
+            raise RuntimeError("sin-transform-file must be set.")
+
+        if self.config['cos-transform-file'] is None:
+            raise RuntimeError("cos-transform-file must be set.")
+
+    def output_name(self, auxiliary_output=None):
+        assert auxiliary_output == None
+
+        split_layer_name = self.layer_type.split('-')
+        assert split_layer_name[-1] == 'layer'
+        last_nonlinearity = split_layer_name[-2]
+        return '{0}.{1}'.format(self.name, last_nonlinearity)
+
+
+    def output_dim(self):
+        split_layer_name = self.layer_type.split('-')
+        if 'affine' in split_layer_name:
+            output_dim = self.config['num-filters']
+            if 'norm' in split_layer_name:
+                output_dim = output_dim + 1
+        else:
+            input_dim = self.descriptors['input']['dim']
+            fft_dim = (2**(input_dim-1).bit_length())
+            half_fft_range = self.config['half-fft-range']
+            output_dim = (fft_dim/2 if half_fft_range is True else fft_dim)
+        return output_dim
+
+    def get_full_config(self):
+        ans = []
+        config_lines = self._generate_config()
+
+        for line in config_lines:
+            if len(line) == 2:
+                # 'ref' or 'final' tuple already exist in the line
+                # These lines correspond to fft component.
+                # which contains FixedAffineComponent.
+                assert(line[0] == 'init' or line[0] == 'ref' or line[0] == 'final')
+                ans.append(line)
+            else:
+                for config_name in ['ref', 'final']:
+                    ans.append((config_name, line))
+        return ans
+
+    def _generate_config(self):
+        split_layer_name = self.layer_type.split('-')
+        assert split_layer_name[-1] == 'layer'
+        nonlinearities = split_layer_name[:-1]
+
+        # by 'descriptor_final_string' we mean a string that can appear in
+        # config-files, i.e. it contains the 'final' names of nodes.
+        input_desc = self.descriptors['input']['final-string']
+        input_dim = self.descriptors['input']['dim']
+
+        # the child classes e.g. tdnn might want to process the input
+        # before adding the other components
+
+        return self._add_components(input_desc, input_dim, nonlinearities)
+
+    def _add_components(self, input_desc, input_dim, nonlinearities):
+        dim = self.config['dim']
+        min_param_value = self.config['min-param-value']
+        max_param_value = self.config['max-param-value']
+        target_rms = self.config['target-rms']
+        max_change = self.config['max-change']
+        #ng_affine_options = self.config['ng-affine-options']
+        learning_rate_factor= self.config['learning-rate-factor']
+        learning_rate_option=('learning-rate-factor={0}'.format(learning_rate_factor)
+                              if learning_rate_factor != 1.0 else '')
+        cos_file = self.config['cos-transform-file']
+        sin_file = self.config['sin-transform-file']
+        num_filters = self.config['num-filters']
+        l2_regularize = self.config['l2-regularize']
+        half_fft_range = self.config['half-fft-range']
+        fft_dim = (2**(input_dim-1).bit_length())
+        cur_dim = input_dim
+        cur_node = input_desc
+        scale = self.config['scale']
+        configs = []
+        for nonlinearity in nonlinearities:
+            if nonlinearity == 'preprocess':
+                configs.append('component name={0}.preprocess type=ShiftInputComponent '
+                               'input-dim={1} output-dim={1} dither=0.0 max-shift=0.0 '
+                               'preprocess=true'.format(self.name, cur_dim))
+
+                configs.append('component-node name={0}.preprocess '
+                               'component={0}.preprocess input={1}'
+                               ''.format(self.name, cur_node))
+                cur_node = '{0}.preprocess'.format(self.name)
+
+            elif nonlinearity == 'fft':
+                #if self.config['write-init-config']:
+                #    line = ('output-node name=output input={0}'
+                #            ''.format(input_desc))
+                #    configs.append(('init', line))
+                output_dim = (fft_dim/2 if half_fft_range is True else fft_dim)
+                line = ('component name={0}.cosine type=FixedAffineComponent '
+                       'matrix={1}'
+                       ''.format(self.name, cos_file))
+                configs.append(('final', line))
+
+                line = ('component name={0}.cosine type=FixedAffineComponent '
+                        'input-dim={1} output-dim={2}'
+                        ''.format(self.name, cur_dim, output_dim))
+                configs.append(('ref', line))
+
+                line = ('component-node name={0}.cosine component={0}.cosine '
+                        'input={1}'.format(self.name, cur_node))
+                configs.append(('final', line))
+                configs.append(('ref', line))
+
+                line = ('component name={0}.sine type=FixedAffineComponent '
+                        'matrix={1}'.format(self.name, sin_file))
+                configs.append(('final', line))
+
+                line = ('component name={0}.sine type=FixedAffineComponent '
+                        'input-dim={1} output-dim={2}'
+                        ''.format(self.name, cur_dim, output_dim))
+                configs.append(('ref', line))
+
+                line = ('component-node name={0}.sine component={0}.sine '
+                        'input={1}'.format(self.name, cur_node))
+                configs.append(('final', line))
+                configs.append(('ref', line))
+
+                cur_node = []
+                if half_fft_range:
+                    cur_node.append('{0}.cosine'.format(self.name))
+                    cur_node.append('{0}.sine'.format(self.name))
+                else:
+                    configs.append('dim-range-node name={0}.sine.half input-node={0}.sine '
+                                   'dim-offset=0 dim={1}'.format(self.name, fft_dim/2))
+                    configs.append('dim-range-node name={0}.cosine.half input-node={0}.cosine '
+                                   'dim-offset=0 dim={1}'.format(self.name, fft_dim/2))
+                    cur_node.append('{0}.cosine.half'.format(self.name))
+                    cur_node.append('{0}.sine.half'.format(self.name))
+                cur_dim = fft_dim / 2
+            elif nonlinearity == 'abs2':
+                assert(len(cur_node) == 2 and
+                       cur_node[0] == '{0}.cosine'.format(self.name) and
+                       cur_node[1] == '{0}.sine'.format(self.name))
+                configs.append('component name={0}.cos.sqr type=ElementwiseProductComponent '
+                               'input-dim={1} output-dim={2}'
+                               ''.format(self.name, cur_dim * 2, cur_dim))
+                configs.append('component-node name={0}.cos.sqr component={0}.cos.sqr '
+                               'input=Append({1},{1})'
+                               ''.format(self.name, cur_node[0]))
+
+                configs.append('component name={0}.sin.sqr type=ElementwiseProductComponent '
+                               'input-dim={1} output-dim={2}'
+                               ''.format(self.name, cur_dim * 2, cur_dim))
+                configs.append('component-node name={0}.sin.sqr component={0}.cos.sqr '
+                               'input=Append({1},{1})'
+                               ''.format(self.name, cur_node[1]))
+                configs.append('component name={0}.abs type=NoOpComponent dim={1}'
+                               ''.format(self.name, cur_dim))
+                configs.append('component-node name={0}.abs component={0}.abs '
+                               'input=Sum({0}.sin.sqr, {0}.cos.sqr)'
+                               ''.format(self.name))
+                cur_node = '{0}.abs'.format(self.name)
+
+            elif nonlinearity == 'abs':
+                assert(len(cur_node) == 2 and
+                       cur_node[0] == '{0}.cosine'.format(self.name) and
+                       cur_node[1] == '{0}.sine'.format(self.name))
+                permute_vec = []
+                for i in range(fft_dim/2):
+                    permute_vec.append(i)
+                    permute_vec.append(i+fft_dim/2)
+                permute_vec_str = ','.join([str(x) for x in permute_vec])
+                configs.append('component name={0}.permute type=PermuteComponent '
+                               'column-map={1}'.format(self.name, permute_vec_str))
+                configs.append('component-node name={0}.permute component={0}.permute '
+                               'input=Append({1},{2})'
+                               ''.format(self.name, cur_node[0], cur_node[1]))
+
+                configs.append('component name={0}.abs type=PnormComponent '
+                               'input-dim={1} output-dim={2}'
+                               ''.format(self.name, fft_dim, fft_dim/2))
+                configs.append('component-node name={0}.abs component={0}.abs '
+                               'input={0}.permute'.format(self.name))
+                cur_node = '{0}.abs'.format(self.name)
+                cur_dim = fft_dim / 2
+            elif nonlinearity == 'norm':
+                assert(isinstance(cur_node, str))
+                configs.append('component name={0}.norm type=NormalizeComponent '
+                               'dim={1} target-rms=1.0 add-log-stddev=true '.format(self.name, cur_dim))
+                configs.append('component-node name={0}.norm component={0}.norm '
+                                'input={1}'.format(self.name, cur_node))
+                configs.append('dim-range-node name={0}.norm.no.energy input-node={0}.norm '
+                                'dim-offset=0 dim={1}'.format(self.name, cur_dim))
+                configs.append('dim-range-node name={0}.norm.energy input-node={0}.norm '
+                               'dim-offset={1} dim=1'.format(self.name, cur_dim))
+                cur_node = '{0}.norm.no.energy'.format(self.name)
+                cur_dim = fft_dim / 2
+            elif nonlinearity == 'lognorm':
+                assert(isinstance(cur_node, str))
+                configs.append('component name={0}.norm.log type=LogComponent '
+                               'dim={1} log-floor=1e-4 additive-offset=false '
+                               ''.format(self.name, cur_dim))
+                configs.append('component-node name={0}.norm.log component={0}.norm.log '
+                               'input={1}'.format(self.name, cur_node))
+                configs.append('component name={0}.norm.batch type=BatchNormComponent '
+                               'dim={1} target-rms={2} '
+                               ''.format(self.name, cur_dim, target_rms))
+                configs.append('component-node name={0}.norm.batch '
+                               'component={0}.norm.batch '
+                               'input={0}.norm.log'.format(self.name))
+                configs.append('component name={0}.norm.so type=ScaleAndOffsetComponent '
+                               'dim={1} max-change=0.5 scale={2}'
+                               ''.format(self.name, cur_dim, scale))
+                configs.append('component-node name={0}.norm.so component={0}.norm.so '
+                               'input={0}.norm.batch '.format(self.name))
+                configs.append('component name={0}.norm.exp type=ExpComponent dim={1} '
+                               ''.format(self.name, cur_dim))
+                configs.append('component-node name={0}.norm.exp component={0}.norm.exp '
+                               'input={0}.norm.so'.format(self.name))
+                #configs.append('component name={0}.norm.exp type=ExpComponent dim={1} '
+                #               ''.format(self.name, cur_dim))
+                #configs.append('component-node name={0}.norm.exp component={0}.norm.exp '
+                #               'input={0}.norm.batch'.format(self.name))
+                cur_node = '{0}.norm.exp'.format(self.name)
+                cur_dim = fft_dim / 2
+
+
+            elif nonlinearity == 'lognorm2':
+                configs.append("component name={0}.lognorm type=CompositeComponent "
+                               "num-components=4 "
+                               "component1='type=LogComponent dim={1} log-floor=1e-4 additive-offset=false' "
+                               "component2='type=BatchNormComponent dim={1} target-rms={2}' "
+                               "component3='type=ScaleAndOffsetComponent dim={1} max-change=0.5' "
+                               "component4='type=ExpComponent dim={1}' "
+                               "".format(self.name, cur_dim, target_rms))
+                configs.append('component-node name={0}.lognorm '
+                               'component={0}.lognorm input={1}'
+                               ''.format(self.name, cur_node))
+
+                cur_node = '{0}.lognorm'.format(self.name)
+                cur_dim = fft_dim / 2
+
+            elif nonlinearity == 'affine':
+                configs.append('component name={0}.filterbank type=AffineComponent '
+                               'input-dim={1} output-dim={2} max-change={3} '
+                               'min-param-value={4} max-param-value={5} '
+                               'bias-stddev=0.0 l2-regularize={6}'
+                               ''.format(self.name, cur_dim, num_filters, max_change,
+                                         min_param_value, max_param_value,
+                                         l2_regularize))
+                configs.append('component-node name={0}.filterbank '
+                               'component={0}.filterbank input={1}'
+                               ''.format(self.name, cur_node))
+                cur_node = '{0}.filterbank'.format(self.name)
+                cur_dim = num_filters
+            elif nonlinearity == 'log':
+                configs.append('component name={0}.log type=LogComponent '
+                               'log-floor=1e-4 additive-offset=false dim={1}'
+                               ''.format(self.name, cur_dim))
+
+                if 'norm' in nonlinearities:
+                    configs.append('component-node name={0}.log0 '
+                                   'component={0}.log input={1}'
+                                   ''.format(self.name, cur_node))
+                    configs.append('component name={0}.log.sum type=NoOpComponent '
+                                   'dim={1}'.format(self.name, cur_dim+1))
+                    configs.append('component-node name={0}.log component={0}.log.sum '
+                                   'input=Append({0}.log0, {0}.norm.energy)'
+                                   ''.format(self.name))
+                    cur_dim = fft_dim / 2 + 1
+                else:
+                    configs.append('component-node name={0}.log '
+                                   'component={0}.log input={1}'
+                                   ''.format(self.name, cur_node))
+                    cur_dim = fft_dim / 2
+                cur_node = '{0}.log'.format(self.name)
+
+
+
+            else:
+                raise RuntimeError("Unknown nonlinearity type: {0}"
+                                   "".format(nonlinearity))
+        return configs
+
+class XconfigTimeDomainLayer(XconfigLayerBase):
+    def __init__(self, first_token, key_to_value, prev_names = None):
+        assert first_token in ['preprocess-tconv-abs-log-nin-affine-layer']
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+        self.config = {'input':'[-1]',
+                       'dim': -1,
+                       'frame-dim': 80,
+                       'max-change' : 0.75,
+                       'num-filters' : 100,
+                       'log-floor' : 0.0001,
+                       'nin-mid-dim' : 75,
+                       'nin-forward-dim' : 500,
+                       'sub-frames-per-frame': 8,
+                       'frames-left-context':1,
+                       'frames-right-context':0,
+                       'max-shift': 0.2}
+
+
+    def check_configs(self):
+        if self.config['frames-left-context'] < 0:
+            raise RuntimeError("frames-left-context should be > 0."
+                               "".format(self.config['frames-left-context']))
+        if self.config['frames-right-context'] < 0:
+            raise RuntimeError("frames-right-context should be > 0."
+                               "".format(self.config['sub-frames-right-context']))
+
+
+    def output_name(self, auxiliary_output=None):
+        assert auxiliary_output == None
+
+        split_layer_name = self.layer_type.split('-')
+        assert split_layer_name[-1] == 'layer'
+        last_nonlinearity = split_layer_name[-2]
+        if last_nonlinearity == 'affine':
+            return '{0}.post.forward'.format(self.name)
+
+    def output_dim(self):
+        split_layer_name = self.layer_type.split('-')
+        assert split_layer_name[-2] == 'affine'
+        return self.config['nin-forward-dim']
+
+    def get_full_config(self):
+        ans = []
+        config_lines = self._generate_config()
+
+        for line in config_lines:
+            if len(line) == 2:
+                # 'ref' or 'final' tuple already exist in the line
+                # These lines correspond to fft component.
+                # which contains FixedAffineComponent.
+                assert(line[0] == 'init' or line[0] == 'ref' or line[0] == 'final')
+                ans.append(line)
+            else:
+                for config_name in ['ref', 'final']:
+                    ans.append((config_name, line))
+        return ans
+
+    def _generate_config(self):
+        split_layer_name = self.layer_type.split('-')
+        assert split_layer_name[-1] == 'layer'
+        nonlinearities = split_layer_name[:-1]
+
+        # by 'descriptor_final_string' we mean a string that can appear in
+        # config-files, i.e. it contains the 'final' names of nodes.
+        input_desc = self.descriptors['input']['final-string']
+        input_dim = self.descriptors['input']['dim']
+
+        # the child classes e.g. tdnn might want to process the input
+        # before adding the other components
+        return self._add_components(input_desc, input_dim, nonlinearities)
+
+
+    def _add_components(self, input_desc, input_dim, nonlinearities):
+        dim = self.config['dim']
+        frame_dim = self.config['frame-dim']
+        max_change = self.config['max-change']
+        nin_mid_dim = self.config['nin-mid-dim']
+        pool_left_context = self.config['frames-left-context']
+        pool_right_context = self.config['frames-right-context']
+        nin_forward_dim = self.config['nin-forward-dim']
+        log_floor = self.config['log-floor']
+        num_filters  = self.config['num-filters']
+        samples_per_sub_frame = frame_dim / self.config['sub-frames-per-frame']
+        filter_step = samples_per_sub_frame
+        filter_dim = input_dim - (frame_dim if 'preprocess' in nonlinearities else 0) - frame_dim + filter_step
+        cur_node = input_desc
+        cur_dim = input_dim
+        configs = []
+        for nonlinearity in nonlinearities:
+            if nonlinearity == 'preprocess':
+                configs.append('component name={0}.preprocess type=ShiftInputComponent '
+                               'input-dim={1} output-dim={2} dither=0.0 max-shift={3} '
+                               'preprocess=true '.format(self.name, cur_dim,
+                                cur_dim - frame_dim,
+                                self.config['max-shift']))
+
+                configs.append('component-node name={0}.preprocess '
+                               'component={0}.preprocess input={1}'
+                               ''.format(self.name, cur_node))
+                cur_node = '{0}.preprocess'.format(self.name)
+                cur_dim = cur_dim - frame_dim
+
+            elif nonlinearity == 'tconv':
+                # add Convolution component and PermuteComponent
+                configs.append('component name={0}.tconv type=ConvolutionComponent '
+                               'input-x-dim={1}  input-y-dim=1 input-z-dim=1 '
+                               'filt-x-dim={2} filt-y-dim=1 filt-x-step={3} '
+                               'filt-y-step=1 num-filters={4} '
+                               'input-vectorization-order=zyx param-stddev={5} '
+                               'bias-stddev=0.01 max-change={6}'
+                               ''.format(self.name, cur_dim, filter_dim,
+                                        filter_step, num_filters,
+                                        0.9 / (filter_dim**0.5),
+                                        max_change))
+
+                configs.append('component-node name={0}.tconv '
+                               'component={0}.tconv input={1}'
+                               ''.format(self.name, cur_node))
+
+                # adding PermuteComponent and appending filter outputs.
+                conv_output_dim = self.config['sub-frames-per-frame'] * (pool_left_context + pool_right_context + 1)
+                permute_vec = []
+                for i in range(num_filters):
+                    for j in range(conv_output_dim):
+                        permute_vec.append(i+j*num_filters)
+                permute_vec_str = ','.join([str(x) for x in permute_vec])
+                configs.append('component name={0}.permute type=PermuteComponent '
+                               'column-map={1}'
+                               ''.format(self.name, permute_vec_str))
+                append_str = ','.join(['Offset({0}.tconv,{1})'.format(self.name, x) for x in range(-1*pool_left_context, pool_right_context+1)])
+                configs.append('component-node name={0}.permute '
+                               'component={0}.permute input=Append({1})'
+                               ''.format(self.name, append_str))
+
+                cur_node = '{0}.permute'.format(self.name)
+                cur_dim = num_filters * conv_output_dim
+
+            elif nonlinearity == 'abs':
+                configs.append('component name={0}.abs type=PnormComponent '
+                               'input-dim={1} output-dim={1}'
+                               ''.format(self.name, cur_dim))
+                configs.append('component-node name={0}.abs component={0}.abs '
+                               'input={1}'.format(self.name, cur_node))
+
+                cur_node = '{0}.abs'.format(self.name)
+                cur_dim = cur_dim
+
+            elif nonlinearity == 'log':
+                configs.append('component name={0}.log type=LogComponent '
+                               'dim={1} log-floor={2} additive-offset=false '
+                               ''.format(self.name, cur_dim, log_floor))
+                configs.append('component-node name={0}.log component={0}.log '
+                               'input={1}'.format(self.name, cur_node))
+
+                cur_node = '{0}.log'.format(self.name)
+                cur_dim = cur_dim
+
+            elif nonlinearity == 'nin':
+                configs.append("component name={0}.nin type=CompositeComponent "
+                               "num-components=4 "
+                               "component1='type=RectifiedLinearComponent dim={1} self-repair-scale=1e-05' "
+                               "component2='type=NaturalGradientRepeatedAffineComponent input-dim={1} output-dim={2} num-repeats={3} param-stddev={4} bias-stddev=0' "
+                               "component3='type=RectifiedLinearComponent dim={2} self-repair-scale=1e-05' "
+                               "component4='type=NaturalGradientRepeatedAffineComponent input-dim={2}  output-dim={1} num-repeats={3} param-stddev={5} bias-mean=0.1 bias-stddev=0 ' "
+                               "".format(self.name, cur_dim, nin_mid_dim * num_filters,
+                                         num_filters, 2.0 / (cur_dim**0.5),
+                                         2.0 / (nin_mid_dim * num_filters)**0.5))
+
+                configs.append('component-node name={0}.nin component={0}.nin '
+                               'input={1}'
+                               ''.format(self.name, cur_node))
+                configs.append("component name={0}.post.nin type=CompositeComponent "
+                               "num-components=2 component1='type=RectifiedLinearComponent dim={1} self-repair-scale=1e-05' "
+                               "component2='type=NormalizeComponent dim={1} add-log-stddev=true '"
+                               "".format(self.name, cur_dim))
+                configs.append('component-node name={0}.post.nin component={0}.post.nin input={0}.nin'
+                               ''.format(self.name))
+
+                cur_node= '{0}.post.nin'.format(self.name)
+                cur_dim = cur_dim + 1
+
+            elif nonlinearity == 'affine':
+                configs.append('component name={0}.forward.nin type=NaturalGradientAffineComponent '
+                               'input-dim={1} output-dim={2} bias-stddev=0'
+                               ''.format(self.name, cur_dim, nin_forward_dim))
+                configs.append('component-node name={0}.forward.nin component={0}.forward.nin '
+                               'input={1}'.format(self.name, cur_node))
+                configs.append("component name={0}.post.forward type=CompositeComponent num-components=2 "
+                               "component1='type=RectifiedLinearComponent dim={1} self-repair-scale=1e-05' "
+                               "component2='type=NormalizeComponent dim={1}'"
+                               "".format(self.name, nin_forward_dim))
+                configs.append('component-node name={0}.post.forward component={0}.post.forward '
+                               'input={0}.forward.nin'.format(self.name, cur_node))
+
+                cur_node = '{0}.post.forward'.format(self.name)
+                cur_dim = nin_forward_dim
+
+            else:
+                raise RuntimeError("Unknown nonlinearity type: {0}"
+                                   "".format(nonlinearity))
+        return configs
diff --git a/egs/wsj/s5/steps/make_raw_feats.sh b/egs/wsj/s5/steps/make_raw_feats.sh
new file mode 100755
index 00000000000..34d6e066626
--- /dev/null
+++ b/egs/wsj/s5/steps/make_raw_feats.sh
@@ -0,0 +1,129 @@
+#!/bin/bash
+
+# Copyright 2016  Pegah Ghahremani
+# Apache 2.0
+# To be run from .. (one directory up from here)
+# see ../run.sh for example
+
+# Begin configuration section.
+nj=4
+cmd=run.pl
+compress=true
+compress_method=3
+raw_config=conf/raw.conf
+write_utt2num_frames=false  # if true writes utt2num_frames
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# -lt 1 ] || [ $# -gt 3 ]; then
+   echo "Usage: $0 [options] <data-dir> <log-dir> <path-to-segmented-dir>";
+   echo "e.g.: $0 data/train exp/make_segment/train segments"
+   echo "options: "
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   exit 1;
+fi
+
+data=$1
+if [ $# -ge 2 ]; then
+  logdir=$2
+else
+  logdir=$data/log
+fi
+if [ $# -ge 3 ]; then
+  featdir=$3
+else
+  featdir=$data/data
+fi
+
+# make $featdir an absolute pathname.
+featdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $featdir ${PWD}`
+
+# use "name" as part of name of the archive.
+name=`basename $data`
+scp=$data/wav.scp
+
+mkdir -p $featdir || exit 1;
+mkdir -p $logdir || exit 1;
+
+if [ -f $data/feats.scp ]; then
+  mkdir -p $data/.backup
+  echo "$0: moving $data/feats.scp to $data/.backup"
+  mv $data/feats.scp $data/.backup
+fi
+
+required="$scp $raw_config"
+
+for f in $required; do
+  if [ ! -f $f ]; then
+    echo "$0: no such file $f"
+    exit 1;
+  fi
+done
+
+utils/validate_data_dir.sh --no-text --no-feats $data || exit 1;
+
+for n in $(seq $nj); do
+  # the next command does nothing unless $featdir/storage/ exists, see
+  # utils/create_data_link.pl for more info.
+  utils/create_data_link.pl $featdir/raw_wav_feat_$name.$n.ark
+done
+
+if $write_utt2num_frames; then
+  write_num_frames_opt="--write-num-frames=ark,t:$logdir/utt2num_frames.JOB"
+else
+  write_num_frames_opt=
+fi
+
+if [ -f $data/segments ]; then
+  echo "$0 [info]: segments file exists: using that."
+
+  split_segments=""
+  for n in $(seq $nj); do
+    split_segments="$split_segments $logdir/segments.$n"
+  done
+
+  utils/split_scp.pl $data/segments $split_segments || exit 1;
+  rm $logdir/.error 2>/dev/null
+
+  $cmd JOB=1:$nj $logdir/make_raw_feats_${name}.JOB.log \
+    extract-segments scp,p:$scp $logdir/segments.JOB ark:- \| \
+      compute-raw-frame-feats --config=$raw_config ark:- ark:- \| \
+      copy-feats $write_num_frames_opt \
+      --compress=$compress --compression-method=$compress_method ark:- \
+      ark,scp:$featdir/raw_wav_feat_$name.JOB.ark,$featdir/raw_wav_feat_$name.JOB.scp \
+     || exit 1;
+
+else
+  echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
+  split_scps=""
+  for n in $(seq $nj); do
+    split_scps="$split_scps $logdir/wav_${name}.$n.scp"
+  done
+
+  utils/split_scp.pl $scp $split_scps || exit 1;
+
+
+  # add ,p to the input rspecifier so that we can just skip over
+  # utterances that have bad wave data.
+
+  $cmd JOB=1:$nj $logdir/make_mfcc_${name}.JOB.log \
+    compute-raw-frame-feats --config=$raw_config \
+     scp,p:$logdir/wav_${name}.JOB.scp ark:- \| \
+      copy-feats $write_num_frames_opt --compress=$compress \
+      --compression-method=$compress_method ark:- \
+      ark,scp:$featdir/raw_wav_feat_$name.JOB.ark,$featdir/raw_wav_feat_$name.JOB.scp \
+      || exit 1;
+fi
+
+# concatenate the .scp files together.
+for n in $(seq $nj); do
+  cat $featdir/raw_wav_feat_$name.$n.scp || exit 1;
+done > $data/feats.scp
+
+rm $logdir/raw_wav_feat_${name}.*.scp  $logdir/segments.* 2>/dev/null
+echo "Successed generate $data"
diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs_old.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs_old.sh
new file mode 100755
index 00000000000..daf1ad2f9ec
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/chain/get_egs_old.sh
@@ -0,0 +1,428 @@
+#!/bin/bash
+
+# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+#
+# This script, which will generally be called from other neural-net training
+# scripts, extracts the training examples used to train the 'chain' system
+# (and also the validation examples used for diagnostics), and puts them in
+# separate archives.
+#
+# This script dumps egs with many frames of labels, controlled by the
+# frames_per_eg config variable (default: 25), plus left and right context.
+# Because CTC training involves alignment of data, we can't meaningfully train
+# frame by frame.   The supervision approach involves the time alignment, though--
+# it is just applied in a loose way, where each symbol can appear in the
+# frame-range that it was in in the alignment, extended by a certain margin.
+#
+
+
+# Begin configuration section.
+cmd=run.pl
+frames_per_eg=25   # number of feature frames example (not counting added context).
+                   # more->less disk space and less time preparing egs, but more
+                   # I/O during training.
+frames_overlap_per_eg=0  # number of supervised frames of overlap that we aim for per eg.
+                  # can be useful to avoid wasted data if you're using --left-deriv-truncate
+                  # and --right-deriv-truncate.
+frame_subsampling_factor=3 # frames-per-second of features we train on divided
+                           # by frames-per-second at output of chain model
+alignment_subsampling_factor=3 # frames-per-second of input alignments divided
+                               # by frames-per-second at output of chain model
+left_context=4    # amount of left-context per eg (i.e. extra frames of input features
+                  # not present in the output supervision).
+right_context=4   # amount of right-context per eg.
+left_context_initial=-1    # if >=0, left-context for first chunk of an utterance
+right_context_final=-1     # if >=0, right-context for last chunk of an utterance
+compress=true   # set this to false to disable compression (e.g. if you want to see whether
+                # results are affected).
+
+num_utts_subset=300     # number of utterances in validation and training
+                        # subsets used for shrinkage and diagnostics.
+num_valid_egs_combine=0  # #validation examples for combination weights at the very end.
+num_train_egs_combine=1000 # number of train examples for the above.
+num_egs_diagnostic=400 # number of frames for "compute_prob" jobs
+frames_per_iter=400000 # each iteration of training, see this many frames per
+                       # job, measured at the sampling rate of the features
+                       # used.  This is just a guideline; it will pick a number
+                       # that divides the number of samples in the entire data.
+
+right_tolerance=  #CTC right tolerance == max label delay.
+left_tolerance=
+
+transform_dir=     # If supplied, overrides latdir as the place to find fMLLR transforms
+
+stage=0
+nj=15         # This should be set to the maximum number of jobs you are
+              # comfortable to run in parallel; you can increase it if your disk
+              # speed is greater and you have more machines.
+max_shuffle_jobs_run=50  # the shuffle jobs now include the nnet3-chain-normalize-egs command,
+                         # which is fairly CPU intensive, so we can run quite a few at once
+                         # without overloading the disks.
+srand=0     # rand seed for nnet3-chain-get-egs, nnet3-chain-copy-egs and nnet3-chain-shuffle-egs
+online_ivector_dir=  # can be used if we are including speaker information as iVectors.
+cmvn_opts=  # can be used for specifying CMVN options, if feature type is not lda (if lda,
+            # it doesn't make sense to use different options than were used as input to the
+            # LDA transform).  This is used to turn off CMVN in the online-nnet experiments.
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+
+if [ $# != 4 ]; then
+  echo "Usage: $0 [opts] <data> <chain-dir> <lattice-dir> <egs-dir>"
+  echo " e.g.: $0 data/train exp/tri4_nnet exp/tri3_lats exp/tri4_nnet/egs"
+  echo ""
+  echo "From <chain-dir>, 0.trans_mdl (the transition-model), tree (the tree)"
+  echo "and normalization.fst (the normalization FST, derived from the denominator FST)"
+  echo "are read."
+  echo ""
+  echo "Main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config file containing options"
+  echo "  --nj <nj>                                        # The maximum number of jobs you want to run in"
+  echo "                                                   # parallel (increase this only if you have good disk and"
+  echo "                                                   # network speed).  default=6"
+  echo "  --cmd (utils/run.pl;utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --frames-per-iter <#samples;400000>              # Number of frames of data to process per iteration, per"
+  echo "                                                   # process."
+  echo "  --frame-subsampling-factor <factor;3>            # factor by which num-frames at nnet output is reduced "
+  echo "  --frames-per-eg <frames;25>                      # number of supervised frames per eg on disk"
+  echo "  --frames-overlap-per-eg <frames;25>              # number of supervised frames of overlap between egs"
+  echo "  --left-context <int;4>                           # Number of frames on left side to append for feature input"
+  echo "  --right-context <int;4>                          # Number of frames on right side to append for feature input"
+  echo "  --left-context-initial <int;-1>                  # If >= 0, left-context for first chunk of an utterance"
+  echo "  --right-context-final <int;-1>                   # If >= 0, right-context for last chunk of an utterance"
+  echo "  --num-egs-diagnostic <#frames;4000>              # Number of egs used in computing (train,valid) diagnostics"
+  echo "  --num-valid-egs-combine <#frames;10000>          # Number of egss used in getting combination weights at the"
+  echo "                                                   # very end."
+  echo "  --stage <stage|0>                                # Used to run a partially-completed training process from somewhere in"
+  echo "                                                   # the middle."
+
+  exit 1;
+fi
+
+data=$1
+chaindir=$2
+latdir=$3
+dir=$4
+
+# Check some files.
+[ ! -z "$online_ivector_dir" ] && \
+  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
+
+for f in $data/feats.scp $latdir/lat.1.gz $latdir/final.mdl \
+         $chaindir/{0.trans_mdl,tree,normalization.fst} $extra_files; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+sdata=$data/split$nj
+utils/split_data.sh $data $nj
+
+mkdir -p $dir/log $dir/info
+
+num_lat_jobs=$(cat $latdir/num_jobs) || exit 1;
+
+# Get list of validation utterances.
+
+frame_shift=$(utils/data/get_frame_shift.sh $data) || exit 1
+utils/data/get_utt2dur.sh $data
+
+cat $data/utt2dur | \
+  awk -v min_len=$frames_per_eg -v fs=$frame_shift '{if ($2 * 1/fs >= min_len) print $1}' | \
+  utils/shuffle_list.pl | head -$num_utts_subset > $dir/valid_uttlist || exit 1;
+
+len_uttlist=`wc -l $dir/valid_uttlist | awk '{print $1}'`
+if [ $len_uttlist -lt $num_utts_subset ]; then
+  echo "Number of utterances which have length at least $frames_per_eg is really low. Please check your data." && exit 1;
+fi
+
+if [ -f $data/utt2uniq ]; then  # this matters if you use data augmentation.
+  # because of this stage we can again have utts with lengths less than
+  # frames_per_eg
+  echo "File $data/utt2uniq exists, so augmenting valid_uttlist to"
+  echo "include all perturbed versions of the same 'real' utterances."
+  mv $dir/valid_uttlist $dir/valid_uttlist.tmp
+  utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt
+  cat $dir/valid_uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \
+    sort | uniq | utils/apply_map.pl $dir/uniq2utt | \
+    awk '{for(n=1;n<=NF;n++) print $n;}' | sort  > $dir/valid_uttlist
+  rm $dir/uniq2utt $dir/valid_uttlist.tmp
+fi
+
+cat $data/utt2dur | \
+  awk -v min_len=$frames_per_eg -v fs=$frame_shift '{if ($2 * 1/fs >= min_len) print $1}' | \
+   utils/filter_scp.pl --exclude $dir/valid_uttlist | \
+   utils/shuffle_list.pl | head -$num_utts_subset > $dir/train_subset_uttlist || exit 1;
+len_uttlist=`wc -l $dir/train_subset_uttlist | awk '{print $1}'`
+if [ $len_uttlist -lt $num_utts_subset ]; then
+  echo "Number of utterances which have length at least $frames_per_eg is really low. Please check your data." && exit 1;
+fi
+
+[ -z "$transform_dir" ] && transform_dir=$latdir
+
+# because we'll need the features with a different number of jobs than $latdir,
+# copy to ark,scp.
+if [ -f $transform_dir/raw_trans.1 ]; then
+  echo "$0: using raw transforms from $transform_dir"
+  if [ $stage -le 0 ]; then
+    $cmd $dir/log/copy_transforms.log \
+      copy-feats "ark:cat $transform_dir/raw_trans.* |" "ark,scp:$dir/trans.ark,$dir/trans.scp"
+  fi
+fi
+
+## Set up features.
+echo "$0: feature type is raw"
+###feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- |"
+###valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
+###train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
+feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | copy-feats scp:- ark:- |"
+valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | copy-feats scp:- ark:- |"
+train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | copy-feats scp:- ark:- |"
+
+echo $cmvn_opts >$dir/cmvn_opts # caution: the top-level nnet training script should copy this to its own dir now.
+
+if [ -f $dir/trans.scp ]; then
+  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/trans.scp ark:- ark:- |"
+  valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |"
+  train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |"
+fi
+
+if [ ! -z "$online_ivector_dir" ]; then
+  ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
+  echo $ivector_dim > $dir/info/ivector_dim
+  steps/nnet2/get_ivector_id.sh $online_ivector_dir > $dir/info/final.ie.id || exit 1
+  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
+  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
+else
+  ivector_opts=""
+  echo 0 >$dir/info/ivector_dim
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$0: working out number of frames of training data"
+  num_frames=$(steps/nnet2/get_num_frames.sh $data)
+  echo $num_frames > $dir/info/num_frames
+  echo "$0: working out feature dim"
+  feats_one="$(echo $feats | sed s/JOB/1/g)"
+  if ! feat_dim=$(feat-to-dim "$feats_one" - 2>/dev/null); then
+    echo "Command failed (getting feature dim): feat-to-dim \"$feats_one\""
+    exit 1
+  fi
+  echo $feat_dim > $dir/info/feat_dim
+else
+  num_frames=$(cat $dir/info/num_frames) || exit 1;
+  feat_dim=$(cat $dir/info/feat_dim) || exit 1;
+fi
+
+# the + 1 is to round up, not down... we assume it doesn't divide exactly.
+num_archives=$[$num_frames/$frames_per_iter+1]
+
+# We may have to first create a smaller number of larger archives, with number
+# $num_archives_intermediate, if $num_archives is more than the maximum number
+# of open filehandles that the system allows per process (ulimit -n).
+# This sometimes gives a misleading answer as GridEngine sometimes changes the
+# limit, so we limit it to 512.
+max_open_filehandles=$(ulimit -n) || exit 1
+[ $max_open_filehandles -gt 512 ] && max_open_filehandles=512
+num_archives_intermediate=$num_archives
+archives_multiple=1
+while [ $[$num_archives_intermediate+4] -gt $max_open_filehandles ]; do
+  archives_multiple=$[$archives_multiple+1]
+  num_archives_intermediate=$[$num_archives/$archives_multiple] || exit 1;
+done
+# now make sure num_archives is an exact multiple of archives_multiple.
+num_archives=$[$archives_multiple*$num_archives_intermediate] || exit 1;
+
+echo $num_archives >$dir/info/num_archives
+echo $frames_per_eg >$dir/info/frames_per_eg
+# Work out the number of egs per archive
+egs_per_archive=$[$num_frames/($frames_per_eg*$num_archives)] || exit 1;
+! [ $egs_per_archive -le $frames_per_iter ] && \
+  echo "$0: script error: egs_per_archive=$egs_per_archive not <= frames_per_iter=$frames_per_iter" \
+  && exit 1;
+
+echo $egs_per_archive > $dir/info/egs_per_archive
+
+echo "$0: creating $num_archives archives, each with $egs_per_archive egs, with"
+echo "$0:   $frames_per_eg labels per example, and (left,right) context = ($left_context,$right_context)"
+if [ $left_context_initial -ge 0 ] || [ $right_context_final -ge 0 ]; then
+  echo "$0:   ... and (left-context-initial,right-context-final) = ($left_context_initial,$right_context_final)"
+fi
+
+
+if [ -e $dir/storage ]; then
+  # Make soft links to storage directories, if distributing this way..  See
+  # utils/create_split_dir.pl.
+  echo "$0: creating data links"
+  utils/create_data_link.pl $(for x in $(seq $num_archives); do echo $dir/cegs.$x.ark; done)
+  for x in $(seq $num_archives_intermediate); do
+    utils/create_data_link.pl $(for y in $(seq $nj); do echo $dir/cegs_orig.$y.$x.ark; done)
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: copying training lattices"
+
+  $cmd --max-jobs-run 6 JOB=1:$num_lat_jobs $dir/log/lattice_copy.JOB.log \
+    lattice-copy "ark:gunzip -c $latdir/lat.JOB.gz|" ark,scp:$dir/lat.JOB.ark,$dir/lat.JOB.scp || exit 1;
+
+  for id in $(seq $num_lat_jobs); do cat $dir/lat.$id.scp; done > $dir/lat.scp
+fi
+
+
+egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress"
+[ $left_context_initial -ge 0 ] && egs_opts="$egs_opts --left-context-initial=$left_context_initial"
+[ $right_context_final -ge 0 ] && egs_opts="$egs_opts --right-context-final=$right_context_final"
+
+
+chain_supervision_all_opts="--lattice-input=true --frame-subsampling-factor=$alignment_subsampling_factor"
+[ ! -z $right_tolerance ] && \
+  chain_supervision_all_opts="$chain_supervision_all_opts --right-tolerance=$right_tolerance"
+
+[ ! -z $left_tolerance ] && \
+  chain_supervision_all_opts="$chain_supervision_all_opts --left-tolerance=$left_tolerance"
+
+echo $left_context > $dir/info/left_context
+echo $right_context > $dir/info/right_context
+echo $left_context_initial > $dir/info/left_context_initial
+echo $right_context_final > $dir/info/right_context_final
+
+if [ $stage -le 3 ]; then
+  echo "$0: Getting validation and training subset examples."
+  rm $dir/.error 2>/dev/null
+  echo "$0: ... extracting validation and training-subset alignments."
+
+  # do the filtering just once, as lat.scp may be long.
+  utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) \
+    <$dir/lat.scp >$dir/lat_special.scp
+
+  $cmd $dir/log/create_valid_subset.log \
+    utils/filter_scp.pl $dir/valid_uttlist $dir/lat_special.scp \| \
+    lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \
+    chain-get-supervision $chain_supervision_all_opts $chaindir/tree $chaindir/0.trans_mdl \
+      ark:- ark:- \| \
+    nnet3-chain-get-egs $ivector_opts --srand=$srand \
+      $egs_opts $chaindir/normalization.fst \
+      "$valid_feats" ark,s,cs:- "ark:$dir/valid_all.cegs" || touch $dir/.error &
+  $cmd $dir/log/create_train_subset.log \
+    utils/filter_scp.pl $dir/train_subset_uttlist $dir/lat_special.scp \| \
+    lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \
+    chain-get-supervision $chain_supervision_all_opts \
+      $chaindir/tree $chaindir/0.trans_mdl ark:- ark:- \| \
+    nnet3-chain-get-egs $ivector_opts --srand=$srand \
+      $egs_opts $chaindir/normalization.fst \
+      "$train_subset_feats" ark,s,cs:- "ark:$dir/train_subset_all.cegs" || touch $dir/.error &
+  wait;
+  [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1
+  echo "... Getting subsets of validation examples for diagnostics and combination."
+  $cmd $dir/log/create_valid_subset_combine.log \
+    nnet3-chain-subset-egs --n=$num_valid_egs_combine ark:$dir/valid_all.cegs \
+    ark:$dir/valid_combine.cegs || touch $dir/.error &
+  $cmd $dir/log/create_valid_subset_diagnostic.log \
+    nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/valid_all.cegs \
+    ark:$dir/valid_diagnostic.cegs || touch $dir/.error &
+
+  $cmd $dir/log/create_train_subset_combine.log \
+    nnet3-chain-subset-egs --n=$num_train_egs_combine ark:$dir/train_subset_all.cegs \
+    ark:$dir/train_combine.cegs || touch $dir/.error &
+  $cmd $dir/log/create_train_subset_diagnostic.log \
+    nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/train_subset_all.cegs \
+    ark:$dir/train_diagnostic.cegs || touch $dir/.error &
+  wait
+  sleep 5  # wait for file system to sync.
+  cat $dir/valid_combine.cegs $dir/train_combine.cegs > $dir/combine.cegs
+
+  for f in $dir/{combine,train_diagnostic,valid_diagnostic}.cegs; do
+    [ ! -s $f ] && echo "No examples in file $f" && exit 1;
+  done
+  rm $dir/valid_all.cegs $dir/train_subset_all.cegs $dir/{train,valid}_combine.cegs
+fi
+
+if [ $stage -le 4 ]; then
+  # create cegs_orig.*.*.ark; the first index goes to $nj,
+  # the second to $num_archives_intermediate.
+
+  egs_list=
+  for n in $(seq $num_archives_intermediate); do
+    egs_list="$egs_list ark:$dir/cegs_orig.JOB.$n.ark"
+  done
+  echo "$0: Generating training examples on disk"
+
+  # The examples will go round-robin to egs_list.  Note: we omit the
+  # 'normalization.fst' argument while creating temporary egs: the phase of egs
+  # preparation that involves the normalization FST is quite CPU-intensive and
+  # it's more convenient to do it later, in the 'shuffle' stage.  Otherwise to
+  # make it efficient we need to use a large 'nj', like 40, and in that case
+  # there can be too many small files to deal with, because the total number of
+  # files is the product of 'nj' by 'num_archives_intermediate', which might be
+  # quite large.
+  $cmd JOB=1:$nj $dir/log/get_egs.JOB.log \
+    utils/filter_scp.pl $sdata/JOB/utt2spk $dir/lat.scp \| \
+    lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \
+    chain-get-supervision $chain_supervision_all_opts \
+      $chaindir/tree $chaindir/0.trans_mdl ark:- ark:- \| \
+    nnet3-chain-get-egs $ivector_opts --srand=\$[JOB+$srand] $egs_opts \
+      --num-frames-overlap=$frames_overlap_per_eg \
+     "$feats" ark,s,cs:- ark:- \| \
+    nnet3-chain-copy-egs --random=true --srand=\$[JOB+$srand] ark:- $egs_list || exit 1;
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$0: recombining and shuffling order of archives on disk"
+  # combine all the "egs_orig.*.JOB.scp" (over the $nj splits of the data) and
+  # shuffle the order, writing to the egs.JOB.ark
+
+  # the input is a concatenation over the input jobs.
+  egs_list=
+  for n in $(seq $nj); do
+    egs_list="$egs_list $dir/cegs_orig.$n.JOB.ark"
+  done
+
+  if [ $archives_multiple == 1 ]; then # normal case.
+    $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \
+      nnet3-chain-normalize-egs $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \
+      nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] ark:- ark:$dir/cegs.JOB.ark  || exit 1;
+  else
+    # we need to shuffle the 'intermediate archives' and then split into the
+    # final archives.  we create soft links to manage this splitting, because
+    # otherwise managing the output names is quite difficult (and we don't want
+    # to submit separate queue jobs for each intermediate archive, because then
+    # the --max-jobs-run option is hard to enforce).
+    output_archives="$(for y in $(seq $archives_multiple); do echo ark:$dir/cegs.JOB.$y.ark; done)"
+    for x in $(seq $num_archives_intermediate); do
+      for y in $(seq $archives_multiple); do
+        archive_index=$[($x-1)*$archives_multiple+$y]
+        # egs.intermediate_archive.{1,2,...}.ark will point to egs.archive.ark
+        ln -sf cegs.$archive_index.ark $dir/cegs.$x.$y.ark || exit 1
+      done
+    done
+    $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \
+      nnet3-chain-normalize-egs $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \
+      nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] ark:- ark:- \| \
+      nnet3-chain-copy-egs ark:- $output_archives || exit 1;
+  fi
+fi
+
+if [ $stage -le 6 ]; then
+  echo "$0: removing temporary archives"
+  (
+    cd $dir
+    for f in $(ls -l . | grep 'cegs_orig' | awk '{ X=NF-1; Y=NF-2; if ($X == "->")  print $Y, $NF; }'); do rm $f; done
+    # the next statement removes them if we weren't using the soft links to a
+    # 'storage' directory.
+    rm cegs_orig.*.ark 2>/dev/null
+  )
+  if [ $archives_multiple -gt 1 ]; then
+    # there are some extra soft links that we should delete.
+    for f in $dir/cegs.*.*.ark; do rm $f; done
+  fi
+  echo "$0: removing temporary lattices"
+  rm $dir/lat.*
+  echo "$0: removing temporary alignments and transforms"
+  # Ignore errors below because trans.* might not exist.
+  rm $dir/{ali,trans}.{ark,scp} 2>/dev/null
+
+fi
+
+echo "$0: Finished preparing training examples"
diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index cf2a1a42b66..77ca29bb0c7 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -99,6 +99,17 @@ def get_args():
                         dest='left_deriv_truncate',
                         default=None,
                         help="Deprecated. Kept for back compatibility")
+    parser.add_argument("--chain.fft-feat-dim", type=int,
+                        dest='fft_feat_dim', default=0,
+                        help="""If nonzero, the cosine and sine transformation
+                        with dim fft_feat_dim and closest 2-power of that is
+                        generated as configs/{cos,sin}_transform.mat.
+                        """)
+    parser.add_argument("--chain.l1-regularize", type=float,
+                        dest='l1_regularize', default=0.0,
+                        help="""Weight of regularization function which is the
+                        l1-norm of the fft transform of convolution filters in
+                        the network.""")
 
     # trainer options
     parser.add_argument("--trainer.input-model", type=str,
@@ -427,6 +438,22 @@ def train(args, run_opts):
             rand_prune=args.rand_prune,
             use_multitask_egs=use_multitask_egs)
 
+    if (args.l1_regularize != 0) or (args.fft_feat_dim != 0):
+        feat_dim = args.fft_feat_dim
+        add_bias = True
+        num_fft_bins = (2**(feat_dim-1).bit_length())
+        common_lib.write_sin_cos_transform_matrix(feat_dim, num_fft_bins,
+            "{0}/configs/cos_transform.mat".format(args.dir),
+            compute_cosine=True, add_bias=add_bias, half_range=True)
+        common_lib.write_sin_cos_transform_matrix(feat_dim, num_fft_bins,
+            "{0}/configs/sin_transform.mat".format(args.dir),
+            compute_cosine=False, add_bias=add_bias, half_range=True)
+        common_lib.write_negate_vector(num_fft_bins,
+            "{0}/configs/negate.vec".format(args.dir))
+        preemph = 0.97
+        common_lib.compute_and_write_preprocess_transform(preemph, feat_dim,
+            "{0}/configs/preprocess.mat".format(args.dir))
+
     if (args.stage <= -1):
         logger.info("Preparing the initial acoustic model.")
         chain_lib.prepare_initial_acoustic_model(args.dir, run_opts,
diff --git a/src/featbin/Makefile b/src/featbin/Makefile
index 8e72d0f744c..86ff1b0d79c 100644
--- a/src/featbin/Makefile
+++ b/src/featbin/Makefile
@@ -17,7 +17,7 @@ BINFILES = add-deltas add-deltas-sdc append-post-to-feats \
            process-kaldi-pitch-feats process-pitch-feats \
            select-feats shift-feats splice-feats subsample-feats \
            subset-feats transform-feats wav-copy wav-reverberate \
-           wav-to-duration
+           wav-to-duration compute-raw-frame-feats 
 
 OBJFILES =
 
diff --git a/src/featbin/compute-raw-frame-feats.cc b/src/featbin/compute-raw-frame-feats.cc
new file mode 100644
index 00000000000..95e489ca59c
--- /dev/null
+++ b/src/featbin/compute-raw-frame-feats.cc
@@ -0,0 +1,122 @@
+// feat/compute-raw-frame-feats.cc
+
+// Copyright 2015 Pegah Ghahremani
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "feat/wave-reader.h"
+#include "feat/feature-functions.h"
+#include "feat/feature-common.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    const char *usage =
+      "Creates raw feature files, starting from wave input and generate output as part of "
+      "raw-wave with specific duration and no overlap.\n"
+      "Some post-processing can be applied on output."
+      "Usage: compute-raw-frame-feats.cc [options...] <wave-rspecifier> <feats-wspecifier>\n";
+
+    ParseOptions po(usage);
+    FrameExtractionOptions raw_opts;
+    raw_opts.frame_shift_ms = 10.0;
+    raw_opts.frame_length_ms = 10.0;
+    raw_opts.window_type = "rectangular";
+    raw_opts.round_to_power_of_two = false;
+    raw_opts.remove_dc_offset = false;
+    raw_opts.preemph_coeff = 0.0;
+    raw_opts.dither = 0.0;
+    // Register raw frame extraction options
+    raw_opts.Register(&po);
+    bool remove_dc = true,
+      loudness_equalize = true;
+    BaseFloat low_rms =0.2, high_rms = 0.2,
+      scale_wav = 0.0;
+    po.Register("remove-global-dc-offset", &remove_dc, "If true, subtract mean from waveform on each wave");
+    po.Register("loudness-equalize", &loudness_equalize, "If true, variance-normalization "
+                "is applied on output-wave");
+    po.Register("low-rms", &low_rms, "The lowest variance of output-wave, where the variance"
+                "is randomly set between [low-rms, high-rms], and "
+                " the loudness of wave is equal to this randomly chosen rms.");
+    po.Register("high-rms", &high_rms, "The highest variance of output-wave, where the variance"
+                "is randomly set between [low-rms, high-rms], and "
+                " the loudness of wave is equal to this randomly chosen rms.");
+    po.Register("scale-wav", &scale_wav, "If non-zero, the raw waveform scaled using that.");
+    po.Read(argc, argv);
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+    std::string wav_rspecifier = po.GetArg(1);
+    std::string output_wspecifier = po.GetArg(2);
+
+    SequentialTableReader<WaveHolder> wave_reader(wav_rspecifier);
+    BaseFloatMatrixWriter feat_writer(output_wspecifier);
+
+
+    int32 num_done = 0, num_err = 0;
+    for (; !wave_reader.Done(); wave_reader.Next()) {
+      std::string utt = wave_reader.Key();
+      const WaveData &wave_data = wave_reader.Value();
+
+      // The channel is not configurable and it is better to extract it
+      // on command line using sox or...
+      int32 num_chan = wave_data.Data().NumRows();
+      if (num_chan != 1)
+        KALDI_WARN << "You have data with "
+                   << num_chan  << " channels; defaulting to zero";
+
+      SubVector<BaseFloat> waveform(wave_data.Data(), 0);
+      Vector<BaseFloat> input(waveform.Dim());
+      input.CopyRowFromMat(wave_data.Data(), 0);
+      BaseFloat mean = waveform.Sum() / waveform.Dim();
+      // compute variance
+      input.Add(-mean);
+      BaseFloat variance = std::pow(VecVec(input, input) / waveform.Dim(), 0.5);
+
+      // remove DC offset
+      if (remove_dc)
+        waveform.Add(-1.0 * mean);
+
+      // apply variance normalization
+      BaseFloat target_rms =  low_rms + RandUniform() * (high_rms - low_rms);
+      if (loudness_equalize && variance != 0)
+        waveform.Scale(target_rms * 1.0 / variance);
+
+      if (scale_wav != 0.0)
+        waveform.Scale(scale_wav);
+
+      Matrix<BaseFloat> raw_mat;
+      try {
+        FeatureWindowFunction window_function(raw_opts);
+        int32 rows_out = NumFrames(waveform.Dim(), raw_opts),
+          cols_out = raw_opts.WindowSize();
+        raw_mat.Resize(rows_out, cols_out);
+        for (int32 frame = 0; frame < rows_out; frame++) {
+          Vector<BaseFloat> raw_feat(cols_out);
+          ExtractWindow(0, waveform, frame, raw_opts,
+                        window_function, &raw_feat);
+          raw_mat.CopyRowFromVec(raw_feat, frame);
+
+        }
+        //ComputeAndProcessRawSignal(raw_opts, waveform, &raw_feats);
+      } catch (...) {
+        KALDI_WARN << "Failed to extract raw-feats for utterance "
+                   << utt;
+        num_err++;
+        continue;
+      }
+
+      feat_writer.Write(utt, raw_mat);
+      if (num_done % 50 == 0 && num_done != 0)
+        KALDI_VLOG(2) << "Processed " << num_done << " utterances";
+      num_done++;
+    }
+    KALDI_LOG << " Done " << num_done << " utterances, " << num_err
+              << " with errors.";
+    return (num_done != 0 ? 0 : 1);
+  } catch (const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc
index 2ec2699ec97..7ecf01aa54f 100644
--- a/src/nnet3/nnet-chain-training.cc
+++ b/src/nnet3/nnet-chain-training.cc
@@ -202,6 +202,9 @@ void NnetChainTrainer::TrainInternal(const NnetChainExample &eg,
       nnet_config.max_param_change, 1.0, 1.0 - nnet_config.momentum, nnet_,
       &num_max_change_per_component_applied_, &num_max_change_global_applied_);
 
+  // impose positivity for AffineComponent max(W, 0)
+  PositiveUpdatableWeights(nnet_);
+
   // Scale down the batchnorm stats (keeps them fresh... this affects what
   // happens when we use the model with batchnorm test-mode set).
   ScaleBatchnormStats(nnet_config.batchnorm_stats_scale, nnet_);
diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc
index c73f3fb921d..2d05f6acc61 100644
--- a/src/nnet3/nnet-component-itf.cc
+++ b/src/nnet3/nnet-component-itf.cc
@@ -174,6 +174,12 @@ Component* Component::NewComponentOfType(const std::string &component_type) {
     ans = new RestrictedAttentionComponent();
   } else if (component_type == "SumBlockComponent") {
     ans = new SumBlockComponent();
+  } else if (component_type == "ShiftInputComponent") {
+    ans = new ShiftInputComponent();
+  } else if (component_type == "LogComponent") {
+    ans = new LogComponent();
+  } else if (component_type == "ExpComponent") {
+    ans = new ExpComponent();
   } else if (component_type == "ScaleAndOffsetComponent") {
     ans = new ScaleAndOffsetComponent();
   }
@@ -218,6 +224,8 @@ UpdatableComponent::UpdatableComponent(const UpdatableComponent &other):
     learning_rate_factor_(other.learning_rate_factor_),
     l2_regularize_(other.l2_regularize_),
     is_gradient_(other.is_gradient_),
+    min_param_value_(other.min_param_value_),
+    max_param_value_(other.max_param_value_),
     max_change_(other.max_change_) { }
 
 
@@ -228,6 +236,8 @@ void UpdatableComponent::SetUpdatableConfigs(
   l2_regularize_ = other.l2_regularize_;
   is_gradient_ = other.is_gradient_;
   max_change_ = other.max_change_;
+  max_param_value_ = other.max_param_value_;
+  min_param_value_ = other.min_param_value_;
 }
 
 // If these defaults are changed, the defaults in the constructor that
@@ -244,6 +254,13 @@ void UpdatableComponent::InitLearningRatesFromConfig(ConfigLine *cfl) {
   if (learning_rate_ < 0.0 || learning_rate_factor_ < 0.0 ||
       max_change_ < 0.0 || l2_regularize_ < 0.0)
     KALDI_ERR << "Bad initializer " << cfl->WholeLine();
+  BaseFloat min_param_value = std::numeric_limits<float>::lowest(),
+            max_param_value = std::numeric_limits<float>::max();
+  cfl->GetValue("min-param-value", &min_param_value);
+  cfl->GetValue("max-param-value", &max_param_value);
+  KALDI_ASSERT(min_param_value < max_param_value);
+  min_param_value_ = min_param_value;
+  max_param_value_ = max_param_value;
 }
 
 
@@ -282,6 +299,21 @@ std::string UpdatableComponent::ReadUpdatableCommon(std::istream &is,
   } else {
     l2_regularize_ = 0.0;
   }
+
+  if (token == "<MaxParamValue>") {
+    ReadBasicType(is, binary, &max_param_value_);
+    ReadToken(is, binary, &token);
+  } else {
+    max_param_value_ = std::numeric_limits<float>::max();
+  }
+
+  if (token == "<MinParamValue>") {
+    ReadBasicType(is, binary, &min_param_value_);
+    ReadToken(is, binary, &token);
+  } else {
+     min_param_value_ = std::numeric_limits<float>::lowest();
+  }
+
   if (token == "<LearningRate>") {
     ReadBasicType(is, binary, &learning_rate_);
     return "";
@@ -312,6 +344,17 @@ void UpdatableComponent::WriteUpdatableCommon(std::ostream &os,
     WriteToken(os, binary, "<L2Regularize>");
     WriteBasicType(os, binary, l2_regularize_);
   }
+
+  if (max_param_value_ < std::numeric_limits<float>::max()) {
+    WriteToken(os, binary, "<MaxParamValue>");
+    WriteBasicType(os, binary, max_param_value_);
+  }
+  
+  if (min_param_value_ > std::numeric_limits<float>::lowest()) {
+    WriteToken(os, binary, "<MinParamValue>");
+    WriteBasicType(os, binary, min_param_value_);
+  }
+
   WriteToken(os, binary, "<LearningRate>");
   WriteBasicType(os, binary, learning_rate_);
 }
@@ -330,6 +373,9 @@ std::string UpdatableComponent::Info() const {
     stream << ", learning-rate-factor=" << learning_rate_factor_;
   if (max_change_ > 0.0)
     stream << ", max-change=" << max_change_;
+
+  stream << ", max-param-value=" << max_param_value_
+         << ", min-param-value=" << min_param_value_;
   return stream.str();
 }
 
diff --git a/src/nnet3/nnet-component-itf.h b/src/nnet3/nnet-component-itf.h
index 01697353308..6fd94a1738a 100644
--- a/src/nnet3/nnet-component-itf.h
+++ b/src/nnet3/nnet-component-itf.h
@@ -443,6 +443,8 @@ class UpdatableComponent: public Component {
   // InitLearningRatesFromConfig() should be changed too.
   UpdatableComponent(): learning_rate_(0.001), learning_rate_factor_(1.0),
                         l2_regularize_(0.0), is_gradient_(false),
+                        min_param_value_(std::numeric_limits<float>::lowest()),
+                        max_param_value_(std::numeric_limits<float>::max()),
                         max_change_(0.0) { }
 
   virtual ~UpdatableComponent() { }
@@ -520,6 +522,13 @@ class UpdatableComponent: public Component {
   virtual void UnVectorize(const VectorBase<BaseFloat> &params) {
     KALDI_ASSERT(0);
   }
+  // This function applies min and max parameter value contraints by
+  // mapping parameter weight to range [min_param_value_, max_param_value]
+  virtual void ApplyMinMaxToWeights() = 0;
+
+  BaseFloat MaxParamValue() const { return max_param_value_; }
+
+  BaseFloat MinParamValue() const { return min_param_value_; }
 
  protected:
   // to be called from child classes, extracts any learning rate information
@@ -538,7 +547,11 @@ class UpdatableComponent: public Component {
   // <ThisComponentType> tag and the learning-rate factor (if not 1.0) and the
   // learning rate;
   void WriteUpdatableCommon(std::ostream &is, bool binary) const;
-
+  
+  BaseFloat max_param_value_; /// max parameter value constant, the parameters mapped to
+                              /// this value, if they get larger than max_param_value_.
+  BaseFloat min_param_value_; /// min parameter value constant, the parameters mapped to
+                              /// this value, if they get smaller than min_param_value_.
   BaseFloat learning_rate_; ///< learning rate (typically 0.0..0.01)
   BaseFloat learning_rate_factor_; ///< learning rate factor (normally 1.0, but
                                    ///< can be set to another < value so that
diff --git a/src/nnet3/nnet-convolutional-component.h b/src/nnet3/nnet-convolutional-component.h
index 35cf0de11c9..c39622775d1 100644
--- a/src/nnet3/nnet-convolutional-component.h
+++ b/src/nnet3/nnet-convolutional-component.h
@@ -276,6 +276,7 @@ class TimeHeightConvolutionComponent: public UpdatableComponent {
   virtual void Scale(BaseFloat scale);
   virtual void Add(BaseFloat alpha, const Component &other);
   virtual void PerturbParams(BaseFloat stddev);
+  virtual void ApplyMinMaxToWeights() {}
   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
   virtual int32 NumParameters() const;
   virtual void Vectorize(VectorBase<BaseFloat> *params) const;
diff --git a/src/nnet3/nnet-general-component.h b/src/nnet3/nnet-general-component.h
index cff73a55b59..163c42837ab 100644
--- a/src/nnet3/nnet-general-component.h
+++ b/src/nnet3/nnet-general-component.h
@@ -502,6 +502,7 @@ class BackpropTruncationComponent: public Component {
 
   virtual void Scale(BaseFloat scale);
   virtual void Add(BaseFloat alpha, const Component &other);
+  virtual void ApplyMinMaxToWeights() {}
   virtual void Read(std::istream &is, bool binary); // This Read function
   // requires that the Component has the correct type.
   /// Write component to stream
@@ -675,6 +676,7 @@ class ConstantComponent: public UpdatableComponent {
   virtual void Scale(BaseFloat scale);
   virtual void Add(BaseFloat alpha, const Component &other);
   virtual void PerturbParams(BaseFloat stddev);
+  virtual void ApplyMinMaxToWeights() {}
   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
   virtual int32 NumParameters() const;
   virtual void Vectorize(VectorBase<BaseFloat> *params) const;
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index 4eb078c0fcb..0011b2989c2 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -1007,6 +1007,49 @@ void RectifiedLinearComponent::StoreStats(
   StoreStatsInternal(out_value, &temp_deriv);
 }
 
+
+void AffineComponent::ApplyMinMaxToWeights() {
+  BaseFloat max_param_value = MaxParamValue(),
+    min_param_value = MinParamValue();
+  CuMatrix<BaseFloat> linear_params_diff(linear_params_);
+  if (min_param_value > std::numeric_limits<float>::lowest()) {
+    linear_params_.ApplyFloor(min_param_value);
+    bias_params_.ApplyFloor(min_param_value);
+  }
+  int32 tot_dim = InputDim() * OutputDim();
+  // percentage of weight, smaller than min-param-value, which mapped to
+  // min-param-value.
+  if (GetVerboseLevel() > 0) {
+    linear_params_diff.AddMat(-1.0, linear_params_);
+    linear_params_diff.Scale(-1.0);
+    linear_params_diff.ApplyHeaviside();
+    BaseFloat num_min_weights = linear_params_diff.Sum();
+    KALDI_LOG << num_min_weights / tot_dim * 100.0 << " % of parameters floored "
+              << " to min-param-value="<< min_param_value;
+  }
+  
+  linear_params_diff.CopyFromMat(linear_params_);
+  if (max_param_value < std::numeric_limits<float>::max()) {
+    // apply min(max_value, w)
+    linear_params_.ApplyCeiling(max_param_value);
+    bias_params_.ApplyCeiling(max_param_value);
+  }
+  // percentage of weight, larger than max-param-value, which mapped to
+  // max-param-value.
+  if (GetVerboseLevel() > 0) {
+    linear_params_diff.AddMat(-1.0, linear_params_);
+    linear_params_diff.ApplyHeaviside();
+    BaseFloat num_max_weights = linear_params_diff.Sum();
+    KALDI_LOG << num_max_weights / tot_dim * 100.0 << " % of parameters ceiled "
+              << " to max-param-value="<< max_param_value;
+  }
+  
+  KALDI_ASSERT(linear_params_.Max() <= max_param_value &&
+               linear_params_.Min() >= min_param_value &&
+               bias_params_.Max() <= max_param_value &&
+               bias_params_.Min() >= min_param_value);
+}
+
 void AffineComponent::Scale(BaseFloat scale) {
   if (scale == 0.0) {
     // If scale == 0.0 we call SetZero() which will get rid of NaN's and inf's.
@@ -1094,6 +1137,9 @@ BaseFloat AffineComponent::DotProduct(const UpdatableComponent &other_in) const
 
 void AffineComponent::Init(int32 input_dim, int32 output_dim,
                            BaseFloat param_stddev, BaseFloat bias_stddev) {
+  BaseFloat max_param_value = MaxParamValue(),
+   min_param_value = MinParamValue();
+  KALDI_ASSERT(param_stddev < std::abs(max_param_value));
   linear_params_.Resize(output_dim, input_dim);
   bias_params_.Resize(output_dim);
   KALDI_ASSERT(output_dim > 0 && input_dim > 0 && param_stddev >= 0.0);
@@ -2245,7 +2291,9 @@ void ScaleAndOffsetComponent::InitFromConfig(ConfigLine *cfl) {
   }
   cfl->GetValue("rank", &rank);
   scales_.Resize(block_dim);
-  scales_.Set(1.0);
+  BaseFloat scale_value = 1.0;
+  cfl->GetValue("scale", &scale_value);
+  scales_.Set(scale_value);
   offsets_.Resize(block_dim);
   // offsets are all zero when initialized.
   if (cfl->HasUnusedValues())
@@ -2942,6 +2990,19 @@ void NaturalGradientAffineComponent::FreezeNaturalGradient(bool freeze) {
   preconditioner_out_.Freeze(freeze);
 }
 
+void LinearComponent::ApplyMinMaxToWeights() {
+  BaseFloat max_param_value = MaxParamValue(),
+    min_param_value = MinParamValue();
+  if (min_param_value > std::numeric_limits<float>::lowest())
+    params_.ApplyFloor(min_param_value);
+
+  // apply min(max_value, w)
+  if (max_param_value < std::numeric_limits<float>::max())
+    params_.ApplyCeiling(max_param_value);
+
+  KALDI_ASSERT(params_.Max() <= max_param_value &&
+               params_.Min() >= min_param_value);
+}
 
 void LinearComponent::Read(std::istream &is, bool binary) {
   std::string token = ReadUpdatableCommon(is, binary);
@@ -3005,6 +3066,9 @@ void LinearComponent::InitFromConfig(ConfigLine *cfl) {
     if (!ok)
       KALDI_ERR << "Bad initializer " << cfl->WholeLine();
     BaseFloat param_stddev = 1.0 / std::sqrt(input_dim);
+    BaseFloat max_param_value = MaxParamValue(),
+              min_param_value = MinParamValue();
+    param_stddev = std::min(param_stddev, max_param_value);
     cfl->GetValue("param-stddev", &param_stddev);
     params_.Resize(output_dim, input_dim);
     KALDI_ASSERT(output_dim > 0 && input_dim > 0 && param_stddev >= 0.0);
@@ -5860,6 +5924,389 @@ void SumBlockComponent::Backprop(
   }
 }
 
+//For raw data
+ShiftInputComponent::ShiftInputComponent(const ShiftInputComponent &other):
+  RandomComponent(other),
+  input_dim_(other.input_dim_),
+  output_dim_(other.output_dim_),
+  max_shift_(other.max_shift_),
+  rand_vol_var_(other.rand_vol_var_),
+  shift_per_frame_(other.shift_per_frame_),
+  dither_(other.dither_),
+  preprocess_(other.preprocess_) { }
+
+Component* ShiftInputComponent::Copy() const {
+  ShiftInputComponent *ans = new ShiftInputComponent(*this);
+  return ans;
+}
+
+std::string ShiftInputComponent::Info() const {
+  std::stringstream stream;
+  stream << Type() << ", input-dim=" << input_dim_
+         << ", output-dim=" << output_dim_
+         << ", max-shift=" << max_shift_
+         << ", shift-per-frame=" << shift_per_frame_
+         << ", dither=" << dither_
+         << ", preprocess=" << preprocess_;
+  return stream.str();
+}
+
+void ShiftInputComponent::Init(int32 input_dim, int32 output_dim, BaseFloat max_shift,
+    BaseFloat rand_vol_var,
+    BaseFloat dither, bool preprocess) {
+  input_dim_ = input_dim;
+  output_dim_ = output_dim;
+  max_shift_ = max_shift;
+  rand_vol_var_ = rand_vol_var;
+  dither_ = dither;
+  preprocess_ = preprocess;
+  KALDI_ASSERT(input_dim_ - output_dim_ >= 0 && input_dim_ > 0);
+  KALDI_ASSERT(max_shift >= 0.0 && max_shift <= 1.0);
+  KALDI_ASSERT(rand_vol_var >= 0.0 && rand_vol_var <= 1.0);
+}
+
+void ShiftInputComponent::InitFromConfig(ConfigLine *cfl) {
+  bool ok = true, preprocess = false;
+  test_mode_ = false;
+  int32 input_dim, output_dim;
+  BaseFloat max_shift = 1.0, rand_vol_var = 0.0, dither = 0.0;
+  ok = ok && cfl->GetValue("input-dim", &input_dim);
+  ok = ok && cfl->GetValue("output-dim", &output_dim);
+  // It only makes sense to set test-mode in the config for testing purposes.
+  cfl->GetValue("test-mode", &test_mode_);
+  if (cfl->GetValue("max-shift", &max_shift))
+    KALDI_ASSERT(max_shift >= 0.0 && max_shift <= 1.0);
+  if (cfl->GetValue("rand-vol-var", &rand_vol_var))
+    KALDI_ASSERT(rand_vol_var >= 0 && rand_vol_var <= 1.0);
+  cfl->GetValue("shift-per-frame", &shift_per_frame_);
+  cfl->GetValue("dither", &dither);
+  cfl->GetValue("preprocess", &preprocess);
+  Init(input_dim, output_dim, max_shift, rand_vol_var, dither, preprocess);
+}
+
+void ShiftInputComponent::Read(std::istream &is, bool binary) {
+  ExpectOneOrTwoTokens(is, binary, "<ShiftInputComponent>", "<InputDim>");
+  ReadBasicType(is, binary, &input_dim_);
+  ExpectToken(is, binary, "<OutputDim>");
+  ReadBasicType(is, binary, &output_dim_);
+  std::string token;
+  ReadToken(is, binary, &token);
+  if (token == "<MaxShift>") {
+    ReadBasicType(is, binary, &max_shift_);
+    ReadToken(is, binary, &token);
+    if (token == "<RandVolVar>") {
+      ReadBasicType(is, binary, &rand_vol_var_);
+      ReadToken(is, binary, &token);
+    }
+  }
+  if (token == "<ShiftPerFrame>") {
+    ReadBasicType(is, binary, &shift_per_frame_);
+    ReadToken(is, binary, &token);
+  }
+  if (token == "<Dither>") {
+    ReadBasicType(is, binary, &dither_);
+    ReadToken(is, binary, &token);
+  }
+  if (token == "<Preprocess>") {
+    ReadBasicType(is, binary, &preprocess_);
+    ReadToken(is, binary, &token);
+  }
+
+  if (token == "<TestMode>") {
+    ReadBasicType(is, binary, &test_mode_);  // read test mode
+    ExpectToken(is, binary, "</ShiftInputComponent>");
+  } else {
+    test_mode_ = false;
+    KALDI_ASSERT(token == "</ShiftInputComponent>");
+  }
+}
+
+void ShiftInputComponent::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<ShiftInputComponent>");
+  WriteToken(os, binary, "<InputDim>");
+  WriteBasicType(os, binary, input_dim_);
+  WriteToken(os, binary, "<OutputDim>");
+  WriteBasicType(os, binary, output_dim_);
+  WriteToken(os, binary, "<MaxShift>");
+  WriteBasicType(os, binary, max_shift_);
+  WriteToken(os, binary, "<RandVolVar>");
+  WriteBasicType(os, binary, rand_vol_var_);
+  WriteToken(os, binary, "<ShiftPerFrame>");
+  WriteBasicType(os, binary, shift_per_frame_);
+  WriteToken(os, binary, "<Dither>");
+  WriteBasicType(os, binary, dither_);
+  WriteToken(os, binary, "<Preprocess>");
+  WriteBasicType(os, binary, preprocess_);
+  WriteToken(os, binary, "<TestMode>");
+  WriteBasicType(os, binary, test_mode_);
+  WriteToken(os, binary, "</ShiftInputComponent>");
+}
+
+void* ShiftInputComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
+                                     const CuMatrixBase<BaseFloat> &in,
+                                     CuMatrixBase<BaseFloat> *out) const {
+  // dithering is done on both train and test time.
+  // it is done to make zero values nonzero on raw frame of signal.
+  CuMatrix<BaseFloat> modified_in(in);
+  if (dither_ != 0.0) {
+    CuMatrix<BaseFloat> dither_mat(in.NumRows(), in.NumCols());
+    dither_mat.SetRandn();
+    dither_mat.Scale(dither_);
+    modified_in.AddMat(1.0, dither_mat);
+  }
+  if (test_mode_)
+    out->CopyFromMat(modified_in.Range(0,in.NumRows(), 0, output_dim_));
+  else {
+    int32 in_out_diff = input_dim_ - output_dim_,
+      shift;
+    KALDI_ASSERT(in_out_diff >= 0);
+    int32 max_shift_int = static_cast<int32>(max_shift_ * in_out_diff);
+    if (shift_per_frame_) {
+      int32 block_size = 1024,
+        num_blocks = out->NumRows() / block_size;
+      CuMatrix<BaseFloat> tmp(1, num_blocks, kUndefined);
+      const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(&tmp);
+      tmp.Scale(max_shift_int);
+      for (int i = 0; i < num_blocks; i++) {
+        int32 start_row = i * block_size,
+          num_shifted_row = std::min(block_size, out->NumRows() - start_row);
+        shift = static_cast<int32>(tmp(0,i) + 0.5);
+        CuSubMatrix<BaseFloat> out_row = out->Range(start_row, num_shifted_row,
+                                                    0, output_dim_);
+        out_row.CopyFromMat(modified_in.Range(start_row, num_shifted_row,
+          shift, output_dim_));
+      }
+    } else {
+    // Generate random shift integer value.
+    shift = RandInt(0, max_shift_int);
+    out->CopyFromMat(modified_in.Range(0, in.NumRows(), shift, output_dim_));
+
+    BaseFloat rand_vol = (1.0 + rand_vol_var_ *
+      (Rand() % 2 ? -1.0 : 1.0) * RandUniform());
+    if (rand_vol != 0 && rand_vol != 1.0)
+      out->Scale(rand_vol);
+    }
+  }
+  if (preprocess_)
+    Preprocess(out);
+  return NULL;
+}
+
+void ShiftInputComponent::Preprocess(CuMatrixBase<BaseFloat> *preprocessed_in) const {
+  // removing dc offset
+  int32 dim = preprocessed_in->NumCols(),
+    num_rows = preprocessed_in->NumRows();
+  BaseFloat scale_w = 1.0 / float(dim), preemph = 0.97;
+  CuVector<BaseFloat> mean(num_rows);
+  mean.AddColSumMat(scale_w, *preprocessed_in, 0);
+  preprocessed_in->AddVecToCols(-1.0, mean);
+
+  // Doing pre-emphasis
+  CuMatrix<BaseFloat> shifted_in(preprocessed_in->ColRange(0, dim-1));
+  preprocessed_in->ColRange(1, dim-1).AddMat(-1.0 * preemph, shifted_in);
+  preprocessed_in->ColRange(0,1).Scale(1.0 - preemph);
+
+
+  // Apply windowing
+  CuVector<BaseFloat> window(dim);
+  double a = M_2PI / (dim-1);
+  for (int32 i = 1; i < dim-1; i++) {
+    double i_fl = static_cast<double>(i);
+    window(i) = std::pow(0.5 - 0.5 * cos(a * i_fl), 0.85);
+  }
+  window(0) = window(1);
+  window(dim - 1) = window(dim - 2);
+  CuMatrix<BaseFloat> window_mat(preprocessed_in->NumRows(), dim);
+  window_mat.CopyRowsFromVec(window);
+  preprocessed_in->MulElements(window_mat);
+}
+
+void ShiftInputComponent::Backprop(const std::string &debug_info,
+                                   const ComponentPrecomputedIndexes *indexes,
+                                   const CuMatrixBase<BaseFloat> &,
+                                   const CuMatrixBase<BaseFloat> &,
+                                   const CuMatrixBase<BaseFloat> &,
+                                   void *memo,
+                                   Component *,
+                                   CuMatrixBase<BaseFloat> *in_deriv) const {
+  in_deriv->SetZero();
+}
+
+std::string LogComponent::Info() const {
+  std::stringstream stream;
+  stream << NonlinearComponent::Info()
+         << ", log-floor=" << log_floor_
+         << ", additive-offset=" << additive_offset_;
+  return stream.str();
+}
+
+void LogComponent::InitFromConfig(ConfigLine *cfl) {
+  cfl->GetValue("log-floor", &log_floor_);
+  cfl->GetValue("additive-offset", &additive_offset_);
+  NonlinearComponent::InitFromConfig(cfl);
+}
+
+void* LogComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
+                              const CuMatrixBase<BaseFloat> &in,
+                              CuMatrixBase<BaseFloat> *out) const {
+  out->CopyFromMat(in);
+  if (additive_offset_) {
+    // Apply log(abs(x)+epsi)
+    out->ApplyPowAbs(1.0);
+    out->Add(log_floor_);
+    out->ApplyLog();
+  } else {
+    // Apply log function (x >= epsi ? log(x) : log(epsi)).
+    out->ApplyFloor(log_floor_);
+    out->ApplyLog();
+  }
+  return NULL;
+}
+
+void LogComponent::Backprop(const std::string &debug_info,
+                            const ComponentPrecomputedIndexes *indexes,
+                            const CuMatrixBase<BaseFloat> &in_value,
+                            const CuMatrixBase<BaseFloat> &out_value,
+                            const CuMatrixBase<BaseFloat> &out_deriv,
+                            void *memo,
+                            Component *to_update,
+                            CuMatrixBase<BaseFloat> *in_deriv) const {
+  if (in_deriv != NULL) {
+    CuMatrix<BaseFloat> divided_in_value(in_value), floored_in_value(in_value);
+    divided_in_value.Set(1.0);
+    floored_in_value.CopyFromMat(in_value);
+    if (additive_offset_) {
+      in_deriv->CopyFromMat(in_value);
+      in_deriv->ApplyHeaviside(); // (x > 0 ? 1 : 0)
+      in_deriv->Scale(2.0);
+      in_deriv->Add(-1.0); // (x > 0 ? 1 : -1)
+      floored_in_value.ApplyPowAbs(1.0);
+      floored_in_value.Add(log_floor_); // (abs(x) + epsi)
+      divided_in_value.DivElements(floored_in_value); // 1 / (abs(x) + epsi)
+      in_deriv->MulElements(divided_in_value); // (dy/dx: x > 0 : 1/(abs(x) + epsi), -1/(abs(x) + epsi))
+      in_deriv->MulElements(out_deriv);   // dF/dx = dF/dy * dy/dx
+    } else {
+      floored_in_value.ApplyFloor(log_floor_); // (x > epsi ? x : epsi)
+      divided_in_value.DivElements(floored_in_value); // (x > epsi ? 1/x : 1/epsi)
+      in_deriv->CopyFromMat(in_value);
+      in_deriv->Add(-1.0 * log_floor_); // (x - epsi)
+      in_deriv->ApplyHeaviside(); // (x > epsi ? 1 : 0)
+      in_deriv->MulElements(divided_in_value); // (dy/dx: x  > epsi ? 1/x : 0)
+      in_deriv->MulElements(out_deriv);   // dF/dx = dF/dy * dy/dx
+    }
+  }
+}
+
+void LogComponent::Read(std::istream &is, bool binary) {
+  std::ostringstream ostr_beg, ostr_end;
+  ostr_beg << "<" << Type() << ">"; // e.g. "<SigmoidComponent>"
+  ostr_end << "</" << Type() << ">"; // e.g. "</SigmoidComponent>"
+  ExpectOneOrTwoTokens(is, binary, ostr_beg.str(), "<Dim>");
+  ReadBasicType(is, binary, &dim_); // Read dimension.
+  ExpectToken(is, binary, "<ValueAvg>");
+  value_sum_.Read(is, binary);
+  ExpectToken(is, binary, "<DerivAvg>");
+  deriv_sum_.Read(is, binary);
+  ExpectToken(is, binary, "<Count>");
+  ReadBasicType(is, binary, &count_);
+  value_sum_.Scale(count_);
+  deriv_sum_.Scale(count_);
+
+  std::string token;
+  ReadToken(is, binary, &token);
+  if (token == "<SelfRepairLowerThreshold>") {
+    ReadBasicType(is, binary, &self_repair_lower_threshold_);
+    ReadToken(is, binary, &token);
+  }
+  if (token == "<SelfRepairUpperThreshold>") {
+    ReadBasicType(is, binary, &self_repair_upper_threshold_);
+    ReadToken(is, binary, &token);
+  }
+  if (token == "<SelfRepairScale>") {
+    ReadBasicType(is, binary, &self_repair_scale_);
+    ReadToken(is, binary, &token);
+  }
+  if (token == "<LogFloor>") {
+    ReadBasicType(is, binary, &log_floor_);
+    ReadToken(is, binary, &token);
+  }
+
+  if (token == "<AdditiveOffset>") {
+    ReadBasicType(is, binary, &additive_offset_);
+    ReadToken(is, binary, &token);
+  }
+
+  if (token != ostr_end.str()) {
+    KALDI_ERR << "Expected token " << ostr_end.str()
+              << ", got " << token;
+  }
+}
+
+void LogComponent::Write(std::ostream &os, bool binary) const {
+  std::ostringstream ostr_beg, ostr_end;
+  ostr_beg << "<" << Type() << ">"; // e.g. "<SigmoidComponent>"
+  ostr_end << "</" << Type() << ">"; // e.g. "</SigmoidComponent>"
+  WriteToken(os, binary, ostr_beg.str());
+  WriteToken(os, binary, "<Dim>");
+  WriteBasicType(os, binary, dim_);
+  // Write the values and derivatives in a count-normalized way, for
+  // greater readability in text form.
+  WriteToken(os, binary, "<ValueAvg>");
+  Vector<BaseFloat> temp(value_sum_);
+  if (count_ != 0.0) temp.Scale(1.0 / count_);
+  temp.Write(os, binary);
+  WriteToken(os, binary, "<DerivAvg>");
+
+  temp.Resize(deriv_sum_.Dim(), kUndefined);
+  temp.CopyFromVec(deriv_sum_);
+  if (count_ != 0.0) temp.Scale(1.0 / count_);
+  temp.Write(os, binary);
+  WriteToken(os, binary, "<Count>");
+  WriteBasicType(os, binary, count_);
+  if (self_repair_lower_threshold_ != kUnsetThreshold) {
+    WriteToken(os, binary, "<SelfRepairLowerThreshold>");
+    WriteBasicType(os, binary, self_repair_lower_threshold_);
+  }
+  if (self_repair_upper_threshold_ != kUnsetThreshold) {
+    WriteToken(os, binary, "<SelfRepairUpperThreshold>");
+    WriteBasicType(os, binary, self_repair_upper_threshold_);
+  }
+  if (self_repair_scale_ != 0.0) {
+    WriteToken(os, binary, "<SelfRepairScale>");
+    WriteBasicType(os, binary, self_repair_scale_);
+  }
+  WriteToken(os, binary, "<LogFloor>");
+  WriteBasicType(os, binary, log_floor_);
+
+  WriteToken(os, binary, "<AdditiveOffset>");
+  WriteBasicType(os, binary, additive_offset_);
+  WriteToken(os, binary, ostr_end.str());
+}
+
+void* ExpComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
+                             const CuMatrixBase<BaseFloat> &in,
+                             CuMatrixBase<BaseFloat> *out) const {
+  // Applied exp function
+  out->CopyFromMat(in);
+  out->ApplyExp();
+  return NULL;
+}
+
+void ExpComponent::Backprop(const std::string &debug_info,
+                            const ComponentPrecomputedIndexes *indexes,
+                            const CuMatrixBase<BaseFloat> &,//in_value,
+                            const CuMatrixBase<BaseFloat> &out_value,
+                            const CuMatrixBase<BaseFloat> &out_deriv,
+                            void *memo,
+                            Component *to_update,
+                            CuMatrixBase<BaseFloat> *in_deriv) const {
+  if (in_deriv != NULL) {
+    in_deriv->CopyFromMat(out_value);
+    in_deriv->MulElements(out_deriv);
+  }
+}
+
+
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index 3929c253aab..b1664473a4e 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -387,6 +387,7 @@ class PerElementOffsetComponent;
 // AffineComponent.
 class AffineComponent: public UpdatableComponent {
  public:
+  virtual void ApplyMinMaxToWeights();
   virtual int32 InputDim() const { return linear_params_.NumCols(); }
   virtual int32 OutputDim() const { return linear_params_.NumRows(); }
 
@@ -482,6 +483,7 @@ class RepeatedAffineComponent;
 /// num-blocks must divide both input-dim and output-dim
 class BlockAffineComponent : public UpdatableComponent {
  public:
+  virtual void ApplyMinMaxToWeights() {}
   virtual int32 InputDim() const { return linear_params_.NumCols() * num_blocks_; }
   virtual int32 OutputDim() const { return linear_params_.NumRows(); }
 
@@ -547,7 +549,7 @@ class BlockAffineComponent : public UpdatableComponent {
 
 class RepeatedAffineComponent: public UpdatableComponent {
  public:
-
+  virtual void ApplyMinMaxToWeights() {}
   virtual int32 InputDim() const { return linear_params_.NumCols() * num_repeats_; }
   virtual int32 OutputDim() const { return linear_params_.NumRows() * num_repeats_; }
 
@@ -898,6 +900,7 @@ class NaturalGradientAffineComponent: public AffineComponent {
 */
 class LinearComponent: public UpdatableComponent {
  public:
+  virtual void ApplyMinMaxToWeights();
   virtual int32 InputDim() const { return params_.NumCols(); }
   virtual int32 OutputDim() const { return params_.NumRows(); }
 
@@ -1462,6 +1465,7 @@ class PermuteComponent: public Component {
 */
 class PerElementScaleComponent: public UpdatableComponent {
  public:
+  virtual void ApplyMinMaxToWeights() {}
   virtual int32 InputDim() const { return scales_.Dim(); }
   virtual int32 OutputDim() const { return scales_.Dim(); }
 
@@ -1558,6 +1562,7 @@ class PerElementScaleComponent: public UpdatableComponent {
 */
 class PerElementOffsetComponent: public UpdatableComponent {
  public:
+  virtual void ApplyMinMaxToWeights() {}
   virtual int32 InputDim() const { return dim_; }
   virtual int32 OutputDim() const { return dim_; }
 
@@ -1622,6 +1627,7 @@ class PerElementOffsetComponent: public UpdatableComponent {
 // no inputs].
 class ConstantFunctionComponent: public UpdatableComponent {
  public:
+  virtual void ApplyMinMaxToWeights() {}
   virtual int32 InputDim() const { return input_dim_; }
   virtual int32 OutputDim() const { return output_.Dim(); }
 
@@ -1794,6 +1800,7 @@ class NaturalGradientPerElementScaleComponent: public PerElementScaleComponent {
 */
 class ScaleAndOffsetComponent: public UpdatableComponent {
  public:
+  virtual void ApplyMinMaxToWeights() {}
   virtual int32 InputDim() const { return dim_; }
   virtual int32 OutputDim() const { return dim_; }
 
@@ -1963,6 +1970,7 @@ class ScaleAndOffsetComponent: public UpdatableComponent {
  */
 class ConvolutionComponent: public UpdatableComponent {
  public:
+  virtual void ApplyMinMaxToWeights() {}
   enum TensorVectorizationType  {
     kYzx = 0,
     kZyx = 1
@@ -2185,6 +2193,7 @@ class ConvolutionComponent: public UpdatableComponent {
 class LstmNonlinearityComponent: public UpdatableComponent {
  public:
 
+  virtual void ApplyMinMaxToWeights() {}
   virtual int32 InputDim() const;
   virtual int32 OutputDim() const;
   virtual std::string Info() const;
@@ -2426,6 +2435,7 @@ class MaxpoolingComponent: public Component {
  */
 class CompositeComponent: public UpdatableComponent {
  public:
+  virtual void ApplyMinMaxToWeights() {}
   virtual int32 InputDim() const;
   virtual int32 OutputDim() const;
 
@@ -2519,6 +2529,149 @@ class CompositeComponent: public UpdatableComponent {
 
 };
 
+//For raw data
+/*
+ * The shiftedComponent shifts the input using random or constant shift.
+ * The output y contains the shifted version of input and it is equal to
+ * x.Range(shift * diff, output_dim_), where 0 <= shift < 1.
+ * The output_dim_ is the target dimension of the output and the input_dim_ is the input
+ * dim of this component and diff = input_dim_ - output_dim_ and input_dim_ >  output_dim_ and the diff should be > = original frame_length.
+ * This component is useful when we train a DNN using raw-waveform
+ * and we can shift the input e.g. shift the input by 20% of original frame-length.
+ * max_shift_ is the max shift used to shift the input(0 <= max_shift_ <= 1, the default is 0.5.)
+ */
+class ShiftInputComponent: public RandomComponent {
+ public:
+  void Init(int32 input_dim, int32 output_dim, BaseFloat max_shift,
+    BaseFloat rand_vol_var = 0.0, BaseFloat dither = 0.0, bool preprocess = false);
+
+  explicit ShiftInputComponent(const ShiftInputComponent &other);
+
+  explicit ShiftInputComponent(int32 input_dim, int32 output_dim,
+                               BaseFloat max_shift,
+                               BaseFloat rand_vol_var = 0.0,
+                               BaseFloat dither = 0.0,
+                               bool preprocess = false) {
+      Init(input_dim, output_dim, max_shift, rand_vol_var, dither, preprocess); }
+  ShiftInputComponent(): input_dim_(0), output_dim_(0), max_shift_(1.0),
+    rand_vol_var_(0.0), shift_per_frame_(false), dither_(0.0),
+    preprocess_(false) { }
+
+  virtual std::string Type() const { return "ShiftInputComponent"; }
+  virtual std::string Info() const;
+  virtual void InitFromConfig(ConfigLine *cfl);
+  void SetShiftAndVolume(BaseFloat shift, BaseFloat vol_var) { max_shift_ = shift;
+    rand_vol_var_ = vol_var; }
+  virtual int32 InputDim() const { return input_dim_; }
+  virtual int32 OutputDim() const { return output_dim_; }
+  virtual int32 Properties() const {
+    return kSimpleComponent|kRandomComponent;
+  }
+  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
+                         const CuMatrixBase<BaseFloat> &in,
+                         CuMatrixBase<BaseFloat> *out) const;
+
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &in_value,
+                        const CuMatrixBase<BaseFloat> &out_value,
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        void *memo,
+                        Component *to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual Component* Copy() const;
+
+  virtual void Read(std::istream &is, bool binary); // This Read function
+  // requires that the Component has the correct type.
+
+  /// Write component to stream
+  virtual void Write(std::ostream &os, bool binary) const;
+ protected:
+  void Preprocess(CuMatrixBase<BaseFloat> *preprocessed_in) const;
+  int32 input_dim_;
+  int32 output_dim_;
+  BaseFloat max_shift_; // max shift is the max shift used to shift the input.
+                        // max_shift_ should be between 0 and 1.
+  BaseFloat rand_vol_var_; // The variance used to generate random volume perturbation value.
+  bool shift_per_frame_;  // If true, different random shift is applied per frame of input.
+  BaseFloat dither_; // The random vector with stddev of dither_ is added to input before random shift.
+                     // The main reason is to make zero values on raw waveform nonzero.
+                     // This is done on both test and train.
+  bool preprocess_; // If true, the preemphasis, mean-removal and windowing is applied
+                    // on outputs.
+};
+
+// The ExpComponent outputs the exp of input values as y = Exp(x)
+class ExpComponent: public NonlinearComponent {
+ public:
+  explicit ExpComponent(const ExpComponent &other):
+    NonlinearComponent(other) { }
+  ExpComponent() { }
+  virtual std::string Type() const { return "ExpComponent"; }
+  virtual int32 Properties() const {
+    return kSimpleComponent|kBackpropNeedsOutput|kStoresStats;
+  }
+  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
+                          const CuMatrixBase<BaseFloat> &in,
+                          CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &,
+                        const CuMatrixBase<BaseFloat> &out_value,
+                        const CuMatrixBase<BaseFloat> &,
+                        void *memo,
+                        Component *to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual Component* Copy() const { return new ExpComponent(*this); }
+ private:
+  ExpComponent &operator = (const ExpComponent &other); // Disallow.
+};
+
+// The LogComponent outputs the log of input values as y = Log(max(x, epsi))
+class LogComponent: public NonlinearComponent {
+ public:
+  //explicit LogComponent(int32 dim): dim_(dim) { }
+  explicit LogComponent(const LogComponent &other):
+    NonlinearComponent(other), log_floor_(other.log_floor_),
+    additive_offset_(other.additive_offset_) {}
+  LogComponent(): log_floor_(1e-10), additive_offset_(false) { }
+  virtual std::string Type() const { return "LogComponent"; }
+  virtual int32 Properties() const {
+    return kSimpleComponent|kBackpropNeedsInput|kStoresStats;
+  }
+
+  virtual std::string Info() const;
+
+  virtual void InitFromConfig(ConfigLine *cfl);
+
+  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
+                          const CuMatrixBase<BaseFloat> &in,
+                          CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &in_value,
+                        const CuMatrixBase<BaseFloat> &out_value,
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        void *memo,
+                        Component *to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual Component* Copy() const { return new LogComponent(*this); }
+
+  virtual void Read(std::istream &is, bool binary);
+
+  virtual void Write(std::ostream &os, bool binary) const;
+
+ private:
+  LogComponent &operator = (const LogComponent &other); // Disallow.
+  BaseFloat log_floor_;
+  bool additive_offset_; // If true, log is computed using abs(x) + log_floor_
+                         // otherwise it is computed as log(max(x,log_floor_))
+};
+
+
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index 1d00125a361..8246148abc6 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -2051,6 +2051,22 @@ void ApplyL2Regularization(const Nnet &nnet,
   }
 }
 
+bool PositiveUpdatableWeights(Nnet *nnet) {
+  for (int32 c = 0; c < nnet->NumComponents(); c++) {
+    Component *comp = nnet->GetComponent(c);
+    if (comp->Properties() & kUpdatableComponent) {
+      UpdatableComponent *src_comp =
+        dynamic_cast<UpdatableComponent*>(comp);
+      BaseFloat min_param_value = src_comp->MinParamValue(),
+                max_param_value = src_comp->MaxParamValue();
+      KALDI_ASSERT(min_param_value < max_param_value);
+      // apply min and max weight constraints to linear and bias parameters.
+      src_comp->ApplyMinMaxToWeights();
+    }
+  }
+  return true;
+}
+
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h
index 3b304b8fb39..00aeb4a1661 100644
--- a/src/nnet3/nnet-utils.h
+++ b/src/nnet3/nnet-utils.h
@@ -497,6 +497,15 @@ void ConstrainOrthonormal(Nnet *nnet);
 int32 GetNumNvalues(const std::vector<NnetIo> &io_vec,
                     bool exhaustive);
 
+/**
+ This function is used as part of the regular training workflow, after
+ UpdateNnetWithMaxChange().
+ For each Updatable component c in the neural net, it makes updatable params
+ less than min_param_value_ to be equal to this value and also params larger
+ than max_param_value_ to max_param_value_.
+*/
+bool PositiveUpdatableWeights(Nnet *nnet);
+
 
 } // namespace nnet3
 } // namespace kaldi

From e3b272735d0003bc3b2ca5d0f5479da1028d2c2b Mon Sep 17 00:00:00 2001
From: LvHang <hanglv@nwpu-aslp.org>
Date: Fri, 11 May 2018 15:18:16 -0400
Subject: [PATCH 2/2] Add fvector code

add _separate version fvector code
---
 egs/mini_librispeech/s5/cmd.sh                |   7 +-
 .../s5/conf/queue_no_k20.conf                 |  16 +
 .../s5/local/fvector/add_output_node.py       |  68 ++++
 .../local/fvector/generate_sin_cos_matrix.py  |  52 +++
 .../s5/local/fvector/run_fvector.sh           |  78 +++++
 .../s5/local/fvector/run_fvector_separate.sh  |  79 +++++
 .../s5/local/fvector/run_tdnn.sh              | 214 ++++++++++++
 egs/mini_librispeech/s5/run.sh                |   2 +
 egs/mini_librispeech/s5/run_fvector.sh        |  64 ++++
 .../s5/run_fvector_separate.sh                |  64 ++++
 egs/wsj/s5/steps/nnet3/fvector/get_egs.sh     | 262 +++++++++++++++
 .../steps/nnet3/fvector/get_egs_separate.sh   | 165 ++++++++++
 .../nnet3/fvector/make_fvector_feature.sh     | 158 +++++++++
 egs/wsj/s5/steps/nnet3/xvector/train.sh       | 253 ++++++++++++++
 .../s5/steps/nnet3/xvector/train_separate.sh  | 257 +++++++++++++++
 src/Makefile                                  |  13 +-
 src/cudamatrix/cu-kernels-ansi.h              |   9 +
 src/cudamatrix/cu-kernels.cu                  |  40 +++
 src/cudamatrix/cu-kernels.h                   |  15 +
 src/cudamatrix/cu-math.cc                     |  43 +++
 src/cudamatrix/cu-math.h                      |  31 ++
 src/cudamatrix/cu-packed-matrix.cc            |   9 +
 src/cudamatrix/cu-packed-matrix.h             |   3 +
 src/fvector/Makefile                          |  19 ++
 src/fvector/fvector-perturb-test.cc           |  86 +++++
 src/fvector/fvector-perturb.cc                | 289 ++++++++++++++++
 src/fvector/fvector-perturb.h                 | 172 ++++++++++
 src/fvectorbin/Makefile                       |  25 ++
 src/fvectorbin/compute-wav-to-rawmatrix.cc    | 123 +++++++
 src/fvectorbin/fvector-add-noise-block.cc     |  61 ++++
 src/fvectorbin/fvector-add-noise-separate.cc  |  72 ++++
 src/fvectorbin/fvector-add-noise.cc           |  59 ++++
 src/fvectorbin/fvector-chunk-block.cc         | 212 ++++++++++++
 src/fvectorbin/fvector-chunk-separate.cc      | 207 ++++++++++++
 src/fvectorbin/fvector-chunk.cc               | 195 +++++++++++
 .../fvector-debug-check-filter-bank.cc        |  64 ++++
 src/fvectorbin/fvector-debug-wav-to-vector.cc |  41 +++
 src/fvectorbin/fvector-debug-write-to-wav.cc  |  52 +++
 src/fvectorbin/fvector-get-egs-block.cc       | 122 +++++++
 src/fvectorbin/fvector-get-egs.cc             | 143 ++++++++
 src/nnet3/nnet-example-utils.cc               |  17 +-
 src/nnet3/nnet-utils.cc                       |  63 ++++
 src/nnet3/nnet-utils.h                        |   7 +
 src/xvector/Makefile                          |  22 ++
 src/xvector/nnet-xvector-compute.cc           |  99 ++++++
 src/xvector/nnet-xvector-compute.h            |  55 ++++
 src/xvector/nnet-xvector-diagnostics.cc       | 214 ++++++++++++
 src/xvector/nnet-xvector-diagnostics.h        |  95 ++++++
 src/xvector/nnet-xvector-training.cc          | 272 +++++++++++++++
 src/xvector/nnet-xvector-training.h           |  89 +++++
 src/xvector/xvector-test.cc                   | 311 ++++++++++++++++++
 src/xvector/xvector.cc                        | 130 ++++++++
 src/xvector/xvector.h                         |  94 ++++++
 src/xvectorbin/Makefile                       |  28 ++
 src/xvectorbin/nnet3-xvector-compute-prob.cc  |  81 +++++
 .../nnet3-xvector-compute-simple.cc           | 155 +++++++++
 src/xvectorbin/nnet3-xvector-compute.cc       | 211 ++++++++++++
 .../nnet3-xvector-get-egs-sre-subsample.cc    | 264 +++++++++++++++
 src/xvectorbin/nnet3-xvector-get-egs-sre.cc   | 237 +++++++++++++
 src/xvectorbin/nnet3-xvector-get-egs.cc       | 244 ++++++++++++++
 src/xvectorbin/nnet3-xvector-scoring.cc       | 151 +++++++++
 src/xvectorbin/nnet3-xvector-show-progress.cc | 158 +++++++++
 src/xvectorbin/nnet3-xvector-train.cc         |  94 ++++++
 tools/config/common_path.sh                   |   2 +
 64 files changed, 6927 insertions(+), 10 deletions(-)
 mode change 100644 => 100755 egs/mini_librispeech/s5/cmd.sh
 create mode 100644 egs/mini_librispeech/s5/conf/queue_no_k20.conf
 create mode 100644 egs/mini_librispeech/s5/local/fvector/add_output_node.py
 create mode 100644 egs/mini_librispeech/s5/local/fvector/generate_sin_cos_matrix.py
 create mode 100755 egs/mini_librispeech/s5/local/fvector/run_fvector.sh
 create mode 100755 egs/mini_librispeech/s5/local/fvector/run_fvector_separate.sh
 create mode 100755 egs/mini_librispeech/s5/local/fvector/run_tdnn.sh
 create mode 100755 egs/mini_librispeech/s5/run_fvector.sh
 create mode 100755 egs/mini_librispeech/s5/run_fvector_separate.sh
 create mode 100755 egs/wsj/s5/steps/nnet3/fvector/get_egs.sh
 create mode 100755 egs/wsj/s5/steps/nnet3/fvector/get_egs_separate.sh
 create mode 100755 egs/wsj/s5/steps/nnet3/fvector/make_fvector_feature.sh
 create mode 100755 egs/wsj/s5/steps/nnet3/xvector/train.sh
 create mode 100755 egs/wsj/s5/steps/nnet3/xvector/train_separate.sh
 create mode 100644 src/fvector/Makefile
 create mode 100644 src/fvector/fvector-perturb-test.cc
 create mode 100644 src/fvector/fvector-perturb.cc
 create mode 100644 src/fvector/fvector-perturb.h
 create mode 100644 src/fvectorbin/Makefile
 create mode 100644 src/fvectorbin/compute-wav-to-rawmatrix.cc
 create mode 100644 src/fvectorbin/fvector-add-noise-block.cc
 create mode 100644 src/fvectorbin/fvector-add-noise-separate.cc
 create mode 100644 src/fvectorbin/fvector-add-noise.cc
 create mode 100644 src/fvectorbin/fvector-chunk-block.cc
 create mode 100644 src/fvectorbin/fvector-chunk-separate.cc
 create mode 100644 src/fvectorbin/fvector-chunk.cc
 create mode 100644 src/fvectorbin/fvector-debug-check-filter-bank.cc
 create mode 100644 src/fvectorbin/fvector-debug-wav-to-vector.cc
 create mode 100644 src/fvectorbin/fvector-debug-write-to-wav.cc
 create mode 100644 src/fvectorbin/fvector-get-egs-block.cc
 create mode 100644 src/fvectorbin/fvector-get-egs.cc
 create mode 100644 src/xvector/Makefile
 create mode 100644 src/xvector/nnet-xvector-compute.cc
 create mode 100644 src/xvector/nnet-xvector-compute.h
 create mode 100644 src/xvector/nnet-xvector-diagnostics.cc
 create mode 100644 src/xvector/nnet-xvector-diagnostics.h
 create mode 100644 src/xvector/nnet-xvector-training.cc
 create mode 100644 src/xvector/nnet-xvector-training.h
 create mode 100644 src/xvector/xvector-test.cc
 create mode 100644 src/xvector/xvector.cc
 create mode 100644 src/xvector/xvector.h
 create mode 100644 src/xvectorbin/Makefile
 create mode 100644 src/xvectorbin/nnet3-xvector-compute-prob.cc
 create mode 100644 src/xvectorbin/nnet3-xvector-compute-simple.cc
 create mode 100644 src/xvectorbin/nnet3-xvector-compute.cc
 create mode 100644 src/xvectorbin/nnet3-xvector-get-egs-sre-subsample.cc
 create mode 100644 src/xvectorbin/nnet3-xvector-get-egs-sre.cc
 create mode 100644 src/xvectorbin/nnet3-xvector-get-egs.cc
 create mode 100644 src/xvectorbin/nnet3-xvector-scoring.cc
 create mode 100644 src/xvectorbin/nnet3-xvector-show-progress.cc
 create mode 100644 src/xvectorbin/nnet3-xvector-train.cc

diff --git a/egs/mini_librispeech/s5/cmd.sh b/egs/mini_librispeech/s5/cmd.sh
old mode 100644
new mode 100755
index 71dd849a93b..4f3b12aa700
--- a/egs/mini_librispeech/s5/cmd.sh
+++ b/egs/mini_librispeech/s5/cmd.sh
@@ -10,6 +10,7 @@
 # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
 # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
-export train_cmd="queue.pl --mem 2G"
-export decode_cmd="queue.pl --mem 4G"
-export mkgraph_cmd="queue.pl --mem 8G"
+export train_cmd="queue.pl --mem 2G --config conf/queue_no_k20.conf --allow-k10-k20 true"
+export decode_cmd="queue.pl --mem 4G --config conf/queue_no_k20.conf --allow-k10-k20 true"
+export mkgraph_cmd="queue.pl --mem 8G --config conf/queue_no_k20.conf --allow-k10-k20 true"
+export cuda_cmd="queue.pl --gpu 1 --config conf/queue_no_k20.conf --allow-k10-k20 true"
diff --git a/egs/mini_librispeech/s5/conf/queue_no_k20.conf b/egs/mini_librispeech/s5/conf/queue_no_k20.conf
new file mode 100644
index 00000000000..e8d19a24ef7
--- /dev/null
+++ b/egs/mini_librispeech/s5/conf/queue_no_k20.conf
@@ -0,0 +1,16 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -l 'hostname=!a08*&!a09*&!a10*&!c04*&!b18*&!b19*&!b20*'
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0 -q all.q
+option gpu=* -l gpu=$0 -q g.q
+default allow_k20=true
+option allow_k20=true
+option allow_k20=false -l 'hostname=!g01*&!g02*&!b06*'
+default allow_k10_k20=true
+option allow_k10_k20=true
+option allow_k10_k20=false -l 'hostname=!b0*&!b10*&!g01*&!g02'
diff --git a/egs/mini_librispeech/s5/local/fvector/add_output_node.py b/egs/mini_librispeech/s5/local/fvector/add_output_node.py
new file mode 100644
index 00000000000..c3d446e5db4
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/fvector/add_output_node.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env/python
+
+from __future__ import print_function
+import argparse
+import logging
+import os
+import pprint
+import sys
+import shutil
+import traceback
+
+def get_args():
+    parser = argparse.ArgumentParser(description="Add the S and b output node "
+            "which is used in plda object function.",
+            epilog="Called by local/fvector/run_fvector.sh")
+    parser.add_argument("--input-dim", type=int, required=True,
+            help="The input dimension of fvector network.")
+    parser.add_argument("--output-dim", type=int, required=True,
+            help="The output dimension of fvector network which is used to "
+            "compute the dimension of S matrix.")
+    parser.add_argument("--s-scale", type=float, default=0.2,
+            help="Scaling factor on the output 's' (s is a symmetric matrix "
+            "used for scoring).")
+    parser.add_argument("--b-scale", type=float, default=0.2,
+            help="Scaling factor on output 'b' (b is a scalar offset used in scoring).")
+    parser.add_argument("--config-file", type=str, required=True,
+            help="The file is needed to be modified. It's always is configs/final.config")
+
+    print(' '.join(sys.argv), file=sys.stderr)
+    print(sys.argv, file=sys.stderr)
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = get_args()
+
+    f = open(args.config_file, "a")
+    # The s output
+    s_dim = (args.output_dim) * (args.output_dim+1) / 2
+
+    print('component name=x-s type=ConstantFunctionComponent input-dim={0} output-dim={1} '
+          'output-mean=0 output-stddev=0 '.format(
+              args.input_dim, s_dim), file=f)
+    print('component-node name=x-s component=x-s input=IfDefined(input)',           
+          file=f)                                                                   
+    print('component name=x-s-scale type=FixedScaleComponent dim={0} scale={1}'.format(
+                s_dim, args.s_scale), file=f);                                      
+    print('component-node name=x-s-scale component=x-s-scale input=x-s',            
+          file=f)                                                                   
+    print('output-node name=s input=x-s-scale', file=f)                             
+                                                                                 
+    # now the 'b' output, which is just a scalar.                                   
+    b_dim = 1                                                                       
+    print('component name=x-b type=ConstantFunctionComponent input-dim={0} output-dim=1 '
+          'output-mean=0 output-stddev=0 '.format(args.input_dim), file=f)           
+    print('component-node name=x-b component=x-b input=IfDefined(input)', file=f)   
+    print('component name=x-b-scale type=FixedScaleComponent dim=1 scale={0}'.format(
+            args.b_scale), file=f);                                                 
+    print('component-node name=x-b-scale component=x-b-scale input=x-b',            
+          file=f)                                                                   
+    print('output-node name=b input=x-b-scale', file=f)                             
+    f.close()                                                      
+
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/mini_librispeech/s5/local/fvector/generate_sin_cos_matrix.py b/egs/mini_librispeech/s5/local/fvector/generate_sin_cos_matrix.py
new file mode 100644
index 00000000000..45e986723a3
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/fvector/generate_sin_cos_matrix.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env/python
+
+from __future__ import print_function
+import argparse
+import logging
+import os
+import pprint
+import shutil
+import sys
+import traceback
+
+sys.path.insert(0, 'steps')
+import libs.common as common_lib
+
+def get_args():
+    parser = argparse.ArgumentParser(description="Generate sine_transform.mat "
+            "and cosine_transform.mat for frequency domain raw waveform setup.",
+            epilog="Called by local/fvector/run_fvector.sh")
+    parser.add_argument("--feat-dim", type=int, required=True,
+            help="The dimension of input.")
+    parser.add_argument("--add-bias", type=str,
+            help="If true, add a column for fft matrix.",
+            default=True, choices=["True","False"])
+    parser.add_argument("--half-range", type=str,
+            help="If true, generate half fft matrix.",
+            default=True, choices=["True","False"])
+    parser.add_argument("--dir", type=str, required=True,
+            help="The output directory.")
+
+    print(' '.join(sys.argv), file=sys.stderr)
+    print(sys.argv, file=sys.stderr)
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = get_args()
+    
+    feat_dim = args.feat_dim
+    num_fft_bins = (2**(args.feat_dim-1).bit_length())
+    add_bias = args.add_bias
+    half_range = args.half_range
+
+    common_lib.write_sin_cos_transform_matrix(feat_dim, num_fft_bins,
+            "{0}/configs/cos_transform.mat".format(args.dir),                   
+            compute_cosine=True, add_bias=add_bias, half_range=half_range)
+    common_lib.write_sin_cos_transform_matrix(feat_dim, num_fft_bins,
+            "{0}/configs/sin_transform.mat".format(args.dir),
+            compute_cosine=False, add_bias=add_bias, half_range=half_range)
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/mini_librispeech/s5/local/fvector/run_fvector.sh b/egs/mini_librispeech/s5/local/fvector/run_fvector.sh
new file mode 100755
index 00000000000..d57c6813428
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/fvector/run_fvector.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+
+. ./cmd.sh
+set -e
+
+stage=3
+train_stage=-10
+data=data/train_clean_5
+noise_data=data/noise
+egs_dir=exp/fvector/egs
+fvector_dir=exp/fvector
+use_gpu=true
+
+. ./path.sh
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+if [ $stage -le 3 ]; then
+  #dump egs                                         
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $egs_dir/storage ]; then  
+    utils/create_split_dir.pl \
+    /export/b{11,12,13}/$USER/kaldi-data/egs/minilibrispeech-$(date +'%m_%d_%H_%M')/s5/$egs_dir/storage $egs_dir/storage
+  fi
+
+  steps/nnet3/fvector/get_egs.sh --cmd "$train_cmd" \
+    --nj 8 \
+    --stage 0 \
+    --egs-per-iter 12500 \
+    --egs-per-iter-diagnostic 10000 \
+    --num-diagnostic-percent 5 \
+    --frame-length 25 \
+    --left-padding 1 \
+    --right-padding 1 \
+    "$data" "$noise_data" "$egs_dir"
+fi
+
+if [ $stage -le 4 ]; then
+  #prepare configs
+  echo "$0: creating neural net configs using the xconfig parser";
+  #options
+  input_dim=400
+  num_filters=100
+
+  mkdir -p $fvector_dir/configs
+
+  cat <<EOF > $fvector_dir/configs/network.xconfig
+  input dim=$input_dim name=input
+  # Each eg contains 8 frames, do Frequency-domain feature learning, and then
+  # use TDNN model split it into one vector
+  preprocess-fft-abs-lognorm-affine-log-layer name=raw0 cos-transform-file=$fvector_dir/configs/cos_transform.mat sin-transform-file=$fvector_dir/configs/sin_transform.mat num-filters=$num_filters half-fft-range=true
+  conv-relu-batchnorm-layer name=cnn1 height-in=$num_filters height-out=$[$num_filters/2] time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 height-subsample-out=2 learning-rate-factor=0.34 max-change=0.25
+
+  relu-batchnorm-layer name=tdnn0 input=cnn1 dim=625
+  relu-batchnorm-layer name=tdnn1 input=Append(0,1,2) dim=625
+  relu-batchnorm-layer name=tdnn2 input=Append(0,1,2) dim=625
+  relu-batchnorm-layer name=tdnn3 input=Append(0,1,2) dim=625
+  relu-batchnorm-layer name=tdnn4 input=Append(0,1) dim=625
+  output-layer name=output input=tdnn4 dim=200 include-log-softmax=False param-stddev=0.04 bias-stddev=1.0
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $fvector_dir/configs/network.xconfig --config-dir $fvector_dir/configs/
+  # Modify the final.config and generate sin.mat/cos.mat manually
+  python local/fvector/add_output_node.py --input-dim 400 --output-dim 200 --config-file $fvector_dir/configs/final.config
+  python local/fvector/generate_sin_cos_matrix.py \
+    --feat-dim 400 --dir $fvector_dir
+fi
+
+if [ $stage -le 5 ]; then
+  #training
+  steps/nnet3/xvector/train.sh --cmd "$train_cmd" \
+    --initial-effective-lrate 0.002 \
+    --final-effective-lrate 0.0002 \
+    --max-param-change 0.2 \
+    --minibatch-size 16 \
+    --num-epochs 8 --use-gpu $use_gpu --stage $train_stage \
+    --num-jobs-initial 1 --num-jobs-final 5 \
+    --egs-dir $egs_dir \
+    $fvector_dir
+fi
diff --git a/egs/mini_librispeech/s5/local/fvector/run_fvector_separate.sh b/egs/mini_librispeech/s5/local/fvector/run_fvector_separate.sh
new file mode 100755
index 00000000000..17f4e95f667
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/fvector/run_fvector_separate.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+
+. ./cmd.sh
+set -e
+
+stage=5
+train_stage=-10
+data=data/train_clean_5
+noise_data=data/noise
+egs_dir=exp/fvector/egs
+fvector_dir=exp/fvector
+use_gpu=true
+
+. ./path.sh
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+if [ $stage -le 3 ]; then
+  #dump egs                                         
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $egs_dir/storage ]; then  
+    utils/create_split_dir.pl \
+    /export/b{11,12,13}/$USER/kaldi-data/egs/minilibrispeech-$(date +'%m_%d_%H_%M')/s5/$egs_dir/storage $egs_dir/storage
+  fi
+
+  steps/nnet3/fvector/get_egs_separate.sh --cmd "$train_cmd" \
+    --nj 8 \
+    --stage 3 \
+    --egs-per-iter 100000 \
+    --egs-per-iter-diagnostic 10000 \
+    --num-diagnostic-percent 5 \
+    "$data" "$noise_data" "$egs_dir"
+fi
+
+if [ $stage -le 4 ]; then
+  #prepare configs
+  echo "$0: creating neural net configs using the xconfig parser";
+  #options
+  input_dim=400
+  num_filters=200
+
+  mkdir -p $fvector_dir/configs
+
+  cat <<EOF > $fvector_dir/configs/network.xconfig
+  input dim=$input_dim name=input
+  # Each eg contains 8 frames, do Frequency-domain feature learning, and then
+  # use TDNN model split it into one vector
+  preprocess-fft-abs-lognorm-affine-log-layer name=raw0 cos-transform-file=$fvector_dir/configs/cos_transform.mat sin-transform-file=$fvector_dir/configs/sin_transform.mat num-filters=$num_filters half-fft-range=true
+  conv-relu-batchnorm-layer name=cnn1 height-in=$num_filters height-out=$[$num_filters/2] time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 height-subsample-out=2 learning-rate-factor=0.34 max-change=0.25
+
+  relu-batchnorm-layer name=tdnn0 input=cnn1 dim=625
+  relu-batchnorm-layer name=tdnn1 input=Append(0,1,2) dim=625
+  relu-batchnorm-layer name=tdnn2 input=Append(0,1,2) dim=625
+  relu-batchnorm-layer name=tdnn3 input=Append(0,1,2) dim=625
+  relu-batchnorm-layer name=tdnn4 input=Append(0,1) dim=625
+  output-layer name=output input=tdnn4 dim=200 include-log-softmax=False param-stddev=0.04 bias-stddev=1.0
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $fvector_dir/configs/network.xconfig --config-dir $fvector_dir/configs/
+  # Modify the final.config and generate sin.mat/cos.mat manually
+  python local/fvector/add_output_node.py --input-dim 400 --output-dim 200 --config-file $fvector_dir/configs/final.config
+  python local/fvector/generate_sin_cos_matrix.py \
+    --feat-dim 400 --dir $fvector_dir
+fi
+
+if [ $stage -le 5 ]; then
+  #training
+  steps/nnet3/xvector/train_separate.sh --cmd "$train_cmd" \
+    --initial-effective-lrate 0.002 \
+    --final-effective-lrate 0.0002 \
+    --max-param-change 0.2 \
+    --minibatch-size 16 \
+    --left-padding 1 \
+    --right-padding 1 \
+    --max-snr 20 \
+    --min-snr 10 \
+    --num-epochs 8 --use-gpu $use_gpu --stage $train_stage \
+    --num-jobs-initial 1 --num-jobs-final 3 \
+    --egs-dir $egs_dir \
+    $fvector_dir
+fi
diff --git a/egs/mini_librispeech/s5/local/fvector/run_tdnn.sh b/egs/mini_librispeech/s5/local/fvector/run_tdnn.sh
new file mode 100755
index 00000000000..a69a26c6bb4
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/fvector/run_tdnn.sh
@@ -0,0 +1,214 @@
+#!/bin/bash
+
+# 1e is as 1d but instead of the --proportional-shrink option, using
+#  the newly added xconfig-layer-specific 'l2-regularize' options.
+
+# local/chain/compare_wer.sh exp/chain/tdnn1d_sp exp/chain/tdnn1e_sp
+# System                tdnn1d_sp tdnn1e_sp
+#WER dev_clean_2 (tgsmall)      14.21     13.43
+#WER dev_clean_2 (tglarge)      10.41      9.76
+# Final train prob        -0.0473   -0.0510
+# Final valid prob        -0.0893   -0.0889
+# Final train prob (xent)   -1.0757   -1.4148
+# Final valid prob (xent)   -1.4222   -1.6640
+
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn1{d,e}_sp
+# exp/chain/tdnn1d_sp: num-iters=17 nj=2..5 num-params=7.5M dim=40+100->2309 combine=-0.063->-0.052 xent:train/valid[10,16,final]=(-1.65,-1.23,-1.08/-1.91,-1.55,-1.42) logprob:train/valid[10,16,final]=(-0.084,-0.057,-0.047/-0.125,-0.100,-0.089)
+# exp/chain/tdnn1e_sp: num-iters=17 nj=2..5 num-params=7.5M dim=40+100->2309 combine=-0.061->-0.056 xent:train/valid[10,16,final]=(-1.69,-1.41,-1.41/-1.91,-1.67,-1.66) logprob:train/valid[10,16,final]=(-0.065,-0.055,-0.051/-0.104,-0.095,-0.089)
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=13
+decode_nj=10
+train_set=train_clean_5
+test_sets=dev_clean_2
+gmm=tri3b
+nnet3_affix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1e_fvector   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+# training options
+# training chunk-options
+chunk_width=140,100,160
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+common_egs_dir=
+xent_regularize=0.1
+
+# training options
+srand=0
+remove_egs=true
+reporting_email=
+
+#decode options
+test_online_decoding=true  # if true, it will run the last decoding stage.
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 13 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.05"
+  output_opts="l2-regularize=0.01"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=200 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $opts dim=512
+  relu-batchnorm-layer name=tdnn2 $opts dim=512 input=Append(-1,0,1)
+  relu-batchnorm-layer name=tdnn3 $opts dim=512
+  relu-batchnorm-layer name=tdnn4 $opts dim=512 input=Append(-1,0,1)
+  relu-batchnorm-layer name=tdnn5 $opts dim=512
+  relu-batchnorm-layer name=tdnn6 $opts dim=512 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn7 $opts dim=512 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn8 $opts dim=512 input=Append(-6,-3,0)
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain $opts dim=512 target-rms=0.5
+  output-layer name=output include-log-softmax=false $output_opts dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn8 $opts dim=512 target-rms=0.5
+  output-layer name=output-xent $output_opts dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=10 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=256,128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $chunk_left_context \
+          --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_tgsmall data/${data}_hires ${dir}/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+exit 0;
diff --git a/egs/mini_librispeech/s5/run.sh b/egs/mini_librispeech/s5/run.sh
index 30b0e8bda7c..84f69059d89 100755
--- a/egs/mini_librispeech/s5/run.sh
+++ b/egs/mini_librispeech/s5/run.sh
@@ -170,6 +170,7 @@ if [ $stage -le 7 ]; then
 
   utils/build_const_arpa_lm.sh \
     data/local/lm/lm_tglarge.arpa.gz data/lang data/lang_test_tglarge
+  exit 0
 
   steps/align_fmllr.sh --nj 5 --cmd "$train_cmd" \
     data/train_clean_5 data/lang exp/tri3b exp/tri3b_ali_train_clean_5
@@ -194,6 +195,7 @@ if [ $stage -le 8 ]; then
   done
 fi
 
+
 # Train a chain model
 if [ $stage -le 9 ]; then
   local/chain/run_tdnn.sh --stage 0
diff --git a/egs/mini_librispeech/s5/run_fvector.sh b/egs/mini_librispeech/s5/run_fvector.sh
new file mode 100755
index 00000000000..4b698777e86
--- /dev/null
+++ b/egs/mini_librispeech/s5/run_fvector.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+# Note: this works only on pre-downloaded data on the CLSP servers
+data=/export/a05/dgalvez/
+
+data_url=www.openslr.org/resources/31
+lm_url=www.openslr.org/resources/11
+
+. ./cmd.sh
+. ./path.sh
+
+stage=4
+. utils/parse_options.sh
+
+set -euo pipefail
+
+mkdir -p $data
+#Stage1: run run.sh from scratch to generate a chain model.
+if [ $stage -le 0 ]; then
+  run.sh
+fi
+
+#Stage2: prepare a noise dir(maybe a speicial noise dataset). In mini_librispeech,
+#we just use trainset directly.
+if [ $stage -le 1 ]; then
+  cp -r data/train_clean_5 data/noise
+  #for the noise dir, we prepare a file utt2dur_fix. Each line is "utt_id dur-0.2"
+  #This file is used in "fvector-chunk.cc". It will be store into a vector in binary code.
+  #For each target chunk, we randomly select two utt_id form vector, and the 
+  #corresponding start point.
+  utils/data/get_utt2dur.sh data/noise  # wav-to-duration
+  cat data/noise/utt2dur | awk '{print $1,$2-0.2}' > data/noise/utt2dur_fix
+fi
+
+if [ $stage -le 2 ]; then
+#generate fvector egs and train model.
+local/fvector/run_fvector.sh --data data/train_clean_5 --noise-data data/noise \
+  --egs-dir exp/fvector/egs --fvector-dir exp/fvector
+fi
+
+if [ $stage -le 3 ]; then
+  for part in dev_clean_2_hires train_clean_5_sp_hires; do
+    if [ -e data/${part}_mfcc ]; then
+      if [ -e data/${part} ]; then
+        rm -rf data/${part}
+      fi
+      mv data/${part}_mfcc data/${part}
+    fi
+    
+    mv data/${part} data/${part}_mfcc
+    cp -r data/${part}_mfcc data/${part}
+    for f in $(ls data/${part}); do
+      if [ $f != "spk2gender" -a $f != "spk2utt" -a $f != "text" -a $f != "utt2spk" -a $f != "wav.scp" ]; then
+        rm -rf data/$part/$f
+      fi
+    done
+    steps/nnet3/fvector/make_fvector_feature.sh --cmd "$train_cmd" --nj 10 \
+      data/${part} exp/fvector exp/make_fvector/train fvector_feature
+  done
+fi
+
+if [ $stage -le 4 ]; then
+  local/fvector/run_tdnn.sh --stage 14 --train-stage 9
+fi
diff --git a/egs/mini_librispeech/s5/run_fvector_separate.sh b/egs/mini_librispeech/s5/run_fvector_separate.sh
new file mode 100755
index 00000000000..34c0e800aa6
--- /dev/null
+++ b/egs/mini_librispeech/s5/run_fvector_separate.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+# Note: this works only on pre-downloaded data on the CLSP servers
+data=/export/a05/dgalvez/
+
+data_url=www.openslr.org/resources/31
+lm_url=www.openslr.org/resources/11
+
+. ./cmd.sh
+. ./path.sh
+
+stage=3
+. utils/parse_options.sh
+
+set -euo pipefail
+
+mkdir -p $data
+#Stage1: run run.sh from scratch to generate a chain model.
+if [ $stage -le 0 ]; then
+  run.sh
+fi
+
+#Stage2: prepare a noise dir(maybe a speicial noise dataset). In mini_librispeech,
+#we just use trainset directly.
+if [ $stage -le 1 ]; then
+  cp -r data/train_clean_5 data/noise
+  #for the noise dir, we prepare a file utt2dur_fix. Each line is "utt_id dur-0.2"
+  #This file is used in "fvector-chunk.cc". It will be store into a vector in binary code.
+  #For each target chunk, we randomly select two utt_id form vector, and the 
+  #corresponding start point.
+  utils/data/get_utt2dur.sh data/noise  # wav-to-duration
+  cat data/noise/utt2dur | awk '{print $1,$2-0.2}' > data/noise/utt2dur_fix
+fi
+
+if [ $stage -le 2 ]; then
+#generate fvector egs and train model.
+local/fvector/run_fvector_separate.sh --data data/train_clean_5 --noise-data data/noise \
+  --egs-dir exp/fvector/egs --fvector-dir exp/fvector
+fi
+
+if [ $stage -le 3 ]; then
+  for part in dev_clean_2_hires train_clean_5_sp_hires; do
+    if [ -e data/${part}_mfcc ]; then
+      if [ -e data/${part} ]; then
+        rm -rf data/${part}
+      fi
+      mv data/${part}_mfcc data/${part}
+    fi
+    
+    mv data/${part} data/${part}_mfcc
+    cp -r data/${part}_mfcc data/${part}
+    for f in $(ls data/${part}); do
+      if [ $f != "spk2gender" -a $f != "spk2utt" -a $f != "text" -a $f != "utt2spk" -a $f != "wav.scp" ]; then
+        rm -rf data/$part/$f
+      fi
+    done
+    steps/nnet3/fvector/make_fvector_feature.sh --cmd "$train_cmd" --nj 10 \
+      data/${part} exp/fvector exp/make_fvector/train fvector_feature
+  done
+fi
+
+if [ $stage -le 4 ]; then
+  local/fvector/run_tdnn.sh --stage 14 --train-stage -10
+fi
diff --git a/egs/wsj/s5/steps/nnet3/fvector/get_egs.sh b/egs/wsj/s5/steps/nnet3/fvector/get_egs.sh
new file mode 100755
index 00000000000..1063a51fb32
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/fvector/get_egs.sh
@@ -0,0 +1,262 @@
+#!/bin/bash
+
+# This script dumps training examples (egs) for fvector training. At least,
+# each eg has two "NnetIo"s(data-chunks), which come from the same original 
+# source signal fragment. The two data-chunks in each eg will have respectively
+# n=0 and n=1.
+#
+#
+# This script, which will generally be called from other neural-net training
+# scripts, extracts the training examples used to train the neural net (and also
+# the validation examples used for diagnostics), and puts them in archives.
+
+# Begin configuration section.
+cmd=run.pl
+egs_per_iter=12500     # have this many frames per archive.
+                       # In xvector setup, this item is 2 milion and each frame
+                       # is 40 dims. In fvector case, the dimension is about
+                       # 1egs=100ms=2 * 8frames* (16kHz * 25ms)= 6400.
+                       # So (2milion * 40 / 6400)
+                       # If frame-length=10ms, it should be 30000.
+                       # That means we keep the capacity of fvector with xvector.
+egs_per_iter_diagnostic=10000    # have this many frames per achive for the
+                                 # archives used for diagnostics.
+num_diagnostic_percent=5   # we want to test the training and validation likelihoods
+                           # on a range of utterance lengths, and this number
+                           # controls how many archives we evaluate on. Select
+                           # "num_diagnostic_percent"% train data to be valid
+compress=true
+srand=0
+generate_egs_scp=true
+
+stage=0
+nj=8    # This should be set to the maximum number of jobs you are confortable
+        # to run in parallel
+
+echo "$0 $@"
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+  echo "Usage: $0 [opts] <data-dir> <noise-dir> <egs-dir>"
+  echo " e.g.: $0 data/train data/noise exp/fvector/egs"
+  echo ""
+  echo "Main options (for others, see top of script file)"
+  echo "  --nj <nj>                                        # The maximum number of jobs you want to run in"
+  echo "                                                   # parallel (increase this only if you have good disk and"
+  echo "                                                   # network speed).  default=8"
+  echo "  --cmd (utils/run.pl;utils/queue.pl <queue opts>) # how to run jobs."  
+  echo "  --frames-per-iter <#samples;100000>              # Target number of frames per archive"
+  echo "                                                   # {train_subset,valid}.*.egs"
+  echo "  --stage <stage|0>                                # Used to run a partially-completed training process from"
+  echo "                                                   # somewhere in the middle."
+  echo ""
+
+  exit 1;
+fi
+
+data_dir=$1
+noise_dir=$2
+egs_dir=$3
+
+for f in $data_dir/wav.scp $noise_dir/wav.scp $noise_dir/utt2dur_fix; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
+done
+
+mkdir -p $egs_dir
+mkdir -p $egs_dir/log
+mkdir -p $egs_dir/info
+num_utts=$(cat $data_dir/wav.scp | wc -l)
+num_valid=$[$num_utts*$num_diagnostic_percent/100];
+
+#Assume recording-id == utt-id
+if [ $stage -le 1 ]; then
+  #Get list of validation utterances.
+  awk '{print $1}' $data_dir/wav.scp | utils/shuffle_list.pl | head -$num_valid \
+    > ${egs_dir}/info/valid_uttlist
+  cat $data_dir/wav.scp | utils/filter_scp.pl --exclude $egs_dir/info/valid_uttlist | \
+    awk '{print $1}' > ${egs_dir}/info/train_uttlist
+  cat ${egs_dir}/info/train_uttlist | utils/shuffle_list.pl | head -$num_valid \
+    > ${egs_dir}/info/train_diagnostic_uttlist
+fi
+# get the (120ms) chunks from wav.scp and noise.scp. And compose 1 source
+# chunk and 2 noise chunks into a matrix.
+if [ $stage -le 2 ]; then
+  sdata=$data_dir/split$nj
+  utils/data/split_data.sh $data_dir $nj || exit 1;
+  $cmd JOB=1:$nj $egs_dir/log/cut_train_wav_into_chunks.JOB.log \
+    fvector-chunk --chunk-size=120 "scp:utils/filter_scp.pl --exclude $egs_dir/info/valid_uttlist $sdata/JOB/wav.scp |" \
+      scp:$noise_dir/wav.scp $noise_dir/utt2dur_fix \
+      ark,scp:$egs_dir/orign_train_chunks.JOB.ark,$egs_dir/orign_train_chunks.JOB.scp
+  for n in $(seq $nj); do
+    cat $egs_dir/orign_train_chunks.${n}.scp || exit 1;
+  done > $data_dir/orign_train_chunks.all.scp
+
+  $cmd $egs_dir/log/cut_valid_wav_into_chunks.log \
+    fvector-chunk --chunk-size=120 "scp:utils/filter_scp.pl $egs_dir/info/valid_uttlist $data_dir/wav.scp |" \
+      scp:$noise_dir/wav.scp $noise_dir/utt2dur_fix \
+      ark,scp:$egs_dir/orign_valid_chunks.ark,$egs_dir/orign_valid_chunks.scp
+  cp $egs_dir/orign_valid_chunks.scp $data_dir/orign_valid_chunks.scp
+
+  $cmd $egs_dir/log/cut_train_diagnostic_wav_into_chunks.log \
+    fvector-chunk --chunk-size=120 "scp:utils/filter_scp.pl $egs_dir/info/train_diagnostic_uttlist $data_dir/wav.scp |" \
+      scp:$noise_dir/wav.scp $noise_dir/utt2dur_fix \
+      ark,scp:$egs_dir/orign_train_diagnostic_chunks.ark,$egs_dir/orign_train_diagnostic_chunks.scp
+  cp $egs_dir/orign_train_diagnostic_chunks.scp $data_dir/orign_train_diagnostic_chunks.scp
+fi
+
+echo "$0: Generate the egs for train dataset."
+
+#each chunk will generate two "NnetIo"s
+num_egs=$(cat $data_dir/orign_train_chunks.all.scp | wc -l)
+num_archives=$[$num_egs/$egs_per_iter+1]
+# We may have to first create a smaller number of larger archives, with number
+# $num_archives_intermediate, if $num_archives is more than the maximum number
+# of open filehandles that the system allows per process (ulimit -n).
+# This sometimes gives a misleading answer as GridEngine sometimes changes that
+# somehow, so we limit it to 512.
+max_open_filehandles=$(ulimit -n) || exit 1
+[ $max_open_filehandles -gt 512 ] && max_open_filehandles=512
+num_archives_intermediate=$num_archives
+archives_multiple=1
+while [ $[$num_archives_intermediate+4] -gt $max_open_filehandles ]; do
+  archives_multiple=$[$archives_multiple+1]
+  num_archives_intermediate=$[$num_archives/$archives_multiple+1];
+done
+# now make sure num_archives is an exact multiple of archives_multiple.
+num_archives=$[$archives_multiple*$num_archives_intermediate]
+echo $num_archives > $egs_dir/info/num_archives
+
+# prepare the dir link
+if [ -e $egs_dir/storage ]; then
+  # Make soft links to storage directories, if distributing this way..  See
+  # utils/create_split_dir.pl.
+  echo "$0: creating data links"
+  utils/create_data_link.pl $(for x in $(seq $num_archives); do echo $egs_dir/egs.$x.ark; done)
+  for x in $(seq $num_archives_intermediate); do
+    utils/create_data_link.pl $(for y in $(seq $nj); do echo $egs_dir/egs_orig.$y.$x.ark; done)
+  done
+fi
+# Deal with the chunk one-by-one, add the noise.
+# convert the chunk data into Nnet3eg
+if [ $stage -le 3 ]; then
+  # create egs_orig.*.*.ark; the first index goes to $nj,
+  # the second to $num_archives_intermediate.
+  egs_list=
+  for n in $(seq $num_archives_intermediate); do
+    egs_list="$egs_list ark:$egs_dir/egs_orig.JOB.$n.ark"
+  done
+  echo "$0: Do data perturbation and dump on disk"
+  #The options could be added in this line
+  $cmd JOB=1:$nj $egs_dir/log/do_train_perturbation_and_get_egs.JOB.log \
+    fvector-add-noise --max-snr=20 --min-snr=10 scp:$egs_dir/orign_train_chunks.JOB.scp ark:- \| \
+    fvector-get-egs ark:- ark:- \| \
+    nnet3-copy-egs --random=true --srand=\$[JOB+$srand] ark:- $egs_list || exit 1;
+fi
+
+# The num_archives_intermediate looks like a bridge. It used to convert the
+# egs_orig(nj * num_achives_intermediate) to egs(num_achives_intermediate * archives_multiple)
+# Each time, get a colmn from egs_orig and average dispersion to a row of egs.
+if [ $stage -le 4 ]; then
+  echo "$0: recombining and shuffling order of archives on disk"
+  # combine all the "egs_orig.*.JOB.scp" (over the $nj splits of the data) and
+  # shuffle the order, writing to the egs.JOB.ark
+
+  # the input is a concatenation over the input jobs.
+  egs_list=
+  for n in $(seq $nj); do
+    egs_list="$egs_list $egs_dir/egs_orig.$n.JOB.ark"
+  done
+
+  if [ $archives_multiple == 1 ]; then # normal case.
+    if $generate_egs_scp; then
+      output_archive="ark,scp:$egs_dir/egs.JOB.ark,$egs_dir/egs.JOB.scp"
+    else
+      output_archive="ark:$egs_dir/egs.JOB.ark"
+    fi
+    $cmd --max-jobs-run $nj JOB=1:$num_archives_intermediate $egs_dir/log/shuffle.JOB.log \
+      nnet3-shuffle-egs --srand=\$[JOB+$srand] "ark:cat $egs_list|" $output_archive  || exit 1;
+
+    if $generate_egs_scp; then
+      #concatenate egs.JOB.scp in single egs.scp
+      rm $egs_dir/egs.scp 2> /dev/null || true
+      for j in $(seq $num_archives_intermediate); do
+        cat $egs_dir/egs.$j.scp || exit 1;
+      done > $egs_dir/egs.scp || exit 1;
+      for f in $egs_dir/egs.*.scp; do rm $f; done
+    fi
+  else
+    # we need to shuffle the 'intermediate archives' and then split into the
+    # final archives.  we create soft links to manage this splitting, because
+    # otherwise managing the output names is quite difficult (and we don't want
+    # to submit separate queue jobs for each intermediate archive, because then
+    # the --max-jobs-run option is hard to enforce).
+    if $generate_egs_scp; then
+      output_archives="$(for y in $(seq $archives_multiple); do echo ark,scp:$egs_dir/egs.JOB.$y.ark,$egs_dir/egs.JOB.$y.scp; done)"
+    else
+      output_archives="$(for y in $(seq $archives_multiple); do echo ark:$egs_dir/egs.JOB.$y.ark; done)"
+    fi
+    for x in $(seq $num_archives_intermediate); do
+      for y in $(seq $archives_multiple); do
+        archive_index=$[($x-1)*$archives_multiple+$y]
+        # egs.intermediate_archive.{1,2,...}.ark will point to egs.archive.ark
+        ln -sf egs.$archive_index.ark $egs_dir/egs.$x.$y.ark || exit 1
+      done
+    done
+    $cmd --max-jobs-run $nj JOB=1:$num_archives_intermediate $egs_dir/log/shuffle.JOB.log \
+      nnet3-shuffle-egs --srand=\$[JOB+$srand] "ark:cat $egs_list|" ark:- \| \
+      nnet3-copy-egs ark:- $output_archives || exit 1;
+
+    if $generate_egs_scp; then
+      #concatenate egs.JOB.scp in single egs.scp
+      rm $egs_dir/egs.scp 2> /dev/null || true
+      for j in $(seq $num_archives_intermediate); do
+        for y in $(seq $num_archives_intermediate); do
+          cat $egs_dir/egs.$j.$y.scp || exit 1;
+        done
+      done > $egs_dir/egs.scp || exit 1;
+      for f in $egs_dir/egs.*.*.scp; do rm $f; done
+    fi
+  fi
+fi
+#get egs.$archives_multiple.$num_archives_intermediate.ark 
+#get egs.scp
+
+echo "$0: Generate the egs for valid dataset"
+if [ $stage -le 5 ]; then
+  $cmd $egs_dir/log/do_valid_perturbation_and_get_egs.log \
+    fvector-add-noise --max-snr=20 --min-snr=10 scp:$egs_dir/orign_valid_chunks.scp ark:- \| \
+    fvector-get-egs ark:- ark:- \| \
+    nnet3-copy-egs --random=true --srand=$srand ark:- ark:$egs_dir/valid.egs || exit 1;
+  #get the valid.egs
+  cp $egs_dir/valid.egs $egs_dir/valid_diagnostic_egs.1.ark
+fi
+
+echo "$0: Generate the egs for train diagnostic"
+if [ $stage -le 6 ];then
+  $cmd $egs_dir/log/do_train_diagnostic_perturbation_and_get_egs.log \
+    fvector-add-noise --max-snr=20 --min-snr=10 scp:$egs_dir/orign_train_diagnostic_chunks.scp ark:- \| \
+    fvector-get-egs ark:- ark:- \| \
+    nnet3-copy-egs --random=true --srand=$srand ark:- ark:$egs_dir/train_diagnostic.egs || exit 1;
+  #get the train_diagnostic.egs
+  cp $egs_dir/train_diagnostic.egs $egs_dir/train_diagnostic_egs.1.ark
+  echo "1" > $egs_dir/info/num_diagnostic_archives
+fi
+
+# remove unnecessary arks and links.
+if [ $stage -le 7 ]; then
+  echo "$0: removing temporary archives"
+  for x in $(seq $nj); do
+    for y in $(seq $num_archives_intermediate); do
+      file=$egs_dir/egs_orig.$x.$y.ark
+      [ -L $file ] && rm $(utils/make_absolute.sh $file)
+      rm $file
+    done
+  done
+  if [ $archives_multiple -gt 1 ]; then
+    # there are some extra soft links that we should delete.
+    for f in $egs_dir/egs.*.*.ark; do rm $f; done
+  fi
+fi
+echo "$0: Finished preparing fvector training examples"
diff --git a/egs/wsj/s5/steps/nnet3/fvector/get_egs_separate.sh b/egs/wsj/s5/steps/nnet3/fvector/get_egs_separate.sh
new file mode 100755
index 00000000000..3040b269f87
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/fvector/get_egs_separate.sh
@@ -0,0 +1,165 @@
+#!/bin/bash
+
+# This script dumps training examples (egs) for fvector training. At least,
+# each eg has two "NnetIo"s(data-chunks), which come from the same original 
+# source signal fragment. The two data-chunks in each eg will have respectively
+# n=0 and n=1.
+#
+#
+# This script, which will generally be called from other neural-net training
+# scripts, extracts the training examples used to train the neural net (and also
+# the validation examples used for diagnostics), and puts them in archives.
+
+# Begin configuration section.
+cmd=run.pl
+egs_per_iter=12500     # have this many frames per archive.
+egs_per_iter_diagnostic=10000    # have this many frames per achive for the
+                                 # archives used for diagnostics.
+num_diagnostic_percent=5   # we want to test the training and validation likelihoods
+                           # on a range of utterance lengths, and this number
+                           # controls how many archives we evaluate on. Select
+                           # "num_diagnostic_percent"% train data to be valid
+chunk_size=120
+compress=true
+srand=0
+generate_egs_scp=true
+
+stage=0
+nj=8    # This should be set to the maximum number of jobs you are confortable
+        # to run in parallel
+
+echo "$0 $@"
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+  echo "Usage: $0 [opts] <data-dir> <noise-dir> <egs-dir>"
+  echo " e.g.: $0 data/train data/noise exp/fvector/egs"
+  echo ""
+  echo "Main options (for others, see top of script file)"
+  echo "  --nj <nj>                                        # The maximum number of jobs you want to run in"
+  echo "                                                   # parallel (increase this only if you have good disk and"
+  echo "                                                   # network speed).  default=8"
+  echo "  --cmd (utils/run.pl;utils/queue.pl <queue opts>) # how to run jobs."  
+  echo "  --frames-per-iter <#samples;100000>              # Target number of frames per archive"
+  echo "                                                   # {train_subset,valid}.*.egs"
+  echo "  --stage <stage|0>                                # Used to run a partially-completed training process from"
+  echo "                                                   # somewhere in the middle."
+  echo ""
+
+  exit 1;
+fi
+
+data_dir=$1
+noise_dir=$2
+egs_dir=$3
+
+for f in $data_dir/wav.scp $noise_dir/wav.scp $noise_dir/utt2dur_fix; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
+done
+
+mkdir -p $egs_dir
+mkdir -p $egs_dir/log
+mkdir -p $egs_dir/info
+num_utts=$(cat $data_dir/wav.scp | wc -l)
+num_valid=$[$num_utts*$num_diagnostic_percent/100];
+
+#Assume recording-id == utt-id
+if [ $stage -le 1 ]; then
+  #Get list of validation utterances.
+  awk '{print $1}' $data_dir/wav.scp | utils/shuffle_list.pl | head -$num_valid \
+    > ${egs_dir}/info/valid_uttlist
+  cat $data_dir/wav.scp | utils/filter_scp.pl --exclude $egs_dir/info/valid_uttlist | \
+    awk '{print $1}' > ${egs_dir}/info/train_uttlist
+  cat ${egs_dir}/info/train_uttlist | utils/shuffle_list.pl | head -$num_valid \
+    > ${egs_dir}/info/train_diagnostic_uttlist
+fi
+# get the (120ms) chunks from wav.scp and noise.scp. And compose 1 source
+# chunk and 2 noise chunks into a matrix.
+if [ $stage -le 2 ]; then
+  sdata=$data_dir/split$nj
+  utils/data/split_data.sh $data_dir $nj || exit 1;
+  $cmd JOB=1:$nj $egs_dir/log/cut_train_wav_into_chunks.JOB.log \
+    fvector-chunk-separate --chunk-size=$chunk_size "scp:utils/filter_scp.pl --exclude $egs_dir/info/valid_uttlist $sdata/JOB/wav.scp |" \
+      scp:$noise_dir/wav.scp $noise_dir/utt2dur_fix \
+      ark,scp:$egs_dir/orign_train_chunks.JOB.ark,$egs_dir/orign_train_chunks.JOB.scp \
+      ark,scp:$egs_dir/orign_train_noise_chunks.JOB.ark,$egs_dir/orign_train_noise_chunks.JOB.scp
+
+  for n in $(seq $nj); do
+    cat $egs_dir/orign_train_chunks.${n}.scp || exit 1;
+  done > $data_dir/orign_train_chunks.all.scp
+  for n in $(seq $nj); do
+    cat $egs_dir/orign_train_noise_chunks.${n}.scp || exit 1;
+  done > $data_dir/orign_train_noise_chunks.all.scp
+  cp $data_dir/orign_train_chunks.all.scp $egs_dir/orign_train_chunks.all.scp
+  cp $data_dir/orign_train_noise_chunks.all.scp $egs_dir/orign_train_noise_chunks.all.scp
+
+  $cmd $egs_dir/log/cut_valid_wav_into_chunks.log \
+    fvector-chunk-separate --chunk-size=$chunk_size "scp:utils/filter_scp.pl $egs_dir/info/valid_uttlist $data_dir/wav.scp |" \
+      scp:$noise_dir/wav.scp $noise_dir/utt2dur_fix \
+      ark,scp:$egs_dir/orign_valid_chunks.ark,$egs_dir/orign_valid_chunks.scp \
+      ark,scp:$egs_dir/orign_valid_noise_chunks.ark,$egs_dir/orign_valid_noise_chunks.scp
+  cp $egs_dir/orign_valid_chunks.scp $data_dir/orign_valid_chunks.scp
+  cp $egs_dir/orign_valid_noise_chunks.scp $data_dir/orign_valid_noise_chunks.scp
+
+  $cmd $egs_dir/log/cut_train_diagnostic_wav_into_chunks.log \
+    fvector-chunk-separate --chunk-size=$chunk_size "scp:utils/filter_scp.pl $egs_dir/info/train_diagnostic_uttlist $data_dir/wav.scp |" \
+      scp:$noise_dir/wav.scp $noise_dir/utt2dur_fix \
+      ark,scp:$egs_dir/orign_train_diagnostic_chunks.ark,$egs_dir/orign_train_diagnostic_chunks.scp \
+      ark,scp:$egs_dir/orign_train_diagnostic_noise_chunks.ark,$egs_dir/orign_train_diagnostic_noise_chunks.scp
+  cp $egs_dir/orign_train_diagnostic_chunks.scp $data_dir/orign_train_diagnostic_chunks.scp
+  cp $egs_dir/orign_train_diagnostic_noise_chunks.scp $data_dir/orign_train_diagnostic_noise_chunks.scp
+fi
+
+echo "$0: Generate the egs for train dataset."
+
+num_egs=$(cat $data_dir/orign_train_chunks.all.scp | wc -l)
+num_archives=$[$num_egs/$egs_per_iter+1]
+echo $num_archives > $egs_dir/info/num_archives
+
+if [ -e $egs_dir/storage ]; then
+  echo "$0:creating data links"
+  utils/create_data_link.pl $(for x in $(seq $num_archives); do echo $egs_dir/egs.$x.ark; done)
+  utils/create_data_link.pl $(for x in $(seq $num_archives); do echo $egs_dir/egs.noise.$x.ark; done)
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0:shuffle and recombine train set"
+  egs_scp_list=
+  for n in $(seq $num_archives); do
+    egs_scp_list="$egs_scp_list $egs_dir/egs.$n.scp.tmp"
+  done
+  utils/shuffle_list.pl $egs_dir/orign_train_chunks.all.scp > $egs_dir/orign_train_chunks.all.scp.shuffled
+  utils/split_scp.pl $egs_dir/orign_train_chunks.all.scp.shuffled $egs_scp_list
+  
+  utils/shuffle_list.pl $egs_dir/orign_train_noise_chunks.all.scp > $egs_dir/orign_train_noise_chunks.all.scp.shuffled
+  count=0
+  for n in $(seq $num_archives); do
+    current_count=$(cat $egs_dir/egs.$n.scp | wc -l)
+    count=$[$count+2*$current_count]
+    cat $egs_dir/orign_train_noise_chunks.all.scp.shuffled | head -n $count | tail -n $[2*$current_count] > $egs_dir/egs.noise.$n.scp.tmp
+  done
+  $cmd JOB=1:$num_archives $egs_dir/log/get_egs.JOB.log \
+    copy-vector scp:$egs_dir/egs.JOB.scp.tmp ark,scp:$egs_dir/egs.JOB.ark,$egs_dir/egs.JOB.scp || exit 1;
+  $cmd JOB=1:$num_archives $egs_dir/log/get_egs_noise.JOB.log \
+    copy-vector scp:$egs_dir/egs.noise.JOB.scp.tmp ark,scp:$egs_dir/egs.noise.JOB.ark,$egs_dir/egs.noise.JOB.scp || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0:shuffle and recombine valid set"
+  $cmd $egs_dir/log/get_egs_valid.log \
+    copy-vector scp:$egs_dir/orign_valid_chunks.scp ark,scp:$egs_dir/valid_diagnostic_egs.1.ark,$egs_dir/valid_diagnostic_egs.1.scp || exit 1;
+  $cmd $egs_dir/log/get_egs_valid_noise.log \
+    copy-vector scp:$egs_dir/orign_valid_noise_chunks.scp ark,scp:$egs_dir/valid_diagnoistic_egs.noise.1.ark,$egs_dir/valid_diagnostic_egs.noise.1.scp || exit 1;
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$0:shuffle and recombine train_diagnostic set"
+  $cmd $egs_dir/log/get_egs_train_diagnostic.log \
+    copy-vector scp:$egs_dir/orign_train_diagnostic_chunks.scp ark,scp:$egs_dir/train_diagnostic_egs.1.ark,$egs_dir/train_diagnostic_egs.1.scp || exit 1;
+  $cmd $egs_dir/log/get_egs_train_diagnostic_noise.log \
+    copy-vector scp:$egs_dir/orign_train_diagnostic_noise_chunks.scp ark,scp:$egs_dir/train_diagnostic_egs.noise.1.ark,$egs_dir/train_diagnostic_egs.noise.1.scp || exit 1;
+  echo "1" > $egs_dir/info/num_diagnostic_archives
+fi
+echo "$0: Finished preparing fvector training examples"
diff --git a/egs/wsj/s5/steps/nnet3/fvector/make_fvector_feature.sh b/egs/wsj/s5/steps/nnet3/fvector/make_fvector_feature.sh
new file mode 100755
index 00000000000..bf6faa5391d
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/fvector/make_fvector_feature.sh
@@ -0,0 +1,158 @@
+#!/bin/bash
+
+# Copyright 2012-2016  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0
+# To be run from .. (one directory up from here)
+# see ../run.sh for example
+
+# Begin configuration section.
+nj=4
+cmd=run.pl
+compress=true
+write_utt2num_frames=false  # if true writes utt2num_frames
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# -lt 2 ] || [ $# -gt 4 ]; then
+   echo "Usage: $0 [options] <data-dir> <fvector-dir> [<log-dir> [<fvector-feature-dir>] ]";
+   echo "e.g.: $0 data/train exp/make_fvector/train fvector-feature/"
+   echo "Note: <log-dir> defaults to <data-dir>/log, and <fvector-feature-dir> defaults to <data-dir>/data"
+   echo "Options: "
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   echo "  --write-utt2num-frames <true|false>     # If true, write utt2num_frames file."
+   exit 1;
+fi
+
+data=$1
+fvectordir=$2
+if [ $# -ge 3 ]; then
+  logdir=$3
+else
+  logdir=$data/log
+fi
+if [ $# -ge 4 ]; then
+  feadir=$4
+else
+  feadir=$data/data
+fi
+
+# make $feadir an absolute pathname.
+feadir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $feadir ${PWD}`
+
+# use "name" as part of name of the archive.
+name=`basename $data`
+
+mkdir -p $feadir || exit 1;
+mkdir -p $logdir || exit 1;
+
+if [ -f $data/feats.scp ]; then
+  mkdir -p $data/.backup
+  echo "$0: moving $data/feats.scp to $data/.backup"
+  mv $data/feats.scp $data/.backup
+fi
+
+scp=$data/wav.scp
+
+required="$scp $fvectordir/final.raw"
+
+for f in $required; do
+  if [ ! -f $f ]; then
+    echo "make_fvector_feature.sh: no such file $f"
+    exit 1;
+  fi
+done
+utils/validate_data_dir.sh --no-text --no-feats $data || exit 1;
+
+for n in $(seq $nj); do
+  # the next command does nothing unless $mfccdir/storage/ exists, see
+  # utils/create_data_link.pl for more info.
+  utils/create_data_link.pl $mfccdir/raw_fvector_$name.$n.ark
+done
+
+
+if $write_utt2num_frames; then
+  write_num_frames_opt="--write-num-frames=ark,t:$logdir/utt2num_frames.JOB"
+else
+  write_num_frames_opt=
+fi
+
+
+if [ -f $data/segments ]; then
+  echo "$0 [info]: segments file exists: using that."
+
+  split_segments=""
+  for n in $(seq $nj); do
+    split_segments="$split_segments $logdir/segments.$n"
+  done
+
+  utils/split_scp.pl $data/segments $split_segments || exit 1;
+  rm $logdir/.error 2>/dev/null
+
+  $cmd JOB=1:$nj $logdir/make_mfcc_${name}.JOB.log \
+    extract-segments scp,p:$scp $logdir/segments.JOB ark:- \| \
+    compute-wav-to-rawmatrix ark:- ark:- \| \
+    nnet3-compute --use-gpu=no $fvectordir/final.raw ark:- ark:- \| \
+    copy-feats --compress=$compress $write_num_frames_opt ark:- \
+      ark,scp:$feadir/raw_mfcc_$name.JOB.ark,$feadir/raw_fvector_$name.JOB.scp \
+     || exit 1;
+
+else
+  echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
+  split_scps=""
+  for n in $(seq $nj); do
+    split_scps="$split_scps $logdir/wav_${name}.$n.scp"
+  done
+
+  utils/split_scp.pl $scp $split_scps || exit 1;
+
+
+  # add ,p to the input rspecifier so that we can just skip over
+  # utterances that have bad wave data.
+
+  $cmd JOB=1:$nj $logdir/make_fvector_${name}.JOB.log \
+    compute-wav-to-rawmatrix scp,p:$logdir/wav_${name}.JOB.scp ark:- \| \
+    nnet3-compute --use-gpu=no $fvectordir/final.raw ark:- ark:- \| \
+    copy-feats $write_num_frames_opt --compress=$compress ark:- \
+      ark,scp:$feadir/raw_fvector_$name.JOB.ark,$feadir/raw_fvector_$name.JOB.scp \
+      || exit 1;
+fi
+
+
+if [ -f $logdir/.error.$name ]; then
+  echo "Error producing mfcc features for $name:"
+  tail $logdir/make_fvector_${name}.1.log
+  exit 1;
+fi
+
+# concatenate the .scp files together.
+for n in $(seq $nj); do
+  cat $feadir/raw_fvector_$name.$n.scp || exit 1;
+done > $data/feats.scp || exit 1
+
+if $write_utt2num_frames; then
+  for n in $(seq $nj); do
+    cat $logdir/utt2num_frames.$n || exit 1;
+  done > $data/utt2num_frames || exit 1
+  rm $logdir/utt2num_frames.*
+fi
+
+rm $logdir/wav_${name}.*.scp  $logdir/segments.* 2>/dev/null
+
+nf=`cat $data/feats.scp | wc -l`
+nu=`cat $data/utt2spk | wc -l`
+if [ $nf -ne $nu ]; then
+  echo "It seems not all of the feature files were successfully processed ($nf != $nu);"
+  echo "consider using utils/fix_data_dir.sh $data"
+fi
+
+if [ $nf -lt $[$nu - ($nu/20)] ]; then
+  echo "Less than 95% the features were successfully generated.  Probably a serious error."
+  exit 1;
+fi
+
+echo "Succeeded creating MFCC features for $name"
diff --git a/egs/wsj/s5/steps/nnet3/xvector/train.sh b/egs/wsj/s5/steps/nnet3/xvector/train.sh
new file mode 100755
index 00000000000..9e148ea3ab0
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/xvector/train.sh
@@ -0,0 +1,253 @@
+#!/bin/bash
+
+# Copyright 2016 Johns Hopkins University (Author: Daniel Povey).
+# Apache 2.0.
+
+
+# Begin configuration section.
+cmd=run.pl
+num_epochs=4      # Number of epochs of training;
+                  # the number of iterations is worked out from this.
+num_shifts=1
+initial_effective_lrate=0.0003
+final_effective_lrate=0.00003
+num_jobs_initial=2 # Number of neural net jobs to run in parallel at the start of training
+num_jobs_final=8   # Number of neural net jobs to run in parallel at the end of training
+stage=-3
+diagnostic_period=5
+compute_accuracy=true
+
+
+shuffle_buffer_size=1000 # This "buffer_size" variable controls randomization of the samples
+                # on each iter.  You could set it to 0 or to a large value for complete
+                # randomization, but this would both consume memory and cause spikes in
+                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
+                # not a huge deal though, as samples are anyway randomized right at the start.
+                # (the point of this is to get data in different minibatches on different iterations,
+                # since in the preconditioning method, 2 samples in the same minibatch can
+                # affect each others' gradients.
+
+max_param_change=0.2  # max param change per minibatch to use eventually
+                      # (for first epoch we use half this)
+minibatch_size=256   # minibatch size to use eventually
+                     # (for first epoch we use half this)
+
+use_gpu=true    # if true, we run on GPU.
+egs_dir=
+
+# End configuration section.
+
+trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# != 1 ]; then
+  echo "Usage: $0 [opts] <exp-dir>"
+  echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet"
+  echo "This script trains the xvector system; see egs/swbd/s5c/local/xvector/train.sh for"
+  echo "example (you have to create the nnet configs and the egs first)."
+  echo ""
+  echo "Main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config file containing options"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --num-epochs <#epochs|10>                        # Number of epochs of training"
+  echo "  --initial-effective-lrate <lrate|0.0003>         # effective learning rate at start of training."
+  echo "  --final-effective-lrate <lrate|0.00003>          # effective learning rate at end of training."
+  echo "                                                   # data, 0.00025 for large data"
+  echo "  --num-jobs-initial <num-jobs|1>                  # Number of parallel jobs to use for neural net training, at the start."
+  echo "  --num-jobs-final <num-jobs|8>                    # Number of parallel jobs to use for neural net training, at the end"
+  echo "  --egs-dir <egs-dir>                              # If supplied, overrides <exp-dir>/egs as location of egs"
+  echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
+  echo "                                                   # the middle."
+  exit 1;
+fi
+
+dir=$1
+
+[ -z $egs_dir ] && egs_dir=$dir/egs
+
+if [ ! -d $egs_dir/info ]; then
+  echo "$0: expected $egs_dir/info to exist: did you run steps/nnet3/xvector/get_egs.sh first?"
+  exit 1
+fi
+if [ ! -f $dir/configs/final.config ]; then
+  echo "$0: expected $dir/configs/final.config to exist (e.g. run steps/nnet3/xvector/make_jesus_configs.py first)"
+  exit 1
+fi
+
+
+num_archives=$(cat $egs_dir/info/num_archives)
+num_diagnostic_archives=$(cat $egs_dir/info/num_diagnostic_archives)
+
+
+
+[ $num_jobs_initial -gt $num_jobs_final ] && \
+  echo "$0: --initial-num-jobs cannot exceed --final-num-jobs" && exit 1;
+
+[ $num_jobs_final -gt $num_archives ] && \
+  echo "$0: --final-num-jobs cannot exceed #archives $num_archives." && exit 1;
+
+# set num_iters so that as close as possible, we process the data $num_epochs
+# times $num_shifts times, times, i.e. $num_iters*$avg_num_jobs) ==
+# $num_epochs*$num_archives*$num_shifts, where
+# avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
+num_archives_to_process=$[$num_epochs*$num_archives*$num_shifts]
+num_archives_processed=0
+num_iters=$[($num_archives_to_process*2)/($num_jobs_initial+$num_jobs_final)]
+
+echo "$0: Will train for $num_epochs epochs = $num_iters iterations"
+
+if $use_gpu; then
+  parallel_suffix=""
+  train_queue_opt="--gpu 1"
+  parallel_train_opts=
+  if ! cuda-compiled; then
+    echo "$0: WARNING: you are running with one thread but you have not compiled"
+    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
+    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
+    exit 1
+  fi
+else
+  echo "$0: without using a GPU this will be very slow.  nnet3 does not yet support multiple threads."
+  parallel_train_opts="--use-gpu=no"
+fi
+
+if [ $stage -le -1 ]; then
+  $cmd $dir/log/nnet_init.log \
+    nnet3-init $dir/configs/final.config $dir/0.raw || exit 1
+fi
+
+
+x=0
+
+while [ $x -lt $num_iters ]; do
+
+  this_num_jobs=$(perl -e "print int(0.5+$num_jobs_initial+($num_jobs_final-$num_jobs_initial)*$x/$num_iters);")
+
+  ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process;
+  this_effective_learning_rate=$(perl -e "print ($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt));");
+  this_learning_rate=$(perl -e "print ($this_effective_learning_rate*$this_num_jobs);");
+
+  if [ $stage -le $x ]; then
+    echo "On iteration $x, learning rate is $this_learning_rate"
+    raw="nnet3-copy --learning-rate=$this_learning_rate $dir/$x.raw - |"
+
+    if [ $[$x%$diagnostic_period] == 0 ]; then
+      # Set off jobs doing some diagnostics, in the background.
+      $cmd JOB=1:$num_diagnostic_archives $dir/log/compute_prob_valid.$x.JOB.log \
+        nnet3-xvector-compute-prob --compute-accuracy=${compute_accuracy} $dir/$x.raw \
+        "ark:nnet3-merge-egs --measure-output-frames=false ark:$egs_dir/valid_diagnostic_egs.JOB.ark ark:- |" &
+      $cmd JOB=1:$num_diagnostic_archives $dir/log/compute_prob_train.$x.JOB.log \
+        nnet3-xvector-compute-prob --compute-accuracy=${compute_accuracy} $dir/$x.raw \
+        "ark:nnet3-merge-egs --measure-output-frames=false ark:$egs_dir/train_diagnostic_egs.JOB.ark ark:- |" &
+    fi
+    if [ $x -gt 0 ]; then
+      $cmd $dir/log/progress.$x.log \
+        nnet3-info $dir/$x.raw '&&' \
+        nnet3-show-progress --use-gpu=no $dir/$[$x-1].raw $dir/$x.raw &
+    fi
+
+    echo "Training neural net (pass $x)"
+
+    if [ $x -le 1 ]; then
+      do_average=false # for the first 2 iters, don't do averaging, pick the best.
+    else
+      do_average=true
+    fi
+
+    rm $dir/.error 2>/dev/null
+
+
+    ( # this sub-shell is so that when we "wait" below,
+      # we only wait for the training jobs that we just spawned,
+      # not the diagnostic jobs that we spawned above.
+
+      # We cannot easily use a single parallel SGE job to do the main training,
+      # because the computation of which archive and which --frame option
+      # to use for each job is a little complex, so we spawn each one separately.
+      # this is no longer true for RNNs as we use do not use the --frame option
+      # but we use the same script for consistency with FF-DNN code
+
+      for n in $(seq $this_num_jobs); do
+        k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we will derive
+                                               # the other indexes from.
+        archive=$[($k%$num_archives)+1]; # work out the 1-based archive index.
+        frame_shift=$[($k/$num_archives)%$num_shifts];
+
+        this_max_param_change=$max_param_change
+        this_minibatch_size=$minibatch_size
+        # for the first 20 iterations or the first epoch, whichever comes earlier,
+        # use a smaller minibatch size and max-param-change.
+        if [ $k -lt $[$num_archives*$num_shifts] ] && [ $x -lt 20 ]; then
+          # if we're the first epoch, use half the minibatch size and half the
+          # max-param-change.
+          this_minibatch_size=$[$minibatch_size/2]
+          this_max_param_change=$(perl -e "print ($max_param_change / 2.0);")
+        fi
+
+        $cmd $train_queue_opt $dir/log/train.$x.$n.log \
+          nnet3-xvector-train $parallel_train_opts --print-interval=10 \
+          --max-param-change=$this_max_param_change "$raw" \
+          "ark:nnet3-copy-egs --frame-shift=$frame_shift ark:$egs_dir/egs.$archive.ark ark:- | nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-merge-egs --measure-output-frames=false --minibatch-size=$this_minibatch_size --discard-partial-minibatches=true ark:- ark:- |" \
+          $dir/$[$x+1].$n.raw || touch $dir/.error &
+      done
+      wait
+      if [ -f $dir/.error ]; then
+        echo "$0: error detected on iteration $x of training"
+        exit 1
+      fi
+    )
+    # the error message below is not that informative, but $cmd will
+    # have printed a more specific one.
+    [ -f $dir/.error ] && echo "$0: error on iteration $x of training" && exit 1;
+
+    models_to_average=$(steps/nnet3/get_successful_models.py $this_num_jobs $dir/log/train.$x.%.log)
+    nnets_list=
+    for n in $models_to_average; do
+      nnets_list="$nnets_list $dir/$[$x+1].$n.raw"
+    done
+
+    if $do_average; then
+      # average the output of the different jobs.
+      $cmd $dir/log/average.$x.log \
+        nnet3-average $nnets_list $dir/$[$x+1].raw || exit 1;
+    else
+      # choose the best from the different jobs.
+      n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
+          $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
+          undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
+          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
+          $best_n=$n; } } print "$best_n\n"; ' $this_num_jobs $dir/log/train.$x.%d.log) || exit 1;
+      [ -z "$n" ] && echo "Error getting best model" && exit 1;
+      $cmd $dir/log/select.$x.log \
+        cp $dir/$[$x+1].$n.raw $dir/$[$x+1].raw || exit 1;
+    fi
+
+    nnets_list=
+    for n in `seq 1 $this_num_jobs`; do
+      nnets_list="$nnets_list $dir/$[$x+1].$n.raw"
+    done
+
+    rm $nnets_list
+    [ ! -f $dir/$[$x+1].raw ] && exit 1;
+    if [ -f $dir/$[$x-1].raw ] && $cleanup && \
+       [ $[($x-1)%100] -ne 0  ]; then
+      rm $dir/$[$x-1].raw
+    fi
+  fi
+  rm $dir/cache.$x 2>/dev/null
+  x=$[$x+1]
+  num_archives_processed=$[$num_archives_processed+$this_num_jobs]
+done
+
+
+cp $dir/$x.raw $dir/final.raw
+
+# don't bother with combination for now - it makes very little difference.
+
+sleep 2
+
+echo Done
diff --git a/egs/wsj/s5/steps/nnet3/xvector/train_separate.sh b/egs/wsj/s5/steps/nnet3/xvector/train_separate.sh
new file mode 100755
index 00000000000..5388af1e21a
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/xvector/train_separate.sh
@@ -0,0 +1,257 @@
+#!/bin/bash
+
+# Copyright 2016 Johns Hopkins University (Author: Daniel Povey).
+# Apache 2.0.
+
+
+# Begin configuration section.
+cmd=run.pl
+num_epochs=4      # Number of epochs of training;
+                  # the number of iterations is worked out from this.
+num_shifts=1
+initial_effective_lrate=0.0003
+final_effective_lrate=0.00003
+num_jobs_initial=2 # Number of neural net jobs to run in parallel at the start of training
+num_jobs_final=8   # Number of neural net jobs to run in parallel at the end of training
+stage=-3
+diagnostic_period=5
+compute_accuracy=true
+
+
+shuffle_buffer_size=1000 # This "buffer_size" variable controls randomization of the samples
+                # on each iter.  You could set it to 0 or to a large value for complete
+                # randomization, but this would both consume memory and cause spikes in
+                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
+                # not a huge deal though, as samples are anyway randomized right at the start.
+                # (the point of this is to get data in different minibatches on different iterations,
+                # since in the preconditioning method, 2 samples in the same minibatch can
+                # affect each others' gradients.
+
+max_param_change=0.2  # max param change per minibatch to use eventually
+                      # (for first epoch we use half this)
+minibatch_size=256   # minibatch size to use eventually
+                     # (for first epoch we use half this)
+
+use_gpu=true    # if true, we run on GPU.
+egs_dir=
+max_snr=10
+min_snr=20
+left_padding=1
+right_padding=1
+
+# End configuration section.
+
+trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# != 1 ]; then
+  echo "Usage: $0 [opts] <exp-dir>"
+  echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet"
+  echo "This script trains the xvector system; see egs/swbd/s5c/local/xvector/train.sh for"
+  echo "example (you have to create the nnet configs and the egs first)."
+  echo ""
+  echo "Main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config file containing options"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --num-epochs <#epochs|10>                        # Number of epochs of training"
+  echo "  --initial-effective-lrate <lrate|0.0003>         # effective learning rate at start of training."
+  echo "  --final-effective-lrate <lrate|0.00003>          # effective learning rate at end of training."
+  echo "                                                   # data, 0.00025 for large data"
+  echo "  --num-jobs-initial <num-jobs|1>                  # Number of parallel jobs to use for neural net training, at the start."
+  echo "  --num-jobs-final <num-jobs|8>                    # Number of parallel jobs to use for neural net training, at the end"
+  echo "  --egs-dir <egs-dir>                              # If supplied, overrides <exp-dir>/egs as location of egs"
+  echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
+  echo "                                                   # the middle."
+  exit 1;
+fi
+
+dir=$1
+
+[ -z $egs_dir ] && egs_dir=$dir/egs
+
+if [ ! -d $egs_dir/info ]; then
+  echo "$0: expected $egs_dir/info to exist: did you run steps/nnet3/xvector/get_egs.sh first?"
+  exit 1
+fi
+if [ ! -f $dir/configs/final.config ]; then
+  echo "$0: expected $dir/configs/final.config to exist (e.g. run steps/nnet3/xvector/make_jesus_configs.py first)"
+  exit 1
+fi
+
+
+num_archives=$(cat $egs_dir/info/num_archives)
+num_diagnostic_archives=$(cat $egs_dir/info/num_diagnostic_archives)
+
+
+
+[ $num_jobs_initial -gt $num_jobs_final ] && \
+  echo "$0: --initial-num-jobs cannot exceed --final-num-jobs" && exit 1;
+
+[ $num_jobs_final -gt $num_archives ] && \
+  echo "$0: --final-num-jobs cannot exceed #archives $num_archives." && exit 1;
+
+# set num_iters so that as close as possible, we process the data $num_epochs
+# times $num_shifts times, times, i.e. $num_iters*$avg_num_jobs) ==
+# $num_epochs*$num_archives*$num_shifts, where
+# avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
+num_archives_to_process=$[$num_epochs*$num_archives*$num_shifts]
+num_archives_processed=0
+num_iters=$[($num_archives_to_process*2)/($num_jobs_initial+$num_jobs_final)]
+
+echo "$0: Will train for $num_epochs epochs = $num_iters iterations"
+
+if $use_gpu; then
+  parallel_suffix=""
+  train_queue_opt="--gpu 1"
+  parallel_train_opts=
+  if ! cuda-compiled; then
+    echo "$0: WARNING: you are running with one thread but you have not compiled"
+    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
+    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
+    exit 1
+  fi
+else
+  echo "$0: without using a GPU this will be very slow.  nnet3 does not yet support multiple threads."
+  parallel_train_opts="--use-gpu=no"
+fi
+
+if [ $stage -le -1 ]; then
+  $cmd $dir/log/nnet_init.log \
+    nnet3-init $dir/configs/final.config $dir/0.raw || exit 1
+fi
+
+
+x=0
+
+while [ $x -lt $num_iters ]; do
+
+  this_num_jobs=$(perl -e "print int(0.5+$num_jobs_initial+($num_jobs_final-$num_jobs_initial)*$x/$num_iters);")
+
+  ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process;
+  this_effective_learning_rate=$(perl -e "print ($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt));");
+  this_learning_rate=$(perl -e "print ($this_effective_learning_rate*$this_num_jobs);");
+
+  if [ $stage -le $x ]; then
+    echo "On iteration $x, learning rate is $this_learning_rate"
+    raw="nnet3-copy --learning-rate=$this_learning_rate $dir/$x.raw - |"
+
+    if [ $[$x%$diagnostic_period] == 0 ]; then
+      # Set off jobs doing some diagnostics, in the background.
+      $cmd JOB=1:$num_diagnostic_archives $dir/log/compute_prob_valid.$x.JOB.log \
+        nnet3-xvector-compute-prob --compute-accuracy=${compute_accuracy} $dir/$x.raw \
+        "ark:fvector-add-noise-separate --max-snr=$max_snr --min-snr=$min_snr scp:$egs_dir/valid_diagnostic_egs.JOB.scp scp:$egs_dir/valid_diagnostic_egs.noise.JOB.scp ark:- | fvector-get-egs --left-padding=$left_padding --right-padding=$right_padding ark:- ark:- | nnet3-merge-egs --measure-output-frames=false ark:- ark:- |" &
+      $cmd JOB=1:$num_diagnostic_archives $dir/log/compute_prob_train.$x.JOB.log \
+        nnet3-xvector-compute-prob --compute-accuracy=${compute_accuracy} $dir/$x.raw \
+        "ark:fvector-add-noise-separate --max-snr=$max_snr --min-snr=$min_snr scp:$egs_dir/train_diagnostic_egs.JOB.scp scp:$egs_dir/train_diagnostic_egs.noise.JOB.scp ark:- | fvector-get-egs --left-padding=$left_padding --right-padding=$right_padding ark:- ark:- | nnet3-merge-egs --measure-output-frames=false ark:- ark:- |" &
+    fi
+    if [ $x -gt 0 ]; then
+      $cmd $dir/log/progress.$x.log \
+        nnet3-info $dir/$x.raw '&&' \
+        nnet3-show-progress --use-gpu=no $dir/$[$x-1].raw $dir/$x.raw &
+    fi
+
+    echo "Training neural net (pass $x)"
+
+    if [ $x -le 1 ]; then
+      do_average=false # for the first 2 iters, don't do averaging, pick the best.
+    else
+      do_average=true
+    fi
+
+    rm $dir/.error 2>/dev/null
+
+
+    ( # this sub-shell is so that when we "wait" below,
+      # we only wait for the training jobs that we just spawned,
+      # not the diagnostic jobs that we spawned above.
+
+      # We cannot easily use a single parallel SGE job to do the main training,
+      # because the computation of which archive and which --frame option
+      # to use for each job is a little complex, so we spawn each one separately.
+      # this is no longer true for RNNs as we use do not use the --frame option
+      # but we use the same script for consistency with FF-DNN code
+
+      for n in $(seq $this_num_jobs); do
+        k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we will derive
+                                               # the other indexes from.
+        archive=$[($k%$num_archives)+1]; # work out the 1-based archive index.
+        frame_shift=$[($k/$num_archives)%$num_shifts];
+
+        this_max_param_change=$max_param_change
+        this_minibatch_size=$minibatch_size
+        # for the first 20 iterations or the first epoch, whichever comes earlier,
+        # use a smaller minibatch size and max-param-change.
+        if [ $k -lt $[$num_archives*$num_shifts] ] && [ $x -lt 20 ]; then
+          # if we're the first epoch, use half the minibatch size and half the
+          # max-param-change.
+          this_minibatch_size=$[$minibatch_size/2]
+          this_max_param_change=$(perl -e "print ($max_param_change / 2.0);")
+        fi
+ 
+        $cmd $train_queue_opt $dir/log/train.$x.$n.log \
+          fvector-add-noise-separate --max-snr=$max_snr --min-snr=$min_snr scp:"utils/shuffle_list.pl --srand $k $egs_dir/egs.$archive.scp |" scp:"utils/shuffle_list.pl --srand $k $egs_dir/egs.noise.$archive.scp |" ark:- \| \
+          fvector-get-egs --left-padding=$left_padding --right-padding=$right_padding ark:- ark:- \| \
+          nnet3-merge-egs --measure-output-frames=false --minibatch-size=$this_minibatch_size --discard-partial-minibatches=true ark:- ark:- \| \
+          nnet3-xvector-train $parallel_train_opts --print-interval=10 --max-param-change=$this_max_param_change "$raw" ark:- $dir/$[$x+1].$n.raw || touch $dir/.error &
+      done
+      wait
+      if [ -f $dir/.error ]; then
+        echo "$0: error detected on iteration $x of training"
+        exit 1
+      fi
+    )
+    # the error message below is not that informative, but $cmd will
+    # have printed a more specific one.
+    [ -f $dir/.error ] && echo "$0: error on iteration $x of training" && exit 1;
+
+    models_to_average=$(steps/nnet3/get_successful_models.py $this_num_jobs $dir/log/train.$x.%.log)
+    nnets_list=
+    for n in $models_to_average; do
+      nnets_list="$nnets_list $dir/$[$x+1].$n.raw"
+    done
+
+    if $do_average; then
+      # average the output of the different jobs.
+      $cmd $dir/log/average.$x.log \
+        nnet3-average $nnets_list $dir/$[$x+1].raw || exit 1;
+    else
+      # choose the best from the different jobs.
+      n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
+          $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
+          undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
+          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
+          $best_n=$n; } } print "$best_n\n"; ' $this_num_jobs $dir/log/train.$x.%d.log) || exit 1;
+      [ -z "$n" ] && echo "Error getting best model" && exit 1;
+      $cmd $dir/log/select.$x.log \
+        cp $dir/$[$x+1].$n.raw $dir/$[$x+1].raw || exit 1;
+    fi
+
+    nnets_list=
+    for n in `seq 1 $this_num_jobs`; do
+      nnets_list="$nnets_list $dir/$[$x+1].$n.raw"
+    done
+
+    rm $nnets_list
+    [ ! -f $dir/$[$x+1].raw ] && exit 1;
+    if [ -f $dir/$[$x-1].raw ] && $cleanup && \
+       [ $[($x-1)%100] -ne 0  ]; then
+      rm $dir/$[$x-1].raw
+    fi
+  fi
+  rm $dir/cache.$x 2>/dev/null
+  x=$[$x+1]
+  num_archives_processed=$[$num_archives_processed+$this_num_jobs]
+done
+
+
+cp $dir/$x.raw $dir/final.raw
+
+# don't bother with combination for now - it makes very little difference.
+
+sleep 2
+
+echo Done
diff --git a/src/Makefile b/src/Makefile
index 6dfd146e3d5..f58e69408a3 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -9,13 +9,15 @@ SUBDIRS = base matrix util feat tree gmm transform \
           fstext hmm lm decoder lat kws cudamatrix nnet \
           bin fstbin gmmbin fgmmbin featbin \
           nnetbin latbin sgmm2 sgmm2bin nnet2 nnet3 rnnlm chain nnet3bin nnet2bin kwsbin \
-          ivector ivectorbin online2 online2bin lmbin chainbin rnnlmbin
+          ivector ivectorbin online2 online2bin lmbin chainbin rnnlmbin \
+		  xvector xvectorbin fvector fvectorbin
 
 MEMTESTDIRS = base matrix util feat tree gmm transform \
           fstext hmm lm decoder lat nnet kws chain \
           bin fstbin gmmbin fgmmbin featbin \
           nnetbin latbin sgmm2 nnet2 nnet3 rnnlm nnet2bin nnet3bin sgmm2bin kwsbin \
-          ivector ivectorbin online2 online2bin lmbin
+          ivector ivectorbin online2 online2bin lmbin \
+		  xvector xvectorbin fvector fvectorbin
 
 CUDAMEMTESTDIR = cudamatrix
 
@@ -149,9 +151,10 @@ $(EXT_SUBDIRS) : mklibdir ext_depend
 ### Dependency list ###
 # this is necessary for correct parallel compilation
 #1)The tools depend on all the libraries
-bin fstbin gmmbin fgmmbin sgmm2bin featbin nnetbin nnet2bin nnet3bin chainbin latbin ivectorbin lmbin kwsbin online2bin rnnlmbin: \
+bin fstbin gmmbin fgmmbin sgmm2bin featbin nnetbin nnet2bin nnet3bin chainbin latbin ivectorbin lmbin kwsbin online2bin rnnlmbin xvectorbin fvectorbin: \
  base matrix util feat tree gmm transform sgmm2 fstext hmm \
- lm decoder lat cudamatrix nnet nnet2 nnet3 ivector chain kws online2 rnnlm
+ lm decoder lat cudamatrix nnet nnet2 nnet3 ivector chain kws online2 rnnlm \
+ xvector fvector
 
 #2)The libraries have inter-dependencies
 base: base/.depend.mk
@@ -174,6 +177,8 @@ nnet3: base util matrix lat gmm hmm tree transform cudamatrix chain fstext
 rnnlm: base util matrix cudamatrix nnet3 lm hmm
 chain: lat hmm tree fstext matrix cudamatrix util base
 ivector: base util matrix transform tree gmm
+xvector: base util matrix cudamatrix nnet3
+fvector: base util matrix cudamatrix nnet3
 #3)Dependencies for optional parts of Kaldi
 onlinebin: base matrix util feat tree gmm transform sgmm2 fstext hmm lm decoder lat cudamatrix nnet nnet2 online
 # python-kaldi-decoding: base matrix util feat tree gmm transform sgmm2 fstext hmm decoder lat online
diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
index 6b99a77e73b..232b274a344 100644
--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@@ -789,6 +789,15 @@ void cuda_uncompress_uint8(dim3 Gr, dim3 Bl, BaseFloat *dest,
                           MatrixDim dim, const uint8_t *src,
                           int src_stride, float scale);
 
+/// For Xvector
+void cudaD_compute_xvector_objf(dim3 Gr, dim3 Bl, const double *scores,
+                               MatrixDim scores_dim, double *obfj_terms,
+                               MatrixDim objf_dim, double *objf_derivs,
+                               MatrixDim derivs_dim);
+void cudaF_compute_xvector_objf(dim3 Gr, dim3 Bl, const float *scores,
+                               MatrixDim scores_dim, float *obfj_terms,
+                               MatrixDim objf_dim, float *objf_derivs,
+                               MatrixDim derivs_dim);
 
 
 } // extern "C"
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 934a860a055..b31bd92d760 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -5445,3 +5445,43 @@ void cuda_uncompress_int16(dim3 Gr, dim3 Bl, BaseFloat *dest,
                            int src_stride, float scale) {
   _cuda_uncompress<<<Gr, Bl>>>(dest, dim, src, src_stride, scale);
 }
+/// For Xvector
+template<typename Real>
+__global__
+static void _compute_xvector_objf(const Real* scores, MatrixDim scores_dim,
+                                  Real* objf_terms, MatrixDim objf_dim,
+                                  Real* objf_derivs, MatrixDim derivs_dim) {
+  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
+  int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
+  int32_cuda scores_index = i + j * scores_dim.stride;
+  int32_cuda objf_index = i + j * objf_dim.stride;
+  int32_cuda derivs_index = i + j * derivs_dim.stride;
+  Real K = 1.0 / (scores_dim.rows - 2.0);
+  Real L = scores[scores_index];
+  if (i < scores_dim.cols && j < scores_dim.rows) {
+    if (i + 1 == j && i % 2 == 0) {
+      objf_terms[objf_index] = L < -15 ? L : -log(1.0 + exp(-L));
+      objf_derivs[derivs_index] = L > 15 ? 0.0 : 1.0 / (1.0 + exp(L));
+    } else if (i < j) {
+      objf_terms[objf_index] = K * (L > 15 ? -L : -log(1.0 + exp(L)));
+      objf_derivs[derivs_index] = L < -15 ? 0 : -K / (1.0 + exp(-L));
+    } else {
+      objf_terms[objf_index] = 0.0;
+      objf_derivs[derivs_index] = 0.0;
+    }
+  }
+}
+void cudaD_compute_xvector_objf(dim3 Gr, dim3 Bl, const double *scores,
+                                MatrixDim scores_dim, double *objf_terms,
+                                MatrixDim objf_dim, double *objf_derivs,
+                                MatrixDim derivs_dim) {
+  _compute_xvector_objf<<<Gr,Bl>>>(scores, scores_dim, objf_terms, objf_dim,
+    objf_derivs, derivs_dim);
+}
+void cudaF_compute_xvector_objf(dim3 Gr, dim3 Bl, const float *scores,
+                                MatrixDim scores_dim, float *objf_terms,
+                                MatrixDim objf_dim, float *objf_derivs,
+                                MatrixDim derivs_dim) {
+  _compute_xvector_objf<<<Gr,Bl>>>(scores, scores_dim, objf_terms, objf_dim,
+    objf_derivs, derivs_dim);
+}
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index 8f719a8c4a1..27804c9339f 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -1546,6 +1546,21 @@ inline void cuda_mat_uncompress(dim3 Gr, dim3 Bl, BaseFloat *dest,
                                 int src_stride, float scale) {
   cuda_uncompress_uint16(Gr, Bl, dest, dim, src, src_stride, scale);
 }
+/// For Xvector
+inline void cuda_compute_xvector_objf(dim3 Gr, dim3 Bl, const float *scores,
+                               MatrixDim scores_dim, float *obfj_terms,
+                               MatrixDim objf_dim, float *objf_derivs,
+                               MatrixDim derivs_dim) {
+  cudaF_compute_xvector_objf(Gr, Bl, scores, scores_dim, obfj_terms, objf_dim,
+                          objf_derivs, derivs_dim);
+}
+inline void cuda_compute_xvector_objf(dim3 Gr, dim3 Bl, const double *scores,
+                               MatrixDim scores_dim, double *obfj_terms,
+                               MatrixDim objf_dim, double *objf_derivs,
+                               MatrixDim derivs_dim) {
+  cudaD_compute_xvector_objf(Gr, Bl, scores, scores_dim, obfj_terms, objf_dim,
+                          objf_derivs, derivs_dim);
+}
 
 
 } // namespace kaldi
diff --git a/src/cudamatrix/cu-math.cc b/src/cudamatrix/cu-math.cc
index 3fbeff3a470..17e1864ec63 100644
--- a/src/cudamatrix/cu-math.cc
+++ b/src/cudamatrix/cu-math.cc
@@ -991,6 +991,49 @@ void BackpropLstmNonlinearity(const CuMatrixBase<double> &input,
                               CuMatrixBase<double> *value_sum_out,
                               CuMatrixBase<double> *deriv_sum_out,
                               CuMatrixBase<double> *self_repair_sum_out);
+// For Xvector
+void ComputeXvectorObjfFromScores(const CuMatrixBase<BaseFloat> &scores,
+                                  CuMatrixBase<BaseFloat> *objf_terms,
+                                  CuMatrixBase<BaseFloat> *objf_derivs) {
+  KALDI_ASSERT(SameDim(*objf_terms, *objf_derivs)
+               && SameDim(*objf_terms, scores) &&
+               scores.NumRows() == scores.NumCols());
+  #if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    CuTimer tim;
+    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
+    dim3 dimGrid(n_blocks(scores.NumCols(), CU2DBLOCK),
+                 n_blocks(scores.NumRows(), CU2DBLOCK));
+
+    cuda_compute_xvector_objf(dimGrid, dimBlock, scores.Data(), scores.Dim(),
+      objf_terms->Data(), objf_terms->Dim(), objf_derivs->Data(),
+      objf_derivs->Dim());
+    CU_SAFE_CALL(cudaGetLastError());
+
+    CuDevice::Instantiate().AccuProfile(__func__, tim);
+  } else
+  #endif
+  {
+    // Compute the xvector objective function and its derivatives in the CPU.
+    int32 num_rows = scores.NumRows();
+    BaseFloat K = 1.0 / (num_rows - 2.0);
+    for (int32 i = 0; i < num_rows; i++) {
+      for (int32 j = 0; j < num_rows; j++) {
+        BaseFloat L = scores(i, j);
+        if (i + 1 == j && i % 2 == 0) {
+          (*objf_terms)(i, j) = L < -15 ? L : -log(1.0 + exp(-L));
+          (*objf_derivs)(i, j) = L > 15 ? 0.0 : 1.0 / (1.0 + exp(L));
+        } else if (i < j) {
+          (*objf_terms)(i, j) = K * (L > 15 ? -L : -log(1.0 + exp(L)));
+          (*objf_derivs)(i, j) = L < -15 ? 0 : -K / (1.0 + exp(-L));
+        } else {
+          (*objf_terms)(i, j) = 0;
+          (*objf_derivs)(i, j) = 0;
+        }
+      }
+    }
+  }
+}
 
 
 
diff --git a/src/cudamatrix/cu-math.h b/src/cudamatrix/cu-math.h
index df533dd73ff..3ec4a693e6f 100644
--- a/src/cudamatrix/cu-math.h
+++ b/src/cudamatrix/cu-math.h
@@ -297,6 +297,37 @@ void DiffNormalizePerRow(const CuMatrixBase<Real> &in_value,
                          const CuMatrixBase<Real> &out_deriv,
                          const Real target_rms, const bool add_log_stddev,
                          CuMatrixBase<Real>* in_deriv);
+// For Xvector
+/*
+  This function is used in computing the objective function and derivatives
+  in xvector training.
+  @param [in] scores   'scores' is a symmetric matrix of scores which are to
+  be interpreted as log-odds (according to the model) of pairs coming from the
+  same class, so scores(i, j) is the model's log p(same/different) for
+  elements i and j of the original minibatch of input. We assume that the data
+  in 'scores' has been arranged in such a way that pairs of indexes of the form
+  (2k, 2k+1), e.g., (0, 1), (2, 3), (4, 5), etc, are from the same class, but
+  indexes of any other form, such as (0, 2), (1, 2), etc, are from different
+  classes.
+  @param [out] objf_terms   'objf_terms' is a matrix of the same dimension as
+  'scores' whose elements we will sum to get the objective function for this
+  minibatch. This function computes the appropriate contributions to the
+  objective function, as follows.
+    if i == j:
+      objf_terms(i, j)== 0       # the same exact element is not scored
+    elsif i%2 == j%2:
+      objf_terms(i, j) = log(p(same))
+                       = -log(1 + exp(-scores(i, j))
+    else:
+      objf_terms(i, j) = 1 / (scores.NumRows() - 2) * log(p(different))
+                       = -1/(scores.NumRows() - 2) * log(1+exp(scores(i,j))
+  @param [out] objf_derivs    Element (i,j) of this matrix is the derivative
+  of objf_terms(i,j) with respect to scores(i, j).
+*/
+void ComputeXvectorObjfFromScores(const CuMatrixBase<BaseFloat> &scores,
+                                  CuMatrixBase<BaseFloat> *objf_terms,
+                                  CuMatrixBase<BaseFloat> *objf_derivs);
+
 
 
 } // namespace cu
diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc
index 64f8afe0616..972e3b03b73 100644
--- a/src/cudamatrix/cu-packed-matrix.cc
+++ b/src/cudamatrix/cu-packed-matrix.cc
@@ -194,6 +194,15 @@ void CuPackedMatrix<Real>::CopyToPacked(PackedMatrix<Real> *dst) const {
   }
 }
 
+//For Xvector
+template<typename Real>
+void CuPackedMatrix<Real>::CopyFromVec(const CuVectorBase<Real> &vec) {
+  MatrixIndexT size = (NumRows() * (NumRows() + 1)) / 2;
+  KALDI_ASSERT(vec.Dim() == size);
+  CuSubVector<Real> this_as_vec(data_, (num_rows_ * (num_rows_ + 1)) / 2);
+  this_as_vec.CopyFromVec(vec);
+}
+
 /*
 template<typename Real>
 void CuPackedMatrix<Real>::CopyRowsFromPacked(int32 r, const CuPackedMatrix<Real> &src, int32 src_ro, int32 dst_ro) {
diff --git a/src/cudamatrix/cu-packed-matrix.h b/src/cudamatrix/cu-packed-matrix.h
index 0131ba6c101..499506c694c 100644
--- a/src/cudamatrix/cu-packed-matrix.h
+++ b/src/cudamatrix/cu-packed-matrix.h
@@ -99,6 +99,9 @@ class CuPackedMatrix {
   void CopyFromPacked(const PackedMatrix<Real> &src);
   void CopyToPacked(PackedMatrix<Real> *dst) const;
 
+  // For Xvector
+  void CopyFromVec(const CuVectorBase<Real> &vec);
+
   void Read(std::istream &in, bool binary);
   
   void Write(std::ostream &out, bool binary) const;
diff --git a/src/fvector/Makefile b/src/fvector/Makefile
new file mode 100644
index 00000000000..882336a20c1
--- /dev/null
+++ b/src/fvector/Makefile
@@ -0,0 +1,19 @@
+
+all:
+
+OPENFST_CXXFLAGS =
+OPENFST_LDLIBS =
+include ../kaldi.mk
+
+TESTFILES = fvector-perturb-test
+
+OBJFILES = fvector-perturb.o
+
+LIBNAME = kaldi-fvector
+
+ADDLIBS = ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
+          ../tree/kaldi-tree.a ../util/kaldi-util.a \
+          ../matrix/kaldi-matrix.a ../base/kaldi-base.a ../feat/kaldi-feat.a
+
+
+include ../makefiles/default_rules.mk
diff --git a/src/fvector/fvector-perturb-test.cc b/src/fvector/fvector-perturb-test.cc
new file mode 100644
index 00000000000..3e12797aba2
--- /dev/null
+++ b/src/fvector/fvector-perturb-test.cc
@@ -0,0 +1,86 @@
+// fvector/fvector-perturb-test.cc
+
+#include <iostream>
+#include <math.h>
+#include "fvector/fvector-perturb.h"
+#include "feat/wave-reader.h"
+
+using namespace kaldi;
+
+static void UnitTestSpeedPerturb() {
+  std::cout << "=== UnitTestSpeedPerturb ===" << std::endl;
+  Vector<BaseFloat> input, output;
+  BaseFloat sample_freq;
+  {
+    std::ifstream is("./test_data/test.wav", std::ios_base::binary);
+    WaveData wave;
+    wave.Read(is);
+    const Matrix<BaseFloat> data(wave.Data());
+    KALDI_ASSERT(data.NumRows() == 1);
+    input.Resize(data.NumCols());
+    input.CopyFromVec(data.Row(0));
+    sample_freq = wave.SampFreq();
+  }
+  BaseFloat speed_factor = 1.2;
+  FvectorPerturbOptions opts;
+  FvectorPerturb perturb(opts);
+  output.Resize(static_cast<MatrixIndexT>(ceil(input.Dim()/speed_factor)));
+  perturb.SpeedPerturbation(input, sample_freq, speed_factor, &output);
+  {
+    std::ofstream os("./test_data/test_speedperturbed.wav.txt", std::ios::out);
+    output.Write(os, false);
+  }
+  std::cout << "With fvector class, The dim of input is: " << input.Dim() << std::endl;
+  std::cout << "With fvector class, The dim of output is: " << output.Dim() << std::endl;
+  // Write the perturbed data into wav format
+  {
+    Matrix<BaseFloat> output_matrix(1, output.Dim());
+    output_matrix.CopyRowFromVec(output, 0);
+    WaveData perturbed_wave(sample_freq, output_matrix);
+    std::ofstream os("./test_data/test_speedperturbed.wav", std::ios::out);
+    perturbed_wave.Write(os);
+  }
+  // print the wav data which is dealed by sox. 
+  // Command: sox -t wav test.wav -t wav test_speed12.wav speed 1.2
+  Vector<BaseFloat> sox_input;
+  BaseFloat sox_sample_freq;
+  {
+    std::ifstream is("./test_data/test_speed12.wav", std::ios_base::binary);
+    WaveData wave;
+    wave.Read(is);
+    const Matrix<BaseFloat> data(wave.Data());
+    KALDI_ASSERT(data.NumRows() == 1);
+    sox_input.Resize(data.NumCols());
+    sox_input.CopyFromVec(data.Row(0));
+    sox_sample_freq = wave.SampFreq();
+    std::ofstream os("./test_data/test_sox.wav.txt", std::ios::out);
+    sox_input.Write(os, false);
+  }
+  KALDI_ASSERT(sample_freq == sox_sample_freq);
+  if (output.ApproxEqual(sox_input, 0.01)) { 
+    std::cout << "Equal" << std::endl;
+  } else {
+    std::cout << "Not Equal" << std::endl;
+    BaseFloat prod_output = VecVec(output, output);
+    BaseFloat prod_sox = VecVec(sox_input, sox_input);
+    BaseFloat cross_prod = VecVec(output, sox_input);
+    std::cout << "The cosin distance is: " 
+              << cross_prod/(sqrt(prod_output)*sqrt(prod_sox))
+              << std::endl;
+  }
+  std::cout << "=== UnitTestSpeedPerturb finish ===" << std::endl;
+}
+
+static void UnitTestFvectorPerturb() {
+  UnitTestSpeedPerturb();
+}
+
+int main() {
+  try{
+    UnitTestFvectorPerturb();
+    std::cout << "Tests succeeded." << std::endl;
+  } catch (const std::exception &e) {
+    std::cerr << e.what();
+    return 1;
+  }
+}
diff --git a/src/fvector/fvector-perturb.cc b/src/fvector/fvector-perturb.cc
new file mode 100644
index 00000000000..9ed6fb2b9c7
--- /dev/null
+++ b/src/fvector/fvector-perturb.cc
@@ -0,0 +1,289 @@
+#include "fvector/fvector-perturb.h"
+
+namespace kaldi {
+
+void FvectorPerturb::ApplyPerturbation(const MatrixBase<BaseFloat>& input_chunk,
+                                       Matrix<BaseFloat>* perturbed_chunk) {
+  // The original_dim_matrix is a matrix whose dimension is same with input_chunk.
+  // Assume the sample_frequency=8kHz, the original length is 120ms.
+  // It will be a (4, 960) matrix.
+  Matrix<BaseFloat> original_dim_matrix(input_chunk);
+  // Firstly, we add additive noise with probability.
+  AddNoise(opts_.add_noise, &original_dim_matrix);
+  // we do Resize() here, because Resize() belongs to Matrix<> rather than MatrixBase<>
+  original_dim_matrix.Resize(2, original_dim_matrix.NumCols(), kCopyData);
+  KALDI_ASSERT(original_dim_matrix.NumRows() == 2);
+  // After AddNoise(), the shape of original_dim_matrix is (2, original_dim).
+  if (opts_.volume_perturbation) {
+    VolumePerturbation(&original_dim_matrix);
+  }
+  // The expected_dim_matrix is a matrix (input_chunk.NumRows(), expected-chunk-length
+  // * sample_frequency / 1000). E.g. it is a (4, 800) matrix.
+  Matrix<BaseFloat> expected_dim_matrix(original_dim_matrix.NumRows(),
+      opts_.expected_chunk_length * opts_.sample_frequency / 1000);
+  if (opts_.speed_perturbation) {
+    //1. generate speed perturb factor randomly(Noice: the expected_length is
+    //always smaller than original_length) for each line.
+    //(1) a=min{original_length/expected_length -1, max-speed-perturb-rate}
+    //(2) the range of factor is (1-a, 1+a)
+    BaseFloat boundary = std::min(static_cast<BaseFloat>((original_dim_matrix.NumCols() * 1.0 / opts_.sample_frequency)
+          * 1000 / opts_.expected_chunk_length - 1), opts_.max_speed_perturb_rate);
+    for (MatrixIndexT i = 0; i < original_dim_matrix.NumRows(); ++i) {
+      //caculate the speed factor
+      BaseFloat factor =static_cast<BaseFloat> (RandInt(
+          (int)((1-boundary)*100),(int)((1+boundary)*100)) * 1.0 / 100.0);
+      
+      Vector<BaseFloat> speed_input_vector(original_dim_matrix.Row(i));
+      
+      MatrixIndexT speed_output_dim = static_cast<MatrixIndexT>(ceil(original_dim_matrix.NumCols() / factor));
+      KALDI_ASSERT(speed_output_dim >= opts_.expected_chunk_length * opts_.sample_frequency / 1000);
+      Vector<BaseFloat> speed_output_vector(speed_output_dim);
+
+      SpeedPerturbation(speed_input_vector, opts_.sample_frequency, factor, &speed_output_vector);
+      
+      Vector<BaseFloat> time_shifted_vector(expected_dim_matrix.NumCols());
+      if (opts_.time_shift) {
+        TimeShift(speed_output_vector, &time_shifted_vector);
+      } else {
+        time_shifted_vector.CopyFromVec(speed_output_vector.Range(0, expected_dim_matrix.NumCols()));
+      }
+      expected_dim_matrix.CopyRowFromVec(time_shifted_vector, i); 
+    }
+  } else { //no speed_perturbation
+    if (opts_.time_shift) {
+      for (MatrixIndexT i = 0; i < original_dim_matrix.NumRows(); ++i) {
+        Vector<BaseFloat> input_vector(original_dim_matrix.Row(i));
+        Vector<BaseFloat> time_shifted_vector(expected_dim_matrix.NumCols());
+        TimeShift(input_vector, &time_shifted_vector);
+        expected_dim_matrix.CopyRowFromVec(time_shifted_vector, i);
+      }  
+    } else {
+      expected_dim_matrix.CopyFromMat(original_dim_matrix.Range(0, expected_dim_matrix.NumRows(),
+                                                                0, expected_dim_matrix.NumCols()));
+    }
+  }
+  // Now we operate the "expected_dim_matrix"  
+  perturbed_chunk->Resize(2, expected_dim_matrix.NumCols());
+  MatrixIndexT indices[2] = {0, 1};
+  perturbed_chunk->CopyRows(expected_dim_matrix, indices);
+}
+
+void FvectorPerturb::VolumePerturbation(MatrixBase<BaseFloat>* chunk) {
+  //1. Randomly generate 2 number from (1-max-volume-variance, 1+max-volume-variance)
+  std::vector<BaseFloat> volume_factors;
+  for (MatrixIndexT i = 0; i < chunk->NumRows(); ++i) {
+    BaseFloat factor = static_cast<BaseFloat>(
+        RandInt((int)((1-opts_.max_volume_variance)*100),
+                (int)((1+opts_.max_volume_variance)*100)) / 100.0);
+    volume_factors.push_back(factor);
+  }
+  //2. scale each line respectively.
+  for (MatrixIndexT i = 0; i < chunk->NumRows(); ++i) {
+    chunk->Row(i).Scale(volume_factors[i]);
+  }
+}
+
+// we stretch the signal from the beginning to end.
+// y(t) = x(s*t) for t = 0,...,n. If s>0, the output will be shorter than
+// input. It represents speeding up. Vice versa.
+// Use ArbitraryResample deal with each line.
+//
+// In ArbitraryResample, according to num_zeros and filter_cutoff, it generates
+// the "filter_with". And then each output_sample(t) corresponds to few input_samples
+// from (t-filter_with) to (t+filter_with), which is stored in "first_index_".
+// And "weights_" will be adjust by a Hanning window in function FilterFunc.
+// In brief, you can think each output sample is the weighted sum of few input_samples.
+void FvectorPerturb::SpeedPerturbation(VectorBase<BaseFloat>& input_vector,
+                                       BaseFloat samp_freq,
+                                       BaseFloat speed_factor,
+                                       VectorBase<BaseFloat>* output_vector) {
+  if (speed_factor == 1.0) {
+    output_vector->CopyFromVec(input_vector);
+  } else {
+    Vector<BaseFloat> in_vec(input_vector),
+                      out_vec(output_vector->Dim());
+    int32 input_dim = in_vec.Dim(),
+          output_dim = out_vec.Dim();
+    Vector<BaseFloat> samp_points_secs(output_dim);
+    int32 num_zeros = 4; // Number of zeros of the sinc function that the window extends out to.
+    // lowpass frequency that's lower than 95% of the Nyquist.
+    BaseFloat filter_cutoff_hz = samp_freq * 0.475; 
+    for (int32 i = 0; i < output_dim; i++) {
+      samp_points_secs(i) = static_cast<BaseFloat>(speed_factor * i / samp_freq);
+    }
+    ArbitraryResample time_resample(input_dim, samp_freq,
+                                    filter_cutoff_hz, 
+                                    samp_points_secs,
+                                    num_zeros);
+    time_resample.Resample(in_vec, &out_vec);
+    output_vector->CopyFromVec(out_vec);
+  }
+}
+
+void FvectorPerturb::TimeShift(VectorBase<BaseFloat>& input_vector,
+                               VectorBase<BaseFloat>* output_vector) {
+  //1. generate start point randomly whose range is
+  // [0, row.NumCols()- expected_chunk_length * sample_frequency)
+  int32 start_point = static_cast<int32>(RandInt(0, input_vector.Dim() - output_vector->Dim()));
+  //2. get the successive expected_chunk_length * sample_frequency data.
+  output_vector->CopyFromVec(input_vector.Range(start_point, output_vector->Dim()));
+}
+
+void FvectorPerturb::AddNoise(BaseFloat probability_threshold, 
+                              MatrixBase<BaseFloat>* chunk) {
+  //1. generate 2 SNR from (min-snr, max-snr)
+  //2. add N1(line3) to S1(line1) with snr1 with probability
+  //   add N2(line4) to S2(line2) with snr2 with probability
+  for (MatrixIndexT i = 0; i < 2; i++) {
+    BaseFloat  probability = static_cast<BaseFloat>(RandInt(0, 100) / 100.0);
+    if (probability <= probability_threshold) {
+      Vector<BaseFloat> source(chunk->Row(i));
+      Vector<BaseFloat> noise(chunk->Row(i+2));
+      BaseFloat source_power = VecVec(source, source) / source.Dim();
+      BaseFloat noise_power = VecVec(noise, noise) / noise.Dim();
+      int32 snr = RandInt(opts_.min_snr, opts_.max_snr);
+      BaseFloat scale_factor = sqrt(pow(10, -snr/10) * source_power / noise_power);
+      //BaseFloat source_energy = VecVec(source, source);
+      //BaseFloat noise_energy = VecVec(noise, noise);
+      // The smaller the value, the greater the snr
+      //BaseFloat scale_factor = sqrt(source_energy/ noise_energy / (pow(10, snr/20)));
+      chunk->Row(i).AddVec(scale_factor, noise);
+    }
+  }
+}
+
+
+// The following functions belong to Class FvectorPerturbBlock
+
+void FvectorPerturbBlock::ApplyPerturbationBlock(Matrix<BaseFloat>* perturbed_chunk) {
+  // 1.Add additive noise with probability
+  AddNoiseBlock(opts_.add_noise, perturbed1, noise1);
+  AddNoiseBlock(opts_.add_noise, perturbed2, noise2);
+  
+  // 2.After AddNoise(), conduct volume perturbation.
+  if (opts_.volume_perturbation) {
+    VolumePerturbationBlock(perturbed1);
+    VolumePerturbationBlock(perturbed2);
+  }
+
+  // 3. Conduct the speed perturbation and time shift together. At last the
+  // NumCols of perturbed matrix equals expected_chunk_length(e.g. 100ms)
+  if (opts_.speed_perturbation) {
+    SpeedPerturbationBlock(perturbed1);
+    SpeedPerturbationBlock(perturbed2);
+    if (opts_.time_shift) {
+      TimeShiftBlock(perturbed1);
+      TimeShiftBlock(perturbed2);
+    } else {
+      int32 output_cols = static_cast<int32>(opts_.expected_chunk_length * opts_.sample_frequency);
+      KALDI_ASSERT(perturbed1.NumRows() == perturbed2.NumRows());
+      int32 output_rows = perturbed1.NumRows();
+      perturbed1.Resize(output_rows, output_cols, kCopyData);
+      perturbed2.Resize(output_rows, output_cols, kCopyData);
+    }
+  } else {
+    if (opts_.time_shift) {
+      TimeShiftBlock(perturbed1);
+      TimeShiftBlock(perturbed2);
+    } else {
+      int32 output_cols = static_cast<int32>(opts_.expected_chunk_length * opts_.sample_frequency);
+      KALDI_ASSERT(perturbed1.NumRows() == perturbed2.NumRows());
+      int32 output_rows = perturbed1.NumRows();
+      perturbed1.Resize(output_rows, output_cols, kCopyData);
+      perturbed2.Resize(output_rows, output_cols, kCopyData);
+    }
+  }
+
+  // 4. At last, compose two different perturbed matrices into one matrix.
+  // Each two consecutive lines come from the same original source signal.
+  KALDI_ASSERT(perturbed1.NumRows() == perturbed2.NumRows());
+  int32 output_rows = perturbed1.NumRows() *2;
+  KALDI_ASSERT(perturbed1.NumCols() == perturbed2.NumCols());
+  int32 output_cols = perturbed1.NumCols();
+  perturbed_chunk->Resize(output_rows, output_cols);
+  for (MatrixIndexT i = 0; i < output_rows/2 ; i++) {
+    perturbed_chunk->Row(2*i).CopyFromVec(perturbed1.Row(i));
+    perturbed_chunk->Row(2*i+1).CopyFromVec(perturbed2.Row(i));
+  }
+}
+
+// For each row of source, we add additive noise to it with random snr with
+// probability. 
+void FvectorPerturbBlock::AddNoiseBlock(BaseFloat probability_threshold,
+                                        MatrixBase<BaseFloat>& source,
+                                        MatrixBase<BaseFloat>& noise) {
+  KALDI_ASSERT(source.NumRows() == noise.NumRows());
+  for (MatrixIndexT i = 0; i < source.NumRows(); i++) {
+    BaseFloat  probability = static_cast<BaseFloat>(RandInt(0, 10000) / 100.0);
+    if (probability <= probability_threshold) {
+      Vector<BaseFloat> source_signal(source.Row(i));
+      Vector<BaseFloat> noise_signal(noise.Row(i));
+      BaseFloat source_energy = VecVec(source_signal, source_signal);
+      BaseFloat noise_energy = VecVec(noise_signal, noise_signal);
+      // The smaller the value, the greater the snr
+      int32 snr = RandInt(opts_.min_snr, opts_.max_snr);
+      BaseFloat scale_factor = sqrt(source_energy/ noise_energy / (pow(10, snr/20)));
+      source.Row(i).AddVec(scale_factor, noise_signal);
+    }
+  }
+}
+
+// For the whole block, we use a uniform scale factor.
+void FvectorPerturbBlock::VolumePerturbationBlock(MatrixBase<BaseFloat>& block) {
+  BaseFloat factor = static_cast<BaseFloat>(
+        RandInt((int)((1-opts_.max_volume_variance)*100),
+                (int)((1+opts_.max_volume_variance)*100)) / 100.0);
+  block.Scale(factor);
+}
+
+// It is similar with FvectorPerturb::SpeedPerturbation(). Use ArbitraryResample.
+// For the whole block, we use a uniform random speed factor.
+void FvectorPerturbBlock::SpeedPerturbationBlock(Matrix<BaseFloat>& block) {
+  //1. generate speed perturb factor randomly(Noice: the expected_length is
+  //always smaller than original_length) for each line.
+  //(1) a=min{original_length/expected_length -1, max-speed-perturb-rate}
+  //(2) the range of factor is (1-a, 1+a)
+  BaseFloat boundary = std::min((block.NumCols() / opts_.sample_frequency) / opts_.expected_chunk_length - 1,
+                                  opts_.max_speed_perturb_rate);
+  //caculate the speed factor
+  BaseFloat speed_factor =static_cast<BaseFloat> (RandInt(
+          (int)((1-boundary)*100),(int)((1+boundary)*100)) * 1.0 / 100.0);
+  MatrixIndexT output_dim = static_cast<MatrixIndexT>(ceil(block.NumCols() / speed_factor));
+  KALDI_ASSERT(output_dim >= opts_.expected_chunk_length * opts_.sample_frequency / 1000);
+
+  if (speed_factor == 1.0) {
+    // return the original block
+  } else {
+    int32 input_dim = block.NumCols();
+    Vector<BaseFloat> samp_points_secs(output_dim);
+    int32 num_zeros = 4; // Number of zeros of the sinc function that the window extends out to.
+    // lowpass frequency that's lower than 95% of the Nyquist.
+    BaseFloat filter_cutoff_hz = opts_.sample_frequency * 0.475; 
+    for (int32 i = 0; i < output_dim; i++) {
+      samp_points_secs(i) = static_cast<BaseFloat>(speed_factor * i / opts_.sample_frequency);
+    }
+    ArbitraryResample time_resample(input_dim, opts_.sample_frequency,
+                                    filter_cutoff_hz, 
+                                    samp_points_secs,
+                                    num_zeros);
+    Matrix<BaseFloat> tmp_block(block.NumRows(), output_dim);
+    time_resample.Resample(block, &tmp_block);
+    block.Resize(tmp_block.NumRows(), tmp_block.NumCols());
+    block.CopyFromMat(tmp_block);    
+  }
+}
+
+// Choose a uniform start_point randomly, whose range is
+// [0, row.NumCols()- expected_chunk_length * sample_frequency)]
+// get the successive expected_chunk_length * sample_frequency data.
+void FvectorPerturbBlock::TimeShiftBlock(Matrix<BaseFloat>& block) {
+  int32 output_cols = static_cast<int32>(opts_.expected_chunk_length * opts_.sample_frequency);
+  int32 output_rows = block.NumRows();
+  int32 start_point = static_cast<int32>(RandInt(0, block.NumCols() - output_cols));
+  Matrix<BaseFloat> tmp_block(output_rows, output_cols);
+  tmp_block.CopyFromMat(block.Range(0, output_rows, start_point, output_cols));
+  block.Resize(output_rows, output_cols);
+  block.CopyFromMat(tmp_block);
+}
+
+} // end of namespace kaldi
diff --git a/src/fvector/fvector-perturb.h b/src/fvector/fvector-perturb.h
new file mode 100644
index 00000000000..c7dc5674820
--- /dev/null
+++ b/src/fvector/fvector-perturb.h
@@ -0,0 +1,172 @@
+#ifndef KALDI_FVECTOR_PERTURB_H_
+#define KALDI_FVECTOR_PERTURB_H_
+
+#include <cassert>
+#include <cstdlib>
+#include <string>
+#include <vector>
+
+#include "base/kaldi-error.h"
+#include "matrix/matrix-lib.h"
+#include "util/common-utils.h"
+
+#include "feat/resample.h"
+#include "matrix/matrix-functions.h"
+
+namespace kaldi {
+
+// options class for distorting signals in egs
+struct FvectorPerturbOptions {
+  BaseFloat sample_frequency;
+  BaseFloat expected_chunk_length;
+  BaseFloat max_speed_perturb_rate;
+  BaseFloat max_volume_variance;
+  BaseFloat max_snr;
+  BaseFloat min_snr;
+  bool volume_perturbation;
+  bool speed_perturbation;
+  bool time_shift;
+  BaseFloat add_noise;
+
+  FvectorPerturbOptions(): sample_frequency(16000),
+                           expected_chunk_length(100),
+                           max_speed_perturb_rate(0.1),
+                           max_volume_variance(0.03),
+                           max_snr(20),
+                           min_snr(0),
+                           volume_perturbation(true),
+                           speed_perturbation(true),
+                           time_shift(true),
+                           add_noise(0.85) { }
+
+  void Register(OptionsItf *opts) {
+    opts->Register("sample-frequency", &sample_frequency, "The sample frequency "
+                   "of the wav signal.");
+    opts->Register("expected-chunk-length", &expected_chunk_length, "It show the "
+                   "length of chunk you expected. e.g. 100ms. That means the length "
+                   "of output will correspond to 100ms. At the same time, it will "
+                   "affect the speed_perturb_rate, the speed_perturb_rate factor will "
+                   "in the range of min{expected-chunk-length/original-length, "
+                   "max-speed-perturb-rate}.");
+    opts->Register("max-speed-perturb-rate", &max_speed_perturb_rate,
+                   "Max speed perturbation applied on matrix. It will work together "
+                   "with expected_chunk_length. E.g. 0.1 means we will generate "
+                   "speed_factor randomly from range (1-a, 1+a), where a="
+                   "min{original_length/expected_length-1, 0.1}.");
+    opts->Register("max-volume-variance", &max_volume_variance, "The variation in "
+                   "volume will vary form 1-max-volume-variance to 1+max-volume-variance "
+                   "randomly.");
+    opts->Register("max-snr",&max_snr,"Specify a upperbound Signal to Noise Ratio. We will scale the noise according "
+                   "to the original signal and SNR. Normally, it's a non-zero number between -30 and 30.");
+    opts->Register("min-snr",&min_snr,"Specify a lowerbound Signal to Noise Ratio. We will scale the noise according "
+                   "to the original signal and SNR. Normally, it's a non-zero number between -30 and 30.");
+    opts->Register("volume-perturbation", &volume_perturbation, "If true, we will "
+                   "conduct variations in volume.");
+    opts->Register("speed-perturbation", &speed_perturbation, "If true, we will "
+                   "conduct variations in speed.");
+    opts->Register("time-shift", &time_shift, "If true, we will "
+                   "conduct time shift. That means randomly select the start point from "
+                   "range [0, input.NumCols() - expected_chunk_length], and then "
+                   "get the successive 'expected_chunk_length' data. Otherwise, we get "
+                   "the data from the head.");
+    opts->Register("add-noise", &add_noise, "Add additive noise to source chunk with "
+                   "probability. E.g. 0.85 means we add noise with 85 percent probability, "
+                   "and remain with 15 percent probability.");
+  }
+};
+
+/* This class is used to do (0-4) kinds of perturbation operation to fvector.
+ * According to the FvectorPerturbOptions, we choose do or not.
+ * The input always is a Matrix which contains four lines(S1, S2, N1, N2)[S1=S2]
+ * Then we will call different perturbation methods. (For details, see the comments
+ * of FvectorPerturbOption.)
+ * For the details about the four kinds of perturbation operation, please see
+ * the document in fvector-perturb.cc.
+ */
+class FvectorPerturb {
+ public:
+  FvectorPerturb(FvectorPerturbOptions opts) { opts_ = opts; }
+  void ApplyPerturbation(const MatrixBase<BaseFloat>& input_chunk,
+                         Matrix<BaseFloat>* perturbed_chunk);
+
+  // Randomly Generate 2 scale number and scale each line respectively
+  void VolumePerturbation(MatrixBase<BaseFloat>* chunk);
+
+  // Use ArbitraryResample. For each line, randomly generate a speed factor. 
+  // Then do time axis stretch. As speed factor is different, so we deal with
+  // each vector separately. The dim of output_vector is bigger than
+  // expected_chunk_length(ms)
+  void SpeedPerturbation(VectorBase<BaseFloat>& input_vector,
+                         BaseFloat samp_freq,
+                         BaseFloat speed_factor,
+                         VectorBase<BaseFloat>* output_vector);
+
+  // Randomly choose a expect_chunk_length(ms) vector.
+  void TimeShift(VectorBase<BaseFloat>& input_vector,
+                 VectorBase<BaseFloat>* output_vector);
+
+  // The input is a matrix contains four consecutive rows.
+  // It is (S1, S2, N1, N2). Each line is original_chunk_length(ms)(e.g. 960 dims = 120ms)
+  // add N1 to S1, add N2 to S2 with random snr. with probability (probability_threshold).
+  // After that, only the first two lines is meaningful, which represents two 
+  // perturbed signals from the same source wavform signal.
+  // After use this function, maybe you need to resize the output.
+  // (Notice: Resize() belongs to Matrix<> rather than MatrixBase<>)
+  void AddNoise(BaseFloat probability_threshold,
+                MatrixBase<BaseFloat>* chunk);
+
+ private:
+  FvectorPerturbOptions opts_;
+};
+
+
+/* This class is used to do (0-4) kinds of perturbation operation to fvector.
+ * According to the FvectorPerturbOptions, we choose do or not. 
+ * It is block version code that means it will process a matrix each time.
+ * Different from class FvectorPerturb, the class will process its private members
+ * (perturbed1, perturbed2, noise1 and noise2) to conducte perturbation operations.
+ * We will call different perturbation methods. (For details, see the comments
+ * of FvectorPerturbOption.)
+ * For the details about the four kinds of perturbation operation, please see
+ * the document in fvector-perturb.cc.
+ */
+class FvectorPerturbBlock {
+ public:
+  FvectorPerturbBlock(FvectorPerturbOptions opts, 
+                      const MatrixBase<BaseFloat> &source,
+                      const MatrixBase<BaseFloat> &noise1,
+                      const MatrixBase<BaseFloat> &noise2) : opts_(opts),
+    perturbed1(source), perturbed2(source), noise1(noise1), noise2(noise2) {}
+
+  // The interface to apply different perturbation opertaions. Firstly, the
+  // function will conduct different perturbation operations. And then
+  // it will compose the final matrices--perturbed1, perturbed2 together.
+  void ApplyPerturbationBlock(Matrix<BaseFloat>* perturbed_chunk);
+
+  // The input is two matrices. One is source matrix(e.g. perturbed1), another
+  // is noise matrix(e.g. noise1). Each line is original_chunk_length(ms)(e.g. 960 dims = 120ms)
+  // add noise row-by-row with random snr. with probability (probability_threshold).
+  // After that, the source signal is perturbed by noise signal.
+  void AddNoiseBlock(BaseFloat probability_threshold,
+                     MatrixBase<BaseFloat>& source,
+                     MatrixBase<BaseFloat>& noise);
+
+  // Randomly Generate a scale number and scale the whole matrix
+  void VolumePerturbationBlock(MatrixBase<BaseFloat>& block);
+
+  // Use ArbitraryResample. Generate a speed factor randomly for the whole matrix.
+  // Then do time axis stretch.  The dim of output_vector is bigger than
+  // expected_chunk_length(ms)
+  void SpeedPerturbationBlock(Matrix<BaseFloat>& block);
+
+  // Randomly choose a expect_chunk_length(ms) vector.
+  void TimeShiftBlock(Matrix<BaseFloat>& block);
+
+ private:
+  FvectorPerturbOptions opts_;
+  Matrix<BaseFloat> perturbed1, perturbed2, noise1, noise2;
+  
+};
+
+} // end of namespace kaldi
+#endif // KALDI_FVECTOR_PERTURB_H_
diff --git a/src/fvectorbin/Makefile b/src/fvectorbin/Makefile
new file mode 100644
index 00000000000..ea54a572168
--- /dev/null
+++ b/src/fvectorbin/Makefile
@@ -0,0 +1,25 @@
+
+all:
+
+EXTRA_CXXFLAGS = -Wno-sign-compare
+include ../kaldi.mk
+
+BINFILES = fvector-add-noise fvector-chunk fvector-get-egs \
+		   fvector-chunk-block fvector-add-noise-block fvector-get-egs-block \
+		   compute-wav-to-rawmatrix fvector-debug-write-to-wav \
+		   fvector-debug-wav-to-vector fvector-debug-check-filter-bank \
+		   fvector-chunk-separate fvector-add-noise-separate
+
+OBJFILES =
+
+
+
+TESTFILES =
+
+ADDLIBS = ../lat/kaldi-lat.a ../lm/kaldi-lm.a ../fstext/kaldi-fstext.a \
+          ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
+          ../matrix/kaldi-matrix.a ../fvector/kaldi-fvector.a \
+		  ../feat/kaldi-feat.a ../base/kaldi-base.a ../nnet3/kaldi-nnet3.a \
+		  ../cudamatrix/kaldi-cudamatrix.a
+
+include ../makefiles/default_rules.mk
diff --git a/src/fvectorbin/compute-wav-to-rawmatrix.cc b/src/fvectorbin/compute-wav-to-rawmatrix.cc
new file mode 100644
index 00000000000..6f4ba1b60c2
--- /dev/null
+++ b/src/fvectorbin/compute-wav-to-rawmatrix.cc
@@ -0,0 +1,123 @@
+// featbin/compute-mfcc-feats.cc
+
+// Copyright 2009-2012  Microsoft Corporation
+//                      Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "feat/feature-mfcc.h"
+#include "feat/wave-reader.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    const char *usage =
+        "Convert wav to rawmatrix.\n"
+        "Usage:  compute-wav-to-rawmatrix [options...] <wav-rspecifier> <feats-wspecifier>\n";
+
+    // construct all the global objects
+    ParseOptions po(usage);
+    FrameExtractionOptions extraction_opts;
+    int32 channel = -1;
+    BaseFloat min_duration = 0.0;
+    // Register the MFCC option struct
+    extraction_opts.Register(&po);
+    
+    // Register the options
+    po.Register("channel", &channel, "Channel to extract (-1 -> expect mono, "
+                "0 -> left, 1 -> right)");
+    po.Register("min-duration", &min_duration, "Minimum duration of segments "
+                "to process (in seconds).");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string wav_rspecifier = po.GetArg(1);
+    std::string output_wspecifier = po.GetArg(2);
+
+    SequentialTableReader<WaveHolder> reader(wav_rspecifier);
+    BaseFloatMatrixWriter kaldi_writer;  // typedef to TableWriter<something>.
+
+    if (!kaldi_writer.Open(output_wspecifier)) {
+        KALDI_ERR << "Could not initialize output with wspecifier "
+                  << output_wspecifier;
+    }
+
+    int32 num_utts = 0, num_success = 0;
+    for (; !reader.Done(); reader.Next()) {
+      num_utts++;
+      std::string utt = reader.Key();
+      const WaveData &wave_data = reader.Value();
+      if (wave_data.Duration() < min_duration) {
+        KALDI_WARN << "File: " << utt << " is too short ("
+                   << wave_data.Duration() << " sec): producing no output.";
+        continue;
+      }
+      int32 num_chan = wave_data.Data().NumRows(), this_chan = channel;
+      {  // This block works out the channel (0=left, 1=right...)
+        KALDI_ASSERT(num_chan > 0);  // should have been caught in
+        // reading code if no channels.
+        if (channel == -1) {
+          this_chan = 0;
+          if (num_chan != 1)
+            KALDI_WARN << "Channel not specified but you have data with "
+                       << num_chan  << " channels; defaulting to zero";
+        } else {
+          if (this_chan >= num_chan) {
+            KALDI_WARN << "File with id " << utt << " has "
+                       << num_chan << " channels but you specified channel "
+                       << channel << ", producing no output.";
+            continue;
+          }
+        }
+      }
+      SubVector<BaseFloat> waveform(wave_data.Data(), this_chan);
+      Matrix<BaseFloat> features;
+      try {
+        int32 rows_out = NumFrames(waveform.Dim(), extraction_opts);
+        int32 cols_out = extraction_opts.WindowSize();
+        features.Resize(rows_out, cols_out);
+        for (int32 i = 0; i < rows_out; i++) {
+          features.CopyRowFromVec(
+              SubVector<BaseFloat>(waveform, i*extraction_opts.WindowShift(),
+                extraction_opts.WindowSize()), i);
+        }
+      } catch (...) {
+        KALDI_WARN << "Failed to compute features for utterance "
+                   << utt;
+        continue;
+      }
+      kaldi_writer.Write(utt, features);
+      if (num_utts % 10 == 0)
+        KALDI_LOG << "Processed " << num_utts << " utterances";
+      KALDI_VLOG(2) << "Processed features for key " << utt;
+      num_success++;
+    }
+    KALDI_LOG << " Done " << num_success << " out of " << num_utts
+              << " utterances.";
+    return (num_success != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
+
diff --git a/src/fvectorbin/fvector-add-noise-block.cc b/src/fvectorbin/fvector-add-noise-block.cc
new file mode 100644
index 00000000000..f59babf3078
--- /dev/null
+++ b/src/fvectorbin/fvector-add-noise-block.cc
@@ -0,0 +1,61 @@
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "fvector/fvector-perturb.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    const char *usage =
+        "Perturb the chunk data. Each time the input is a source chunk block and\n"
+        "two noise chunk block. The two noise blocks are added to source block separately,\n"
+        "and then we maybe do volume perturbate, speed perturb or time shift.\n"
+        "At last, the output is a matrix. Each two consecutive rows of the matrix\n"
+        "come from same source wave, but were used different perturbation method.\n"
+        "Usage: fvector-add-noise-block [options...] <source-chunk-rspecifier> "
+        "<noise1-chunk-rspecifier> <noise2-chunk-respecifer> <perturbed-wspecifier>\n";
+
+    // construct all the global objects
+    ParseOptions po(usage);
+    FvectorPerturbOptions perturb_opts;
+    perturb_opts.Register(&po);
+
+    po.Read(argc, argv);
+    if (po.NumArgs() != 4) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string source_chunk_rspecifier = po.GetArg(1),
+                noise1_chunk_rspecifier = po.GetArg(2),
+                noise2_chunk_rspecifier = po.GetArg(3),
+                perturbed_chunk_rspecifier = po.GetArg(4);
+
+    SequentialBaseFloatMatrixReader source_chunk_reader(source_chunk_rspecifier);
+    RandomAccessBaseFloatMatrixReader noise1_chunk_reader(noise1_chunk_rspecifier);
+    RandomAccessBaseFloatMatrixReader noise2_chunk_reader(noise2_chunk_rspecifier);
+    BaseFloatMatrixWriter perturbed_chunk_writer(perturbed_chunk_rspecifier);
+
+    int64 num_read = 0, num_written = 0;
+    for (; !source_chunk_reader.Done(); source_chunk_reader.Next(), num_read++) {
+      std::string key = source_chunk_reader.Key();
+      // get source and 2 noise matrices.
+      const Matrix<BaseFloat> &source_input = source_chunk_reader.Value();
+      const Matrix<BaseFloat> &noise1_input = noise1_chunk_reader.Value(key);
+      const Matrix<BaseFloat> &noise2_input = noise2_chunk_reader.Value(key);
+
+      // the class FvectorPerturbBlock conduct the different perturb operation.
+      FvectorPerturbBlock perturb_fvector_block(perturb_opts, source_input,
+                                                noise1_input, noise2_input);
+      Matrix<BaseFloat> perturbed_chunk;
+      perturb_fvector_block.ApplyPerturbationBlock(&perturbed_chunk);
+      perturbed_chunk_writer.Write(key, perturbed_chunk);
+      num_written++;
+    }
+    KALDI_LOG << " Done " << num_written << " out of " << num_read
+              << " utterances.";
+    return (num_written != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/fvectorbin/fvector-add-noise-separate.cc b/src/fvectorbin/fvector-add-noise-separate.cc
new file mode 100644
index 00000000000..3d0652ef01e
--- /dev/null
+++ b/src/fvectorbin/fvector-add-noise-separate.cc
@@ -0,0 +1,72 @@
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "fvector/fvector-perturb.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    const char *usage =
+        "Perturb the chunk data. We read in one source chunk and two noise chunks separately\n"
+        "According to the setup, use (0-4) kinds of perturbation opertation, and then each output chunk \n"
+        "is a 2 consecutive rows of output matrix.\n"
+        "The two rows come from the same source wavform signal, but now they are different.\n"
+        "Usage:  fvector-add-noise [options...] <source-chunk-rspecifier> <noise-chunk-respecifier> <perturbed-wspecifier>\n";
+
+    // construct all the global objects
+    ParseOptions po(usage);
+    FvectorPerturbOptions perturb_opts;
+    perturb_opts.Register(&po);
+
+    po.Read(argc, argv);
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string chunk_rspecifier = po.GetArg(1),
+      noise_chunk_rspecifier = po.GetArg(2),
+      perturbed_chunk_rspecifier = po.GetArg(3);
+
+    SequentialBaseFloatVectorReader chunk_reader(chunk_rspecifier);
+    SequentialBaseFloatVectorReader noise_chunk_reader(noise_chunk_rspecifier);
+    BaseFloatMatrixWriter perturbed_chunk_writer(perturbed_chunk_rspecifier);
+
+    int64 num_read = 0, num_written = 0;
+    for (; !chunk_reader.Done(); chunk_reader.Next(), num_read++) {
+      // Read 2 noise chunks
+      if (noise_chunk_reader.Done()) {
+        KALDI_ERR << "Noise chunk is too short to enough";
+      }
+      const Vector<BaseFloat> noise1_chunk(noise_chunk_reader.Value());
+      noise_chunk_reader.Next();
+      const Vector<BaseFloat> noise2_chunk(noise_chunk_reader.Value());
+      noise_chunk_reader.Next();
+
+      std::string key = chunk_reader.Key();
+      // input_chunk has 3 lines.
+      const Vector<BaseFloat> &input_chunk = chunk_reader.Value();
+      // whole_chunk has 4 lines, it copies the first line and will be operate.
+      Matrix<BaseFloat> whole_chunk(4, input_chunk.Dim());
+      // For here, we copy the first line. So in the "whole_chunk" the first
+      // two lines come from the same source wavform signal. And the third/forth
+      // line is the random noise.
+      whole_chunk.CopyRowFromVec(input_chunk, 0);
+      whole_chunk.CopyRowFromVec(input_chunk, 1);
+      whole_chunk.CopyRowFromVec(noise1_chunk, 2);
+      whole_chunk.CopyRowFromVec(noise2_chunk, 3);
+      Matrix<BaseFloat> perturbed_chunk;
+
+      // the class FvectorPerturb conduct the different perturb operation.
+      FvectorPerturb perturb_fvector(perturb_opts);
+      perturb_fvector.ApplyPerturbation(whole_chunk, &perturbed_chunk);
+      perturbed_chunk_writer.Write(key, perturbed_chunk);
+      num_written++;
+    }
+    KALDI_LOG << " Done " << num_written << " out of " << num_read
+              << " utterances.";
+    return (num_written != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/fvectorbin/fvector-add-noise.cc b/src/fvectorbin/fvector-add-noise.cc
new file mode 100644
index 00000000000..be5438fea7f
--- /dev/null
+++ b/src/fvectorbin/fvector-add-noise.cc
@@ -0,0 +1,59 @@
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "fvector/fvector-perturb.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    const char *usage =
+        "Perturb the chunk data. Each input chunk is a four consecutive rows matrix(S1, S2, N1, N2).\n"
+        "According to the setup, use (0-4) kinds of perturbation opertation, and then each output chunk \n"
+        "is a 2 consecutive rows of output matrix.\n"
+        "The two rows come from the same source wavform signal, but now they are different.\n"
+        "Usage:  fvector-add-noise [options...] <chunk-rspecifier> <perturbed-wspecifier>\n";
+
+    // construct all the global objects
+    ParseOptions po(usage);
+    FvectorPerturbOptions perturb_opts;
+    perturb_opts.Register(&po);
+
+    po.Read(argc, argv);
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string chunk_rspecifier = po.GetArg(1),
+      perturbed_chunk_rspecifier = po.GetArg(2);
+
+    SequentialBaseFloatMatrixReader chunk_reader(chunk_rspecifier);
+    BaseFloatMatrixWriter perturbed_chunk_writer(perturbed_chunk_rspecifier);
+
+    int64 num_read = 0, num_written = 0;
+    for (; !chunk_reader.Done(); chunk_reader.Next(), num_read++) {
+      std::string key = chunk_reader.Key();
+      // input_chunk has 3 lines.
+      const Matrix<BaseFloat> &input_chunk = chunk_reader.Value();
+      // whole_chunk has 4 lines, it copies the first line and will be operate.
+      Matrix<BaseFloat> whole_chunk(4, input_chunk.NumCols());
+      // For here, we copy the first line. So in the "whole_chunk" the first
+      // two lines come from the same source wavform signal. And the third/forth
+      // line is the random noise.
+      MatrixIndexT indices[4] = {0, 0, 1, 2};
+      whole_chunk.CopyRows(input_chunk, indices);
+      Matrix<BaseFloat> perturbed_chunk;
+
+      // the class FvectorPerturb conduct the different perturb operation.
+      FvectorPerturb perturb_fvector(perturb_opts);
+      perturb_fvector.ApplyPerturbation(whole_chunk, &perturbed_chunk);
+      perturbed_chunk_writer.Write(key, perturbed_chunk);
+      num_written++;
+    }
+    KALDI_LOG << " Done " << num_written << " out of " << num_read
+              << " utterances.";
+    return (num_written != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/fvectorbin/fvector-chunk-block.cc b/src/fvectorbin/fvector-chunk-block.cc
new file mode 100644
index 00000000000..ab8daa93f0b
--- /dev/null
+++ b/src/fvectorbin/fvector-chunk-block.cc
@@ -0,0 +1,212 @@
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "feat/wave-reader.h"
+
+namespace kaldi {
+//Randomly select two groups (uttid, startpoint) from noise list, respectively.
+//The length of selected utterance is bigger than chunk_size, which guarantees
+//we can get a complete noise chunk. At the same time, the startpoint is randomly
+//selected from [0, len(utt)-chunk_size].
+void RandomSelectTwoNoiseUtt(const std::vector<std::pair<std::string, float>>& utt2dur_list,
+                             const int32& utt2dur_len,
+                             const int32& chunk_size,
+                             std::vector<std::pair<std::string, float>>* output) {
+  for(int32 index = 0; index < 2; ++index) {
+    int32 r_index = -1;
+    do {
+      // r_index indicate the random index of utt2dur_list
+      r_index = RandInt(0, utt2dur_len-1);
+    } while (utt2dur_list[r_index].second > chunk_size);
+    // random number in [0, utt2dur]
+    float start_point = RandInt(0, (int)(utt2dur_list[r_index].second)*100) * 1.0 / 100;
+    output->push_back(std::make_pair(utt2dur_list[r_index].first, start_point));
+  }
+  KALDI_ASSERT(output->size() == 2);
+}
+
+
+} //The end of namespace kaldi
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    const char *usage =
+        "Get the data chunks. We sequentially read the wav files. And cut them\n"
+        "into 'chunk_size' length fragment. And we randomly select two 'chunk_size'\n"
+        "length fragments from noise-list. Then we the store the 'source' chunk and\n"
+        "'noise' chunks into the corresponding matrix separately."
+        "Usage:  fvector-chunk [options...] <wav-rspecifier> <noise-rspecifier>"
+        "<utt2dur-rxfilename> <feats-wspecifier> <noise1-wspecifier> <noise2-wspecifier>\n";
+
+    // construct all the global objects
+    ParseOptions po(usage);
+    int32 chunk_size = 120;
+    int32 channel = -1;
+    int32 shift_time = 60;
+    BaseFloat min_duration = 0.0;
+    int32 srand_seed = 1;
+    int32 block_size = 32;
+    BaseFloat samp_freq = 8000;
+
+    po.Register("channel", &channel, "Channel to extract (-1 -> expect mono, "
+                "0 -> left, 1 -> right)");
+    po.Register("chunk-size", &chunk_size, "The expected length of the chunk.");
+    po.Register("shift-time", &shift_time, "Time shift, which decide the overlap "
+                "of two adjacent chunks in the same utterance.");
+    po.Register("min-duration", &min_duration, "Minimum duration of segments "
+                "to process (in seconds).");
+    po.Register("srand", &srand_seed, "Seed for random number generator.");
+    po.Register("block-size",&block_size, "Specify the number of lines of feature "
+                "block; the number of lines of noise block will be twice.");
+    po.Register("sample-frequency", &samp_freq, "Specify the sample frequency. "
+                "(default=8000)");
+    
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 6) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    srand(srand_seed);
+
+    std::string wav_rspecifier = po.GetArg(1);
+    std::string noise_rspecifier = po.GetArg(2);
+    std::string utt2dur_rxfilename = po.GetArg(3);
+    std::string output_feature_wspecifier = po.GetArg(4);
+    std::string output_noise1_wspecifier = po.GetArg(5);
+    std::string output_noise2_wspecifier = po.GetArg(6);
+
+
+    SequentialTableReader<WaveHolder> reader(wav_rspecifier);
+    RandomAccessTableReader<WaveHolder> noise_reader(noise_rspecifier);
+    Input ki(utt2dur_rxfilename);
+    BaseFloatMatrixWriter feature_writer;  // typedef to TableWriter<something>.
+    BaseFloatMatrixWriter noise1_writer;
+    BaseFloatMatrixWriter noise2_writer;
+
+    //Read the utt2dur file
+    //the vector--utt2dur is used to randomly select the noise chunk.
+    std::vector<std::pair<std::string, float>> utt2dur;
+    std::string line;
+    while (std::getline(ki.Stream(), line)) {
+      std::vector<std::string> split_line;
+      // Split the line by space or tab and check the number of fields in each
+      // line. There must be 2 fields--segment utt_id and duration
+      SplitStringToVector(line, " \t\r", true, &split_line);
+      if (split_line.size() != 2) {
+        KALDI_WARN << "Invalid line in segments file: " << line;
+        continue;
+      }
+      std::string utt = split_line[0],
+        duration_str = split_line[1];
+
+      double duration;
+      if (!ConvertStringToReal(duration_str, &duration)) {
+        KALDI_WARN << "Invalid line in utt2dur file: " << line;
+        continue;
+      }
+      utt2dur.push_back(std::make_pair(utt, duration));
+    }
+    //random number in [0, utt2dur_len), so we get variable "utt2dur_len"
+    int32 utt2dur_len = utt2dur.size();
+
+    // Start to chunk the data, each source chunk and 2 corresponding noise 
+    // chunks were store into corresping block matrix. When counter == block_size,
+    // write one source block and two noise blocks.
+    int32 num_utts = 0, num_success = 0;
+    int32 counter = 0;
+    int32 dim = static_cast<int32>(samp_freq * chunk_size / 1000);
+    Matrix<BaseFloat> feature_block(block_size, dim),
+                      noise_block1(block_size, dim),
+                      noise_block2(block_size, dim);
+
+    for (; !reader.Done(); reader.Next()) {
+      num_utts++;
+      std::string utt = reader.Key();
+      const WaveData &wave_data = reader.Value();
+      if (wave_data.Duration() < min_duration) {
+        KALDI_WARN << "File: " << utt << " is too short ("
+                   << wave_data.Duration() << " sec): producing no output.";
+        continue;
+      }
+      int32 num_chan = wave_data.Data().NumRows(), this_chan = channel;
+      {  // This block works out the channel (0=left, 1=right...)
+        KALDI_ASSERT(num_chan > 0);  // should have been caught in
+        // reading code if no channels.
+        if (channel == -1) {
+          this_chan = 0;
+          if (num_chan != 1)
+            KALDI_WARN << "Channel not specified but you have data with "
+                       << num_chan  << " channels; defaulting to zero";
+        } else {
+          if (this_chan >= num_chan) {
+            KALDI_WARN << "File with id " << utt << " has "
+                       << num_chan << " channels but you specified channel "
+                       << channel << ", producing no output.";
+            continue;
+          }
+        }
+      }
+
+      KALDI_ASSERT(wave_data.SampFreq() == samp_freq);
+      SubVector<BaseFloat> waveform(wave_data.Data(), this_chan);
+      //e.g. A "waveform" is 285ms, chunk_size is 120ms, shift_time is 70ms. At last, the chunks
+      //will be 0-120ms, 70-190ms, 140-260ms. So num_chunk = 3
+      int32 num_chunk = (int)((waveform.Dim() / wave_data.SampFreq() - chunk_size ) / shift_time) + 1;
+      try {
+        for (int32 index = 0; index < num_chunk; ++index) {
+          int32 source_start = wave_data.SampFreq() * (index * shift_time);
+          feature_block.CopyRowFromVec(SubVector<BaseFloat>(waveform, source_start, dim), counter);
+          //1. Generate 2 random number form [0, utt2dur_len)
+          //2. From vector utt2dur, get the 2 pairs
+          //3. Generate 2 random "start point" number from [0, utt2dur[x][1])
+          //The three steps is implemented by function--"RandomSelectTwoNoiseUtt"
+          //The output vector, "two_random_uttid", contains two pairs. For each
+          //pair, its content is <uttid, start_point>
+          std::vector<std::pair<std::string, float>> two_random_uttid;
+          RandomSelectTwoNoiseUtt(utt2dur, utt2dur_len, chunk_size/1000, 
+                                  &two_random_uttid);
+          //4. According to the utt2dur[x][0]--utt_id and startpoint form RandomAccessTable
+          //   read noise chunk.
+          //5. The features matrix has 3 lines: source, nosie1, noise2.
+          const WaveData &noise_wav1 = noise_reader.Value(two_random_uttid[0].first);
+          KALDI_ASSERT(wave_data.SampFreq() == noise_wav1.SampFreq());
+          SubVector<BaseFloat> noise1(noise_wav1.Data(), 0);
+          noise_block1.CopyRowFromVec(SubVector<BaseFloat>(noise1, two_random_uttid[0].second, dim), counter);
+          
+          const WaveData &noise_wav2 = noise_reader.Value(two_random_uttid[1].first);
+          KALDI_ASSERT(wave_data.SampFreq() == noise_wav2.SampFreq());
+          SubVector<BaseFloat> noise2(noise_wav2.Data(), 0);
+          noise_block2.CopyRowFromVec(SubVector<BaseFloat>(noise2, two_random_uttid[1].second, dim), counter);
+          counter++;
+          
+          // when "counter == block_size", store the matrices.
+          if (counter == block_size) {
+            std::ostringstream utt_id_new;
+            utt_id_new << utt << '_' << index;
+            feature_writer.Write(utt_id_new.str(), feature_block);
+            noise1_writer.Write(utt_id_new.str(), noise_block1);
+            noise2_writer.Write(utt_id_new.str(), noise_block2);
+            counter = 0;
+          }
+        }
+      } catch (...) {
+        KALDI_WARN << "Failed to compute features for utterance "
+                   << utt;
+        continue;
+      }
+      
+      if (num_utts % 10 == 0)
+        KALDI_LOG << "Processed " << num_utts << " utterances";
+      KALDI_VLOG(2) << "Processed features for key " << utt;
+      num_success++;
+    }
+    KALDI_LOG << " Done " << num_success << " out of " << num_utts
+              << " utterances.";
+    return (num_success != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/fvectorbin/fvector-chunk-separate.cc b/src/fvectorbin/fvector-chunk-separate.cc
new file mode 100644
index 00000000000..f9a00f880f3
--- /dev/null
+++ b/src/fvectorbin/fvector-chunk-separate.cc
@@ -0,0 +1,207 @@
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "feat/wave-reader.h"
+
+namespace kaldi {
+//Randomly select two groups (uttid, startpoint) from noise list, respectively.
+//The length of selected utterance is bigger than chunk_size, which guarantees
+//we can get a complete noise chunk. At the same time, the startpoint is randomly
+//selected from [0, len(utt)-chunk_size].
+void RandomSelectTwoNoiseUtt(const std::vector<std::pair<std::string, float>>& utt2dur_list,
+                             const int32& utt2dur_len,
+                             const BaseFloat& sample_frequency,
+                             const BaseFloat& chunk_size,
+                             std::vector<std::pair<std::string, float>>* output) {
+  for(int32 index = 0; index < 2; ++index) {
+    int32 r_index = -1;
+    do {
+      // r_index indicate the random index of utt2dur_list
+      r_index = RandInt(0, utt2dur_len-1);
+    } while (utt2dur_list[r_index].second < chunk_size);
+    // random number in [0, utt2dur]
+    int boundary = (int)((utt2dur_list[r_index].second - chunk_size) * 1000);
+    float start_point = RandInt(0, boundary);
+    output->push_back(std::make_pair(utt2dur_list[r_index].first, start_point));
+  }
+  KALDI_ASSERT(output->size() == 2);
+}
+
+
+} //The end of namespace kaldi
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    const char *usage =
+        "Get the data chunks. We sequentially read the wav files. And cut them\n"
+        "into 'chunk_size' length fragment. And we randomly select two 'chunk_size'\n"
+        "length fragments from noise-list. We call the three chunks (S1,N1,N2)\n"
+        "separately. Then we store the S1 into <source-wspecifer> and sotre the\n"
+        "(N1,N2) into <source-wspecifier> separately\n"
+        "Usage:  fvector-chunk-separate [options...] <wav-rspecifier> <noise-rspecifier>"
+        "<utt2dur-rxfilename> <feats-wspecifier> <noises-wspecifier>\n";
+
+    // construct all the global objects
+    ParseOptions po(usage);
+    int32 chunk_size = 120;
+    int32 channel = -1;
+    int32 shift_time = 60;
+    BaseFloat min_duration = 0.0;
+    int32 srand_seed = 1;
+
+    po.Register("channel", &channel, "Channel to extract (-1 -> expect mono, "
+                "0 -> left, 1 -> right)");
+    po.Register("chunk-size", &chunk_size, "The expected length of the chunk.");
+    po.Register("shift-time", &shift_time, "Time shift, which decide the overlap "
+                "of two adjacent chunks in the same utterance.");
+    po.Register("min-duration", &min_duration, "Minimum duration of segments "
+                "to process (in seconds).");
+    po.Register("srand", &srand_seed, "Seed for random number generator.");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 5) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    srand(srand_seed);
+
+    std::string wav_rspecifier = po.GetArg(1);
+    std::string noise_rspecifier = po.GetArg(2);
+    std::string utt2dur_rxfilename = po.GetArg(3);
+    std::string output_wspecifier = po.GetArg(4);
+    std::string noise_wspecifier = po.GetArg(5);
+
+
+    SequentialTableReader<WaveHolder> reader(wav_rspecifier);
+    RandomAccessTableReader<WaveHolder> noise_reader(noise_rspecifier);
+    Input ki(utt2dur_rxfilename);
+    BaseFloatVectorWriter source_writer;  // typedef to TableWriter<something>.
+    BaseFloatVectorWriter noise_writer;
+
+    if (!source_writer.Open(output_wspecifier)) {
+      KALDI_ERR << "Could not initialize output with wspecifier "
+                << output_wspecifier;
+    }
+    if (!noise_writer.Open(noise_wspecifier)) {
+      KALDI_ERR << "Could not initialize output with wspecifier "
+                << noise_wspecifier;
+    }
+    //Read the utt2dur file
+    //the vector--utt2dur is used to randomly select the noise chunk.
+    std::vector<std::pair<std::string, float>> utt2dur;
+    std::string line;
+    while (std::getline(ki.Stream(), line)) {
+      std::vector<std::string> split_line;
+      // Split the line by space or tab and check the number of fields in each
+      // line. There must be 2 fields--segment utt_id and duration
+      SplitStringToVector(line, " \t\r", true, &split_line);
+      if (split_line.size() != 2) {
+        KALDI_WARN << "Invalid line in segments file: " << line;
+        continue;
+      }
+      std::string utt = split_line[0],
+        duration_str = split_line[1];
+
+      double duration;
+      if (!ConvertStringToReal(duration_str, &duration)) {
+        KALDI_WARN << "Invalid line in utt2dur file: " << line;
+        continue;
+      }
+      utt2dur.push_back(std::make_pair(utt, duration));
+    }
+    //random number in [0, utt2dur_len), so we get variable "utt2dur_len"
+    int32 utt2dur_len = utt2dur.size();
+
+    // Start to chunk the data, compose 1 source chunk and 2 noise chunks into
+    // a matrix.
+    int32 num_utts = 0, num_success = 0;
+    for (; !reader.Done(); reader.Next()) {
+      num_utts++;
+      std::string utt = reader.Key();
+      const WaveData &wave_data = reader.Value();
+      if (wave_data.Duration() < min_duration) {
+        KALDI_WARN << "File: " << utt << " is too short ("
+                   << wave_data.Duration() << " sec): producing no output.";
+        continue;
+      }
+      int32 num_chan = wave_data.Data().NumRows(), this_chan = channel;
+      {  // This block works out the channel (0=left, 1=right...)
+        KALDI_ASSERT(num_chan > 0);  // should have been caught in
+        // reading code if no channels.
+        if (channel == -1) {
+          this_chan = 0;
+          if (num_chan != 1)
+            KALDI_WARN << "Channel not specified but you have data with "
+                       << num_chan  << " channels; defaulting to zero";
+        } else {
+          if (this_chan >= num_chan) {
+            KALDI_WARN << "File with id " << utt << " has "
+                       << num_chan << " channels but you specified channel "
+                       << channel << ", producing no output.";
+            continue;
+          }
+        }
+      }
+      SubVector<BaseFloat> waveform(wave_data.Data(), this_chan);
+      //e.g. A "waveform" is 285ms, chunk_size is 120ms, shift_time is 70ms. At last, the chunks
+      //will be 0-120ms, 70-190ms, 140-260ms. So num_chunk = 3
+      int32 num_chunk = (int)(((waveform.Dim() * 1.0 / wave_data.SampFreq()) * 1000 - chunk_size ) / shift_time) + 1;
+      int32 dim = wave_data.SampFreq() * chunk_size / 1000;
+      try {
+        for (int32 index = 0; index < num_chunk; ++index) {
+          Matrix<BaseFloat> features(3, dim);
+          int32 source_start = static_cast<int32>(wave_data.SampFreq() * (index * shift_time / 1000.0));
+          features.CopyRowFromVec(SubVector<BaseFloat>(waveform, source_start, dim), 0);
+          //1. Generate 2 random number form [0, utt2dur_len)
+          //2. From vector utt2dur, get the 2 pairs
+          //3. Generate 2 random "start point" number from [0, utt2dur[x][1])
+          //The three steps is implemented by function--"RandomSelectTwoNoiseUtt"
+          //The output vector, "two_random_uttid", contains two pairs. For each
+          //pair, its content is <uttid, start_point>
+          std::vector<std::pair<std::string, float>> two_random_uttid;
+          RandomSelectTwoNoiseUtt(utt2dur, utt2dur_len, wave_data.SampFreq(), chunk_size/1000.0, 
+                                  &two_random_uttid);
+          //4. According to the utt2dur[x][0]--utt_id and startpoint form RandomAccessTable
+          //   read noise chunk.
+          //5. The features matrix has 3 lines: source, nosie1, noise2.
+          const WaveData &noise_wav1 = noise_reader.Value(two_random_uttid[0].first);
+          KALDI_ASSERT(wave_data.SampFreq() == noise_wav1.SampFreq());
+          SubVector<BaseFloat> noise1(noise_wav1.Data(), 0);
+          features.CopyRowFromVec(SubVector<BaseFloat>(noise1, two_random_uttid[0].second, dim), 1);
+          
+          const WaveData &noise_wav2 = noise_reader.Value(two_random_uttid[1].first);
+          KALDI_ASSERT(wave_data.SampFreq() == noise_wav2.SampFreq());
+          SubVector<BaseFloat> noise2(noise_wav2.Data(), 0);
+          features.CopyRowFromVec(SubVector<BaseFloat>(noise2, two_random_uttid[1].second, dim), 2);
+
+          std::ostringstream source_id;
+          source_id << utt << '_' << index << "_s";
+          source_writer.Write(source_id.str(), Vector<BaseFloat>(SubVector<BaseFloat>(features, 0)));
+          std::ostringstream noise1_id;
+          noise1_id << utt << '_' << index << "_n1";
+          noise_writer.Write(noise1_id.str(), Vector<BaseFloat>(SubVector<BaseFloat>(features, 1)));
+          std::ostringstream noise2_id;
+          noise2_id << utt << '_' << index << "_n2";
+          noise_writer.Write(noise2_id.str(), Vector<BaseFloat>(SubVector<BaseFloat>(features, 2)));
+        }
+      } catch (...) {
+        KALDI_WARN << "Failed to compute features for utterance "
+                   << utt;
+        continue;
+      }
+      
+      if (num_utts % 10 == 0)
+        KALDI_LOG << "Processed " << num_utts << " utterances";
+      KALDI_VLOG(2) << "Processed features for key " << utt;
+      num_success++;
+    }
+    KALDI_LOG << " Done " << num_success << " out of " << num_utts
+              << " utterances.";
+    return (num_success != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/fvectorbin/fvector-chunk.cc b/src/fvectorbin/fvector-chunk.cc
new file mode 100644
index 00000000000..ad280cf9577
--- /dev/null
+++ b/src/fvectorbin/fvector-chunk.cc
@@ -0,0 +1,195 @@
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "feat/wave-reader.h"
+
+namespace kaldi {
+//Randomly select two groups (uttid, startpoint) from noise list, respectively.
+//The length of selected utterance is bigger than chunk_size, which guarantees
+//we can get a complete noise chunk. At the same time, the startpoint is randomly
+//selected from [0, len(utt)-chunk_size].
+void RandomSelectTwoNoiseUtt(const std::vector<std::pair<std::string, float>>& utt2dur_list,
+                             const int32& utt2dur_len,
+                             const BaseFloat& sample_frequency,
+                             const BaseFloat& chunk_size,
+                             std::vector<std::pair<std::string, float>>* output) {
+  for(int32 index = 0; index < 2; ++index) {
+    int32 r_index = -1;
+    do {
+      // r_index indicate the random index of utt2dur_list
+      r_index = RandInt(0, utt2dur_len-1);
+    } while (utt2dur_list[r_index].second < chunk_size);
+    // random number in [0, utt2dur]
+    int boundary = (int)((utt2dur_list[r_index].second - chunk_size) * 1000);
+    float start_point = RandInt(0, boundary);
+    output->push_back(std::make_pair(utt2dur_list[r_index].first, start_point));
+  }
+  KALDI_ASSERT(output->size() == 2);
+}
+
+
+} //The end of namespace kaldi
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    const char *usage =
+        "Get the data chunks. We sequentially read the wav files. And cut them\n"
+        "into 'chunk_size' length fragment. And we randomly select two 'chunk_size'\n"
+        "length fragments from noise-list. Then we compose the three vectors into\n"
+        "a matrix, which we call it 'chunk'. So each item in the output file \n"
+        "is a matrix which has 3 lines.(S1, N1, N2).\n"
+        "Usage:  fvector-chunk [options...] <wav-rspecifier> <noise-rspecifier>"
+        "<utt2dur-rxfilename> <feats-wspecifier>\n";
+
+    // construct all the global objects
+    ParseOptions po(usage);
+    int32 chunk_size = 120;
+    int32 channel = -1;
+    int32 shift_time = 60;
+    BaseFloat min_duration = 0.0;
+    int32 srand_seed = 1;
+
+    po.Register("channel", &channel, "Channel to extract (-1 -> expect mono, "
+                "0 -> left, 1 -> right)");
+    po.Register("chunk-size", &chunk_size, "The expected length of the chunk.");
+    po.Register("shift-time", &shift_time, "Time shift, which decide the overlap "
+                "of two adjacent chunks in the same utterance.");
+    po.Register("min-duration", &min_duration, "Minimum duration of segments "
+                "to process (in seconds).");
+    po.Register("srand", &srand_seed, "Seed for random number generator.");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 4) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    srand(srand_seed);
+
+    std::string wav_rspecifier = po.GetArg(1);
+    std::string noise_rspecifier = po.GetArg(2);
+    std::string utt2dur_rxfilename = po.GetArg(3);
+    std::string output_wspecifier = po.GetArg(4);
+
+
+    SequentialTableReader<WaveHolder> reader(wav_rspecifier);
+    RandomAccessTableReader<WaveHolder> noise_reader(noise_rspecifier);
+    Input ki(utt2dur_rxfilename);
+    BaseFloatMatrixWriter kaldi_writer;  // typedef to TableWriter<something>.
+    
+    if (!kaldi_writer.Open(output_wspecifier)) {
+      KALDI_ERR << "Could not initialize output with wspecifier "
+                << output_wspecifier;
+    }
+    //Read the utt2dur file
+    //the vector--utt2dur is used to randomly select the noise chunk.
+    std::vector<std::pair<std::string, float>> utt2dur;
+    std::string line;
+    while (std::getline(ki.Stream(), line)) {
+      std::vector<std::string> split_line;
+      // Split the line by space or tab and check the number of fields in each
+      // line. There must be 2 fields--segment utt_id and duration
+      SplitStringToVector(line, " \t\r", true, &split_line);
+      if (split_line.size() != 2) {
+        KALDI_WARN << "Invalid line in segments file: " << line;
+        continue;
+      }
+      std::string utt = split_line[0],
+        duration_str = split_line[1];
+
+      double duration;
+      if (!ConvertStringToReal(duration_str, &duration)) {
+        KALDI_WARN << "Invalid line in utt2dur file: " << line;
+        continue;
+      }
+      utt2dur.push_back(std::make_pair(utt, duration));
+    }
+    //random number in [0, utt2dur_len), so we get variable "utt2dur_len"
+    int32 utt2dur_len = utt2dur.size();
+
+    // Start to chunk the data, compose 1 source chunk and 2 noise chunks into
+    // a matrix.
+    int32 num_utts = 0, num_success = 0;
+    for (; !reader.Done(); reader.Next()) {
+      num_utts++;
+      std::string utt = reader.Key();
+      const WaveData &wave_data = reader.Value();
+      if (wave_data.Duration() < min_duration) {
+        KALDI_WARN << "File: " << utt << " is too short ("
+                   << wave_data.Duration() << " sec): producing no output.";
+        continue;
+      }
+      int32 num_chan = wave_data.Data().NumRows(), this_chan = channel;
+      {  // This block works out the channel (0=left, 1=right...)
+        KALDI_ASSERT(num_chan > 0);  // should have been caught in
+        // reading code if no channels.
+        if (channel == -1) {
+          this_chan = 0;
+          if (num_chan != 1)
+            KALDI_WARN << "Channel not specified but you have data with "
+                       << num_chan  << " channels; defaulting to zero";
+        } else {
+          if (this_chan >= num_chan) {
+            KALDI_WARN << "File with id " << utt << " has "
+                       << num_chan << " channels but you specified channel "
+                       << channel << ", producing no output.";
+            continue;
+          }
+        }
+      }
+      SubVector<BaseFloat> waveform(wave_data.Data(), this_chan);
+      //e.g. A "waveform" is 285ms, chunk_size is 120ms, shift_time is 70ms. At last, the chunks
+      //will be 0-120ms, 70-190ms, 140-260ms. So num_chunk = 3
+      int32 num_chunk = (int)(((waveform.Dim() * 1.0 / wave_data.SampFreq()) * 1000 - chunk_size ) / shift_time) + 1;
+      int32 dim = wave_data.SampFreq() * chunk_size / 1000;
+      try {
+        for (int32 index = 0; index < num_chunk; ++index) {
+          Matrix<BaseFloat> features(3, dim);
+          int32 source_start = static_cast<int32>(wave_data.SampFreq() * (index * shift_time / 1000.0));
+          features.CopyRowFromVec(SubVector<BaseFloat>(waveform, source_start, dim), 0);
+          //1. Generate 2 random number form [0, utt2dur_len)
+          //2. From vector utt2dur, get the 2 pairs
+          //3. Generate 2 random "start point" number from [0, utt2dur[x][1])
+          //The three steps is implemented by function--"RandomSelectTwoNoiseUtt"
+          //The output vector, "two_random_uttid", contains two pairs. For each
+          //pair, its content is <uttid, start_point>
+          std::vector<std::pair<std::string, float>> two_random_uttid;
+          RandomSelectTwoNoiseUtt(utt2dur, utt2dur_len, wave_data.SampFreq(), chunk_size/1000.0, 
+                                  &two_random_uttid);
+          //4. According to the utt2dur[x][0]--utt_id and startpoint form RandomAccessTable
+          //   read noise chunk.
+          //5. The features matrix has 3 lines: source, nosie1, noise2.
+          const WaveData &noise_wav1 = noise_reader.Value(two_random_uttid[0].first);
+          KALDI_ASSERT(wave_data.SampFreq() == noise_wav1.SampFreq());
+          SubVector<BaseFloat> noise1(noise_wav1.Data(), 0);
+          features.CopyRowFromVec(SubVector<BaseFloat>(noise1, two_random_uttid[0].second, dim), 1);
+          
+          const WaveData &noise_wav2 = noise_reader.Value(two_random_uttid[1].first);
+          KALDI_ASSERT(wave_data.SampFreq() == noise_wav2.SampFreq());
+          SubVector<BaseFloat> noise2(noise_wav2.Data(), 0);
+          features.CopyRowFromVec(SubVector<BaseFloat>(noise2, two_random_uttid[1].second, dim), 2);
+
+          std::ostringstream utt_id_new;
+          utt_id_new << utt << '_' << index;
+          kaldi_writer.Write(utt_id_new.str(), features);
+        }
+      } catch (...) {
+        KALDI_WARN << "Failed to compute features for utterance "
+                   << utt;
+        continue;
+      }
+      
+      if (num_utts % 10 == 0)
+        KALDI_LOG << "Processed " << num_utts << " utterances";
+      KALDI_VLOG(2) << "Processed features for key " << utt;
+      num_success++;
+    }
+    KALDI_LOG << " Done " << num_success << " out of " << num_utts
+              << " utterances.";
+    return (num_success != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/fvectorbin/fvector-debug-check-filter-bank.cc b/src/fvectorbin/fvector-debug-check-filter-bank.cc
new file mode 100644
index 00000000000..67a8140d5f9
--- /dev/null
+++ b/src/fvectorbin/fvector-debug-check-filter-bank.cc
@@ -0,0 +1,64 @@
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "fvector/fvector-perturb.h"
+#include "feat/wave-reader.h"
+#include "nnet3/nnet-utils.h"
+#include "nnet3/nnet-simple-component.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+      "This binary is used to check the filter bank which is modeled by affine\n"
+      "component. It computes the band-with of each learned filter."  
+      "Usage:  fvector-debug-check-filter-bank [options...] <nnet-in> <component-name> <stats-out>\n";
+
+    ParseOptions po(usage);
+
+    po.Read(argc, argv);
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string nnet_rxfilename = po.GetArg(1),
+                component_name = po.GetArg(2),
+                stats_wxfilename = po.GetArg(3);
+
+    Nnet nnet;
+    ReadKaldiObject(nnet_rxfilename, &nnet);
+    int32 component_index = nnet.GetComponentIndex(component_name);
+    Matrix<BaseFloat> filter_bank(
+        dynamic_cast<AffineComponent*>(nnet.GetComponent(component_index))->LinearParams());
+    std::ofstream out;
+    out.open(stats_wxfilename, std::ios::out);
+    if (!out.is_open()) { 
+      std::cout << "File open error." << std::endl;
+      return -1;
+    }
+    int32 num_rows = filter_bank.NumRows();
+    int32 num_columns = filter_bank.NumCols();
+    out << "Number of rows: " << num_rows << std::endl;
+    out << "Number of columns: " << num_columns << std::endl;
+    // Each row can be regard as a filter.
+    for (MatrixIndexT i = 0; i < num_rows; i++) {
+      const SubVector<BaseFloat> current_row = filter_bank.Row(i);
+      BaseFloat current_sum_2 = VecVec(current_row, current_row);
+      BaseFloat current_max_2 = current_row.Max() * current_row.Max();
+      BaseFloat band_with = current_sum_2 / current_max_2;
+      out << "Filter " << i+1 << ": Quadratic Sum is " << current_sum_2 
+          << " ;The square of max value is " << current_max_2
+          << " ;Band with is " << band_with << std::endl;
+    } 
+
+    out.close();
+    return 0;
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/fvectorbin/fvector-debug-wav-to-vector.cc b/src/fvectorbin/fvector-debug-wav-to-vector.cc
new file mode 100644
index 00000000000..ad202b88742
--- /dev/null
+++ b/src/fvectorbin/fvector-debug-wav-to-vector.cc
@@ -0,0 +1,41 @@
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "fvector/fvector-perturb.h"
+#include "feat/wave-reader.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    const char *usage =
+        "Usage: fvector-wav-to-vector [options...] <wav-rspecifier> <wave-wspecifier>\n";
+
+    ParseOptions po(usage);
+    BaseFloat sample_freq=16000; 
+    po.Register("sample-frequency",&sample_freq, "sample-frequency of the wave.");
+    po.Read(argc, argv);
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string wav_rspecifier = po.GetArg(1),
+                output_wspecifier = po.GetArg(2);
+
+    SequentialTableReader<WaveHolder> reader(wav_rspecifier);
+    BaseFloatVectorWriter kaldi_writer(output_wspecifier);
+
+    int64 num_read = 0, num_written = 0;
+    for (; !reader.Done(); reader.Next(), num_read++) {
+      std::string utt = reader.Key();
+      const WaveData &wave_data = reader.Value();
+      Vector<BaseFloat> waveform(SubVector<BaseFloat>(wave_data.Data(), 0));
+      kaldi_writer.Write(utt, waveform);
+    }
+    KALDI_LOG << " Done " << num_written << " out of " << num_read
+              << " utterances.";
+    return (num_written != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/fvectorbin/fvector-debug-write-to-wav.cc b/src/fvectorbin/fvector-debug-write-to-wav.cc
new file mode 100644
index 00000000000..6385a6c5a1b
--- /dev/null
+++ b/src/fvectorbin/fvector-debug-write-to-wav.cc
@@ -0,0 +1,52 @@
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "fvector/fvector-perturb.h"
+#include "feat/wave-reader.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    const char *usage =
+        "Usage:  fvector-write-to-wav [options...] <chunk-rspecifier> <wave-path>\n";
+
+    ParseOptions po(usage);
+    BaseFloat sample_freq=16000; 
+    po.Register("sample-frequency",&sample_freq, "sample-frequency of the wave.");
+    po.Read(argc, argv);
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string chunk_rspecifier = po.GetArg(1),
+                wave_path = po.GetArg(2);
+
+    SequentialBaseFloatMatrixReader chunk_reader(chunk_rspecifier);
+
+    int64 num_read = 0, num_written = 0;
+    for (; !chunk_reader.Done(); chunk_reader.Next(), num_read++) {
+      std::string key = chunk_reader.Key();
+      // input_chunk has 3 lines.
+      const Matrix<BaseFloat> &input_chunk = chunk_reader.Value();
+      num_read++;
+      for(int i=0; i<input_chunk.NumRows(); i++) {
+        std::stringstream utt_id_new;
+        utt_id_new << wave_path << '/' << key << '_' << i << ".wav";
+        Output os(utt_id_new.str(), false);
+        
+        Matrix<BaseFloat> temp(1, input_chunk.NumCols());
+        temp.CopyRowFromVec(input_chunk.Row(i),0);
+        WaveData wave(sample_freq, temp);
+        
+        wave.Write(os.Stream());
+        num_written++;
+      }
+    }
+    KALDI_LOG << " Done " << num_written << " out of " << num_read
+              << " utterances.";
+    return (num_written != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/fvectorbin/fvector-get-egs-block.cc b/src/fvectorbin/fvector-get-egs-block.cc
new file mode 100644
index 00000000000..acc98827e46
--- /dev/null
+++ b/src/fvectorbin/fvector-get-egs-block.cc
@@ -0,0 +1,122 @@
+// fvectorbin/fvector-get-egs.cc
+
+// Copyright 2012-2016  Johns Hopkins University (author:  Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sstream>
+#include "util/common-utils.h"
+#include "nnet3/nnet-example.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+
+    const char *usage =
+        "Get examples for training an nnet3 neural network for the fvector\n"
+        "system.  Each output example contains a pair of feature chunks.\n"
+        "Different from fvector-get-egs, the input is a matrix block which \n"
+        "contains some pairs rather than one pair.\n"
+        "Usage:  fvector-get-egs-block [options] <chunk-rspecifier> <egs-wspecifier>\n"
+        "For example:\n"
+        "fvector-get-egs scp:perturbed_chunks.scp ark:egs.ark";
+
+    bool compress = true;
+    BaseFloat frame_length_ms = 25; // in milliseconds
+    BaseFloat frame_shift_ms = 10; // in milliseconds
+    BaseFloat samp_freq;
+
+    ParseOptions po(usage);
+    po.Register("compress", &compress, "If true, write egs in "
+                "compressed format.");
+    po.Register("frame-length", &frame_length_ms, "Frame length in milliseconds");
+    po.Register("frame-shift", &frame_shift_ms, "Frame shift in milliseconds");
+    po.Register("sample-frequency", &samp_freq, "Waveform data sample frequency ("
+                "must match the waveform file, if specified there)");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string feature_rspecifier = po.GetArg(1);
+    NnetExampleWriter example_writer(po.GetArg(2));
+
+
+    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
+
+    int32 num_read = 0,
+          num_egs_written = 0;
+    for (; feature_reader.Done(); feature_reader.Next(), num_read++) {
+      std::string key = feature_reader.Key();
+      const Matrix<BaseFloat> &feats = feature_reader.Value();
+      //Please take care. Here, the 'feats' is a matrix block which is generated
+      //by fvector-add-noise-block.cc. Each two consecutive lines of the matrix
+      //represents two perturbed vectors(e.g 100ms wavform) which come from the 
+      //same source signal.chunk1 and chunk2 corresponds to one line respectively.
+      for (MatrixIndexT i=0; i < feats.NumRows()/2; i++) {
+        SubVector<BaseFloat> chunk1(feats, 2*i),
+                             chunk2(feats, 2*i+1);
+        //According to frame_length and frame_shift, cut the chunk into few pieces
+        //so that it is similiar with normal feature extract procedure.
+        int num_rows = ((int)((chunk1.Dim() / samp_freq - frame_length_ms) / 
+                              frame_shift_ms) + 1);
+        int num_cols = (int)(samp_freq * frame_length_ms);
+        Matrix<BaseFloat> chunk1_matrix(num_rows, num_cols),
+                          chunk2_matrix(num_rows, num_cols);
+        for (MatrixIndexT i = 0; i < num_rows; i++) {
+          chunk1_matrix.Row(i).CopyFromVec(chunk1.Range(i*num_cols, num_cols));
+          chunk2_matrix.Row(i).CopyFromVec(chunk2.Range(i*num_cols, num_cols));
+        }
+        //generate the NnetIo
+        NnetIo nnet_io1 = NnetIo("input", 0, chunk1_matrix),
+               nnet_io2 = NnetIo("input", 0, chunk2_matrix);
+        //modify the n index, so that in a mini-batch Nnet3Example, the adjacent
+        //two NnetIos come from the same source signal.
+        for (std::vector<Index>::iterator indx_it = nnet_io1.indexes.begin();
+          indx_it != nnet_io1.indexes.end(); ++indx_it) {
+          indx_it->n = 0;
+        }
+        for (std::vector<Index>::iterator indx_it = nnet_io2.indexes.begin();
+          indx_it != nnet_io2.indexes.end(); ++indx_it) {
+          indx_it->n = 1;
+        }
+        NnetExample eg;
+        eg.io.push_back(nnet_io1);
+        eg.io.push_back(nnet_io2);
+        if (compress) {
+          eg.Compress();
+        }
+        std::ostringstream os;
+        os << key << "-" << i;
+        std::string key_new = os.str();
+        example_writer.Write(key_new, eg);
+        num_egs_written += 1;
+      }
+    }
+    KALDI_LOG << "Finished generating examples, "
+              << "successfully convert " << num_egs_written << " chunks into examples out of "
+              << num_read << " chunks";
+    return (num_egs_written == 0);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
diff --git a/src/fvectorbin/fvector-get-egs.cc b/src/fvectorbin/fvector-get-egs.cc
new file mode 100644
index 00000000000..c5b59e02c3c
--- /dev/null
+++ b/src/fvectorbin/fvector-get-egs.cc
@@ -0,0 +1,143 @@
+// fvectorbin/fvector-get-egs.cc
+
+// Copyright 2012-2016  Johns Hopkins University (author:  Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sstream>
+#include "util/common-utils.h"
+#include "nnet3/nnet-example.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+
+    const char *usage =
+        "Get examples for training an nnet3 neural network for the fvector\n"
+        "system.  Each output example contains a pair of feature chunks.\n"
+        "Usage:  fvector-get-egs [options] <chunk-rspecifier> <egs-wspecifier>\n"
+        "For example:\n"
+        "fvector-get-egs scp:perturbed_chunks.scp ark:egs.ark";
+
+    bool compress = true;
+    BaseFloat frame_length_ms = 25; // in milliseconds
+    BaseFloat frame_shift_ms = 10; // in milliseconds
+    BaseFloat samp_freq=16000;
+    int left_padding=0;
+    int right_padding=0;
+
+    ParseOptions po(usage);
+    po.Register("compress", &compress, "If true, write egs in "
+                "compressed format.");
+    po.Register("frame-length", &frame_length_ms, "Frame length in milliseconds");
+    po.Register("frame-shift", &frame_shift_ms, "Frame shift in milliseconds");
+    po.Register("sample-frequency", &samp_freq, "Waveform data sample frequency ("
+                "must match the waveform file, if specified there)");
+    po.Register("left-padding", &left_padding, "When we use convolutional NN,"
+                "we tend to pad on the time axis with repeats of the first frame.");
+    po.Register("right-padding", &right_padding, "When we use convolutional NN,"
+                "we tend to pad on the time axis with repeats of the last frame.");
+
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string feature_rspecifier = po.GetArg(1);
+    NnetExampleWriter example_writer(po.GetArg(2));
+    
+    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
+
+    int32 num_read = 0,
+          num_egs_written = 0;
+    for (; !feature_reader.Done(); feature_reader.Next(), num_read++) {
+      std::string key = feature_reader.Key();
+      const Matrix<BaseFloat> &feats = feature_reader.Value();
+      //Please take care. Here, the 'feats' is a 2-lines matrix which is generated
+      //by fvector-add-noise.cc. The 2-lines matrix represents two perturbed 
+      //vectors(e.g 100ms wavform) which come from the same source signal.
+      //chunk1 and chunk2 corresponds to one line respectively.
+      SubVector<BaseFloat> chunk1(feats, 0),
+                           chunk2(feats, 1);
+
+      //According to frame_length and frame_shift, cut the chunk into few pieces
+      //so that it is similiar with normal feature extract procedure.
+      int num_rows = ((int)(((chunk1.Dim() * 1.0 / samp_freq) * 1000 - frame_length_ms) / 
+                            frame_shift_ms) + 1);
+      int num_cols = (int)(samp_freq / 1000.0 * frame_length_ms);
+      Matrix<BaseFloat> chunk1_matrix(num_rows, num_cols),
+                        chunk2_matrix(num_rows, num_cols);
+      for (MatrixIndexT i = 0; i < num_rows; i++) {
+        chunk1_matrix.Row(i).CopyFromVec(chunk1.Range(i*frame_shift_ms*samp_freq/1000, num_cols));
+        chunk2_matrix.Row(i).CopyFromVec(chunk2.Range(i*frame_shift_ms*samp_freq/1000, num_cols));
+      }
+      Matrix<BaseFloat> chunk1_matrix_out(chunk1_matrix),
+                        chunk2_matrix_out(chunk2_matrix);
+      if((left_padding !=0) || (right_padding != 0)) {
+        int32 tot_num_rows = num_rows+left_padding+right_padding;
+        chunk1_matrix_out.Resize(tot_num_rows, num_cols, kUndefined);
+        chunk2_matrix_out.Resize(tot_num_rows, num_cols, kUndefined);
+        for(int32 row = 0; row < tot_num_rows; row++) {
+          int32 row_in = row - left_padding;
+          if (row_in < 0) {
+            row_in = 0;
+          } else if (row_in >= num_rows ) {
+            row_in = num_rows -1;
+          }
+          SubVector<BaseFloat> vec_chunk1_in(chunk1_matrix, row_in),
+                               vec_chunk1_out(chunk1_matrix_out, row),
+                               vec_chunk2_in(chunk2_matrix, row_in),
+                               vec_chunk2_out(chunk2_matrix_out, row);
+          vec_chunk1_out.CopyFromVec(vec_chunk1_in);
+          vec_chunk2_out.CopyFromVec(vec_chunk2_in);
+        }
+      }
+      //generate the NnetIo
+      NnetIo nnet_io1 = NnetIo("input", -left_padding, chunk1_matrix_out),
+             nnet_io2 = NnetIo("input", -left_padding, chunk2_matrix_out);
+      //modify the n index, so that in a mini-batch Nnet3Example, the adjacent
+      //two NnetIos come from the same source signal.
+      for (std::vector<Index>::iterator indx_it = nnet_io1.indexes.begin();
+        indx_it != nnet_io1.indexes.end(); ++indx_it) {
+        indx_it->n = 0;
+      }
+      for (std::vector<Index>::iterator indx_it = nnet_io2.indexes.begin();
+        indx_it != nnet_io2.indexes.end(); ++indx_it) {
+        indx_it->n = 1;
+      }
+      NnetExample eg;
+      eg.io.push_back(nnet_io1);
+      eg.io.push_back(nnet_io2);
+      if (compress) {
+        eg.Compress();
+      }
+      example_writer.Write(key, eg);
+      num_egs_written += 1;
+    }
+    KALDI_LOG << "Finished generating examples, "
+              << "successfully convert " << num_egs_written << " chunks into examples out of "
+              << num_read << " chunks";
+    return (num_egs_written == 0);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc
index cc5fe3cc050..63cdb93b813 100644
--- a/src/nnet3/nnet-example-utils.cc
+++ b/src/nnet3/nnet-example-utils.cc
@@ -113,7 +113,9 @@ static void MergeIo(const std::vector<NnetExample> &src,
                                              names_end = names.end();
   std::vector<NnetExample>::const_iterator eg_iter = src.begin(),
     eg_end = src.end();
+  int32 n_offset = 0;
   for (int32 n = 0; eg_iter != eg_end; ++eg_iter, ++n) {
+    int32 max_source_n = 0;
     std::vector<NnetIo>::const_iterator io_iter = eg_iter->io.begin(),
       io_end = eg_iter->io.end();
     for (; io_iter != io_end; ++io_iter) {
@@ -139,12 +141,21 @@ static void MergeIo(const std::vector<NnetExample> &src,
       for (int32 i = this_offset; i < this_offset + this_size; i++) {
         // we could easily support merging already-merged egs, but I don't see a
         // need for it right now.
-        KALDI_ASSERT(output_iter[i].n == 0 &&
-                     "Merging already-merged egs?  Not currentlysupported.");
-        output_iter[i].n = n;
+        
+        // For fvector, the NnetIos in the same NnetExample may have the same
+        // name, however the index.ns of them are different.
+        //KALDI_ASSERT(output_iter[i].n == 0 &&
+        //             "Merging already-merged egs?  Not currentlysupported.");
+        //output_iter[i].n = n;
+        KALDI_ASSERT(output_iter[i].n >= 0);
+        if (output_iter[i].n > max_source_n) {
+          max_source_n = output_iter[i].n;
+        }
+        output_iter[i].n += n_offset;
       }
       this_offset += this_size;  // note: this_offset is a reference.
     }
+    n_offset += max_source_n + 1;
   }
   KALDI_ASSERT(cur_size == sizes);
   for (int32 f = 0; f < num_feats; f++) {
diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index 8246148abc6..650e8b5aecf 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -2067,6 +2067,69 @@ bool PositiveUpdatableWeights(Nnet *nnet) {
   return true;
 }
 
+/// For Xvector
+void GetConstantOutput(const Nnet &nnet_const, const std::string &output_name,
+    Vector<BaseFloat> *output) {
+  Nnet nnet(nnet_const);
+  std::string input_name = "input";
+  int32 left_context,
+        right_context,
+        input_node_index = nnet.GetNodeIndex(input_name),
+        output_node_index = nnet.GetNodeIndex(output_name);
+  if (output_node_index == -1 && !nnet.IsOutputNode(output_node_index))
+    KALDI_ERR << "No output node called '" << output_name
+              << "' in the network.";
+  if (input_node_index == -1 && nnet.IsInputNode(input_node_index))
+    KALDI_ERR << "No input node called '" << input_name
+              << "' in the network.";
+  KALDI_ASSERT(output->Dim() == nnet.OutputDim(output_name));
+  ComputeSimpleNnetContext(nnet, &left_context, &right_context);
+
+  // It's difficult to get the output of the node
+  // directly.  Instead, we can create some fake input,
+  // propagate it through the network, and read out the
+  // output.
+  CuMatrix<BaseFloat> cu_feats(left_context + right_context + 1,
+      nnet.InputDim(input_name));
+  Matrix<BaseFloat> feats(cu_feats);
+
+  ComputationRequest request;
+  NnetIo nnet_io = NnetIo(input_name, 0, feats);
+  request.inputs.clear();
+  request.outputs.clear();
+  request.inputs.resize(1);
+  request.outputs.resize(1);
+  request.need_model_derivative = false;
+  request.store_component_stats = false;
+
+  std::vector<Index> output_indexes;
+  request.inputs[0].name = input_name;
+  request.inputs[0].indexes = nnet_io.indexes;
+  request.inputs[0].has_deriv = false;
+  output_indexes.resize(1);
+  output_indexes[0].n = 0;
+  output_indexes[0].t = 0;
+  request.outputs[0].name = output_name;
+  request.outputs[0].indexes = output_indexes;
+  request.outputs[0].has_deriv = false;
+
+  CachingOptimizingCompiler compiler(nnet, NnetOptimizeOptions());
+  std::shared_ptr<const NnetComputation> computation = compiler.Compile(request);
+  NnetComputer computer(NnetComputeOptions(), *computation,
+                        nnet, &nnet);
+
+  // check to see if something went wrong.
+  if (request.inputs.empty())
+    KALDI_ERR << "No input in computation request.";
+  if (request.outputs.empty())
+    KALDI_ERR << "No output in computation request.";
+
+  computer.AcceptInput("input", &cu_feats);
+  computer.Run();
+  const CuMatrixBase<BaseFloat> &output_mat = computer.GetOutput(output_name);
+  CuSubVector<BaseFloat> output_vec(output_mat, 0);
+  output->CopyFromVec(output_vec);
+}
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h
index 00aeb4a1661..932d0b8ed06 100644
--- a/src/nnet3/nnet-utils.h
+++ b/src/nnet3/nnet-utils.h
@@ -506,6 +506,13 @@ int32 GetNumNvalues(const std::vector<NnetIo> &io_vec,
 */
 bool PositiveUpdatableWeights(Nnet *nnet);
 
+/// For Xvector
+/// This function assumes that the node named in 'output_node' is a constant
+/// function of the input features (e.g, a ConstantFunctionComponent is
+/// its input) and returns it in 'out'.
+void GetConstantOutput(const Nnet &nnet, const std::string &output_name,
+      Vector<BaseFloat> *out);
+
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/xvector/Makefile b/src/xvector/Makefile
new file mode 100644
index 00000000000..093dd68f1a8
--- /dev/null
+++ b/src/xvector/Makefile
@@ -0,0 +1,22 @@
+
+all:
+
+OPENFST_CXXFLAGS = 
+OPENFST_LDLIBS = 
+include ../kaldi.mk
+
+LDFLAGS += $(CUDA_LDFLAGS)
+LDLIBS += $(CUDA_LDLIBS)
+
+TESTFILES = xvector-test
+
+OBJFILES = xvector.o nnet-xvector-training.o nnet-xvector-diagnostics.o nnet-xvector-compute.o
+
+LIBNAME = kaldi-xvector
+
+ADDLIBS = ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../transform/kaldi-transform.a \
+          ../nnet3/kaldi-nnet3.a ../cudamatrix/kaldi-cudamatrix.a \
+          ../matrix/kaldi-matrix.a ../base/kaldi-base.a \
+          ../util/kaldi-util.a
+
+include ../makefiles/default_rules.mk
diff --git a/src/xvector/nnet-xvector-compute.cc b/src/xvector/nnet-xvector-compute.cc
new file mode 100644
index 00000000000..a83e6e063f0
--- /dev/null
+++ b/src/xvector/nnet-xvector-compute.cc
@@ -0,0 +1,99 @@
+// xvector/nnet-xvector-compute.cc
+
+// Copyright      2015    Johns Hopkins University (author: Daniel Povey)
+//                2016    David Snyder
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xvector/nnet-xvector-compute.h"
+#include "nnet3/nnet-utils.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+NnetXvectorComputer::NnetXvectorComputer(
+    const NnetSimpleComputationOptions &config,
+    Nnet *nnet):
+    nnet_(nnet),
+    config_(config),
+    compiler_(*nnet, config.optimize_config) {
+}
+
+void NnetXvectorComputer::ComputeXvector(const MatrixBase<BaseFloat> &feats,
+                    Vector<BaseFloat> *xvector) {
+
+  ComputationRequest request;
+  GetComputationRequest(feats, &request);
+  std::shared_ptr<const NnetComputation> computation = compiler_.Compile(request);
+  NnetComputer computer(config_.compute_config, *computation,
+                        *nnet_,
+                        nnet_);
+  std::string input_name = "input";
+  CuMatrix<BaseFloat> cu_feats(feats);
+  computer.AcceptInput(input_name, &cu_feats);
+  computer.Run();
+  const CuMatrixBase<BaseFloat> &output = computer.GetOutput("output");
+  KALDI_ASSERT(output.NumRows() == 1 && output.NumCols() == xvector->Dim());
+  CuSubVector<BaseFloat> xvector_tmp(output, 0);
+  xvector->CopyFromVec(xvector_tmp);
+}
+
+void NnetXvectorComputer::GetComputationRequest(
+    const MatrixBase<BaseFloat> &feats,
+    ComputationRequest *request) {
+  std::string input_name = "input",
+              output_name = "output";
+  NnetIo nnet_io = NnetIo(input_name, 0, feats);
+  request->inputs.clear();
+  request->outputs.clear();
+  request->inputs.resize(1);
+  request->outputs.resize(1);
+  request->need_model_derivative = false;
+  request->store_component_stats = false;
+
+  int32 input_node_index = nnet_->GetNodeIndex(input_name);
+
+  if (input_node_index == -1 && !nnet_->IsInputNode(input_node_index))
+    KALDI_ERR << "No input node called '" << input_name
+              << "' in the network.";
+
+  request->inputs[0].name = input_name;
+  request->inputs[0].indexes = nnet_io.indexes;
+  request->inputs[0].has_deriv = false;
+
+  // We only need the output on frame t=0.
+  std::vector<Index> output_indexes;
+  output_indexes.resize(1);
+  output_indexes[0].n = 0;
+  output_indexes[0].t = 0;
+
+  // Add an io_spec for the output node.
+  int32 output_node_index = nnet_->GetNodeIndex(output_name);
+  if (!nnet_->IsOutputNode(output_node_index))
+    KALDI_ERR << "No output node called '" << output_name
+              << "' in the network.";
+  request->outputs[0].name = output_name;
+  request->outputs[0].indexes = output_indexes;
+  request->outputs[0].has_deriv = false;
+
+  // check to see if something went wrong.
+  if (request->inputs.empty())
+    KALDI_ERR << "No input in computation request.";
+  if (request->outputs.empty())
+    KALDI_ERR << "No output in computation request.";
+}
+
+} // namespace nnet3
+} // namespace kaldi
diff --git a/src/xvector/nnet-xvector-compute.h b/src/xvector/nnet-xvector-compute.h
new file mode 100644
index 00000000000..b551c24e76c
--- /dev/null
+++ b/src/xvector/nnet-xvector-compute.h
@@ -0,0 +1,55 @@
+// xvector/nnet-xvector-compute.h
+
+// Copyright    2015  Johns Hopkins University (author: Daniel Povey)
+//              2016  David Snyder
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_XVECTOR_NNET_XVECTOR_COMPUTE_H_
+#define KALDI_XVECTOR_NNET_XVECTOR_COMPUTE_H_
+
+#include "nnet3/nnet-am-decodable-simple.h" // For NnetSimpleComputationOptions
+#include "nnet3/nnet-computation.h"
+#include "nnet3/nnet-compute.h"
+#include "xvector/xvector.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+/**
+  class NnetXvectorComputer is responsible for extracting xvectors from
+  feature chunks.
+**/
+class NnetXvectorComputer {
+ public:
+  /// Constructor.
+  NnetXvectorComputer(const NnetSimpleComputationOptions &opts,
+                      Nnet *nnet);
+  /// Extracts an xvector given input features.
+  void ComputeXvector(const MatrixBase<BaseFloat> &feats,
+                    Vector<BaseFloat> *xvector);
+ private:
+  Nnet *nnet_;
+  const NnetSimpleComputationOptions config_;
+  CachingOptimizingCompiler compiler_;
+
+  /// Creates a computation request from the input features.
+  void GetComputationRequest(const MatrixBase<BaseFloat> &feats,
+                             ComputationRequest *request);
+};
+} // namespace nnet3
+} // namespace kaldi
+
+#endif //
diff --git a/src/xvector/nnet-xvector-diagnostics.cc b/src/xvector/nnet-xvector-diagnostics.cc
new file mode 100644
index 00000000000..071ed65a241
--- /dev/null
+++ b/src/xvector/nnet-xvector-diagnostics.cc
@@ -0,0 +1,214 @@
+// xvector/nnet-xvector-diagnostics.cc
+
+// Copyright      2015    Johns Hopkins University (author: Daniel Povey)
+// Copyright      2016    Pegah Ghahremani
+//                        David Snyder
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet3/nnet-diagnostics.h"
+#include "xvector/nnet-xvector-diagnostics.h"
+#include "nnet3/nnet-utils.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+NnetXvectorComputeProb::NnetXvectorComputeProb(const NnetComputeProbOptions
+                                               &config,
+                                               const Nnet &nnet):
+    config_(config),
+    nnet_(nnet),
+    deriv_nnet_(NULL),
+    compiler_(nnet),
+    num_minibatches_processed_(0) {
+  if (config_.compute_deriv) {
+    deriv_nnet_ = new Nnet(nnet_);
+    ScaleNnet(0.0, deriv_nnet_); // force simple update
+    SetNnetAsGradient(deriv_nnet_);
+  }
+}
+
+const Nnet &NnetXvectorComputeProb::GetDeriv() const {
+  if (deriv_nnet_ == NULL)
+    KALDI_ERR << "GetDeriv() called when no derivatives were requested.";
+  return *deriv_nnet_;
+}
+
+NnetXvectorComputeProb::~NnetXvectorComputeProb() {
+  delete deriv_nnet_;  // delete does nothing if pointer is NULL.
+}
+
+void NnetXvectorComputeProb::Reset() {
+  num_minibatches_processed_ = 0;
+  objf_info_.clear();
+  if (deriv_nnet_) {
+    ScaleNnet(0.0, deriv_nnet_);
+    SetNnetAsGradient(deriv_nnet_);
+  }
+}
+
+void NnetXvectorComputeProb::Compute(const NnetExample &eg) {
+  bool need_model_derivative = config_.compute_deriv,
+      store_component_stats = false;
+  ComputationRequest request;
+  GetComputationRequestXvector(nnet_, eg, need_model_derivative,
+                               store_component_stats,
+                               &request);
+  std::shared_ptr<const NnetComputation> computation = compiler_.Compile(request);
+  NnetComputer computer(config_.compute_config, *computation,
+                        nnet_, deriv_nnet_);
+  // give the inputs to the computer object.
+  computer.AcceptInputs(nnet_, eg.io);
+  computer.Run();
+  this->ProcessOutputs(&computer);
+  if (config_.compute_deriv)
+    computer.Run();
+}
+
+void NnetXvectorComputeProb::ProcessOutputs(NnetComputer *computer) {
+  for (int32 node_index = 0; node_index < nnet_.NumNodes(); node_index++) {
+    if (nnet_.IsOutputNode(node_index)) {
+      std::string xvector_name = nnet_.GetNodeName(node_index),
+        s_name = "s", b_name = "b";
+      if (nnet_.GetNodeIndex(s_name) == -1 || nnet_.GetNodeIndex(b_name) == -1)
+        KALDI_ERR << "The nnet expected to have two output nodes with "
+                  << "name s and b.";
+
+      if (xvector_name == "output") {
+        const CuMatrixBase<BaseFloat> &xvector_pairs
+                                      = computer->GetOutput(xvector_name),
+                                      &xvec_s = computer->GetOutput(s_name),
+                                      &xvec_b = computer->GetOutput(b_name);
+        int32 num_rows = xvector_pairs.NumRows(),
+              num_cols = xvector_pairs.NumCols();
+        CuMatrix<BaseFloat> xvector_deriv(num_rows, num_cols, kUndefined),
+                            raw_scores(num_rows, num_rows, kUndefined);
+        int32 s_dim = num_cols * (num_cols + 1) / 2;
+
+        // convert CuVector to CuSpMatrix
+        CuSpMatrix<BaseFloat> xvec_s_sp(num_cols);
+        xvec_s_sp.CopyFromVec(xvec_s.Row(0));
+        CuVector<BaseFloat> deriv_s(s_dim);
+
+        BaseFloat xvec_b_val = xvec_b(0,0), deriv_b;
+        BaseFloat tot_weight, tot_objf;
+        bool supply_deriv = config_.compute_deriv;
+        bool compute_accuracy = config_.compute_accuracy;
+        ComputeXvectorObjfAndDeriv(xvector_pairs, xvec_s_sp, xvec_b_val,
+                                   (supply_deriv ? &xvector_deriv : NULL),
+                                   (supply_deriv ? &deriv_s : NULL),
+                                   (supply_deriv ? &deriv_b : NULL),
+                                   (compute_accuracy ? &raw_scores : NULL),
+                                   &tot_objf,
+                                   &tot_weight);
+        if (supply_deriv) {
+          CuMatrix<BaseFloat> deriv_s_mat(1, s_dim),
+            deriv_b_mat(1,1);
+          deriv_b_mat(0,0) = deriv_b;
+          deriv_s_mat.CopyRowsFromVec(deriv_s);
+          computer->AcceptInput(xvector_name, &xvector_deriv);
+          computer->AcceptInput(s_name, &deriv_s_mat);
+          computer->AcceptInput(b_name, &deriv_b_mat);
+
+        }
+        SimpleObjectiveInfo &totals = objf_info_[xvector_name];
+        totals.tot_weight += tot_weight;
+        totals.tot_objective += tot_objf;
+        if (compute_accuracy) {
+          BaseFloat tot_acc;
+          SimpleObjectiveInfo &acc_totals = acc_info_[xvector_name];
+          ComputeAccuracy(raw_scores, &tot_acc);
+          acc_totals.tot_objective += tot_weight * tot_acc;
+          acc_totals.tot_weight += tot_weight;
+        }
+      }
+      num_minibatches_processed_++;
+    }
+  }
+}
+
+bool NnetXvectorComputeProb::PrintTotalStats() const {
+  bool ans = false;
+  unordered_map<std::string, SimpleObjectiveInfo, StringHasher>::const_iterator
+      iter, end;
+  { // First print regular objectives
+    iter = objf_info_.begin();
+    end = objf_info_.end();
+    for (; iter != end; ++iter) {
+      const std::string &name = iter->first;
+      int32 node_index = nnet_.GetNodeIndex(name);
+      KALDI_ASSERT(node_index >= 0);
+      ObjectiveType obj_type = nnet_.GetNode(node_index).u.objective_type;
+      const SimpleObjectiveInfo &info = iter->second;
+      KALDI_LOG << "Overall "
+                << (obj_type == kLinear ? "log-likelihood" : "objective")
+                << " for '" << name << "' is "
+                << (info.tot_objective / info.tot_weight) << " per chunk"
+                << ", over " << info.tot_weight << " chunks.";
+      if (info.tot_weight > 0)
+        ans = true;
+    }
+  }
+  if (config_.compute_accuracy) {  // Now print the accuracy.
+    iter = acc_info_.begin();
+    end = acc_info_.end();
+    for (; iter != end; ++iter) {
+      const std::string &name = iter->first;
+      const SimpleObjectiveInfo &info = iter->second;
+      KALDI_LOG << "Overall accuracy for '" << name << "' is "
+                << (info.tot_objective / info.tot_weight)
+                << " per chunk"
+                << ", over " << ceil(info.tot_weight) << " chunks.";
+    }
+  }
+  return ans;
+}
+
+void NnetXvectorComputeProb::ComputeAccuracy(
+    const CuMatrixBase<BaseFloat> &raw_scores,
+    BaseFloat *tot_accuracy_out) {
+  int32 num_rows = raw_scores.NumRows();
+  BaseFloat K = 1.0 / (num_rows - 2.0),
+            threshold = 0; // Corresponds to prob_same(u,v) = 0.5.
+  BaseFloat count = 0,
+        error = 0;
+  for (int32 i = 0; i < num_rows; i++) {
+    for (int32 j = 0; j < num_rows; j++) {
+      if (i + 1 == j && i % 2 == 0) {
+        if (raw_scores(i, j) < threshold)
+          error++;
+        count++;
+      } else if (i < j) {
+        if (raw_scores(i, j) >= threshold)
+          error += K;
+        count += K;
+      }
+    }
+  }
+  (*tot_accuracy_out) = 1.0 - error / count;
+}
+
+const SimpleObjectiveInfo* NnetXvectorComputeProb::GetObjective(
+    const std::string &output_name) const {
+  unordered_map<std::string, SimpleObjectiveInfo, StringHasher>::const_iterator
+      iter = objf_info_.find(output_name);
+  if (iter != objf_info_.end())
+    return &(iter->second);
+  else
+    return NULL;
+}
+
+} // namespace nnet3
+} // namespace kaldi
diff --git a/src/xvector/nnet-xvector-diagnostics.h b/src/xvector/nnet-xvector-diagnostics.h
new file mode 100644
index 00000000000..6a2de6b38bd
--- /dev/null
+++ b/src/xvector/nnet-xvector-diagnostics.h
@@ -0,0 +1,95 @@
+// xvector/nnet-xvector-diagnostics.h
+
+// Copyright    2015  Johns Hopkins University (author: Daniel Povey)
+// Copyright    2016  Pegah Ghahremani
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_XVECTOR_NNET_XVECTOR_DIAGNOSTICS_H_
+#define KALDI_XVECTOR_NNET_XVECTOR_DIAGNOSTICS_H_
+
+#include "nnet3/nnet-example.h"
+#include "nnet3/nnet-computation.h"
+#include "nnet3/nnet-compute.h"
+#include "nnet3/nnet-optimize.h"
+#include "nnet3/nnet-example-utils.h"
+#include "nnet3/nnet-training.h"
+#include "xvector/nnet-xvector-training.h"
+#include "xvector/xvector.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+
+
+/** This class is for computing cross-entropy values in a neural
+    network with xvector as output and unsupervised objective, for diagnostics.
+    Note: because we put a "logsoftmax" component in the nnet, the actual
+    objective function becomes linear at the output, but the printed messages
+    reflect the fact that it's the cross-entropy objective.
+
+    TODO: In future we plan to check that the same values are returned whether
+    we run the computation with or without optimization.
+ */
+class NnetXvectorComputeProb {
+ public:
+  // does not store a reference to 'config' but does store one to 'nnet'.
+  NnetXvectorComputeProb(const NnetComputeProbOptions &config,
+                  const Nnet &nnet);
+
+  // Reset the likelihood stats, and the derivative stats (if computed).
+  void Reset();
+
+  // compute objective on one minibatch.
+  void Compute(const NnetExample &eg);
+
+  // Prints out the final stats, and return true if there was a nonzero count.
+  bool PrintTotalStats() const;
+
+
+  // returns the objective-function info for this output name (e.g. "output"),
+  // or NULL if there is no such info.
+  const SimpleObjectiveInfo *GetObjective(const std::string &output_name) const;
+
+  // if config.compute_deriv == true, returns a reference to the
+  // computed derivative.  Otherwise crashes.
+  const Nnet &GetDeriv() const;
+
+  ~NnetXvectorComputeProb();
+ private:
+  void ProcessOutputs(NnetComputer *computer);
+  // Computes the accuracy for this minibatch.
+  void ComputeAccuracy(const CuMatrixBase<BaseFloat> &raw_scores,
+                       BaseFloat *tot_accuracy_out);
+  NnetComputeProbOptions config_;
+  const Nnet &nnet_;
+
+  Nnet *deriv_nnet_;
+  CachingOptimizingCompiler compiler_;
+
+  // this is only for diagnostics.
+  int32 num_minibatches_processed_;
+
+  unordered_map<std::string, SimpleObjectiveInfo, StringHasher> objf_info_;
+  unordered_map<std::string, SimpleObjectiveInfo, StringHasher> acc_info_;
+
+};
+
+
+} // namespace nnet3
+} // namespace kaldi
+
+#endif // KALDI_XVECTOR_NNET_XVECTOR_DIAGNOSTICS_H_
diff --git a/src/xvector/nnet-xvector-training.cc b/src/xvector/nnet-xvector-training.cc
new file mode 100644
index 00000000000..8fc9423df1b
--- /dev/null
+++ b/src/xvector/nnet-xvector-training.cc
@@ -0,0 +1,272 @@
+// xvector/nnet-xvector-training.cc
+
+// Copyright      2015    Johns Hopkins University (author: Daniel Povey)
+//                2015    Xiaohui Zhang
+// Copyright      2016    Pegah Ghahremani
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xvector/nnet-xvector-training.h"
+#include "nnet3/nnet-utils.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+NnetXvectorTrainer::NnetXvectorTrainer(const NnetTrainerOptions &config,
+                         Nnet *nnet):
+    config_(config),
+    nnet_(nnet),
+    compiler_(*nnet, config_.optimize_config),
+    num_minibatches_processed_(0) {
+  if (config_.zero_component_stats)
+    ZeroComponentStats(nnet);
+  if (config_.momentum == 0.0 &&
+      config_.max_param_change == 0.0) {
+    delta_nnet_= NULL;
+  } else {
+    KALDI_ASSERT(config_.momentum >= 0.0 &&
+                 config_.max_param_change >= 0.0);
+    delta_nnet_ = nnet_->Copy();
+    bool is_gradient = false;  // setting this to true would disable the
+                               // natural-gradient updates.
+    if (is_gradient) {
+      ScaleNnet(0.0, delta_nnet_);
+      SetNnetAsGradient(delta_nnet_);
+    } else {
+      ScaleNnet(0.0, delta_nnet_);
+    }
+  }
+  if (config_.read_cache != "") {
+    bool binary;
+    try {
+      Input ki(config_.read_cache, &binary);
+      compiler_.ReadCache(ki.Stream(), binary);
+    } catch (...) {
+      KALDI_WARN << "Could not open cached computation. "
+                    "Probably this is the first training iteration.";
+    }
+  }
+}
+
+
+void NnetXvectorTrainer::Train(const NnetExample &eg) {
+  bool need_model_derivative = true;
+  ComputationRequest request;
+  GetComputationRequestXvector(*nnet_, eg, need_model_derivative,
+                               config_.store_component_stats,
+                               &request);
+  std::shared_ptr<const NnetComputation> computation = compiler_.Compile(request);
+
+  NnetComputer computer(config_.compute_config, *computation,
+                        *nnet_,
+                        (delta_nnet_ == NULL ? nnet_ : delta_nnet_));
+  // give the inputs to the computer object.
+  computer.AcceptInputs(*nnet_, eg.io);
+  computer.Run();
+
+  this->ProcessOutputs(&computer);
+  computer.Run();
+
+  if (delta_nnet_ != NULL) {
+    BaseFloat scale = (1.0 - config_.momentum);
+    if (config_.max_param_change != 0.0) {
+      BaseFloat param_delta =
+          std::sqrt(DotProduct(*delta_nnet_, *delta_nnet_)) * scale;
+      if (param_delta > config_.max_param_change) {
+        if (param_delta - param_delta != 0.0) {
+          KALDI_WARN << "Infinite parameter change, will not apply.";
+          ScaleNnet(0.0, delta_nnet_);
+        } else {
+          scale *= config_.max_param_change / param_delta;
+          KALDI_LOG << "Parameter change too big: " << param_delta << " > "
+                    << "--max-param-change=" << config_.max_param_change
+                    << ", scaling by " << config_.max_param_change / param_delta;
+        }
+      }
+    }
+    AddNnet(*delta_nnet_, scale, nnet_);
+    // impose positivity for AffineComponent max(W,0)
+    PositiveUpdatableWeights(nnet_);
+
+    ScaleNnet(config_.momentum, delta_nnet_);
+  }
+  if (config_.write_cache != "") {
+    Output ko(config_.write_cache,
+      config_.binary_write_cache);
+    compiler_.WriteCache(ko.Stream(), config_.binary_write_cache);
+  }
+}
+
+void NnetXvectorTrainer::ProcessOutputs(NnetComputer *computer) {
+  for (int32 node_index = 0; node_index < nnet_->NumNodes(); node_index++) {
+    if (nnet_->IsOutputNode(node_index)) {
+      BaseFloat tot_weight, tot_objf;
+      bool supply_deriv = true;
+      // For each xvector output node, we expect two output nodes with name "s"
+      // and "b", which store symmetric affine transformation and bias term
+      // for xvector-objective computation.
+      std::string xvector_name = nnet_->GetNodeName(node_index),
+        s_name = "s", b_name = "b";
+      if (nnet_->GetNodeIndex(s_name) == -1 || nnet_->GetNodeIndex(b_name) == -1)
+        KALDI_ERR << "The nnet expected to have two output nodes with name s and b.";
+
+      if (xvector_name == "output") {
+        const CuMatrixBase<BaseFloat> &xvector_pairs = computer->GetOutput(xvector_name),
+          &xvec_s = computer->GetOutput(s_name),
+          &xvec_b = computer->GetOutput(b_name);
+        CuMatrix<BaseFloat> xvector_deriv(xvector_pairs.NumRows(), xvector_pairs.NumCols(),
+                                          kUndefined);
+        int32 s_dim = xvector_pairs.NumCols() * (xvector_pairs.NumCols() + 1) / 2;
+
+        // convert CuVector to CuSpMatrix
+        CuSpMatrix<BaseFloat> xvec_s_sp(xvector_pairs.NumCols());
+        xvec_s_sp.CopyFromVec(xvec_s.Row(0));
+
+        CuVector<BaseFloat> deriv_s(s_dim);
+        BaseFloat xvec_b_val = xvec_b(0,0), deriv_b;
+        ComputeXvectorObjfAndDeriv(xvector_pairs, xvec_s_sp, xvec_b_val,
+                                   (supply_deriv ? &xvector_deriv : NULL),
+                                   (supply_deriv ? &deriv_s : NULL),
+                                   (supply_deriv ? &deriv_b : NULL),
+                                   NULL, // The raw scores aren't needed
+                                   &tot_objf,
+                                   &tot_weight);
+
+        if (supply_deriv) {
+          CuMatrix<BaseFloat> deriv_s_mat(1, s_dim),
+            deriv_b_mat(1,1);
+          deriv_b_mat(0,0) = deriv_b;
+          deriv_s_mat.CopyRowsFromVec(deriv_s);
+          computer->AcceptInput(xvector_name, &xvector_deriv);
+          computer->AcceptInput(s_name, &deriv_s_mat);
+          computer->AcceptInput(b_name, &deriv_b_mat);
+        }
+
+        objf_info_[xvector_name].UpdateStats(xvector_name,
+                                             config_.print_interval,
+                                             num_minibatches_processed_++,
+                                             tot_weight, tot_objf);
+      }
+    }
+  }
+}
+
+bool NnetXvectorTrainer::PrintTotalStats() const {
+  unordered_map<std::string, ObjectiveFunctionInfo, StringHasher>::const_iterator
+      iter = objf_info_.begin(),
+      end = objf_info_.end();
+  std::vector<std::pair<std::string, const ObjectiveFunctionInfo*> > all_pairs;
+  for (; iter != end; ++iter)
+    all_pairs.push_back(std::pair<std::string, const ObjectiveFunctionInfo*>(
+          iter->first, &(iter->second)));
+  // ensure deterministic order of these names (this will matter in situations
+  // where a script greps for the objective from the log).
+  std::sort(all_pairs.begin(), all_pairs.end());
+  bool ans = false;
+  for (size_t i = 0; i < all_pairs.size(); i++) {
+    const std::string &name = all_pairs[i].first;
+    const ObjectiveFunctionInfo &info = *(all_pairs[i].second);
+    bool ok = info.PrintTotalStats(name);
+    ans = ans || ok;
+  }
+  return ans;
+}
+
+NnetXvectorTrainer::~NnetXvectorTrainer() {
+  delete delta_nnet_;
+}
+
+void GetComputationRequestXvector(const Nnet &nnet,
+                                  const NnetExample &eg,
+                                  bool need_model_derivative,
+                                  bool store_component_stats,
+                                  ComputationRequest *request) {
+  request->inputs.clear();
+  request->inputs.reserve(eg.io.size());
+  request->outputs.clear();
+  request->outputs.reserve(eg.io.size());
+  request->need_model_derivative = need_model_derivative;
+  request->store_component_stats = store_component_stats;
+
+  // Examples for xvectors have no outputs.
+  for (size_t i = 0; i < eg.io.size(); i++) {
+    const NnetIo &io = eg.io[i];
+    const std::string &name = io.name;
+    int32 node_index = nnet.GetNodeIndex(name);
+
+    if (node_index == -1 &&
+        !nnet.IsInputNode(node_index))
+      KALDI_ERR << "xvector example has input  named '" << name
+                << "', but no such input node is in the network.";
+
+    std::vector<IoSpecification> &dest = request->inputs;
+    dest.resize(dest.size() + 1);
+    IoSpecification &io_spec = dest.back();
+    io_spec.name = name;
+    io_spec.indexes = io.indexes;
+    io_spec.has_deriv = false;
+  }
+
+  // We only need the output on frame t=0 for each n.
+  // So the output index for the output node is (n, 0, 0)
+  // for n=0 to max(n).
+  // Indexes for "s" and "b" output nodes are equal to (0,0,0).
+  int32 io_index_size = request->inputs[0].indexes.size(),
+        n_indx_size = 0;
+  std::vector<Index> output_indexes,
+    affine_output_indexes;
+  affine_output_indexes.resize(1);
+  affine_output_indexes[0].n = 0;
+  affine_output_indexes[0].t = 0;
+
+  for (int32 indx = 0; indx < io_index_size; indx++)
+    n_indx_size = std::max(n_indx_size,
+      request->inputs[0].indexes[indx].n + 1);
+
+  output_indexes.resize(n_indx_size);
+  for (int32 indx = 0; indx < n_indx_size; indx++) {
+    output_indexes[indx].n = indx;
+    output_indexes[indx].t = 0;
+  }
+
+  // In order to generate computation request for output nodes,
+  // we should find output nodes and add io_spec for each one.
+  int32 num_nodes = nnet.NumNodes();
+  for (size_t node_index = 0; node_index < num_nodes; node_index++) {
+    if (nnet.IsOutputNode(node_index)) {
+      std::vector<IoSpecification> &dest = request->outputs;
+      dest.resize(dest.size() + 1);
+      IoSpecification &io_spec = dest.back();
+      io_spec.name = nnet.GetNodeName(node_index);
+      if (nnet.GetNodeName(node_index) == "s" ||
+          nnet.GetNodeName(node_index) == "b")
+        io_spec.indexes = affine_output_indexes;
+      else
+        io_spec.indexes = output_indexes;
+      io_spec.has_deriv = need_model_derivative;
+    }
+  }
+
+  // check to see if something went wrong.
+  if (request->inputs.empty())
+    KALDI_ERR << "No inputs in computation request.";
+  if (request->outputs.empty())
+    KALDI_ERR << "No outputs in computation request.";
+}
+
+
+
+} // namespace nnet3
+} // namespace kaldi
diff --git a/src/xvector/nnet-xvector-training.h b/src/xvector/nnet-xvector-training.h
new file mode 100644
index 00000000000..121abcf0380
--- /dev/null
+++ b/src/xvector/nnet-xvector-training.h
@@ -0,0 +1,89 @@
+// xvector/nnet-xvector-training.h
+
+// Copyright    2015  Johns Hopkins University (author: Daniel Povey)
+//              2016  Xiaohui Zhang
+// Copyright    2016  Pegah Ghahremani
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_XVECTOR_NNET_XVECTOR_TRAINING_H_
+#define KALDI_XVECTOR_NNET_XVECTOR_TRAINING_H_
+
+#include "nnet3/nnet-example.h"
+#include "nnet3/nnet-computation.h"
+#include "nnet3/nnet-compute.h"
+#include "nnet3/nnet-optimize.h"
+#include "nnet3/nnet-example-utils.h"
+#include "xvector/xvector.h"
+#include "nnet3/nnet-training.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+
+/** This class is for single-threaded training of neural nets using
+    standard objective functions such as cross-entropy (implemented with
+    logsoftmax nonlinearity and a linear objective function) and quadratic loss.
+
+    Something that we should do in the future is to make it possible to have
+    two different threads, one for the compilation, and one for the computation.
+    This would only improve efficiency in the cases where the structure of the
+    input example was different each time, which isn't what we expect to see in
+    speech-recognition training.  (If the structure is the same each time,
+    the CachingOptimizingCompiler notices this and uses the computation from
+    last time).
+ */
+class NnetXvectorTrainer {
+ public:
+  NnetXvectorTrainer(const NnetTrainerOptions &config,
+              Nnet *nnet);
+
+  // train on one minibatch.
+  void Train(const NnetExample &eg);
+
+  // Prints out the final stats, and return true if there was a nonzero count.
+  bool PrintTotalStats() const;
+
+  ~NnetXvectorTrainer();
+ private:
+  void ProcessOutputs(NnetComputer *computer);
+
+  const NnetTrainerOptions config_;
+  Nnet *nnet_;
+  Nnet *delta_nnet_;  // Only used if momentum != 0.0.  nnet representing
+                      // accumulated parameter-change (we'd call this
+                      // gradient_nnet_, but due to natural-gradient update,
+                      // it's better to consider it as a delta-parameter nnet.
+  CachingOptimizingCompiler compiler_;
+
+  // This code supports multiple output layers, even though in the
+  // normal case there will be just one output layer named "output".
+  // So we store the objective functions per output layer.
+  int32 num_minibatches_processed_;
+
+  unordered_map<std::string, ObjectiveFunctionInfo, StringHasher> objf_info_;
+};
+
+
+
+void GetComputationRequestXvector(const Nnet &nnet,
+                                  const NnetExample &eg,
+                                  bool need_model_derivative,
+                                  bool store_component_stats,
+                                  ComputationRequest *request);
+} // namespace nnet3
+} // namespace kaldi
+
+#endif //
diff --git a/src/xvector/xvector-test.cc b/src/xvector/xvector-test.cc
new file mode 100644
index 00000000000..873e382851f
--- /dev/null
+++ b/src/xvector/xvector-test.cc
@@ -0,0 +1,311 @@
+// ivector/xvector-test.cc
+
+// Copyright 2016  David Snyder
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xvector/xvector.h"
+#include "util/kaldi-io.h"
+#include "cudamatrix/cu-matrix-lib.h"
+
+namespace kaldi {
+BaseFloat TestSimilarityScore(const CuVector<BaseFloat> &v,
+  const CuVector<BaseFloat> &w, const CuSpMatrix<BaseFloat> &S,
+  BaseFloat b);
+
+void TestGetDeriv(const CuVector<BaseFloat> &v,
+    const CuVector<BaseFloat> &w, const CuSpMatrix<BaseFloat> &S,
+    BaseFloat b, bool is_same, BaseFloat similarity_score,
+    CuVector<BaseFloat> *deriv_v, CuVector<BaseFloat> *deriv_w,
+    CuVector<BaseFloat> *deriv_S, BaseFloat *deriv_b);
+
+void TestComputeXvectorObjfAndDeriv(
+    const CuMatrixBase<BaseFloat> &xvector_pairs,
+    const CuSpMatrix<BaseFloat> &S,
+    BaseFloat b, CuMatrixBase<BaseFloat> *deriv_xvector,
+    CuVector<BaseFloat> *deriv_S, BaseFloat *deriv_b, BaseFloat *tot_objf,
+    BaseFloat *tot_weight);
+
+bool TestXvectorExtractorDerivative(BaseFloat perturb_delta) {
+  int32 xvector_dim = RandInt(4, 100),
+        num_rows = 2 * RandInt(2, 10); // The number of rows must be even
+                                       // and greater than 2.
+  int32 num_rows_subset = RandInt(1, num_rows);
+  CuSpMatrix<BaseFloat> S(xvector_dim);
+  S.SetRandn();
+  // Necessary to keep the similarity scores from getting too large or small.
+  S.Scale(1.0e-01);
+  BaseFloat b = RandInt(-100, 100) / 10.0,
+            tot_weight,
+            tot_objf,
+            deriv_b;
+  int32 S_dim = S.NumCols() * (S.NumCols() + 1) / 2;
+  CuMatrix<BaseFloat> xvector_pairs(num_rows, xvector_dim, kSetZero),
+                      deriv_xvector(num_rows, xvector_dim, kSetZero);
+  CuVector<BaseFloat> deriv_S(S_dim, kSetZero);
+  xvector_pairs.SetRandn();
+  ComputeXvectorObjfAndDeriv(xvector_pairs, S, b, &deriv_xvector,
+    &deriv_S, &deriv_b, NULL, &tot_objf, &tot_weight);
+  CuVector<BaseFloat> deriv_xvector_vec(xvector_dim);
+
+  // Sum over the derivatives for xvector input.
+  deriv_xvector_vec.AddRowSumMat(1.0, deriv_xvector.RowRange(0, num_rows_subset),
+                                 0.0);
+  BaseFloat l2_xvector = 0,
+            l2_S = 0,
+            l2_b = 0;
+
+  // Compare the xvector derivatives calculated above with a numerical
+  // approximation.
+  for (int32 i = 0; i < xvector_dim; i++) {
+    CuMatrix<BaseFloat> xvector_pairs_p(xvector_pairs);
+    CuMatrix<BaseFloat> xvector_pairs_n(xvector_pairs);
+    for (int32 j = 0; j < num_rows_subset; j++) {
+      xvector_pairs_p(j, i) += perturb_delta;
+      xvector_pairs_n(j, i) += -perturb_delta;
+    }
+    BaseFloat tot_objf_p,
+        tot_objf_n;
+    ComputeXvectorObjfAndDeriv(xvector_pairs_p, S, b, NULL,
+      NULL, NULL, NULL, &tot_objf_p, &tot_weight);
+    ComputeXvectorObjfAndDeriv(xvector_pairs_n, S, b, NULL,
+      NULL, NULL, NULL, &tot_objf_n, &tot_weight);
+    BaseFloat delta = (tot_objf_p  - tot_objf_n)
+      * 1.0 / (2.0 * perturb_delta);
+    l2_xvector += pow(deriv_xvector_vec(i) - delta, 2);
+  }
+
+  // Compare the S derivative calculated above with a numerical
+  // approximation.
+  for (int32 i = 0; i < S_dim; i++) {
+    CuSpMatrix<BaseFloat> S_p(S);
+    CuSpMatrix<BaseFloat> S_n(S);
+    CuSubVector<BaseFloat> S_p_vec(S_p.Data(), S_dim);
+    CuSubVector<BaseFloat> S_n_vec(S_n.Data(), S_dim);
+    S_p_vec(i) += perturb_delta;
+    S_n_vec(i) += -perturb_delta;
+    BaseFloat tot_objf_p,
+              tot_objf_n;
+    ComputeXvectorObjfAndDeriv(xvector_pairs, S_p, b, NULL,
+      NULL, NULL, NULL, &tot_objf_p, &tot_weight);
+    ComputeXvectorObjfAndDeriv(xvector_pairs, S_n, b, NULL,
+      NULL, NULL, NULL, &tot_objf_n, &tot_weight);
+    BaseFloat delta = (tot_objf_p  - tot_objf_n)
+      * 1.0 / (2.0 * perturb_delta);
+    l2_S += pow(deriv_S(i) - delta, 2);
+  }
+
+  // Compare the b derivative calculated above with a numerical
+  // approximation.
+  BaseFloat b_p = b + perturb_delta;
+  BaseFloat b_n = b - perturb_delta;
+  BaseFloat tot_objf_p;
+  BaseFloat tot_objf_n;
+  ComputeXvectorObjfAndDeriv(xvector_pairs, S, b_p, NULL,
+    NULL, NULL, NULL, &tot_objf_p, &tot_weight);
+  ComputeXvectorObjfAndDeriv(xvector_pairs, S, b_n, NULL,
+    NULL, NULL, NULL, &tot_objf_n, &tot_weight);
+  BaseFloat delta = (tot_objf_p  - tot_objf_n)
+                    * 1.0 / (2.0 * perturb_delta);
+  l2_b = pow(deriv_b - delta, 2);
+  KALDI_ASSERT(l2_xvector < 1.0e-03);
+  KALDI_ASSERT(l2_S <  1.0e-03);
+  KALDI_ASSERT(l2_b < 1.0e-03);
+  return true;
+}
+
+bool TestXvectorComputeObjf() {
+  int32 xvector_dim = RandInt(4, 100),
+      num_rows = 2 * RandInt(2, 10); // The number of rows must be even
+                                       // and greater than 2.
+  CuSpMatrix<BaseFloat> S(xvector_dim);
+  S.SetRandn();
+  // Necessary to keep the similarity scores from getting too large or small.
+  S.Scale(1.0e-01);
+  BaseFloat b = RandInt(-200, 200) / 10.0,
+            tot_weight,
+            tot_weight_test,
+            tot_objf,
+            tot_objf_test,
+            deriv_b,
+            deriv_b_test;
+  int32 S_dim = S.NumCols() * (S.NumCols() + 1) / 2;
+  CuMatrix<BaseFloat> xvector_pairs(num_rows, xvector_dim, kSetZero),
+                      deriv_xvector(num_rows, xvector_dim, kSetZero),
+                      deriv_xvector_test(num_rows, xvector_dim, kSetZero);
+  CuVector<BaseFloat> deriv_S(S_dim, kSetZero),
+                      deriv_S_test(S_dim, kSetZero);
+  xvector_pairs.SetRandn();
+
+  ComputeXvectorObjfAndDeriv(xvector_pairs, S, b, &deriv_xvector,
+    &deriv_S, &deriv_b, NULL, &tot_objf, &tot_weight);
+  TestComputeXvectorObjfAndDeriv(xvector_pairs, S, b, &deriv_xvector_test,
+    &deriv_S_test, &deriv_b_test, &tot_objf_test, &tot_weight_test);
+
+  CuVector<BaseFloat> deriv_xvector_vec(xvector_dim);
+  deriv_xvector_vec.AddRowSumMat(1.0, deriv_xvector, 0.0);
+  CuVector<BaseFloat> deriv_xvector_vec_test(xvector_dim);
+  deriv_xvector_vec_test.AddRowSumMat(1.0, deriv_xvector_test, 0.0);
+  KALDI_ASSERT(deriv_xvector.ApproxEqual(deriv_xvector_test, 0.01));
+
+  // Verify that the objfs are the same.
+  KALDI_ASSERT(ApproxEqual(tot_objf, tot_objf_test, 0.001));
+
+  // Also verify that the gradients are the same.
+  for (int32 i = 0; i < deriv_xvector_vec.Dim(); i++)
+    KALDI_ASSERT(ApproxEqual(deriv_xvector_vec(i),
+    deriv_xvector_vec_test(i), 0.001));
+
+  // Verify that the S derivates are the same.
+  for (int32 i = 0; i < deriv_S.Dim(); i++)
+    KALDI_ASSERT(ApproxEqual(deriv_S(i), deriv_S_test(i), 0.001));
+
+  // Verify that the b derivates are the same.
+  KALDI_ASSERT(ApproxEqual(deriv_b, deriv_b_test, 0.001));
+  return true;
+}
+
+void TestComputeXvectorObjfAndDeriv(
+    const CuMatrixBase<BaseFloat> &xvector_pairs,
+    const CuSpMatrix<BaseFloat> &S,
+    BaseFloat b, CuMatrixBase<BaseFloat> *deriv_xvector,
+    CuVector<BaseFloat> *deriv_S, BaseFloat *deriv_b, BaseFloat *tot_objf,
+    BaseFloat *tot_weight) {
+
+  int32 N = xvector_pairs.NumRows();
+  BaseFloat same_objf = 0,
+            diff_objf = 0;
+  BaseFloat K = 1.0 / (N - 2.0);
+  (*deriv_b) = 0;
+  // Handle portion of the objf corresponding to pairs of xvectors
+  // from the same classes.
+  for (int32 i = 0; i < N/2; i++) {
+    const CuVector<BaseFloat> &v(xvector_pairs.Row(2 * i)),
+                              &w(xvector_pairs.Row(2 * i + 1));
+    CuVector<BaseFloat> deriv_v,
+                        deriv_w,
+                        deriv_S_part;
+    BaseFloat similarity_score = TestSimilarityScore(v, w, S, b),
+              deriv_b_part = 0;
+    same_objf += Log(1 + Exp(-similarity_score));
+    TestGetDeriv(v, w, S, b, true, similarity_score, &deriv_v,
+     &deriv_w, &deriv_S_part, &deriv_b_part);
+    deriv_xvector->Row(2 * i).AddVec(1.0, deriv_v);
+    deriv_xvector->Row(2 * i + 1).AddVec(1.0, deriv_w);
+    deriv_S->AddVec(1.0, deriv_S_part);
+    (*deriv_b) += deriv_b_part;
+  }
+
+  // Handle portion of the objf corresponding to pairs of xvectors
+  // from different classes.
+  for (int32 i = 0; i < N; i++) {
+    for (int32 j = 2 * std::ceil((i + 1) / 2.0); j < N; j++) {
+      const CuVector<BaseFloat> &v(xvector_pairs.Row(i)),
+                                &w(xvector_pairs.Row(j));
+      CuVector<BaseFloat> deriv_v,
+                          deriv_w,
+                          deriv_S_part;
+      BaseFloat similarity_score = TestSimilarityScore(v, w, S, b),
+              deriv_b_part = 0;
+      diff_objf += Log(1 + Exp(similarity_score));
+      TestGetDeriv(v, w, S, b, false, similarity_score, &deriv_v,
+        &deriv_w, &deriv_S_part, &deriv_b_part);
+      deriv_xvector->Row(i).AddVec(K, deriv_v);
+      deriv_xvector->Row(j).AddVec(K, deriv_w);
+      deriv_S->AddVec(K, deriv_S_part);
+      (*deriv_b) += K * deriv_b_part;
+    }
+  }
+  // Scale the same and different portions of the objective function
+  // so that both contribute a weight of N.
+  (*tot_objf) = -same_objf - K * diff_objf;
+  (*tot_weight) = N;
+}
+
+
+void TestGetDeriv(const CuVector<BaseFloat> &v,
+    const CuVector<BaseFloat> &w, const CuSpMatrix<BaseFloat> &S,
+    BaseFloat b, bool is_same, BaseFloat similarity_score,
+    CuVector<BaseFloat> *deriv_v, CuVector<BaseFloat> *deriv_w,
+    CuVector<BaseFloat> *deriv_S, BaseFloat *deriv_b) {
+  int32 d = is_same ? 1 : -1,
+        S_dim = S.NumCols() * (S.NumCols() + 1) / 2;
+  deriv_v->Resize(v.Dim(), kSetZero);
+  deriv_w->Resize(v.Dim(), kSetZero);
+  deriv_S->Resize(S_dim, kSetZero);
+
+  // This scalar is common to the different derivatives.
+  BaseFloat deriv_coef = -d * Exp(-1 * d * similarity_score)
+    / (1 + Exp(-1 * d * similarity_score));
+
+  // Handle derivative with respect to v and w.
+  deriv_v->CopyFromVec(w);
+  deriv_w->CopyFromVec(v);
+  deriv_v->AddSpVec(2.0, S, v, -1.0);
+  deriv_w->AddSpVec(2.0, S, w, -1.0);
+  deriv_v->Scale(deriv_coef);
+  deriv_w->Scale(deriv_coef);
+
+  // Handle derivative with respect to S.
+  CuSpMatrix<BaseFloat> deriv_S_mat(S.NumCols(), kSetZero);
+  deriv_S_mat.AddVec2(2.0, v);
+  deriv_S_mat.AddVec2(2.0, w);
+  for (int32 i = 0; i < S.NumCols(); i++)
+    deriv_S_mat(i, i) = 0.5 * deriv_S_mat(i, i);
+  CuSubVector<BaseFloat> deriv_S_vec(deriv_S_mat.Data(), S_dim);
+  deriv_S->AddVec(deriv_coef, deriv_S_vec);
+
+  // Handle derivative with respect to b.
+  (*deriv_b) = -deriv_coef;
+}
+
+BaseFloat TestSimilarityScore(const CuVector<BaseFloat> &v,
+  const CuVector<BaseFloat> &w, const CuSpMatrix<BaseFloat> &S,
+  BaseFloat b) {
+  CuVector<BaseFloat> Sv(v.Dim());
+  Sv.AddSpVec(1.0, S, v, 0);
+  CuVector<BaseFloat> Sw(w.Dim());
+  Sw.AddSpVec(1.0, S, w, 0);
+  BaseFloat L = VecVec(v, w) - VecVec(v, Sv) - VecVec(w, Sw) + b;
+  return L;
+}
+
+void UnitTestXvectorExtractor() {
+  if (!TestXvectorComputeObjf())
+    KALDI_ERR << "Xvector objf test failed";
+  if (!TestXvectorExtractorDerivative(1.0e-02) &&
+     !TestXvectorExtractorDerivative(1.0e-03) &&
+     !TestXvectorExtractorDerivative(1.0e-04) &&
+     !TestXvectorExtractorDerivative(1.0e-05))
+    KALDI_ERR << "Xvector derivative test failed";
+}
+
+} // namespace kaldi
+
+int main() {
+  using namespace kaldi;
+  for (int32 i = 0; i < 2; i++) {
+#if HAVE_CUDA == 1
+    if (i == 0)
+      CuDevice::Instantiate().SelectGpuId("no");
+    else
+      CuDevice::Instantiate().SelectGpuId("yes");
+#endif
+    UnitTestXvectorExtractor();
+  }
+  std::cout << "Xvector tests succeeded.\n";
+  return 0;
+}
diff --git a/src/xvector/xvector.cc b/src/xvector/xvector.cc
new file mode 100644
index 00000000000..10e05f8eef6
--- /dev/null
+++ b/src/xvector/xvector.cc
@@ -0,0 +1,130 @@
+// xvector/xvector.cc
+
+// Copyright 2016  David Snyder
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xvector/xvector.h"
+
+namespace kaldi {
+
+void ComputeXvectorObjfAndDeriv(
+    const CuMatrixBase<BaseFloat> &xvector_pairs,
+    const CuSpMatrix<BaseFloat> &S,
+    BaseFloat b, CuMatrixBase<BaseFloat> *deriv_xvector,
+    CuVector<BaseFloat> *deriv_S, BaseFloat *deriv_b,
+    CuMatrixBase<BaseFloat> *scores_out,
+    BaseFloat *tot_objf,
+    BaseFloat *tot_weight) {
+
+  int32 S_dim = S.NumCols() * (S.NumCols() + 1) / 2,
+        N = xvector_pairs.NumRows(),
+        xvector_dim = xvector_pairs.NumCols();
+  (*tot_objf) = 0;
+
+  if (deriv_xvector == NULL)
+    KALDI_ASSERT(deriv_S == NULL && deriv_b == NULL);
+  else {
+    KALDI_ASSERT(deriv_xvector->NumCols() == xvector_dim);
+    KALDI_ASSERT(deriv_xvector->NumRows() == N);
+    KALDI_ASSERT(deriv_S->Dim() == S_dim);
+    deriv_xvector->SetZero();
+    deriv_S->SetZero();
+  }
+
+
+  CuMatrix<BaseFloat> S_tmp(S),
+                      P(N, xvector_dim),
+                      Q(N, N),
+                      R(N, N),
+                      scores(N, N),                 // The raw scores.
+                      objf_terms(N, N, kUndefined),
+                      scores_deriv(N, N,        // Derivative of the
+                                   kUndefined); // objf w.r.t. the scores.
+  CuVector<BaseFloat> r(N);
+
+  P.AddMatMat(1.0, xvector_pairs, kNoTrans, S_tmp, kNoTrans, 0.0);
+  r.AddDiagMatMat(1.0, xvector_pairs, kNoTrans, P, kTrans, 0.0);
+  R.AddVecToRows(1.0, r);
+  Q.SymAddMat2(1.0, xvector_pairs, kNoTrans, 0.0);
+  Q.CopyLowerToUpper();
+  scores.AddMat(1.0, Q, kNoTrans);
+  scores.AddMat(-1.0, R, kTrans);
+  scores.AddMat(-1.0, R, kNoTrans);
+  scores.Add(b);
+  if (scores_out != NULL) {
+    KALDI_ASSERT(scores_out->NumCols() == scores.NumCols()
+                 && scores_out->NumRows() == scores.NumRows());
+    scores_out->CopyFromMat(scores);
+  }
+
+  cu::ComputeXvectorObjfFromScores(scores, &objf_terms, &scores_deriv);
+  CuVector<BaseFloat> objf_terms_vec(N);
+  objf_terms_vec.AddRowSumMat(1.0, objf_terms);
+  (*tot_objf) = objf_terms_vec.Sum();
+
+  if (deriv_xvector != NULL) {
+    // compute the derivatives of tot_objf w.r.t the inputs.
+    CuMatrix<BaseFloat> scores_deriv_plus_trans(scores_deriv, kTrans);
+    scores_deriv_plus_trans.AddMat(1.0, scores_deriv, kNoTrans);
+    CuVector<BaseFloat> r_deriv(N);
+    r_deriv.AddRowSumMat(-1.0, scores_deriv_plus_trans, 0.0);
+
+    // Compute derivative of the objf with respect to the xvectors.
+    deriv_xvector->AddDiagVecMat(2.0, r_deriv, P, kNoTrans, 0.0);
+    deriv_xvector->AddMatMat(1.0, scores_deriv_plus_trans, kNoTrans,
+                             xvector_pairs, kNoTrans, 1.0);
+
+    // Compute derivative of the objf with respect to the symmetric matrix S:
+    // S_deriv += xvector_pairs' * diag(r_deriv) * xvector_pairs
+    CuMatrix<BaseFloat> S_deriv_mat(xvector_dim, xvector_dim);
+    // we don't need P any more so re-use it temporarily
+    // rderiv_xvector_pairs is the product of diag(r_deriv) times xvector_pairs.
+    CuMatrix<BaseFloat> &rderiv_xvector_pairs(P);
+    rderiv_xvector_pairs.AddDiagVecMat(1.0, r_deriv, xvector_pairs, kNoTrans, 0.0);
+    S_deriv_mat.AddMatMat(1.0, xvector_pairs, kTrans, rderiv_xvector_pairs, kNoTrans, 0.0);
+    CuSpMatrix<BaseFloat> S_deriv_sp(xvector_dim);
+    S_deriv_sp.CopyFromMat(S_deriv_mat, kTakeLower);
+
+    // at this point S_deriv_sp represents the deriv w.r.t. S represented as a
+    // symmetric matrix; but we need the deriv w.r.t. S represented as a packed
+    // vector, which is a little different because each off-diagonal element is
+    // only represented once in the packed vector.  This means we need
+    // to scale the off-diag elements by 2.
+    S_deriv_sp.Scale(2.0);
+    S_deriv_sp.ScaleDiag(0.5);
+    deriv_S->CopyFromVec(CuSubVector<BaseFloat>(S_deriv_sp.Data(),
+                                                S_dim));
+
+    // Compute derivative of objf with respect to the scalar offset b.
+    (*deriv_b) = scores_deriv.Sum();
+  }
+  (*tot_weight) = N;
+}
+
+BaseFloat SimilarityScore(const Vector<BaseFloat> &v,
+    const Vector<BaseFloat> &w, const SpMatrix<BaseFloat> &S,
+    BaseFloat b) {
+  KALDI_ASSERT(v.Dim() == w.Dim() && v.Dim() == S.NumRows());
+  Vector<BaseFloat> Sv(v.Dim());
+  Sv.AddSpVec(1.0, S, v, 0);
+  Vector<BaseFloat> Sw(w.Dim());
+  Sw.AddSpVec(1.0, S, w, 0);
+  BaseFloat L = VecVec(v, w) - VecVec(v, Sv) - VecVec(w, Sw) + b;
+  return L;
+}
+
+} // namespace kaldi
diff --git a/src/xvector/xvector.h b/src/xvector/xvector.h
new file mode 100644
index 00000000000..fa6c580ab43
--- /dev/null
+++ b/src/xvector/xvector.h
@@ -0,0 +1,94 @@
+// xvector/xvector.h
+
+// Copyright 2016    Johns Hopkins University (Author: Daniel Povey)
+//           2016    David Snyder
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_XVECTOR_XVECTOR_H_
+#define KALDI_XVECTOR_XVECTOR_H_
+
+#include <vector>
+#include "base/kaldi-common.h"
+#include "cudamatrix/cu-matrix-lib.h"
+#include "itf/options-itf.h"
+#include "util/common-utils.h"
+#include "matrix/matrix-lib.h"
+
+namespace kaldi {
+  /*
+  Computes the training objective function and the derivatives for
+  the xvector.  Let N = xvector_pairs.NumRows() be the number of
+  xvectors. There are N(N-1)/2 pairs in total and N/2 from the same
+  class. Let v(n) be the n'th row of the matrix xvector_pairs.
+  The total objective function written to 'tot_objf' is
+      \sum_{n=0}^{N/2} p_same(v(n*2), v(n*2+1))
+      + 1/(N-2) \sum_{n=0}^{N} \sum_{m=2*ceil(n+1)/2)}^{N}
+      p_different(v(m), v(n))
+  and let N be the normalizer for the objective function, written to
+  'tot_weight' and equal to the total (weighted) number of samples over
+  which the objective function is computed. It is useful for displaying
+  the objective function correctly.
+  Let the log-odds L(v,w) [interpreted as log(p_same(v,w) / p_different(v,w))]
+  be defined as:
+      L(v, w) = v' w -  v' S v - w' S w + b
+  then p_same(v, w) = -log(1 + exp(-l(v, w)), and
+  p_different(v, w) = 1 - p_same(v, w) = -log(1 + exp(-l(v, w)).
+
+  @param [in] xvector_pairs   Each row of 'xvector_pairs' is an xvector
+  extracted by the network for one sample, and the assumption is that
+  pairs of the form (2*k, 2*k+1), e.g., (0, 1), (2, 3), (4, 5), etc,
+  are from the same class, but any other pairs, e.g., (0, 2), (1, 2),
+  (2, 4), etc, are from different classes.
+  @param [out] deriv_xvector  If non-NULL, the derivative of the objective
+  function with respect to the xvectors is written here.
+  @param [out] deriv_S  If non-NULL, the derivative of the objective
+  function with respect to the parameter S are written here.
+  @param [out] deriv_b  If other derivates are non-NULL the derivative of
+  the objective function with respect to the parameter b is written here.
+  @param [out] tot_objf  The total objective function described above
+  @param [out] tot_weight  The total normalizing factor for the objective
+  function, equal to xvector_pairs.NumRows().
+  */
+  void ComputeXvectorObjfAndDeriv(const CuMatrixBase<BaseFloat> &xvector_pairs,
+    const CuSpMatrix<BaseFloat> &S,
+    BaseFloat b,
+    CuMatrixBase<BaseFloat> *deriv_xvector,
+    CuVector<BaseFloat> *deriv_S,
+    BaseFloat *deriv_b,
+    CuMatrixBase<BaseFloat> *scores_out,
+    BaseFloat *tot_objf,
+    BaseFloat *tot_weight);
+
+ /*
+ Compute the similarity score between two input xvectors. The score is
+ defined as:
+      L(v, w) = v' w -  v' S v - w' S w + b
+ @param [in] v  The first xvector.
+ @param [in] w  The second xvector.
+ @param [in] S  A symmetric matrix, usually a constant output of the
+ Nnet the xvectors came from.
+ @param [in] b  A scalar offset, usually a constant output of the Nnet
+ the xvectors came from.
+ @return  The score between vectors v and w.
+ */
+ BaseFloat SimilarityScore(const Vector<BaseFloat> &v,
+    const Vector<BaseFloat> &w, const SpMatrix<BaseFloat> &S,
+    BaseFloat b);
+
+}  // namespace kaldi
+
+#endif
diff --git a/src/xvectorbin/Makefile b/src/xvectorbin/Makefile
new file mode 100644
index 00000000000..63b78a36880
--- /dev/null
+++ b/src/xvectorbin/Makefile
@@ -0,0 +1,28 @@
+
+all:
+EXTRA_CXXFLAGS = -Wno-sign-compare
+include ../kaldi.mk
+
+LDFLAGS += $(CUDA_LDFLAGS)
+LDLIBS += $(CUDA_LDLIBS)
+
+BINFILES = nnet3-xvector-get-egs nnet3-xvector-compute-prob \
+           nnet3-xvector-show-progress nnet3-xvector-train \
+           nnet3-xvector-compute nnet3-xvector-scoring nnet3-xvector-get-egs-sre \
+           nnet3-xvector-get-egs-sre-subsample nnet3-xvector-compute-simple
+
+OBJFILES =
+
+# Add this dependency to force cuda-compiled.o to be rebuilt when we reconfigure.
+cuda-compiled.o: ../kaldi.mk
+
+TESTFILES =
+
+ADDLIBS = ../xvector/kaldi-xvector.a ../nnet3/kaldi-nnet3.a ../gmm/kaldi-gmm.a \
+         ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a  \
+         ../transform/kaldi-transform.a ../tree/kaldi-tree.a  \
+         ../cudamatrix/kaldi-cudamatrix.a \
+         ../matrix/kaldi-matrix.a ../fstext/kaldi-fstext.a \
+         ../util/kaldi-util.a ../base/kaldi-base.a
+
+include ../makefiles/default_rules.mk
diff --git a/src/xvectorbin/nnet3-xvector-compute-prob.cc b/src/xvectorbin/nnet3-xvector-compute-prob.cc
new file mode 100644
index 00000000000..fb3975b259d
--- /dev/null
+++ b/src/xvectorbin/nnet3-xvector-compute-prob.cc
@@ -0,0 +1,81 @@
+// xvectorbin/nnet3-xvector-compute-prob.cc
+
+// Copyright 2015  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "nnet3/nnet-diagnostics.h"
+#include "xvector/nnet-xvector-diagnostics.h"
+
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Computes and prints to in logging messages the average log-prob per frame of\n"
+        "the given data with an nnet3 neural net.  The input of this is the output of\n"
+        "e.g. nnet3-xvector-get-egs | nnet3-merge-egs.\n"
+        "\n"
+        "Usage:  nnet3-xvector-compute-prob [options] <raw-model-in> <training-examples-in>\n"
+        "e.g.: nnet3-xvector-compute-prob 0.raw ark:valid.egs\n";
+
+    
+    // This program doesn't support using a GPU, because these probabilities are
+    // used for diagnostics, and you can just compute them with a small enough
+    // amount of data that a CPU can do it within reasonable time.
+
+    NnetComputeProbOptions opts;
+    
+    ParseOptions po(usage);
+
+    opts.Register(&po);
+    
+    po.Read(argc, argv);
+    
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+    
+    std::string raw_nnet_rxfilename = po.GetArg(1),
+        examples_rspecifier = po.GetArg(2);
+
+    Nnet nnet;
+    ReadKaldiObject(raw_nnet_rxfilename, &nnet);
+
+    NnetXvectorComputeProb prob_computer(opts, nnet);
+    
+    SequentialNnetExampleReader example_reader(examples_rspecifier);
+
+    for (; !example_reader.Done(); example_reader.Next())
+      prob_computer.Compute(example_reader.Value());
+
+    bool ok = prob_computer.PrintTotalStats();
+    
+    return (ok ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
+
diff --git a/src/xvectorbin/nnet3-xvector-compute-simple.cc b/src/xvectorbin/nnet3-xvector-compute-simple.cc
new file mode 100644
index 00000000000..e588edd90b7
--- /dev/null
+++ b/src/xvectorbin/nnet3-xvector-compute-simple.cc
@@ -0,0 +1,155 @@
+// nnet3bin/nnet3-compute.cc
+
+// Copyright 2012-2015   Johns Hopkins University (author: Daniel Povey)
+//                2016   David Snyder
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "base/timer.h"
+#include "nnet3/nnet-utils.h"
+#include "xvector/nnet-xvector-compute.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+      "Propagate the features through the network and write the output\n"
+      "xvectors.  By default, xvectors are extracted once every\n"
+      "--xvector-period using --chunk-size frames and output as an archive\n"
+      "of matrices.  If --repeat=true, the xvectors are copied between\n"
+      "periods, so that the output matrix has the same number of rows as\n"
+      "the input.  If --output-as-vector=true, the xvectors are averaged\n"
+      "across periods, and the output is a single vector for each utterance.\n"
+      "\n"
+      "Usage: nnet3-xvector-compute [options] <raw-nnet-in> "
+      "<feats-rspecifier> <xvector-wspecifier>\n"
+      " e.g.: nnet3-xvector-compute --xvector-period=50 final.raw "
+      "scp:feats.scp ark:xvectors.ark\n";
+
+    ParseOptions po(usage);
+    Timer timer;
+
+    NnetSimpleComputationOptions opts;
+    std::string use_gpu = "yes";
+    int32 chunk_size = 100;
+
+    opts.Register(&po);
+
+    po.Register("use-gpu", &use_gpu,
+                "yes|no|optional|wait, only has effect if compiled with CUDA");
+    po.Register("chunk-size", &chunk_size,
+      "Feature chunk size over which the xvector is computed.  "
+      "If not set, defaults to xvector-period.");
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().SelectGpuId(use_gpu);
+#endif
+
+    KALDI_ASSERT(chunk_size > 0);
+
+    std::string nnet_rxfilename = po.GetArg(1),
+                feat_rspecifier = po.GetArg(2),
+                vector_wspecifier = po.GetArg(3);
+    Nnet nnet;
+    ReadKaldiObject(nnet_rxfilename, &nnet);
+    NnetXvectorComputer nnet_computer(opts, &nnet);
+
+    BaseFloatVectorWriter vector_writer(vector_wspecifier);
+
+    int32 num_success = 0,
+          num_fail = 0,
+          left_context,
+          right_context,
+          xvector_dim = nnet.OutputDim("output");
+    int32 min_chunk_size = 100;
+    int64 frame_count = 0;
+    SequentialBaseFloatMatrixReader feat_reader(feat_rspecifier);
+    for (; !feat_reader.Done(); feat_reader.Next()) {
+      std::string utt = feat_reader.Key();
+      const Matrix<BaseFloat> &feats (feat_reader.Value());
+      int32 num_rows = feats.NumRows(),
+            feat_dim = feats.NumCols(),
+            this_chunk_size = chunk_size;
+
+      if (num_rows < min_chunk_size) {
+        KALDI_WARN << "Minimum chunk size of " << min_chunk_size
+                   << " is greater than the number of rows "
+                   << "in utterance: " << utt;
+        num_fail++;
+        continue;
+      } else if (num_rows < this_chunk_size) {
+        KALDI_LOG << "Chunk size of " << this_chunk_size << " is greater than "
+                  << "the number of rows in utterance: " << utt
+                  << ", using chunk size  of " << num_rows;
+        this_chunk_size = num_rows;
+      }
+
+      int32 num_chunks = ceil(num_rows / static_cast<BaseFloat>(chunk_size));
+
+      Vector<BaseFloat> xvector_avg(xvector_dim, kSetZero);
+      BaseFloat tot_weight = 0.0;
+
+      // Iterate over the feature chunks.
+      for (int32 chunk_indx = 0; chunk_indx < num_chunks; chunk_indx++) {
+        // If we're nearing the end of the input, we may need to shift the
+        // offset back so that we can get this_chunk_size frames of input to
+        // the nnet.
+        int32 offset = std::min(chunk_size, num_rows - chunk_indx * chunk_size);
+        if (offset < min_chunk_size)
+          continue;
+        SubMatrix<BaseFloat> sub_feats(feats, chunk_indx * chunk_size, offset,
+                                       0, feat_dim);
+        Vector<BaseFloat> xvector(xvector_dim);
+        nnet_computer.ComputeXvector(sub_feats, &xvector);
+        tot_weight += offset;
+        xvector_avg.AddVec(offset, xvector);
+      }
+
+      // If output is a vector, scale it by the total weight.
+      xvector_avg.Scale(1.0 / tot_weight);
+      vector_writer.Write(utt, xvector_avg);
+
+      frame_count += feats.NumRows();
+      num_success++;
+    }
+
+    double elapsed = timer.Elapsed();
+    KALDI_LOG << "Time taken "<< elapsed
+              << "s: real-time factor assuming 100 frames/sec is "
+              << (elapsed*100.0/frame_count);
+    KALDI_LOG << "Done " << num_success << " utterances, failed for "
+              << num_fail;
+
+    if (num_success != 0) return 0;
+    else return 1;
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/xvectorbin/nnet3-xvector-compute.cc b/src/xvectorbin/nnet3-xvector-compute.cc
new file mode 100644
index 00000000000..c2d16c867ca
--- /dev/null
+++ b/src/xvectorbin/nnet3-xvector-compute.cc
@@ -0,0 +1,211 @@
+// nnet3bin/nnet3-compute.cc
+
+// Copyright 2012-2015   Johns Hopkins University (author: Daniel Povey)
+//                2016   David Snyder
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "base/timer.h"
+#include "nnet3/nnet-utils.h"
+#include "xvector/nnet-xvector-compute.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+      "Propagate the features through the network and write the output\n"
+      "xvectors.  By default, xvectors are extracted once every\n"
+      "--xvector-period using --chunk-size frames and output as an archive\n"
+      "of matrices.  If --repeat=true, the xvectors are copied between\n"
+      "periods, so that the output matrix has the same number of rows as\n"
+      "the input.  If --output-as-vector=true, the xvectors are averaged\n"
+      "across periods, and the output is a single vector for each utterance.\n"
+      "\n"
+      "Usage: nnet3-xvector-compute [options] <raw-nnet-in> "
+      "<feats-rspecifier> <xvector-wspecifier>\n"
+      " e.g.: nnet3-xvector-compute --xvector-period=50 final.raw "
+      "scp:feats.scp ark:xvectors.ark\n";
+
+    ParseOptions po(usage);
+    Timer timer;
+
+    NnetSimpleComputationOptions opts;
+    std::string use_gpu = "yes";
+    int32 xvector_period = 10,
+          chunk_size = -1;
+    bool output_as_vector = false,
+         repeat = false;
+
+    opts.Register(&po);
+
+    po.Register("use-gpu", &use_gpu,
+                "yes|no|optional|wait, only has effect if compiled with CUDA");
+    po.Register("xvector-period", &xvector_period,
+      "Extract a new xvector once for each period.");
+    po.Register("chunk-size", &chunk_size,
+      "Feature chunk size over which the xvector is computed.  "
+      "If not set, defaults to xvector-period.");
+    po.Register("output-as-vector", &output_as_vector,
+      "If true, average the chunk-level xvectors and output as an "
+      "archive of vectors.");
+    po.Register("repeat", &repeat, "If true, the xvectors are copied between "
+      "periods so that the output has the same number of rows as the input.");
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().SelectGpuId(use_gpu);
+#endif
+
+    if (output_as_vector && repeat)
+      KALDI_ERR << "Options --output-as-vector and --repeat cannot both "
+                << "be true.";
+    if (chunk_size == -1)
+      chunk_size = xvector_period;
+
+    KALDI_ASSERT(chunk_size > 0 && xvector_period > 0);
+
+    std::string nnet_rxfilename = po.GetArg(1),
+                feat_rspecifier = po.GetArg(2),
+                vector_wspecifier = po.GetArg(3);
+    Nnet nnet;
+    ReadKaldiObject(nnet_rxfilename, &nnet);
+    NnetXvectorComputer nnet_computer(opts, &nnet);
+
+    BaseFloatMatrixWriter matrix_writer(output_as_vector
+        ? "" : vector_wspecifier);
+    BaseFloatVectorWriter vector_writer(output_as_vector
+        ? vector_wspecifier : "");
+
+    int32 num_success = 0,
+          num_fail = 0,
+          left_context,
+          right_context,
+          xvector_dim = nnet.OutputDim("output");
+    ComputeSimpleNnetContext(nnet, &left_context, &right_context);
+    int32 min_chunk_size = left_context + right_context;
+    int64 frame_count = 0;
+
+    SequentialBaseFloatMatrixReader feat_reader(feat_rspecifier);
+    for (; !feat_reader.Done(); feat_reader.Next()) {
+      std::string utt = feat_reader.Key();
+      const Matrix<BaseFloat> &feats (feat_reader.Value());
+      int32 num_rows = feats.NumRows(),
+            feat_dim = feats.NumCols(),
+            this_chunk_size = chunk_size;
+
+      if (num_rows < min_chunk_size) {
+        KALDI_WARN << "Minimum chunk size of " << min_chunk_size
+                   << " is greater than the number of rows "
+                   << "in utterance: " << utt;
+        num_fail++;
+        continue;
+      } else if (num_rows < this_chunk_size) {
+        KALDI_LOG << "Chunk size of " << this_chunk_size << " is greater than "
+                  << "the number of rows in utterance: " << utt
+                  << ", using chunk size  of " << num_rows;
+        this_chunk_size = num_rows;
+      }
+
+      int32 num_chunks = ceil((num_rows - this_chunk_size)
+            / static_cast<BaseFloat>(xvector_period)) + 1;
+      int32 num_xvectors = repeat ? num_rows : num_chunks;
+
+      // The number of frames by which the last two chunks overlap.
+      int32 overlap = std::max(0, (num_chunks - 1) * xvector_period
+                      - num_rows + this_chunk_size);
+      BaseFloat total_chunk_weight = 0.0;
+      Vector<BaseFloat> xvector_avg;
+      Matrix<BaseFloat> xvector_mat;
+
+      // Create the output xvector vector or matrix. Only allocate memory
+      // for the one we're going to output.
+      if (output_as_vector)
+        xvector_avg.Resize(xvector_dim);
+      else
+        xvector_mat.Resize(num_xvectors, xvector_dim);
+
+      // Iterate over the feature chunks.
+      for (int32 chunk_indx = 0; chunk_indx < num_chunks; chunk_indx++) {
+        // If we're nearing the end of the input, we may need to shift the
+        // offset back so that we can get this_chunk_size frames of input to
+        // the nnet.
+        int32 offset = std::min(chunk_indx * xvector_period,
+                           num_rows - this_chunk_size);
+        SubMatrix<BaseFloat> sub_feats(feats, offset, this_chunk_size,
+                                       0, feat_dim);
+        Vector<BaseFloat> xvector(xvector_dim);
+        nnet_computer.ComputeXvector(sub_feats, &xvector);
+
+        if (output_as_vector) {
+          // The second to last chunk may have extra overlap with the
+          // final chunk. We need to reduce the weight on these
+          // chunks, so that the overlapping portion isn't counted twice.
+          BaseFloat weight;
+          if (chunk_indx < num_chunks - 2)
+            weight = this_chunk_size;
+          else
+            weight = this_chunk_size - 0.5 * overlap;
+          total_chunk_weight += weight;
+          xvector_avg.AddVec(weight, xvector);
+        // Cases for outputting as a matrix:
+        } else if (repeat) {
+          int32 out_offset = chunk_indx * xvector_period;
+          for (int32 i = out_offset;
+              i < std::min(out_offset + xvector_period, num_rows); i++)
+            xvector_mat.Row(i).CopyFromVec(xvector);
+        } else {
+          xvector_mat.Row(chunk_indx).CopyFromVec(xvector);
+        }
+      }
+
+      // If output is a vector, scale it by the total weight.
+      if (output_as_vector) {
+        xvector_avg.Scale(1.0 / total_chunk_weight);
+        vector_writer.Write(utt, xvector_avg);
+      } else {
+        matrix_writer.Write(utt, xvector_mat);
+      }
+
+      frame_count += feats.NumRows();
+      num_success++;
+    }
+
+    double elapsed = timer.Elapsed();
+    KALDI_LOG << "Time taken "<< elapsed
+              << "s: real-time factor assuming 100 frames/sec is "
+              << (elapsed*100.0/frame_count);
+    KALDI_LOG << "Done " << num_success << " utterances, failed for "
+              << num_fail;
+
+    if (num_success != 0) return 0;
+    else return 1;
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/xvectorbin/nnet3-xvector-get-egs-sre-subsample.cc b/src/xvectorbin/nnet3-xvector-get-egs-sre-subsample.cc
new file mode 100644
index 00000000000..685279f356a
--- /dev/null
+++ b/src/xvectorbin/nnet3-xvector-get-egs-sre-subsample.cc
@@ -0,0 +1,264 @@
+// xvectorbin/nnet3-xvector-get-egs.cc
+
+// Copyright 2012-2016  Johns Hopkins University (author:  Daniel Povey)
+//                2016  David Snyder
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sstream>
+
+#include "util/common-utils.h"
+#include "nnet3/nnet-example.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+// A struct for holding information about the position and
+// duration of each pair of chunks.
+struct ChunkPairInfo {
+  std::string pair_name;
+  std::string utt1;
+  std::string utt2;
+  int32 output_archive_id;
+  int32 start_frame1;
+  int32 start_frame2;
+  int32 num_frames1;
+  int32 num_frames2;
+};
+
+// Process the range input file and store it as a map from utterance
+// name to vector of ChunkPairInfo structs.
+static void ProcessRangeFile(const std::string &range_rxfilename,
+                             std::vector<ChunkPairInfo *> *pairs) {
+  Input range_input(range_rxfilename);
+  if (!range_rxfilename.empty()) {
+    std::string line;
+    while (std::getline(range_input.Stream(), line)) {
+      ChunkPairInfo *pair = new ChunkPairInfo();
+      std::vector<std::string> fields;
+      SplitStringToVector(line, " \t\n\r", true, &fields);
+      if (fields.size() != 8)
+        KALDI_ERR << "Expected 7 fields in line of range file, got "
+                  << fields.size() << " instead.";
+
+      std::string utt1 = fields[0],
+                  utt2 = fields[1],
+                  start_frame1_str = fields[4],
+                  num_frames1_str = fields[5],
+                  start_frame2_str = fields[6],
+                  num_frames2_str = fields[7];
+      pair->utt1 = utt1;
+      pair->utt2 = utt2;
+      if (!ConvertStringToInteger(fields[2], &(pair->output_archive_id))
+          || !ConvertStringToInteger(start_frame1_str, &(pair->start_frame1))
+          || !ConvertStringToInteger(start_frame2_str, &(pair->start_frame2))
+          || !ConvertStringToInteger(num_frames1_str, &(pair->num_frames1))
+          || !ConvertStringToInteger(num_frames2_str, &(pair->num_frames2)))
+        KALDI_ERR << "Expected integer for output archive in range file.";
+      pair->pair_name = utt1 + "-" + start_frame1_str + "-" + num_frames1_str
+                      + "-" + utt2
+                      + "-" + start_frame2_str + "-" + num_frames2_str;
+      pairs->push_back(pair);
+    }
+  }
+}
+
+static void WriteExample(const MatrixBase<BaseFloat> &feat1,
+                         const MatrixBase<BaseFloat> &feat2,
+                         const ChunkPairInfo *pair,
+                         int32 subsample,
+                         bool compress,
+                         int32 *num_egs_written,
+                         std::vector<NnetExampleWriter *> *example_writers) {
+    NnetExample eg;
+    int32 num_rows1 = feat1.NumRows(),
+          feat_dim1 = feat1.NumCols(),
+          num_rows2 = feat2.NumRows(),
+          feat_dim2 = feat2.NumCols();
+    std::string utt1 = pair->utt1,
+                utt2 = pair->utt2;
+
+    KALDI_ASSERT(feat_dim1 == feat_dim2);
+
+    if (num_rows1 < pair->num_frames1) {
+      KALDI_WARN << "Unable to create examples for utterance "
+                 << utt1
+                 << ". Requested chunk size of "
+                 << pair->num_frames1
+                 << " but utterance has only " << num_rows1 << " frames.";
+      return;
+    }
+    if (num_rows2 < pair->num_frames2) {
+      KALDI_WARN << "Unable to create examples for utterance "
+                 << utt2
+                 << ". Requested chunk size of "
+                 << pair->num_frames2
+                 << " but utterance has only " << num_rows2 << " frames.";
+      return;
+    }
+    // The requested chunk positions are approximate. It's possible
+    // that they slightly exceed the number of frames in the utterance.
+    // If that occurs, we can shift the chunks location back slightly.
+    int32 shift1 = std::min(0, num_rows1 - pair->start_frame1
+                              - pair->num_frames1),
+          shift2 = std::min(0, num_rows2 - pair->start_frame2
+                                 - pair->num_frames2);
+
+    SubMatrix<BaseFloat> chunk1_sub(feat1, pair->start_frame1 + shift1,
+                                pair->num_frames1, 0, feat_dim1),
+                         chunk2_sub(feat2, pair->start_frame2 + shift2,
+                                pair->num_frames2, 0, feat_dim2);
+    Matrix<BaseFloat> chunk1_tmp(chunk1_sub);
+    Matrix<BaseFloat> chunk2_tmp(chunk2_sub);
+
+    int32 real_chunk_size1 = chunk1_tmp.NumRows() / subsample;
+    int32 real_chunk_size2 = chunk2_tmp.NumRows() / subsample;
+    Matrix<BaseFloat> chunk1(real_chunk_size1, chunk1_tmp.NumCols());
+    Matrix<BaseFloat> chunk2(real_chunk_size2, chunk2_tmp.NumCols());
+
+    std::vector<int32> index_vector1;
+    for (int32 i = 0; i < chunk1_tmp.NumRows(); i++)
+      index_vector1.push_back(i);
+
+    std::vector<int32> index_vector2;
+    for (int32 i = 0; i < chunk2_tmp.NumRows(); i++)
+      index_vector2.push_back(i);
+
+    std::random_shuffle(index_vector1.begin(), index_vector1.end());
+    for (int32 i = 0; i < real_chunk_size1; i++)
+      chunk1.Row(i).CopyFromVec(chunk1_tmp.Row(i));
+
+    std::random_shuffle(index_vector2.begin(), index_vector2.end());
+    for (int32 i = 0; i < real_chunk_size2; i++)
+      chunk2.Row(i).CopyFromVec(chunk2_tmp.Row(i));
+
+    NnetIo nnet_io1 = NnetIo("input", 0, chunk1),
+           nnet_io2 = NnetIo("input", 0, chunk2);
+    for (std::vector<Index>::iterator indx_it = nnet_io1.indexes.begin();
+        indx_it != nnet_io1.indexes.end(); ++indx_it)
+      indx_it->n = 0;
+    for (std::vector<Index>::iterator indx_it = nnet_io2.indexes.begin();
+        indx_it != nnet_io2.indexes.end(); ++indx_it)
+      indx_it->n = 1;
+
+    eg.io.push_back(nnet_io1);
+    eg.io.push_back(nnet_io2);
+    if (compress)
+      eg.Compress();
+
+    if (pair->output_archive_id >= example_writers->size())
+      KALDI_ERR << "Requested output index exceeds number of specified "
+                << "output files.";
+    (*example_writers)[pair->output_archive_id]->Write(
+                       pair->pair_name, eg);
+    (*num_egs_written) += 1;
+}
+
+// Delete the dynamically allocated memory.
+static void Cleanup(std::vector<ChunkPairInfo *> *pairs,
+                    std::vector<NnetExampleWriter *> *writers) {
+  for (std::vector<ChunkPairInfo *>::iterator
+      vec_it = pairs->begin(); vec_it != pairs->end();
+      ++vec_it)
+    delete *vec_it;
+  for (std::vector<NnetExampleWriter *>::iterator
+      it = writers->begin(); it != writers->end(); ++it)
+    delete *it;
+}
+
+} // namespace nnet3
+} // namespace kaldi
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+
+    const char *usage =
+        "Get examples for training an nnet3 neural network for the xvector\n"
+        "system.  Each output example contains a pair of feature chunks from\n"
+        "the same utterance.  The location and length of the feature chunks\n"
+        "are specified in the 'ranges' file.  Each line is interpreted as\n"
+        "follows:\n"
+        "  <source-utterance> <relative-output-archive-index> "
+        "<absolute-archive-index>  <start-frame-index1> <num-frames1> "
+        "<start-frame-index2> <num-frames2>\n"
+        "where <relative-output-archive-index> is interpreted as a zero-based\n"
+        "index into the wspecifiers specified on the command line (<egs-0-out>\n"
+        "and so on), and <absolute-archive-index> is ignored by this program.\n"
+        "For example:\n"
+        "  utt1  3  13  0   65  112  110\n"
+        "  utt1  0  10  160 50  214  180\n"
+        "  utt2  ...\n"
+        "\n"
+        "Usage:  nnet3-xvector-get-egs [options] <ranges-filename> "
+        "<features-rspecifier> <egs-0-out> <egs-1-out> ... <egs-N-1-out>\n"
+        "\n"
+        "For example:\n"
+        "nnet3-xvector-get-egs ranges.1 \"$feats\" ark:egs_temp.1.ark"
+        "  ark:egs_temp.2.ark ark:egs_temp.3.ark\n";
+
+    bool compress = true;
+    int32 subsample = 5;
+
+    ParseOptions po(usage);
+    po.Register("compress", &compress, "If true, write egs in "
+                "compressed format.");
+    po.Register("subsample", &subsample, "TODO");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() < 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string
+        range_rspecifier = po.GetArg(1),
+        feature_rspecifier = po.GetArg(2);
+    std::vector<NnetExampleWriter *> example_writers;
+
+    for (int32 i = 3; i <= po.NumArgs(); i++)
+      example_writers.push_back(new NnetExampleWriter(po.GetArg(i)));
+
+    std::vector<ChunkPairInfo *> pairs;
+    ProcessRangeFile(range_rspecifier, &pairs);
+    RandomAccessBaseFloatMatrixReader feat_reader1(feature_rspecifier);
+    RandomAccessBaseFloatMatrixReader feat_reader2(feature_rspecifier);
+    int32 num_done = 0,
+          num_err = 0,
+          num_egs_written = 0;
+    for (int32 i = 0; i < pairs.size(); i++) {
+      ChunkPairInfo *pair = pairs[i];
+      const Matrix<BaseFloat> &feat1(feat_reader1.Value(pair->utt1));
+      const Matrix<BaseFloat> &feat2(feat_reader2.Value(pair->utt2));
+      WriteExample(feat1, feat2, pair, subsample, compress, &num_egs_written,
+                      &example_writers);
+      num_done++;
+    }
+    Cleanup(&pairs, &example_writers);
+
+    KALDI_LOG << "Finished generating examples, "
+              << "successfully processed " << num_done
+              << " feature files, wrote " << num_egs_written << " examples; "
+              << num_err << " files had errors.";
+    return (num_egs_written == 0 || num_err > num_done ? 1 : 0);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
diff --git a/src/xvectorbin/nnet3-xvector-get-egs-sre.cc b/src/xvectorbin/nnet3-xvector-get-egs-sre.cc
new file mode 100644
index 00000000000..28fde0fbf36
--- /dev/null
+++ b/src/xvectorbin/nnet3-xvector-get-egs-sre.cc
@@ -0,0 +1,237 @@
+// xvectorbin/nnet3-xvector-get-egs.cc
+
+// Copyright 2012-2016  Johns Hopkins University (author:  Daniel Povey)
+//                2016  David Snyder
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sstream>
+
+#include "util/common-utils.h"
+#include "nnet3/nnet-example.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+// A struct for holding information about the position and
+// duration of each pair of chunks.
+struct ChunkPairInfo {
+  std::string pair_name;
+  std::string utt1;
+  std::string utt2;
+  int32 output_archive_id;
+  int32 start_frame1;
+  int32 start_frame2;
+  int32 num_frames1;
+  int32 num_frames2;
+};
+
+// Process the range input file and store it as a map from utterance
+// name to vector of ChunkPairInfo structs.
+static void ProcessRangeFile(const std::string &range_rxfilename,
+                             std::vector<ChunkPairInfo *> *pairs) {
+  Input range_input(range_rxfilename);
+  if (!range_rxfilename.empty()) {
+    std::string line;
+    while (std::getline(range_input.Stream(), line)) {
+      ChunkPairInfo *pair = new ChunkPairInfo();
+      std::vector<std::string> fields;
+      SplitStringToVector(line, " \t\n\r", true, &fields);
+      if (fields.size() != 8)
+        KALDI_ERR << "Expected 7 fields in line of range file, got "
+                  << fields.size() << " instead.";
+
+      std::string utt1 = fields[0],
+                  utt2 = fields[1],
+                  start_frame1_str = fields[4],
+                  num_frames1_str = fields[5],
+                  start_frame2_str = fields[6],
+                  num_frames2_str = fields[7];
+      pair->utt1 = utt1;
+      pair->utt2 = utt2;
+      if (!ConvertStringToInteger(fields[2], &(pair->output_archive_id))
+          || !ConvertStringToInteger(start_frame1_str, &(pair->start_frame1))
+          || !ConvertStringToInteger(start_frame2_str, &(pair->start_frame2))
+          || !ConvertStringToInteger(num_frames1_str, &(pair->num_frames1))
+          || !ConvertStringToInteger(num_frames2_str, &(pair->num_frames2)))
+        KALDI_ERR << "Expected integer for output archive in range file.";
+      pair->pair_name = utt1 + "-" + start_frame1_str + "-" + num_frames1_str
+                      + "-" + utt2
+                      + "-" + start_frame2_str + "-" + num_frames2_str;
+      pairs->push_back(pair);
+    }
+  }
+}
+
+static void WriteExample(const MatrixBase<BaseFloat> &feat1,
+                         const MatrixBase<BaseFloat> &feat2,
+                         const ChunkPairInfo *pair,
+                         bool compress,
+                         int32 *num_egs_written,
+                         std::vector<NnetExampleWriter *> *example_writers) {
+    NnetExample eg;
+    int32 num_rows1 = feat1.NumRows(),
+          feat_dim1 = feat1.NumCols(),
+          num_rows2 = feat2.NumRows(),
+          feat_dim2 = feat2.NumCols();
+    std::string utt1 = pair->utt1,
+                utt2 = pair->utt2;
+
+    KALDI_ASSERT(feat_dim1 == feat_dim2);
+
+    if (num_rows1 < pair->num_frames1) {
+      KALDI_WARN << "Unable to create examples for utterance "
+                 << utt1
+                 << ". Requested chunk size of "
+                 << pair->num_frames1
+                 << " but utterance has only " << num_rows1 << " frames.";
+      return;
+    }
+    if (num_rows2 < pair->num_frames2) {
+      KALDI_WARN << "Unable to create examples for utterance "
+                 << utt2
+                 << ". Requested chunk size of "
+                 << pair->num_frames2
+                 << " but utterance has only " << num_rows2 << " frames.";
+      return;
+    }
+    // The requested chunk positions are approximate. It's possible
+    // that they slightly exceed the number of frames in the utterance.
+    // If that occurs, we can shift the chunks location back slightly.
+    int32 shift1 = std::min(0, num_rows1 - pair->start_frame1
+                              - pair->num_frames1),
+          shift2 = std::min(0, num_rows2 - pair->start_frame2
+                                 - pair->num_frames2);
+
+    SubMatrix<BaseFloat> chunk1(feat1, pair->start_frame1 + shift1,
+                                pair->num_frames1, 0, feat_dim1),
+                         chunk2(feat2, pair->start_frame2 + shift2,
+                                pair->num_frames2, 0, feat_dim2);
+    NnetIo nnet_io1 = NnetIo("input", 0, chunk1),
+           nnet_io2 = NnetIo("input", 0, chunk2);
+    for (std::vector<Index>::iterator indx_it = nnet_io1.indexes.begin();
+        indx_it != nnet_io1.indexes.end(); ++indx_it)
+      indx_it->n = 0;
+    for (std::vector<Index>::iterator indx_it = nnet_io2.indexes.begin();
+        indx_it != nnet_io2.indexes.end(); ++indx_it)
+      indx_it->n = 1;
+
+    eg.io.push_back(nnet_io1);
+    eg.io.push_back(nnet_io2);
+    if (compress)
+      eg.Compress();
+
+    if (pair->output_archive_id >= example_writers->size())
+      KALDI_ERR << "Requested output index exceeds number of specified "
+                << "output files.";
+    (*example_writers)[pair->output_archive_id]->Write(
+                       pair->pair_name, eg);
+    (*num_egs_written) += 1;
+}
+
+// Delete the dynamically allocated memory.
+static void Cleanup(std::vector<ChunkPairInfo *> *pairs,
+                    std::vector<NnetExampleWriter *> *writers) {
+  for (std::vector<ChunkPairInfo *>::iterator
+      vec_it = pairs->begin(); vec_it != pairs->end();
+      ++vec_it)
+    delete *vec_it;
+  for (std::vector<NnetExampleWriter *>::iterator
+      it = writers->begin(); it != writers->end(); ++it)
+    delete *it;
+}
+
+} // namespace nnet3
+} // namespace kaldi
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+
+    const char *usage =
+        "Get examples for training an nnet3 neural network for the xvector\n"
+        "system.  Each output example contains a pair of feature chunks from\n"
+        "the same utterance.  The location and length of the feature chunks\n"
+        "are specified in the 'ranges' file.  Each line is interpreted as\n"
+        "follows:\n"
+        "  <source-utterance> <relative-output-archive-index> "
+        "<absolute-archive-index>  <start-frame-index1> <num-frames1> "
+        "<start-frame-index2> <num-frames2>\n"
+        "where <relative-output-archive-index> is interpreted as a zero-based\n"
+        "index into the wspecifiers specified on the command line (<egs-0-out>\n"
+        "and so on), and <absolute-archive-index> is ignored by this program.\n"
+        "For example:\n"
+        "  utt1  3  13  0   65  112  110\n"
+        "  utt1  0  10  160 50  214  180\n"
+        "  utt2  ...\n"
+        "\n"
+        "Usage:  nnet3-xvector-get-egs [options] <ranges-filename> "
+        "<features-rspecifier> <egs-0-out> <egs-1-out> ... <egs-N-1-out>\n"
+        "\n"
+        "For example:\n"
+        "nnet3-xvector-get-egs ranges.1 \"$feats\" ark:egs_temp.1.ark"
+        "  ark:egs_temp.2.ark ark:egs_temp.3.ark\n";
+
+    bool compress = true;
+
+    ParseOptions po(usage);
+    po.Register("compress", &compress, "If true, write egs in "
+                "compressed format.");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() < 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string
+        range_rspecifier = po.GetArg(1),
+        feature_rspecifier = po.GetArg(2);
+    std::vector<NnetExampleWriter *> example_writers;
+
+    for (int32 i = 3; i <= po.NumArgs(); i++)
+      example_writers.push_back(new NnetExampleWriter(po.GetArg(i)));
+
+    std::vector<ChunkPairInfo *> pairs;
+    ProcessRangeFile(range_rspecifier, &pairs);
+    RandomAccessBaseFloatMatrixReader feat_reader1(feature_rspecifier);
+    RandomAccessBaseFloatMatrixReader feat_reader2(feature_rspecifier);
+    int32 num_done = 0,
+          num_err = 0,
+          num_egs_written = 0;
+    for (int32 i = 0; i < pairs.size(); i++) {
+      ChunkPairInfo *pair = pairs[i];
+      const Matrix<BaseFloat> &feat1(feat_reader1.Value(pair->utt1));
+      const Matrix<BaseFloat> &feat2(feat_reader2.Value(pair->utt2));
+      WriteExample(feat1, feat2, pair, compress, &num_egs_written,
+                      &example_writers);
+      num_done++;
+    }
+    Cleanup(&pairs, &example_writers);
+
+    KALDI_LOG << "Finished generating examples, "
+              << "successfully processed " << num_done
+              << " feature files, wrote " << num_egs_written << " examples; "
+              << num_err << " files had errors.";
+    return (num_egs_written == 0 || num_err > num_done ? 1 : 0);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
diff --git a/src/xvectorbin/nnet3-xvector-get-egs.cc b/src/xvectorbin/nnet3-xvector-get-egs.cc
new file mode 100644
index 00000000000..ab9a020e839
--- /dev/null
+++ b/src/xvectorbin/nnet3-xvector-get-egs.cc
@@ -0,0 +1,244 @@
+// xvectorbin/nnet3-xvector-get-egs.cc
+
+// Copyright 2012-2016  Johns Hopkins University (author:  Daniel Povey)
+//                2016  David Snyder
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sstream>
+
+#include "util/common-utils.h"
+#include "nnet3/nnet-example.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+// A struct for holding information about the position and
+// duration of each pair of chunks.
+struct ChunkPairInfo {
+  std::string pair_name;
+  int32 output_archive_id;
+  int32 start_frame1;
+  int32 start_frame2;
+  int32 num_frames1;
+  int32 num_frames2;
+};
+
+// Process the range input file and store it as a map from utterance
+// name to vector of ChunkPairInfo structs.
+static void ProcessRangeFile(const std::string &range_rxfilename,
+                             unordered_map<std::string,
+                             std::vector<ChunkPairInfo *> > *utt_to_pairs) {
+  Input range_input(range_rxfilename);
+  if (!range_rxfilename.empty()) {
+    std::string line;
+    while (std::getline(range_input.Stream(), line)) {
+      ChunkPairInfo *pair = new ChunkPairInfo();
+      std::vector<std::string> fields;
+      SplitStringToVector(line, " \t\n\r", true, &fields);
+      if (fields.size() != 7)
+        KALDI_ERR << "Expected 7 fields in line of range file, got "
+                  << fields.size() << " instead.";
+
+      std::string utt = fields[0],
+                  start_frame1_str = fields[3],
+                  num_frames1_str = fields[4],
+                  start_frame2_str = fields[5],
+                  num_frames2_str = fields[6];
+
+      if (!ConvertStringToInteger(fields[1], &(pair->output_archive_id))
+          || !ConvertStringToInteger(start_frame1_str, &(pair->start_frame1))
+          || !ConvertStringToInteger(start_frame2_str, &(pair->start_frame2))
+          || !ConvertStringToInteger(num_frames1_str, &(pair->num_frames1))
+          || !ConvertStringToInteger(num_frames2_str, &(pair->num_frames2)))
+        KALDI_ERR << "Expected integer for output archive in range file.";
+      pair->pair_name = utt + "-" + start_frame1_str + "-" + num_frames1_str
+                      + "-" + start_frame2_str + "-" + num_frames2_str;
+      unordered_map<std::string, std::vector<ChunkPairInfo*> >::iterator
+        got = utt_to_pairs->find(utt);
+      if (got == utt_to_pairs->end()) {
+        std::vector<ChunkPairInfo* > pairs;
+        pairs.push_back(pair);
+        utt_to_pairs->insert(std::pair<std::string,
+                             std::vector<ChunkPairInfo* > > (utt, pairs));
+      } else {
+        got->second.push_back(pair);
+      }
+    }
+  }
+}
+
+static void WriteExamples(const MatrixBase<BaseFloat> &feats,
+                          const std::vector<ChunkPairInfo *> &pairs,
+                          const std::string &utt,
+                          bool compress,
+                          int32 *num_egs_written,
+                          std::vector<NnetExampleWriter *> *example_writers) {
+  for (std::vector<ChunkPairInfo *>::const_iterator it = pairs.begin();
+      it != pairs.end(); ++it) {
+    ChunkPairInfo *pair = *it;
+    NnetExample eg;
+    int32 num_rows = feats.NumRows(),
+          feat_dim = feats.NumCols();
+    if (num_rows < std::max(pair->num_frames1, pair->num_frames2)) {
+      KALDI_WARN << "Unable to create examples for utterance " << utt
+                 << ". Requested chunk size of "
+                 << std::max(pair->num_frames1, pair->num_frames2)
+                 << " but utterance has only " << num_rows << " frames.";
+    } else {
+      // The requested chunk positions are approximate. It's possible
+      // that they slightly exceed the number of frames in the utterance.
+      // If that occurs, we can shift the chunks location back slightly.
+      int32 shift1 = std::min(0, num_rows - pair->start_frame1
+                                 - pair->num_frames1),
+            shift2 = std::min(0, num_rows - pair->start_frame2
+                                 - pair->num_frames2);
+      SubMatrix<BaseFloat> chunk1(feats, pair->start_frame1 + shift1,
+                                  pair->num_frames1, 0, feat_dim),
+                           chunk2(feats, pair->start_frame2 + shift2,
+                                  pair->num_frames2, 0, feat_dim);
+      NnetIo nnet_io1 = NnetIo("input", 0, chunk1),
+             nnet_io2 = NnetIo("input", 0, chunk2);
+      for (std::vector<Index>::iterator indx_it = nnet_io1.indexes.begin();
+          indx_it != nnet_io1.indexes.end(); ++indx_it)
+        indx_it->n = 0;
+      for (std::vector<Index>::iterator indx_it = nnet_io2.indexes.begin();
+          indx_it != nnet_io2.indexes.end(); ++indx_it)
+        indx_it->n = 1;
+
+      NnetExample eg;
+      eg.io.push_back(nnet_io1);
+      eg.io.push_back(nnet_io2);
+      if (compress)
+        eg.Compress();
+
+      if (pair->output_archive_id >= example_writers->size())
+        KALDI_ERR << "Requested output index exceeds number of specified "
+                  << "output files.";
+      (*example_writers)[pair->output_archive_id]->Write(
+                         pair->pair_name, eg);
+      (*num_egs_written) += 1;
+    }
+  }
+}
+
+// Delete the dynamically allocated memory.
+static void Cleanup(unordered_map<std::string,
+                    std::vector<ChunkPairInfo *> > *utt_to_pairs,
+                    std::vector<NnetExampleWriter *> *writers) {
+  for (unordered_map<std::string, std::vector<ChunkPairInfo*> >::iterator
+      map_it = utt_to_pairs->begin();
+      map_it != utt_to_pairs->end(); ++map_it)
+    for (std::vector<ChunkPairInfo*>::iterator
+        vec_it = map_it->second.begin(); vec_it != map_it->second.end();
+        ++vec_it)
+      delete *vec_it;
+  for (std::vector<NnetExampleWriter *>::iterator
+      it = writers->begin(); it != writers->end(); ++it)
+    delete *it;
+}
+
+} // namespace nnet3
+} // namespace kaldi
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+
+    const char *usage =
+        "Get examples for training an nnet3 neural network for the xvector\n"
+        "system.  Each output example contains a pair of feature chunks from\n"
+        "the same utterance.  The location and length of the feature chunks\n"
+        "are specified in the 'ranges' file.  Each line is interpreted as\n"
+        "follows:\n"
+        "  <source-utterance> <relative-output-archive-index> "
+        "<absolute-archive-index>  <start-frame-index1> <num-frames1> "
+        "<start-frame-index2> <num-frames2>\n"
+        "where <relative-output-archive-index> is interpreted as a zero-based\n"
+        "index into the wspecifiers specified on the command line (<egs-0-out>\n"
+        "and so on), and <absolute-archive-index> is ignored by this program.\n"
+        "For example:\n"
+        "  utt1  3  13  0   65  112  110\n"
+        "  utt1  0  10  160 50  214  180\n"
+        "  utt2  ...\n"
+        "\n"
+        "Usage:  nnet3-xvector-get-egs [options] <ranges-filename> "
+        "<features-rspecifier> <egs-0-out> <egs-1-out> ... <egs-N-1-out>\n"
+        "\n"
+        "For example:\n"
+        "nnet3-xvector-get-egs ranges.1 \"$feats\" ark:egs_temp.1.ark"
+        "  ark:egs_temp.2.ark ark:egs_temp.3.ark\n";
+
+    bool compress = true;
+
+    ParseOptions po(usage);
+    po.Register("compress", &compress, "If true, write egs in "
+                "compressed format.");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() < 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string
+        range_rspecifier = po.GetArg(1),
+        feature_rspecifier = po.GetArg(2);
+    std::vector<NnetExampleWriter *> example_writers;
+
+    for (int32 i = 3; i <= po.NumArgs(); i++)
+      example_writers.push_back(new NnetExampleWriter(po.GetArg(i)));
+
+    unordered_map<std::string, std::vector<ChunkPairInfo *> > utt_to_pairs;
+    ProcessRangeFile(range_rspecifier, &utt_to_pairs);
+    SequentialBaseFloatMatrixReader feat_reader(feature_rspecifier);
+
+    int32 num_done = 0,
+          num_err = 0,
+          num_egs_written = 0;
+
+    for (; !feat_reader.Done(); feat_reader.Next()) {
+      std::string key = feat_reader.Key();
+      const Matrix<BaseFloat> &feats = feat_reader.Value();
+      unordered_map<std::string, std::vector<ChunkPairInfo*> >::iterator
+        got = utt_to_pairs.find(key);
+      if (got == utt_to_pairs.end()) {
+        KALDI_WARN << "Could not create examples from utterance "
+                   << key << " because it has no entry in the ranges "
+                  <<  "input file.";
+        num_err++;
+      } else {
+        std::vector<ChunkPairInfo *> pairs = got->second;
+        WriteExamples(feats, pairs, key, compress, &num_egs_written,
+                      &example_writers);
+        num_done++;
+      }
+    }
+    Cleanup(&utt_to_pairs, &example_writers);
+
+    KALDI_LOG << "Finished generating examples, "
+              << "successfully processed " << num_done
+              << " feature files, wrote " << num_egs_written << " examples; "
+              << num_err << " files had errors.";
+    return (num_egs_written == 0 || num_err > num_done ? 1 : 0);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
diff --git a/src/xvectorbin/nnet3-xvector-scoring.cc b/src/xvectorbin/nnet3-xvector-scoring.cc
new file mode 100644
index 00000000000..0b2512df83d
--- /dev/null
+++ b/src/xvectorbin/nnet3-xvector-scoring.cc
@@ -0,0 +1,151 @@
+// xvectorbin/nnet3-xvector-scoring.cc
+
+// Copyright 2013  Daniel Povey
+//           2016  David Snyder
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "nnet3/nnet-utils.h"
+#include "xvector/xvector.h"
+
+
+int main(int argc, char *argv[]) {
+  using namespace kaldi;
+  using namespace kaldi::nnet3;
+  typedef kaldi::int32 int32;
+  typedef kaldi::int64 int64;
+  try {
+    const char *usage =
+        "Computes scores between pairs of xvectors.\n"
+        "The 'trials-file' has lines of the form\n"
+        "<key1> <key2>\n"
+        "and the output will have the form\n"
+        "<key1> <key2> [<score>]\n"
+        "(if either key could not be found, the score field in the output\n"
+        "will be absent, and this program will print a warning)\n"
+        "\n"
+        "Usage:  nnet3-xvector-scoring [options] <raw-nnet-in> "
+        "<trials-in> <xvector1-rspecifier> <xvector2-rspecifier> "
+        "<scores-out>\n"
+        "e.g.: \n"
+        "  nnet3-xvector-scoring nnet.final trials ark:spk_xvectors.scp "
+        "ark:test_xvectors.scp trials.scored\n"
+        "See also: ivector-plda-scoring and ivector-compute-dot-products\n";
+
+    ParseOptions po(usage);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 5) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string nnet_rxfilename = po.GetArg(1),
+        trials_rxfilename = po.GetArg(2),
+        xvector1_rspecifier = po.GetArg(3),
+        xvector2_rspecifier = po.GetArg(4),
+        scores_wxfilename = po.GetArg(5);
+
+
+    int64 num_done = 0, num_err = 0;
+    Nnet nnet;
+    ReadKaldiObject(nnet_rxfilename, &nnet);
+    // We need to ensure that the Nnet has outputs called 's' and 'b'
+    // and that 'b' is a scalar and 's' can be interpreted as a symmetric
+    // matrix.
+    int32 s_index = nnet.GetNodeIndex("s"),
+          b_index = nnet.GetNodeIndex("b");
+    if (s_index == -1 || b_index == -1)
+      KALDI_ERR << "The input Nnet cannot be used for xvector scoring"
+                << "because it has no output called 's' or 'b'.";
+    if (!nnet.IsOutputNode(s_index) || !nnet.IsOutputNode(b_index))
+      KALDI_ERR << "The nodes 's' and 'b' must be output nodes.";
+
+    int32 s_dim = nnet.OutputDim("s"),
+          b_dim = nnet.OutputDim("b");
+    if (b_dim != 1)
+      KALDI_ERR << "The output 'b' is a scalar offset.  Input Nnet has an"
+                << "output called 'b' but it has a dimension of " << b_dim;
+    int32 d = (0.5) * (1 + sqrt(1 + 8 * s_dim)) - 1;
+    if (((d + 1) * d) / 2 != s_dim)
+      KALDI_ERR << "Output 's' cannot be interpretedas a symmetric matrix.";
+    Vector<BaseFloat> s_vec(s_dim);
+    Vector<BaseFloat> b_vec(1);
+    GetConstantOutput(nnet, "s", &s_vec);
+    GetConstantOutput(nnet, "b", &b_vec);
+    SpMatrix<BaseFloat> S(d);
+    SubVector<BaseFloat> s_vec_sub(s_vec, 0, s_dim);
+    S.CopyFromVec(s_vec_sub);
+    BaseFloat b = b_vec(0);
+
+    RandomAccessBaseFloatVectorReader xvector1_reader(xvector1_rspecifier);
+    RandomAccessBaseFloatVectorReader xvector2_reader(xvector2_rspecifier);
+
+    Input ki(trials_rxfilename);
+
+    bool binary = false;
+    Output ko(scores_wxfilename, binary);
+    double sum = 0.0, sumsq = 0.0;
+
+    std::string line;
+    while (std::getline(ki.Stream(), line)) {
+      std::vector<std::string> fields;
+      SplitStringToVector(line, " \t\n\r", true, &fields);
+      if (fields.size() != 2) {
+        KALDI_ERR << "Bad line " << (num_done + num_err) << " in input "
+                  << "(expected two fields: key1 key2): " << line;
+      }
+      std::string key1 = fields[0], key2 = fields[1];
+      if (!xvector1_reader.HasKey(key1)) {
+        KALDI_WARN << "Key " << key1 << " not present in 1st table of xvectors.";
+        num_err++;
+        continue;
+      }
+      if (!xvector2_reader.HasKey(key2)) {
+        KALDI_WARN << "Key " << key2 << " not present in 2nd table of xvectors.";
+        num_err++;
+        continue;
+      }
+      const Vector<BaseFloat> &xvector1 = xvector1_reader.Value(key1),
+          &xvector2 = xvector2_reader.Value(key2);
+      // The following will crash if the dimensions differ, but
+      // they would likely also differ for all the xvectors so it's probably
+      // best to just crash.
+      BaseFloat score = SimilarityScore(xvector1, xvector2, S, b);
+      sum += score;
+      sumsq += score * score;
+      num_done++;
+      ko.Stream() << key1 << ' ' << key2 << ' ' << score << std::endl;
+    }
+
+    if (num_done != 0) {
+      BaseFloat mean = sum / num_done, scatter = sumsq / num_done,
+          variance = scatter - mean * mean, stddev = sqrt(variance);
+      KALDI_LOG << "Mean score was " << mean << ", standard deviation was "
+                << stddev;
+    }
+    KALDI_LOG << "Processed " << num_done << " trials " << num_err
+              << " had errors.";
+    return (num_done != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/xvectorbin/nnet3-xvector-show-progress.cc b/src/xvectorbin/nnet3-xvector-show-progress.cc
new file mode 100644
index 00000000000..951a7b1eb3a
--- /dev/null
+++ b/src/xvectorbin/nnet3-xvector-show-progress.cc
@@ -0,0 +1,158 @@
+// xvectorbin/nnet3-xvector-show-progress.cc
+
+// Copyright 2015 Johns Hopkins University (author:  Daniel Povey)
+//           2015 Xingyu Na
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "nnet3/nnet-utils.h"
+#include "nnet3/nnet-diagnostics.h"
+#include "xvector/nnet-xvector-diagnostics.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Given an old and a new 'raw' nnet3 network and some training examples\n"
+        "(possibly held-out), show the average objective function given the\n"
+        "mean of the two networks, and the breakdown by component of why this\n"
+        "happened (computed from derivative information). Also shows parameter\n"
+        "differences per layer. If training examples not provided, only shows\n"
+        "parameter differences per layer.\n"
+        "\n"
+        "Usage:  nnet3-xvector-show-progress [options] <old-net-in> <new-net-in>"
+        " [<training-examples-in>]\n"
+        "e.g.: nnet3-xvector-show-progress 1.nnet 2.nnet ark:valid.egs\n";
+
+    ParseOptions po(usage);
+
+    int32 num_segments = 1;
+    std::string use_gpu = "no";
+    NnetComputeProbOptions compute_prob_opts;
+    compute_prob_opts.compute_deriv = true;
+
+    po.Register("num-segments", &num_segments,
+                "Number of line segments used for computing derivatives");
+    po.Register("use-gpu", &use_gpu,
+                "yes|no|optional|wait, only has effect if compiled with CUDA");
+    compute_prob_opts.Register(&po);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() < 2 || po.NumArgs() > 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().SelectGpuId(use_gpu);
+#endif
+
+    std::string nnet1_rxfilename = po.GetArg(1),
+                nnet2_rxfilename = po.GetArg(2),
+                examples_rspecifier = po.GetOptArg(3);
+
+    Nnet nnet1, nnet2;
+    ReadKaldiObject(nnet1_rxfilename, &nnet1);
+    ReadKaldiObject(nnet2_rxfilename, &nnet2);
+
+    if (NumParameters(nnet1) != NumParameters(nnet2)) {
+      KALDI_WARN << "Parameter-dim mismatch, cannot show progress.";
+      exit(0);
+    }
+
+    if (!examples_rspecifier.empty()) {
+      std::vector<NnetExample> examples;
+      SequentialNnetExampleReader example_reader(examples_rspecifier);
+      for (; !example_reader.Done(); example_reader.Next())
+        examples.push_back(example_reader.Value());
+
+      int32 num_examples = examples.size();
+
+      if (num_examples == 0)
+        KALDI_ERR << "No examples read.";
+
+      int32 num_updatable = NumUpdatableComponents(nnet1);
+      Vector<BaseFloat> diff(num_updatable);
+
+      for (int32 s = 0; s < num_segments; s++) {
+        // start and end segments of the line between 0 and 1
+        BaseFloat start = (s + 0.0) / num_segments,
+            end = (s + 1.0) / num_segments, middle = 0.5 * (start + end);
+        Nnet interp_nnet(nnet2);
+        ScaleNnet(middle, &interp_nnet);
+        AddNnet(nnet1, 1.0 - middle, &interp_nnet);
+
+        NnetXvectorComputeProb prob_computer(compute_prob_opts, interp_nnet);
+        std::vector<NnetExample>::const_iterator eg_iter = examples.begin(),
+                                                 eg_end = examples.end();
+        for (; eg_iter != eg_end; ++eg_iter)
+          prob_computer.Compute(*eg_iter);
+        const SimpleObjectiveInfo *objf_info = prob_computer.GetObjective("output");
+        double objf_per_frame = objf_info->tot_objective / objf_info->tot_weight;
+        const Nnet &nnet_gradient = prob_computer.GetDeriv();
+        KALDI_LOG << "At position " << middle
+                  << ", objf per frame is " << objf_per_frame;
+
+        Vector<BaseFloat> old_dotprod(num_updatable), new_dotprod(num_updatable);
+        ComponentDotProducts(nnet_gradient, nnet1, &old_dotprod);
+        ComponentDotProducts(nnet_gradient, nnet2, &new_dotprod);
+        old_dotprod.Scale(1.0 / objf_info->tot_weight);
+        new_dotprod.Scale(1.0 / objf_info->tot_weight);
+        diff.AddVec(1.0/ num_segments, new_dotprod);
+        diff.AddVec(-1.0 / num_segments, old_dotprod);
+        KALDI_VLOG(1) << "By segment " << s << ", objf change is "
+                      << PrintVectorPerUpdatableComponent(nnet1, diff);
+      }
+      KALDI_LOG << "Total objf change per component is "
+                << PrintVectorPerUpdatableComponent(nnet1, diff);
+    }
+
+    { // Get info about magnitude of parameter change.
+      Nnet diff_nnet(nnet1);
+      AddNnet(nnet2, -1.0, &diff_nnet);
+      int32 num_updatable = NumUpdatableComponents(diff_nnet);
+      Vector<BaseFloat> dot_prod(num_updatable);
+      ComponentDotProducts(diff_nnet, diff_nnet, &dot_prod);
+      dot_prod.ApplyPow(0.5); // take sqrt to get l2 norm of diff
+      KALDI_LOG << "Parameter differences per layer are "
+                << PrintVectorPerUpdatableComponent(nnet1, dot_prod);
+
+      Vector<BaseFloat> baseline_prod(num_updatable);
+      ComponentDotProducts(nnet1, nnet1, &baseline_prod);
+      baseline_prod.ApplyPow(0.5);
+      dot_prod.DivElements(baseline_prod);
+      KALDI_LOG << "Relative parameter differences per layer are "
+                << PrintVectorPerUpdatableComponent(nnet1, dot_prod);
+    }
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().PrintProfile();
+#endif
+    return 0;
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
+
diff --git a/src/xvectorbin/nnet3-xvector-train.cc b/src/xvectorbin/nnet3-xvector-train.cc
new file mode 100644
index 00000000000..a120879e72c
--- /dev/null
+++ b/src/xvectorbin/nnet3-xvector-train.cc
@@ -0,0 +1,94 @@
+// xvectorbin/nnet3-xvector-train.cc
+
+// Copyright 2015  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "nnet3/nnet-training.h"
+#include "xvector/nnet-xvector-training.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Train xvector neural network parameters with backprop and stochastic\n"
+        "gradient descent.  Minibatches are to be created by nnet3-merge-egs in\n"
+        "the input pipeline.  This training program is single-threaded (best to\n"
+        "use it with a GPU); see nnet3-train-parallel for multi-threaded training\n"
+        "that is better suited to CPUs.\n"
+        "\n"
+        "Usage:  nnet3-xvector-train [options] <raw-model-in> <training-examples-in> <raw-model-out>\n"
+        "\n"
+        "e.g.:\n"
+        "nnet3-xvector-train 1.raw 'ark:nnet3-merge-egs 1.egs ark:-|' 2.raw\n";
+
+    bool binary_write = true;
+    std::string use_gpu = "yes";
+    NnetTrainerOptions train_config;
+
+    ParseOptions po(usage);
+    po.Register("binary", &binary_write, "Write output in binary mode");
+    po.Register("use-gpu", &use_gpu,
+                "yes|no|optional|wait, only has effect if compiled with CUDA");
+
+    train_config.Register(&po);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().SelectGpuId(use_gpu);
+#endif
+
+    std::string nnet_rxfilename = po.GetArg(1),
+        examples_rspecifier = po.GetArg(2),
+        nnet_wxfilename = po.GetArg(3);
+
+    Nnet nnet;
+    ReadKaldiObject(nnet_rxfilename, &nnet);
+
+    NnetXvectorTrainer trainer(train_config, &nnet);
+
+    SequentialNnetExampleReader example_reader(examples_rspecifier);
+
+    for (; !example_reader.Done(); example_reader.Next())
+      trainer.Train(example_reader.Value());
+
+    bool ok = trainer.PrintTotalStats();
+
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().PrintProfile();
+#endif
+    WriteKaldiObject(nnet, nnet_wxfilename, binary_write);
+    KALDI_LOG << "Wrote model to " << nnet_wxfilename;
+    return (ok ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
+
diff --git a/tools/config/common_path.sh b/tools/config/common_path.sh
index 9a7ae2d9b29..4dba58ac929 100644
--- a/tools/config/common_path.sh
+++ b/tools/config/common_path.sh
@@ -22,4 +22,6 @@ ${KALDI_ROOT}/src/rnnlmbin:\
 ${KALDI_ROOT}/src/sgmm2bin:\
 ${KALDI_ROOT}/src/sgmmbin:\
 ${KALDI_ROOT}/src/tfrnnlmbin:\
+${KALDI_ROOT}/src/xvectorbin:\
+${KALDI_ROOT}/src/fvectorbin:\
 $PATH