From d605149ec4fd84536f7a1a08f63200c00798cc8e Mon Sep 17 00:00:00 2001 From: LvHang Date: Mon, 16 Apr 2018 18:04:47 -0400 Subject: [PATCH 1/2] merge raw data codes fix some bugs about raw waveform codes add make_raw_feats script add ApplyMinMaxToWeights fix the scripts to use max/min_param_value On/Off batchnorm in fft component add a nnet3/get_egs_old.sh backup small fix small fix2 --- egs/multi_en/s5/cmd.sh | 8 +- egs/multi_en/s5/conf/queue_no_k20.conf | 16 + egs/multi_en/s5/conf/raw_no_mvn.conf | 7 + egs/wsj/s5/steps/libs/common.py | 51 ++ egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py | 1 + egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py | 6 +- .../s5/steps/libs/nnet3/xconfig/raw_signal.py | 552 ++++++++++++++++++ egs/wsj/s5/steps/make_raw_feats.sh | 129 ++++ egs/wsj/s5/steps/nnet3/chain/get_egs_old.sh | 428 ++++++++++++++ egs/wsj/s5/steps/nnet3/chain/train.py | 27 + src/featbin/Makefile | 2 +- src/featbin/compute-raw-frame-feats.cc | 122 ++++ src/nnet3/nnet-chain-training.cc | 3 + src/nnet3/nnet-component-itf.cc | 46 ++ src/nnet3/nnet-component-itf.h | 15 +- src/nnet3/nnet-convolutional-component.h | 1 + src/nnet3/nnet-general-component.h | 2 + src/nnet3/nnet-simple-component.cc | 449 +++++++++++++- src/nnet3/nnet-simple-component.h | 155 ++++- src/nnet3/nnet-utils.cc | 16 + src/nnet3/nnet-utils.h | 9 + 21 files changed, 2036 insertions(+), 9 deletions(-) create mode 100644 egs/multi_en/s5/conf/queue_no_k20.conf create mode 100644 egs/multi_en/s5/conf/raw_no_mvn.conf create mode 100644 egs/wsj/s5/steps/libs/nnet3/xconfig/raw_signal.py create mode 100755 egs/wsj/s5/steps/make_raw_feats.sh create mode 100755 egs/wsj/s5/steps/nnet3/chain/get_egs_old.sh create mode 100644 src/featbin/compute-raw-frame-feats.cc diff --git a/egs/multi_en/s5/cmd.sh b/egs/multi_en/s5/cmd.sh index ed453ff8416..4f3b12aa700 100755 --- a/egs/multi_en/s5/cmd.sh +++ b/egs/multi_en/s5/cmd.sh @@ -10,7 +10,7 @@ # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. -export train_cmd="queue.pl --mem 2G" -export decode_cmd="queue.pl --mem 4G" -export mkgraph_cmd="queue.pl --mem 8G" -export cuda_cmd="queue.pl --gpu 1" +export train_cmd="queue.pl --mem 2G --config conf/queue_no_k20.conf --allow-k10-k20 true" +export decode_cmd="queue.pl --mem 4G --config conf/queue_no_k20.conf --allow-k10-k20 true" +export mkgraph_cmd="queue.pl --mem 8G --config conf/queue_no_k20.conf --allow-k10-k20 true" +export cuda_cmd="queue.pl --gpu 1 --config conf/queue_no_k20.conf --allow-k10-k20 true" diff --git a/egs/multi_en/s5/conf/queue_no_k20.conf b/egs/multi_en/s5/conf/queue_no_k20.conf new file mode 100644 index 00000000000..e8d19a24ef7 --- /dev/null +++ b/egs/multi_en/s5/conf/queue_no_k20.conf @@ -0,0 +1,16 @@ +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -l 'hostname=!a08*&!a09*&!a10*&!c04*&!b18*&!b19*&!b20*' +option mem=* -l mem_free=$0,ram_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -pe smp $0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q +option gpu=* -l gpu=$0 -q g.q +default allow_k20=true +option allow_k20=true +option allow_k20=false -l 'hostname=!g01*&!g02*&!b06*' +default allow_k10_k20=true +option allow_k10_k20=true +option allow_k10_k20=false -l 'hostname=!b0*&!b10*&!g01*&!g02' diff --git a/egs/multi_en/s5/conf/raw_no_mvn.conf b/egs/multi_en/s5/conf/raw_no_mvn.conf new file mode 100644 index 00000000000..90fe4eed9d4 --- /dev/null +++ b/egs/multi_en/s5/conf/raw_no_mvn.conf @@ -0,0 +1,7 @@ +# configs for raw wav features +--sample-frequency=8000 +--remove-dc-offset=false +--loudness-equalize=false +--remove-global-dc-offset=false +--snip-edges=false +--dither=1 diff --git a/egs/wsj/s5/steps/libs/common.py b/egs/wsj/s5/steps/libs/common.py index 1e8e2ced6ce..5dffde08145 100644 --- a/egs/wsj/s5/steps/libs/common.py +++ b/egs/wsj/s5/steps/libs/common.py @@ -17,6 +17,7 @@ import subprocess import sys import threading +import numpy as np logger = logging.getLogger(__name__) logger.addHandler(logging.NullHandler()) @@ -517,3 +518,53 @@ def write_idct_matrix(feat_dim, cepstral_lifter, file_path): for k in range(0, feat_dim): idct_matrix[k].append(0) write_kaldi_matrix(file_path, idct_matrix) + + +def compute_sin_cos_transform_matrix(K, N, compute_cosine=True, add_bias=False, half_range=False): + assert(K <= N) + n_range = (N/2 if half_range is True else N) + matrix = [[0] * (K + (1 if add_bias else 0)) for i in range(n_range)] + if compute_cosine: + for k in range(0, K): + for n in range(0, n_range): + matrix[n][k] = math.cos(2* math.pi / float(N) * n * k) + else: + for k in range(0, K): + for n in range(0, n_range): + matrix[n][k] = -1.0 * math.sin(2* math.pi / float(N) * n * k) + return matrix + +def write_sin_cos_transform_matrix(feat_dim, fft_dim, file_path, compute_cosine=True, add_bias=False, half_range=False): + # generate discrete sin and cosine transform and write to the file + transform_matrix = compute_sin_cos_transform_matrix(feat_dim, fft_dim, + compute_cosine=compute_cosine, add_bias=add_bias, half_range=half_range) + write_kaldi_matrix(file_path, transform_matrix) + +def write_negate_vector(fft_dim, file_path): + scale_vec = [[-1.0] * fft_dim] + write_kaldi_matrix(file_path, scale_vec) + +# This function computes transform for applying mean-subtraction -> pre-emphasis +# -> windowing, which can be used in the begining of network. +def compute_and_write_preprocess_transform(preemph, dim, file_path): + preemph_mat = [[0] * dim for i in range(dim)] + mean_subtract_mat = [[-1.0/dim] * dim for i in range(dim)] + window_mat = [[0] * dim for i in range(dim)] + preemph_mat[0][0] = 1.0 - 1.0 * preemph; + for i in range(dim): + if (i > 0): + preemph_mat[i][i-1] = -1.0 * preemph + preemph_mat[i][i] = 1.0 + mean_subtract_mat [i][i] = 1.0 - 1.0/dim + if (i == 0): + i_fl = float(i+1) + elif (i == (dim-1)): + i_fl = float(dim-2.0) + else: + i_fl = float(i) + window_mat[i][i] = (0.5 - 0.5 * math.cos(2 * math.pi * i_fl / float(dim)))**0.85 + tot_mat_tmp = np.dot(preemph_mat, mean_subtract_mat) + tot_mat = np.dot(window_mat, tot_mat_tmp) + bias = np.zeros((dim,1)) + biased_mat = np.c_[tot_mat, bias] + write_kaldi_matrix(file_path, biased_mat) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py index 32d6e87eba1..0a82ee44615 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py @@ -10,3 +10,4 @@ from .gru import * from .stats_layer import * from .trivial_layers import * +from .raw_signal import * diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py index a3e3e970311..cccd29ad53d 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py @@ -71,7 +71,11 @@ 'renorm-component': xlayers.XconfigRenormComponent, 'batchnorm-component': xlayers.XconfigBatchnormComponent, 'no-op-component': xlayers.XconfigNoOpComponent, - 'linear-component': xlayers.XconfigLinearComponent + 'linear-component': xlayers.XconfigLinearComponent, + 'preprocess-fft-abs-norm-lognorm-affine-log-layer': xlayers.XconfigFftFilterLayer, + 'preprocess-fft-abs-lognorm-affine-log-layer': xlayers.XconfigFftFilterLayer, + 'preprocess-fft-abs-log-layer': xlayers.XconfigFftFilterLayer, + 'preprocess-tconv-abs-log-nin-affine-layer': xlayers.XconfigTimeDomainLayer } # Turn a config line and a list of previous layers into diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/raw_signal.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/raw_signal.py new file mode 100644 index 00000000000..d2cffe04cc7 --- /dev/null +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/raw_signal.py @@ -0,0 +1,552 @@ +# Copyright 2017 Pegah Ghahremani +# Apache 2.0. + +""" This module contains layer types for processig raw waveform frames. +""" + +from __future__ import print_function +import math +import re +import sys +from libs.nnet3.xconfig.basic_layers import XconfigLayerBase + +# This class is used for frequency-domain filter learning. +# This class is for parsing lines like +# 'preprocess-fft-abs-lognorm-affine-log-layer fft-dim=512 num-left-inputs=1' +# 'num-right-inputs=2 l2-reg=0.001' +# preprocess : applies windowing and pre-emphasis on input frames. +# fft : compute real and imaginary part of discrete cosine transform +# using sine and cosine transform. +# abs : computes absolute value of real and complex part of fft. +# lognorm : normalize input in log-space using batchnorm followed by per-element +# scale and offset. +# affine : filterbank learned using AffineComponent + +class XconfigFftFilterLayer(XconfigLayerBase): + def __init__(self, first_token, key_to_value, prev_names = None): + # Here we just list some likely combinations.. you can just add any + # combinations you want to use, to this list. + assert first_token in ['preprocess-fft-abs-lognorm-affine-log-layer', + 'preprocess-fft-abs-norm-lognorm-affine-log-layer', + 'preprocess-fft-abs-norm-affine-log-layer', + 'preprocess-fft-abs-log-layer'] + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + def set_default_configs(self): + self.config = { 'input':'[-1]', + 'dim': -1, + 'max-change' : 0.75, + 'target-rms' : 1.0, + 'learning-rate-factor' : 1.0, + 'max-change' : 0.75, + 'max-param-value' : 1.0, + 'min-param-value' : 0.0, + 'l2-regularize' : 0.005, + 'learning-rate-factor' : 1, + 'dim' : -1, + 'write-init-config' : True, + 'num-filters' : 100, + 'sin-transform-file' : '', + 'cos-transform-file' : '', + 'scale': 1.0, + 'half-fft-range' : False} # l2-regularize and min-param-value + # and max-param-value affects + # layers affine layer. + def check_configs(self): + if self.config['target-rms'] < 0.0: + raise RuntimeError("target-rms has invalid value {0}" + .format(self.config['target-rms'])) + if self.config['learning-rate-factor'] <= 0.0: + raise RuntimeError("learning-rate-factor has invalid value {0}" + .format(self.config['learning-rate-factor'])) + if self.config['max-param-value'] < self.config['min-param-value']: + raise RuntimeError("max-param-value {0} should be larger than " + "min-param-value {1}." + "".format(self.config['max-param-value'], + self.config['min-param-value'])) + + if self.config['sin-transform-file'] is None: + raise RuntimeError("sin-transform-file must be set.") + + if self.config['cos-transform-file'] is None: + raise RuntimeError("cos-transform-file must be set.") + + def output_name(self, auxiliary_output=None): + assert auxiliary_output == None + + split_layer_name = self.layer_type.split('-') + assert split_layer_name[-1] == 'layer' + last_nonlinearity = split_layer_name[-2] + return '{0}.{1}'.format(self.name, last_nonlinearity) + + + def output_dim(self): + split_layer_name = self.layer_type.split('-') + if 'affine' in split_layer_name: + output_dim = self.config['num-filters'] + if 'norm' in split_layer_name: + output_dim = output_dim + 1 + else: + input_dim = self.descriptors['input']['dim'] + fft_dim = (2**(input_dim-1).bit_length()) + half_fft_range = self.config['half-fft-range'] + output_dim = (fft_dim/2 if half_fft_range is True else fft_dim) + return output_dim + + def get_full_config(self): + ans = [] + config_lines = self._generate_config() + + for line in config_lines: + if len(line) == 2: + # 'ref' or 'final' tuple already exist in the line + # These lines correspond to fft component. + # which contains FixedAffineComponent. + assert(line[0] == 'init' or line[0] == 'ref' or line[0] == 'final') + ans.append(line) + else: + for config_name in ['ref', 'final']: + ans.append((config_name, line)) + return ans + + def _generate_config(self): + split_layer_name = self.layer_type.split('-') + assert split_layer_name[-1] == 'layer' + nonlinearities = split_layer_name[:-1] + + # by 'descriptor_final_string' we mean a string that can appear in + # config-files, i.e. it contains the 'final' names of nodes. + input_desc = self.descriptors['input']['final-string'] + input_dim = self.descriptors['input']['dim'] + + # the child classes e.g. tdnn might want to process the input + # before adding the other components + + return self._add_components(input_desc, input_dim, nonlinearities) + + def _add_components(self, input_desc, input_dim, nonlinearities): + dim = self.config['dim'] + min_param_value = self.config['min-param-value'] + max_param_value = self.config['max-param-value'] + target_rms = self.config['target-rms'] + max_change = self.config['max-change'] + #ng_affine_options = self.config['ng-affine-options'] + learning_rate_factor= self.config['learning-rate-factor'] + learning_rate_option=('learning-rate-factor={0}'.format(learning_rate_factor) + if learning_rate_factor != 1.0 else '') + cos_file = self.config['cos-transform-file'] + sin_file = self.config['sin-transform-file'] + num_filters = self.config['num-filters'] + l2_regularize = self.config['l2-regularize'] + half_fft_range = self.config['half-fft-range'] + fft_dim = (2**(input_dim-1).bit_length()) + cur_dim = input_dim + cur_node = input_desc + scale = self.config['scale'] + configs = [] + for nonlinearity in nonlinearities: + if nonlinearity == 'preprocess': + configs.append('component name={0}.preprocess type=ShiftInputComponent ' + 'input-dim={1} output-dim={1} dither=0.0 max-shift=0.0 ' + 'preprocess=true'.format(self.name, cur_dim)) + + configs.append('component-node name={0}.preprocess ' + 'component={0}.preprocess input={1}' + ''.format(self.name, cur_node)) + cur_node = '{0}.preprocess'.format(self.name) + + elif nonlinearity == 'fft': + #if self.config['write-init-config']: + # line = ('output-node name=output input={0}' + # ''.format(input_desc)) + # configs.append(('init', line)) + output_dim = (fft_dim/2 if half_fft_range is True else fft_dim) + line = ('component name={0}.cosine type=FixedAffineComponent ' + 'matrix={1}' + ''.format(self.name, cos_file)) + configs.append(('final', line)) + + line = ('component name={0}.cosine type=FixedAffineComponent ' + 'input-dim={1} output-dim={2}' + ''.format(self.name, cur_dim, output_dim)) + configs.append(('ref', line)) + + line = ('component-node name={0}.cosine component={0}.cosine ' + 'input={1}'.format(self.name, cur_node)) + configs.append(('final', line)) + configs.append(('ref', line)) + + line = ('component name={0}.sine type=FixedAffineComponent ' + 'matrix={1}'.format(self.name, sin_file)) + configs.append(('final', line)) + + line = ('component name={0}.sine type=FixedAffineComponent ' + 'input-dim={1} output-dim={2}' + ''.format(self.name, cur_dim, output_dim)) + configs.append(('ref', line)) + + line = ('component-node name={0}.sine component={0}.sine ' + 'input={1}'.format(self.name, cur_node)) + configs.append(('final', line)) + configs.append(('ref', line)) + + cur_node = [] + if half_fft_range: + cur_node.append('{0}.cosine'.format(self.name)) + cur_node.append('{0}.sine'.format(self.name)) + else: + configs.append('dim-range-node name={0}.sine.half input-node={0}.sine ' + 'dim-offset=0 dim={1}'.format(self.name, fft_dim/2)) + configs.append('dim-range-node name={0}.cosine.half input-node={0}.cosine ' + 'dim-offset=0 dim={1}'.format(self.name, fft_dim/2)) + cur_node.append('{0}.cosine.half'.format(self.name)) + cur_node.append('{0}.sine.half'.format(self.name)) + cur_dim = fft_dim / 2 + elif nonlinearity == 'abs2': + assert(len(cur_node) == 2 and + cur_node[0] == '{0}.cosine'.format(self.name) and + cur_node[1] == '{0}.sine'.format(self.name)) + configs.append('component name={0}.cos.sqr type=ElementwiseProductComponent ' + 'input-dim={1} output-dim={2}' + ''.format(self.name, cur_dim * 2, cur_dim)) + configs.append('component-node name={0}.cos.sqr component={0}.cos.sqr ' + 'input=Append({1},{1})' + ''.format(self.name, cur_node[0])) + + configs.append('component name={0}.sin.sqr type=ElementwiseProductComponent ' + 'input-dim={1} output-dim={2}' + ''.format(self.name, cur_dim * 2, cur_dim)) + configs.append('component-node name={0}.sin.sqr component={0}.cos.sqr ' + 'input=Append({1},{1})' + ''.format(self.name, cur_node[1])) + configs.append('component name={0}.abs type=NoOpComponent dim={1}' + ''.format(self.name, cur_dim)) + configs.append('component-node name={0}.abs component={0}.abs ' + 'input=Sum({0}.sin.sqr, {0}.cos.sqr)' + ''.format(self.name)) + cur_node = '{0}.abs'.format(self.name) + + elif nonlinearity == 'abs': + assert(len(cur_node) == 2 and + cur_node[0] == '{0}.cosine'.format(self.name) and + cur_node[1] == '{0}.sine'.format(self.name)) + permute_vec = [] + for i in range(fft_dim/2): + permute_vec.append(i) + permute_vec.append(i+fft_dim/2) + permute_vec_str = ','.join([str(x) for x in permute_vec]) + configs.append('component name={0}.permute type=PermuteComponent ' + 'column-map={1}'.format(self.name, permute_vec_str)) + configs.append('component-node name={0}.permute component={0}.permute ' + 'input=Append({1},{2})' + ''.format(self.name, cur_node[0], cur_node[1])) + + configs.append('component name={0}.abs type=PnormComponent ' + 'input-dim={1} output-dim={2}' + ''.format(self.name, fft_dim, fft_dim/2)) + configs.append('component-node name={0}.abs component={0}.abs ' + 'input={0}.permute'.format(self.name)) + cur_node = '{0}.abs'.format(self.name) + cur_dim = fft_dim / 2 + elif nonlinearity == 'norm': + assert(isinstance(cur_node, str)) + configs.append('component name={0}.norm type=NormalizeComponent ' + 'dim={1} target-rms=1.0 add-log-stddev=true '.format(self.name, cur_dim)) + configs.append('component-node name={0}.norm component={0}.norm ' + 'input={1}'.format(self.name, cur_node)) + configs.append('dim-range-node name={0}.norm.no.energy input-node={0}.norm ' + 'dim-offset=0 dim={1}'.format(self.name, cur_dim)) + configs.append('dim-range-node name={0}.norm.energy input-node={0}.norm ' + 'dim-offset={1} dim=1'.format(self.name, cur_dim)) + cur_node = '{0}.norm.no.energy'.format(self.name) + cur_dim = fft_dim / 2 + elif nonlinearity == 'lognorm': + assert(isinstance(cur_node, str)) + configs.append('component name={0}.norm.log type=LogComponent ' + 'dim={1} log-floor=1e-4 additive-offset=false ' + ''.format(self.name, cur_dim)) + configs.append('component-node name={0}.norm.log component={0}.norm.log ' + 'input={1}'.format(self.name, cur_node)) + configs.append('component name={0}.norm.batch type=BatchNormComponent ' + 'dim={1} target-rms={2} ' + ''.format(self.name, cur_dim, target_rms)) + configs.append('component-node name={0}.norm.batch ' + 'component={0}.norm.batch ' + 'input={0}.norm.log'.format(self.name)) + configs.append('component name={0}.norm.so type=ScaleAndOffsetComponent ' + 'dim={1} max-change=0.5 scale={2}' + ''.format(self.name, cur_dim, scale)) + configs.append('component-node name={0}.norm.so component={0}.norm.so ' + 'input={0}.norm.batch '.format(self.name)) + configs.append('component name={0}.norm.exp type=ExpComponent dim={1} ' + ''.format(self.name, cur_dim)) + configs.append('component-node name={0}.norm.exp component={0}.norm.exp ' + 'input={0}.norm.so'.format(self.name)) + #configs.append('component name={0}.norm.exp type=ExpComponent dim={1} ' + # ''.format(self.name, cur_dim)) + #configs.append('component-node name={0}.norm.exp component={0}.norm.exp ' + # 'input={0}.norm.batch'.format(self.name)) + cur_node = '{0}.norm.exp'.format(self.name) + cur_dim = fft_dim / 2 + + + elif nonlinearity == 'lognorm2': + configs.append("component name={0}.lognorm type=CompositeComponent " + "num-components=4 " + "component1='type=LogComponent dim={1} log-floor=1e-4 additive-offset=false' " + "component2='type=BatchNormComponent dim={1} target-rms={2}' " + "component3='type=ScaleAndOffsetComponent dim={1} max-change=0.5' " + "component4='type=ExpComponent dim={1}' " + "".format(self.name, cur_dim, target_rms)) + configs.append('component-node name={0}.lognorm ' + 'component={0}.lognorm input={1}' + ''.format(self.name, cur_node)) + + cur_node = '{0}.lognorm'.format(self.name) + cur_dim = fft_dim / 2 + + elif nonlinearity == 'affine': + configs.append('component name={0}.filterbank type=AffineComponent ' + 'input-dim={1} output-dim={2} max-change={3} ' + 'min-param-value={4} max-param-value={5} ' + 'bias-stddev=0.0 l2-regularize={6}' + ''.format(self.name, cur_dim, num_filters, max_change, + min_param_value, max_param_value, + l2_regularize)) + configs.append('component-node name={0}.filterbank ' + 'component={0}.filterbank input={1}' + ''.format(self.name, cur_node)) + cur_node = '{0}.filterbank'.format(self.name) + cur_dim = num_filters + elif nonlinearity == 'log': + configs.append('component name={0}.log type=LogComponent ' + 'log-floor=1e-4 additive-offset=false dim={1}' + ''.format(self.name, cur_dim)) + + if 'norm' in nonlinearities: + configs.append('component-node name={0}.log0 ' + 'component={0}.log input={1}' + ''.format(self.name, cur_node)) + configs.append('component name={0}.log.sum type=NoOpComponent ' + 'dim={1}'.format(self.name, cur_dim+1)) + configs.append('component-node name={0}.log component={0}.log.sum ' + 'input=Append({0}.log0, {0}.norm.energy)' + ''.format(self.name)) + cur_dim = fft_dim / 2 + 1 + else: + configs.append('component-node name={0}.log ' + 'component={0}.log input={1}' + ''.format(self.name, cur_node)) + cur_dim = fft_dim / 2 + cur_node = '{0}.log'.format(self.name) + + + + else: + raise RuntimeError("Unknown nonlinearity type: {0}" + "".format(nonlinearity)) + return configs + +class XconfigTimeDomainLayer(XconfigLayerBase): + def __init__(self, first_token, key_to_value, prev_names = None): + assert first_token in ['preprocess-tconv-abs-log-nin-affine-layer'] + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + def set_default_configs(self): + self.config = {'input':'[-1]', + 'dim': -1, + 'frame-dim': 80, + 'max-change' : 0.75, + 'num-filters' : 100, + 'log-floor' : 0.0001, + 'nin-mid-dim' : 75, + 'nin-forward-dim' : 500, + 'sub-frames-per-frame': 8, + 'frames-left-context':1, + 'frames-right-context':0, + 'max-shift': 0.2} + + + def check_configs(self): + if self.config['frames-left-context'] < 0: + raise RuntimeError("frames-left-context should be > 0." + "".format(self.config['frames-left-context'])) + if self.config['frames-right-context'] < 0: + raise RuntimeError("frames-right-context should be > 0." + "".format(self.config['sub-frames-right-context'])) + + + def output_name(self, auxiliary_output=None): + assert auxiliary_output == None + + split_layer_name = self.layer_type.split('-') + assert split_layer_name[-1] == 'layer' + last_nonlinearity = split_layer_name[-2] + if last_nonlinearity == 'affine': + return '{0}.post.forward'.format(self.name) + + def output_dim(self): + split_layer_name = self.layer_type.split('-') + assert split_layer_name[-2] == 'affine' + return self.config['nin-forward-dim'] + + def get_full_config(self): + ans = [] + config_lines = self._generate_config() + + for line in config_lines: + if len(line) == 2: + # 'ref' or 'final' tuple already exist in the line + # These lines correspond to fft component. + # which contains FixedAffineComponent. + assert(line[0] == 'init' or line[0] == 'ref' or line[0] == 'final') + ans.append(line) + else: + for config_name in ['ref', 'final']: + ans.append((config_name, line)) + return ans + + def _generate_config(self): + split_layer_name = self.layer_type.split('-') + assert split_layer_name[-1] == 'layer' + nonlinearities = split_layer_name[:-1] + + # by 'descriptor_final_string' we mean a string that can appear in + # config-files, i.e. it contains the 'final' names of nodes. + input_desc = self.descriptors['input']['final-string'] + input_dim = self.descriptors['input']['dim'] + + # the child classes e.g. tdnn might want to process the input + # before adding the other components + return self._add_components(input_desc, input_dim, nonlinearities) + + + def _add_components(self, input_desc, input_dim, nonlinearities): + dim = self.config['dim'] + frame_dim = self.config['frame-dim'] + max_change = self.config['max-change'] + nin_mid_dim = self.config['nin-mid-dim'] + pool_left_context = self.config['frames-left-context'] + pool_right_context = self.config['frames-right-context'] + nin_forward_dim = self.config['nin-forward-dim'] + log_floor = self.config['log-floor'] + num_filters = self.config['num-filters'] + samples_per_sub_frame = frame_dim / self.config['sub-frames-per-frame'] + filter_step = samples_per_sub_frame + filter_dim = input_dim - (frame_dim if 'preprocess' in nonlinearities else 0) - frame_dim + filter_step + cur_node = input_desc + cur_dim = input_dim + configs = [] + for nonlinearity in nonlinearities: + if nonlinearity == 'preprocess': + configs.append('component name={0}.preprocess type=ShiftInputComponent ' + 'input-dim={1} output-dim={2} dither=0.0 max-shift={3} ' + 'preprocess=true '.format(self.name, cur_dim, + cur_dim - frame_dim, + self.config['max-shift'])) + + configs.append('component-node name={0}.preprocess ' + 'component={0}.preprocess input={1}' + ''.format(self.name, cur_node)) + cur_node = '{0}.preprocess'.format(self.name) + cur_dim = cur_dim - frame_dim + + elif nonlinearity == 'tconv': + # add Convolution component and PermuteComponent + configs.append('component name={0}.tconv type=ConvolutionComponent ' + 'input-x-dim={1} input-y-dim=1 input-z-dim=1 ' + 'filt-x-dim={2} filt-y-dim=1 filt-x-step={3} ' + 'filt-y-step=1 num-filters={4} ' + 'input-vectorization-order=zyx param-stddev={5} ' + 'bias-stddev=0.01 max-change={6}' + ''.format(self.name, cur_dim, filter_dim, + filter_step, num_filters, + 0.9 / (filter_dim**0.5), + max_change)) + + configs.append('component-node name={0}.tconv ' + 'component={0}.tconv input={1}' + ''.format(self.name, cur_node)) + + # adding PermuteComponent and appending filter outputs. + conv_output_dim = self.config['sub-frames-per-frame'] * (pool_left_context + pool_right_context + 1) + permute_vec = [] + for i in range(num_filters): + for j in range(conv_output_dim): + permute_vec.append(i+j*num_filters) + permute_vec_str = ','.join([str(x) for x in permute_vec]) + configs.append('component name={0}.permute type=PermuteComponent ' + 'column-map={1}' + ''.format(self.name, permute_vec_str)) + append_str = ','.join(['Offset({0}.tconv,{1})'.format(self.name, x) for x in range(-1*pool_left_context, pool_right_context+1)]) + configs.append('component-node name={0}.permute ' + 'component={0}.permute input=Append({1})' + ''.format(self.name, append_str)) + + cur_node = '{0}.permute'.format(self.name) + cur_dim = num_filters * conv_output_dim + + elif nonlinearity == 'abs': + configs.append('component name={0}.abs type=PnormComponent ' + 'input-dim={1} output-dim={1}' + ''.format(self.name, cur_dim)) + configs.append('component-node name={0}.abs component={0}.abs ' + 'input={1}'.format(self.name, cur_node)) + + cur_node = '{0}.abs'.format(self.name) + cur_dim = cur_dim + + elif nonlinearity == 'log': + configs.append('component name={0}.log type=LogComponent ' + 'dim={1} log-floor={2} additive-offset=false ' + ''.format(self.name, cur_dim, log_floor)) + configs.append('component-node name={0}.log component={0}.log ' + 'input={1}'.format(self.name, cur_node)) + + cur_node = '{0}.log'.format(self.name) + cur_dim = cur_dim + + elif nonlinearity == 'nin': + configs.append("component name={0}.nin type=CompositeComponent " + "num-components=4 " + "component1='type=RectifiedLinearComponent dim={1} self-repair-scale=1e-05' " + "component2='type=NaturalGradientRepeatedAffineComponent input-dim={1} output-dim={2} num-repeats={3} param-stddev={4} bias-stddev=0' " + "component3='type=RectifiedLinearComponent dim={2} self-repair-scale=1e-05' " + "component4='type=NaturalGradientRepeatedAffineComponent input-dim={2} output-dim={1} num-repeats={3} param-stddev={5} bias-mean=0.1 bias-stddev=0 ' " + "".format(self.name, cur_dim, nin_mid_dim * num_filters, + num_filters, 2.0 / (cur_dim**0.5), + 2.0 / (nin_mid_dim * num_filters)**0.5)) + + configs.append('component-node name={0}.nin component={0}.nin ' + 'input={1}' + ''.format(self.name, cur_node)) + configs.append("component name={0}.post.nin type=CompositeComponent " + "num-components=2 component1='type=RectifiedLinearComponent dim={1} self-repair-scale=1e-05' " + "component2='type=NormalizeComponent dim={1} add-log-stddev=true '" + "".format(self.name, cur_dim)) + configs.append('component-node name={0}.post.nin component={0}.post.nin input={0}.nin' + ''.format(self.name)) + + cur_node= '{0}.post.nin'.format(self.name) + cur_dim = cur_dim + 1 + + elif nonlinearity == 'affine': + configs.append('component name={0}.forward.nin type=NaturalGradientAffineComponent ' + 'input-dim={1} output-dim={2} bias-stddev=0' + ''.format(self.name, cur_dim, nin_forward_dim)) + configs.append('component-node name={0}.forward.nin component={0}.forward.nin ' + 'input={1}'.format(self.name, cur_node)) + configs.append("component name={0}.post.forward type=CompositeComponent num-components=2 " + "component1='type=RectifiedLinearComponent dim={1} self-repair-scale=1e-05' " + "component2='type=NormalizeComponent dim={1}'" + "".format(self.name, nin_forward_dim)) + configs.append('component-node name={0}.post.forward component={0}.post.forward ' + 'input={0}.forward.nin'.format(self.name, cur_node)) + + cur_node = '{0}.post.forward'.format(self.name) + cur_dim = nin_forward_dim + + else: + raise RuntimeError("Unknown nonlinearity type: {0}" + "".format(nonlinearity)) + return configs diff --git a/egs/wsj/s5/steps/make_raw_feats.sh b/egs/wsj/s5/steps/make_raw_feats.sh new file mode 100755 index 00000000000..34d6e066626 --- /dev/null +++ b/egs/wsj/s5/steps/make_raw_feats.sh @@ -0,0 +1,129 @@ +#!/bin/bash + +# Copyright 2016 Pegah Ghahremani +# Apache 2.0 +# To be run from .. (one directory up from here) +# see ../run.sh for example + +# Begin configuration section. +nj=4 +cmd=run.pl +compress=true +compress_method=3 +raw_config=conf/raw.conf +write_utt2num_frames=false # if true writes utt2num_frames +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# -lt 1 ] || [ $# -gt 3 ]; then + echo "Usage: $0 [options] "; + echo "e.g.: $0 data/train exp/make_segment/train segments" + echo "options: " + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +data=$1 +if [ $# -ge 2 ]; then + logdir=$2 +else + logdir=$data/log +fi +if [ $# -ge 3 ]; then + featdir=$3 +else + featdir=$data/data +fi + +# make $featdir an absolute pathname. +featdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $featdir ${PWD}` + +# use "name" as part of name of the archive. +name=`basename $data` +scp=$data/wav.scp + +mkdir -p $featdir || exit 1; +mkdir -p $logdir || exit 1; + +if [ -f $data/feats.scp ]; then + mkdir -p $data/.backup + echo "$0: moving $data/feats.scp to $data/.backup" + mv $data/feats.scp $data/.backup +fi + +required="$scp $raw_config" + +for f in $required; do + if [ ! -f $f ]; then + echo "$0: no such file $f" + exit 1; + fi +done + +utils/validate_data_dir.sh --no-text --no-feats $data || exit 1; + +for n in $(seq $nj); do + # the next command does nothing unless $featdir/storage/ exists, see + # utils/create_data_link.pl for more info. + utils/create_data_link.pl $featdir/raw_wav_feat_$name.$n.ark +done + +if $write_utt2num_frames; then + write_num_frames_opt="--write-num-frames=ark,t:$logdir/utt2num_frames.JOB" +else + write_num_frames_opt= +fi + +if [ -f $data/segments ]; then + echo "$0 [info]: segments file exists: using that." + + split_segments="" + for n in $(seq $nj); do + split_segments="$split_segments $logdir/segments.$n" + done + + utils/split_scp.pl $data/segments $split_segments || exit 1; + rm $logdir/.error 2>/dev/null + + $cmd JOB=1:$nj $logdir/make_raw_feats_${name}.JOB.log \ + extract-segments scp,p:$scp $logdir/segments.JOB ark:- \| \ + compute-raw-frame-feats --config=$raw_config ark:- ark:- \| \ + copy-feats $write_num_frames_opt \ + --compress=$compress --compression-method=$compress_method ark:- \ + ark,scp:$featdir/raw_wav_feat_$name.JOB.ark,$featdir/raw_wav_feat_$name.JOB.scp \ + || exit 1; + +else + echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance." + split_scps="" + for n in $(seq $nj); do + split_scps="$split_scps $logdir/wav_${name}.$n.scp" + done + + utils/split_scp.pl $scp $split_scps || exit 1; + + + # add ,p to the input rspecifier so that we can just skip over + # utterances that have bad wave data. + + $cmd JOB=1:$nj $logdir/make_mfcc_${name}.JOB.log \ + compute-raw-frame-feats --config=$raw_config \ + scp,p:$logdir/wav_${name}.JOB.scp ark:- \| \ + copy-feats $write_num_frames_opt --compress=$compress \ + --compression-method=$compress_method ark:- \ + ark,scp:$featdir/raw_wav_feat_$name.JOB.ark,$featdir/raw_wav_feat_$name.JOB.scp \ + || exit 1; +fi + +# concatenate the .scp files together. +for n in $(seq $nj); do + cat $featdir/raw_wav_feat_$name.$n.scp || exit 1; +done > $data/feats.scp + +rm $logdir/raw_wav_feat_${name}.*.scp $logdir/segments.* 2>/dev/null +echo "Successed generate $data" diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs_old.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs_old.sh new file mode 100755 index 00000000000..daf1ad2f9ec --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs_old.sh @@ -0,0 +1,428 @@ +#!/bin/bash + +# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. +# +# This script, which will generally be called from other neural-net training +# scripts, extracts the training examples used to train the 'chain' system +# (and also the validation examples used for diagnostics), and puts them in +# separate archives. +# +# This script dumps egs with many frames of labels, controlled by the +# frames_per_eg config variable (default: 25), plus left and right context. +# Because CTC training involves alignment of data, we can't meaningfully train +# frame by frame. The supervision approach involves the time alignment, though-- +# it is just applied in a loose way, where each symbol can appear in the +# frame-range that it was in in the alignment, extended by a certain margin. +# + + +# Begin configuration section. +cmd=run.pl +frames_per_eg=25 # number of feature frames example (not counting added context). + # more->less disk space and less time preparing egs, but more + # I/O during training. +frames_overlap_per_eg=0 # number of supervised frames of overlap that we aim for per eg. + # can be useful to avoid wasted data if you're using --left-deriv-truncate + # and --right-deriv-truncate. +frame_subsampling_factor=3 # frames-per-second of features we train on divided + # by frames-per-second at output of chain model +alignment_subsampling_factor=3 # frames-per-second of input alignments divided + # by frames-per-second at output of chain model +left_context=4 # amount of left-context per eg (i.e. extra frames of input features + # not present in the output supervision). +right_context=4 # amount of right-context per eg. +left_context_initial=-1 # if >=0, left-context for first chunk of an utterance +right_context_final=-1 # if >=0, right-context for last chunk of an utterance +compress=true # set this to false to disable compression (e.g. if you want to see whether + # results are affected). + +num_utts_subset=300 # number of utterances in validation and training + # subsets used for shrinkage and diagnostics. +num_valid_egs_combine=0 # #validation examples for combination weights at the very end. +num_train_egs_combine=1000 # number of train examples for the above. +num_egs_diagnostic=400 # number of frames for "compute_prob" jobs +frames_per_iter=400000 # each iteration of training, see this many frames per + # job, measured at the sampling rate of the features + # used. This is just a guideline; it will pick a number + # that divides the number of samples in the entire data. + +right_tolerance= #CTC right tolerance == max label delay. +left_tolerance= + +transform_dir= # If supplied, overrides latdir as the place to find fMLLR transforms + +stage=0 +nj=15 # This should be set to the maximum number of jobs you are + # comfortable to run in parallel; you can increase it if your disk + # speed is greater and you have more machines. +max_shuffle_jobs_run=50 # the shuffle jobs now include the nnet3-chain-normalize-egs command, + # which is fairly CPU intensive, so we can run quite a few at once + # without overloading the disks. +srand=0 # rand seed for nnet3-chain-get-egs, nnet3-chain-copy-egs and nnet3-chain-shuffle-egs +online_ivector_dir= # can be used if we are including speaker information as iVectors. +cmvn_opts= # can be used for specifying CMVN options, if feature type is not lda (if lda, + # it doesn't make sense to use different options than were used as input to the + # LDA transform). This is used to turn off CMVN in the online-nnet experiments. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 4 ]; then + echo "Usage: $0 [opts] " + echo " e.g.: $0 data/train exp/tri4_nnet exp/tri3_lats exp/tri4_nnet/egs" + echo "" + echo "From , 0.trans_mdl (the transition-model), tree (the tree)" + echo "and normalization.fst (the normalization FST, derived from the denominator FST)" + echo "are read." + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config file containing options" + echo " --nj # The maximum number of jobs you want to run in" + echo " # parallel (increase this only if you have good disk and" + echo " # network speed). default=6" + echo " --cmd (utils/run.pl;utils/queue.pl ) # how to run jobs." + echo " --frames-per-iter <#samples;400000> # Number of frames of data to process per iteration, per" + echo " # process." + echo " --frame-subsampling-factor # factor by which num-frames at nnet output is reduced " + echo " --frames-per-eg # number of supervised frames per eg on disk" + echo " --frames-overlap-per-eg # number of supervised frames of overlap between egs" + echo " --left-context # Number of frames on left side to append for feature input" + echo " --right-context # Number of frames on right side to append for feature input" + echo " --left-context-initial # If >= 0, left-context for first chunk of an utterance" + echo " --right-context-final # If >= 0, right-context for last chunk of an utterance" + echo " --num-egs-diagnostic <#frames;4000> # Number of egs used in computing (train,valid) diagnostics" + echo " --num-valid-egs-combine <#frames;10000> # Number of egss used in getting combination weights at the" + echo " # very end." + echo " --stage # Used to run a partially-completed training process from somewhere in" + echo " # the middle." + + exit 1; +fi + +data=$1 +chaindir=$2 +latdir=$3 +dir=$4 + +# Check some files. +[ ! -z "$online_ivector_dir" ] && \ + extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period" + +for f in $data/feats.scp $latdir/lat.1.gz $latdir/final.mdl \ + $chaindir/{0.trans_mdl,tree,normalization.fst} $extra_files; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +sdata=$data/split$nj +utils/split_data.sh $data $nj + +mkdir -p $dir/log $dir/info + +num_lat_jobs=$(cat $latdir/num_jobs) || exit 1; + +# Get list of validation utterances. + +frame_shift=$(utils/data/get_frame_shift.sh $data) || exit 1 +utils/data/get_utt2dur.sh $data + +cat $data/utt2dur | \ + awk -v min_len=$frames_per_eg -v fs=$frame_shift '{if ($2 * 1/fs >= min_len) print $1}' | \ + utils/shuffle_list.pl | head -$num_utts_subset > $dir/valid_uttlist || exit 1; + +len_uttlist=`wc -l $dir/valid_uttlist | awk '{print $1}'` +if [ $len_uttlist -lt $num_utts_subset ]; then + echo "Number of utterances which have length at least $frames_per_eg is really low. Please check your data." && exit 1; +fi + +if [ -f $data/utt2uniq ]; then # this matters if you use data augmentation. + # because of this stage we can again have utts with lengths less than + # frames_per_eg + echo "File $data/utt2uniq exists, so augmenting valid_uttlist to" + echo "include all perturbed versions of the same 'real' utterances." + mv $dir/valid_uttlist $dir/valid_uttlist.tmp + utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt + cat $dir/valid_uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \ + sort | uniq | utils/apply_map.pl $dir/uniq2utt | \ + awk '{for(n=1;n<=NF;n++) print $n;}' | sort > $dir/valid_uttlist + rm $dir/uniq2utt $dir/valid_uttlist.tmp +fi + +cat $data/utt2dur | \ + awk -v min_len=$frames_per_eg -v fs=$frame_shift '{if ($2 * 1/fs >= min_len) print $1}' | \ + utils/filter_scp.pl --exclude $dir/valid_uttlist | \ + utils/shuffle_list.pl | head -$num_utts_subset > $dir/train_subset_uttlist || exit 1; +len_uttlist=`wc -l $dir/train_subset_uttlist | awk '{print $1}'` +if [ $len_uttlist -lt $num_utts_subset ]; then + echo "Number of utterances which have length at least $frames_per_eg is really low. Please check your data." && exit 1; +fi + +[ -z "$transform_dir" ] && transform_dir=$latdir + +# because we'll need the features with a different number of jobs than $latdir, +# copy to ark,scp. +if [ -f $transform_dir/raw_trans.1 ]; then + echo "$0: using raw transforms from $transform_dir" + if [ $stage -le 0 ]; then + $cmd $dir/log/copy_transforms.log \ + copy-feats "ark:cat $transform_dir/raw_trans.* |" "ark,scp:$dir/trans.ark,$dir/trans.scp" + fi +fi + +## Set up features. +echo "$0: feature type is raw" +###feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- |" +###valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |" +###train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |" +feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | copy-feats scp:- ark:- |" +valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | copy-feats scp:- ark:- |" +train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | copy-feats scp:- ark:- |" + +echo $cmvn_opts >$dir/cmvn_opts # caution: the top-level nnet training script should copy this to its own dir now. + +if [ -f $dir/trans.scp ]; then + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/trans.scp ark:- ark:- |" + valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |" + train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |" +fi + +if [ ! -z "$online_ivector_dir" ]; then + ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1; + echo $ivector_dim > $dir/info/ivector_dim + steps/nnet2/get_ivector_id.sh $online_ivector_dir > $dir/info/final.ie.id || exit 1 + ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1; + ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period" +else + ivector_opts="" + echo 0 >$dir/info/ivector_dim +fi + +if [ $stage -le 1 ]; then + echo "$0: working out number of frames of training data" + num_frames=$(steps/nnet2/get_num_frames.sh $data) + echo $num_frames > $dir/info/num_frames + echo "$0: working out feature dim" + feats_one="$(echo $feats | sed s/JOB/1/g)" + if ! feat_dim=$(feat-to-dim "$feats_one" - 2>/dev/null); then + echo "Command failed (getting feature dim): feat-to-dim \"$feats_one\"" + exit 1 + fi + echo $feat_dim > $dir/info/feat_dim +else + num_frames=$(cat $dir/info/num_frames) || exit 1; + feat_dim=$(cat $dir/info/feat_dim) || exit 1; +fi + +# the + 1 is to round up, not down... we assume it doesn't divide exactly. +num_archives=$[$num_frames/$frames_per_iter+1] + +# We may have to first create a smaller number of larger archives, with number +# $num_archives_intermediate, if $num_archives is more than the maximum number +# of open filehandles that the system allows per process (ulimit -n). +# This sometimes gives a misleading answer as GridEngine sometimes changes the +# limit, so we limit it to 512. +max_open_filehandles=$(ulimit -n) || exit 1 +[ $max_open_filehandles -gt 512 ] && max_open_filehandles=512 +num_archives_intermediate=$num_archives +archives_multiple=1 +while [ $[$num_archives_intermediate+4] -gt $max_open_filehandles ]; do + archives_multiple=$[$archives_multiple+1] + num_archives_intermediate=$[$num_archives/$archives_multiple] || exit 1; +done +# now make sure num_archives is an exact multiple of archives_multiple. +num_archives=$[$archives_multiple*$num_archives_intermediate] || exit 1; + +echo $num_archives >$dir/info/num_archives +echo $frames_per_eg >$dir/info/frames_per_eg +# Work out the number of egs per archive +egs_per_archive=$[$num_frames/($frames_per_eg*$num_archives)] || exit 1; +! [ $egs_per_archive -le $frames_per_iter ] && \ + echo "$0: script error: egs_per_archive=$egs_per_archive not <= frames_per_iter=$frames_per_iter" \ + && exit 1; + +echo $egs_per_archive > $dir/info/egs_per_archive + +echo "$0: creating $num_archives archives, each with $egs_per_archive egs, with" +echo "$0: $frames_per_eg labels per example, and (left,right) context = ($left_context,$right_context)" +if [ $left_context_initial -ge 0 ] || [ $right_context_final -ge 0 ]; then + echo "$0: ... and (left-context-initial,right-context-final) = ($left_context_initial,$right_context_final)" +fi + + +if [ -e $dir/storage ]; then + # Make soft links to storage directories, if distributing this way.. See + # utils/create_split_dir.pl. + echo "$0: creating data links" + utils/create_data_link.pl $(for x in $(seq $num_archives); do echo $dir/cegs.$x.ark; done) + for x in $(seq $num_archives_intermediate); do + utils/create_data_link.pl $(for y in $(seq $nj); do echo $dir/cegs_orig.$y.$x.ark; done) + done +fi + +if [ $stage -le 2 ]; then + echo "$0: copying training lattices" + + $cmd --max-jobs-run 6 JOB=1:$num_lat_jobs $dir/log/lattice_copy.JOB.log \ + lattice-copy "ark:gunzip -c $latdir/lat.JOB.gz|" ark,scp:$dir/lat.JOB.ark,$dir/lat.JOB.scp || exit 1; + + for id in $(seq $num_lat_jobs); do cat $dir/lat.$id.scp; done > $dir/lat.scp +fi + + +egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress" +[ $left_context_initial -ge 0 ] && egs_opts="$egs_opts --left-context-initial=$left_context_initial" +[ $right_context_final -ge 0 ] && egs_opts="$egs_opts --right-context-final=$right_context_final" + + +chain_supervision_all_opts="--lattice-input=true --frame-subsampling-factor=$alignment_subsampling_factor" +[ ! -z $right_tolerance ] && \ + chain_supervision_all_opts="$chain_supervision_all_opts --right-tolerance=$right_tolerance" + +[ ! -z $left_tolerance ] && \ + chain_supervision_all_opts="$chain_supervision_all_opts --left-tolerance=$left_tolerance" + +echo $left_context > $dir/info/left_context +echo $right_context > $dir/info/right_context +echo $left_context_initial > $dir/info/left_context_initial +echo $right_context_final > $dir/info/right_context_final + +if [ $stage -le 3 ]; then + echo "$0: Getting validation and training subset examples." + rm $dir/.error 2>/dev/null + echo "$0: ... extracting validation and training-subset alignments." + + # do the filtering just once, as lat.scp may be long. + utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) \ + <$dir/lat.scp >$dir/lat_special.scp + + $cmd $dir/log/create_valid_subset.log \ + utils/filter_scp.pl $dir/valid_uttlist $dir/lat_special.scp \| \ + lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \ + chain-get-supervision $chain_supervision_all_opts $chaindir/tree $chaindir/0.trans_mdl \ + ark:- ark:- \| \ + nnet3-chain-get-egs $ivector_opts --srand=$srand \ + $egs_opts $chaindir/normalization.fst \ + "$valid_feats" ark,s,cs:- "ark:$dir/valid_all.cegs" || touch $dir/.error & + $cmd $dir/log/create_train_subset.log \ + utils/filter_scp.pl $dir/train_subset_uttlist $dir/lat_special.scp \| \ + lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \ + chain-get-supervision $chain_supervision_all_opts \ + $chaindir/tree $chaindir/0.trans_mdl ark:- ark:- \| \ + nnet3-chain-get-egs $ivector_opts --srand=$srand \ + $egs_opts $chaindir/normalization.fst \ + "$train_subset_feats" ark,s,cs:- "ark:$dir/train_subset_all.cegs" || touch $dir/.error & + wait; + [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1 + echo "... Getting subsets of validation examples for diagnostics and combination." + $cmd $dir/log/create_valid_subset_combine.log \ + nnet3-chain-subset-egs --n=$num_valid_egs_combine ark:$dir/valid_all.cegs \ + ark:$dir/valid_combine.cegs || touch $dir/.error & + $cmd $dir/log/create_valid_subset_diagnostic.log \ + nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/valid_all.cegs \ + ark:$dir/valid_diagnostic.cegs || touch $dir/.error & + + $cmd $dir/log/create_train_subset_combine.log \ + nnet3-chain-subset-egs --n=$num_train_egs_combine ark:$dir/train_subset_all.cegs \ + ark:$dir/train_combine.cegs || touch $dir/.error & + $cmd $dir/log/create_train_subset_diagnostic.log \ + nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/train_subset_all.cegs \ + ark:$dir/train_diagnostic.cegs || touch $dir/.error & + wait + sleep 5 # wait for file system to sync. + cat $dir/valid_combine.cegs $dir/train_combine.cegs > $dir/combine.cegs + + for f in $dir/{combine,train_diagnostic,valid_diagnostic}.cegs; do + [ ! -s $f ] && echo "No examples in file $f" && exit 1; + done + rm $dir/valid_all.cegs $dir/train_subset_all.cegs $dir/{train,valid}_combine.cegs +fi + +if [ $stage -le 4 ]; then + # create cegs_orig.*.*.ark; the first index goes to $nj, + # the second to $num_archives_intermediate. + + egs_list= + for n in $(seq $num_archives_intermediate); do + egs_list="$egs_list ark:$dir/cegs_orig.JOB.$n.ark" + done + echo "$0: Generating training examples on disk" + + # The examples will go round-robin to egs_list. Note: we omit the + # 'normalization.fst' argument while creating temporary egs: the phase of egs + # preparation that involves the normalization FST is quite CPU-intensive and + # it's more convenient to do it later, in the 'shuffle' stage. Otherwise to + # make it efficient we need to use a large 'nj', like 40, and in that case + # there can be too many small files to deal with, because the total number of + # files is the product of 'nj' by 'num_archives_intermediate', which might be + # quite large. + $cmd JOB=1:$nj $dir/log/get_egs.JOB.log \ + utils/filter_scp.pl $sdata/JOB/utt2spk $dir/lat.scp \| \ + lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \ + chain-get-supervision $chain_supervision_all_opts \ + $chaindir/tree $chaindir/0.trans_mdl ark:- ark:- \| \ + nnet3-chain-get-egs $ivector_opts --srand=\$[JOB+$srand] $egs_opts \ + --num-frames-overlap=$frames_overlap_per_eg \ + "$feats" ark,s,cs:- ark:- \| \ + nnet3-chain-copy-egs --random=true --srand=\$[JOB+$srand] ark:- $egs_list || exit 1; +fi + +if [ $stage -le 5 ]; then + echo "$0: recombining and shuffling order of archives on disk" + # combine all the "egs_orig.*.JOB.scp" (over the $nj splits of the data) and + # shuffle the order, writing to the egs.JOB.ark + + # the input is a concatenation over the input jobs. + egs_list= + for n in $(seq $nj); do + egs_list="$egs_list $dir/cegs_orig.$n.JOB.ark" + done + + if [ $archives_multiple == 1 ]; then # normal case. + $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ + nnet3-chain-normalize-egs $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \ + nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] ark:- ark:$dir/cegs.JOB.ark || exit 1; + else + # we need to shuffle the 'intermediate archives' and then split into the + # final archives. we create soft links to manage this splitting, because + # otherwise managing the output names is quite difficult (and we don't want + # to submit separate queue jobs for each intermediate archive, because then + # the --max-jobs-run option is hard to enforce). + output_archives="$(for y in $(seq $archives_multiple); do echo ark:$dir/cegs.JOB.$y.ark; done)" + for x in $(seq $num_archives_intermediate); do + for y in $(seq $archives_multiple); do + archive_index=$[($x-1)*$archives_multiple+$y] + # egs.intermediate_archive.{1,2,...}.ark will point to egs.archive.ark + ln -sf cegs.$archive_index.ark $dir/cegs.$x.$y.ark || exit 1 + done + done + $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ + nnet3-chain-normalize-egs $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \ + nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] ark:- ark:- \| \ + nnet3-chain-copy-egs ark:- $output_archives || exit 1; + fi +fi + +if [ $stage -le 6 ]; then + echo "$0: removing temporary archives" + ( + cd $dir + for f in $(ls -l . | grep 'cegs_orig' | awk '{ X=NF-1; Y=NF-2; if ($X == "->") print $Y, $NF; }'); do rm $f; done + # the next statement removes them if we weren't using the soft links to a + # 'storage' directory. + rm cegs_orig.*.ark 2>/dev/null + ) + if [ $archives_multiple -gt 1 ]; then + # there are some extra soft links that we should delete. + for f in $dir/cegs.*.*.ark; do rm $f; done + fi + echo "$0: removing temporary lattices" + rm $dir/lat.* + echo "$0: removing temporary alignments and transforms" + # Ignore errors below because trans.* might not exist. + rm $dir/{ali,trans}.{ark,scp} 2>/dev/null + +fi + +echo "$0: Finished preparing training examples" diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index cf2a1a42b66..77ca29bb0c7 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -99,6 +99,17 @@ def get_args(): dest='left_deriv_truncate', default=None, help="Deprecated. Kept for back compatibility") + parser.add_argument("--chain.fft-feat-dim", type=int, + dest='fft_feat_dim', default=0, + help="""If nonzero, the cosine and sine transformation + with dim fft_feat_dim and closest 2-power of that is + generated as configs/{cos,sin}_transform.mat. + """) + parser.add_argument("--chain.l1-regularize", type=float, + dest='l1_regularize', default=0.0, + help="""Weight of regularization function which is the + l1-norm of the fft transform of convolution filters in + the network.""") # trainer options parser.add_argument("--trainer.input-model", type=str, @@ -427,6 +438,22 @@ def train(args, run_opts): rand_prune=args.rand_prune, use_multitask_egs=use_multitask_egs) + if (args.l1_regularize != 0) or (args.fft_feat_dim != 0): + feat_dim = args.fft_feat_dim + add_bias = True + num_fft_bins = (2**(feat_dim-1).bit_length()) + common_lib.write_sin_cos_transform_matrix(feat_dim, num_fft_bins, + "{0}/configs/cos_transform.mat".format(args.dir), + compute_cosine=True, add_bias=add_bias, half_range=True) + common_lib.write_sin_cos_transform_matrix(feat_dim, num_fft_bins, + "{0}/configs/sin_transform.mat".format(args.dir), + compute_cosine=False, add_bias=add_bias, half_range=True) + common_lib.write_negate_vector(num_fft_bins, + "{0}/configs/negate.vec".format(args.dir)) + preemph = 0.97 + common_lib.compute_and_write_preprocess_transform(preemph, feat_dim, + "{0}/configs/preprocess.mat".format(args.dir)) + if (args.stage <= -1): logger.info("Preparing the initial acoustic model.") chain_lib.prepare_initial_acoustic_model(args.dir, run_opts, diff --git a/src/featbin/Makefile b/src/featbin/Makefile index 8e72d0f744c..86ff1b0d79c 100644 --- a/src/featbin/Makefile +++ b/src/featbin/Makefile @@ -17,7 +17,7 @@ BINFILES = add-deltas add-deltas-sdc append-post-to-feats \ process-kaldi-pitch-feats process-pitch-feats \ select-feats shift-feats splice-feats subsample-feats \ subset-feats transform-feats wav-copy wav-reverberate \ - wav-to-duration + wav-to-duration compute-raw-frame-feats OBJFILES = diff --git a/src/featbin/compute-raw-frame-feats.cc b/src/featbin/compute-raw-frame-feats.cc new file mode 100644 index 00000000000..95e489ca59c --- /dev/null +++ b/src/featbin/compute-raw-frame-feats.cc @@ -0,0 +1,122 @@ +// feat/compute-raw-frame-feats.cc + +// Copyright 2015 Pegah Ghahremani + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "feat/wave-reader.h" +#include "feat/feature-functions.h" +#include "feat/feature-common.h" + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + const char *usage = + "Creates raw feature files, starting from wave input and generate output as part of " + "raw-wave with specific duration and no overlap.\n" + "Some post-processing can be applied on output." + "Usage: compute-raw-frame-feats.cc [options...] \n"; + + ParseOptions po(usage); + FrameExtractionOptions raw_opts; + raw_opts.frame_shift_ms = 10.0; + raw_opts.frame_length_ms = 10.0; + raw_opts.window_type = "rectangular"; + raw_opts.round_to_power_of_two = false; + raw_opts.remove_dc_offset = false; + raw_opts.preemph_coeff = 0.0; + raw_opts.dither = 0.0; + // Register raw frame extraction options + raw_opts.Register(&po); + bool remove_dc = true, + loudness_equalize = true; + BaseFloat low_rms =0.2, high_rms = 0.2, + scale_wav = 0.0; + po.Register("remove-global-dc-offset", &remove_dc, "If true, subtract mean from waveform on each wave"); + po.Register("loudness-equalize", &loudness_equalize, "If true, variance-normalization " + "is applied on output-wave"); + po.Register("low-rms", &low_rms, "The lowest variance of output-wave, where the variance" + "is randomly set between [low-rms, high-rms], and " + " the loudness of wave is equal to this randomly chosen rms."); + po.Register("high-rms", &high_rms, "The highest variance of output-wave, where the variance" + "is randomly set between [low-rms, high-rms], and " + " the loudness of wave is equal to this randomly chosen rms."); + po.Register("scale-wav", &scale_wav, "If non-zero, the raw waveform scaled using that."); + po.Read(argc, argv); + if (po.NumArgs() != 2) { + po.PrintUsage(); + exit(1); + } + std::string wav_rspecifier = po.GetArg(1); + std::string output_wspecifier = po.GetArg(2); + + SequentialTableReader wave_reader(wav_rspecifier); + BaseFloatMatrixWriter feat_writer(output_wspecifier); + + + int32 num_done = 0, num_err = 0; + for (; !wave_reader.Done(); wave_reader.Next()) { + std::string utt = wave_reader.Key(); + const WaveData &wave_data = wave_reader.Value(); + + // The channel is not configurable and it is better to extract it + // on command line using sox or... + int32 num_chan = wave_data.Data().NumRows(); + if (num_chan != 1) + KALDI_WARN << "You have data with " + << num_chan << " channels; defaulting to zero"; + + SubVector waveform(wave_data.Data(), 0); + Vector input(waveform.Dim()); + input.CopyRowFromMat(wave_data.Data(), 0); + BaseFloat mean = waveform.Sum() / waveform.Dim(); + // compute variance + input.Add(-mean); + BaseFloat variance = std::pow(VecVec(input, input) / waveform.Dim(), 0.5); + + // remove DC offset + if (remove_dc) + waveform.Add(-1.0 * mean); + + // apply variance normalization + BaseFloat target_rms = low_rms + RandUniform() * (high_rms - low_rms); + if (loudness_equalize && variance != 0) + waveform.Scale(target_rms * 1.0 / variance); + + if (scale_wav != 0.0) + waveform.Scale(scale_wav); + + Matrix raw_mat; + try { + FeatureWindowFunction window_function(raw_opts); + int32 rows_out = NumFrames(waveform.Dim(), raw_opts), + cols_out = raw_opts.WindowSize(); + raw_mat.Resize(rows_out, cols_out); + for (int32 frame = 0; frame < rows_out; frame++) { + Vector raw_feat(cols_out); + ExtractWindow(0, waveform, frame, raw_opts, + window_function, &raw_feat); + raw_mat.CopyRowFromVec(raw_feat, frame); + + } + //ComputeAndProcessRawSignal(raw_opts, waveform, &raw_feats); + } catch (...) { + KALDI_WARN << "Failed to extract raw-feats for utterance " + << utt; + num_err++; + continue; + } + + feat_writer.Write(utt, raw_mat); + if (num_done % 50 == 0 && num_done != 0) + KALDI_VLOG(2) << "Processed " << num_done << " utterances"; + num_done++; + } + KALDI_LOG << " Done " << num_done << " utterances, " << num_err + << " with errors."; + return (num_done != 0 ? 0 : 1); + } catch (const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc index 2ec2699ec97..7ecf01aa54f 100644 --- a/src/nnet3/nnet-chain-training.cc +++ b/src/nnet3/nnet-chain-training.cc @@ -202,6 +202,9 @@ void NnetChainTrainer::TrainInternal(const NnetChainExample &eg, nnet_config.max_param_change, 1.0, 1.0 - nnet_config.momentum, nnet_, &num_max_change_per_component_applied_, &num_max_change_global_applied_); + // impose positivity for AffineComponent max(W, 0) + PositiveUpdatableWeights(nnet_); + // Scale down the batchnorm stats (keeps them fresh... this affects what // happens when we use the model with batchnorm test-mode set). ScaleBatchnormStats(nnet_config.batchnorm_stats_scale, nnet_); diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc index c73f3fb921d..2d05f6acc61 100644 --- a/src/nnet3/nnet-component-itf.cc +++ b/src/nnet3/nnet-component-itf.cc @@ -174,6 +174,12 @@ Component* Component::NewComponentOfType(const std::string &component_type) { ans = new RestrictedAttentionComponent(); } else if (component_type == "SumBlockComponent") { ans = new SumBlockComponent(); + } else if (component_type == "ShiftInputComponent") { + ans = new ShiftInputComponent(); + } else if (component_type == "LogComponent") { + ans = new LogComponent(); + } else if (component_type == "ExpComponent") { + ans = new ExpComponent(); } else if (component_type == "ScaleAndOffsetComponent") { ans = new ScaleAndOffsetComponent(); } @@ -218,6 +224,8 @@ UpdatableComponent::UpdatableComponent(const UpdatableComponent &other): learning_rate_factor_(other.learning_rate_factor_), l2_regularize_(other.l2_regularize_), is_gradient_(other.is_gradient_), + min_param_value_(other.min_param_value_), + max_param_value_(other.max_param_value_), max_change_(other.max_change_) { } @@ -228,6 +236,8 @@ void UpdatableComponent::SetUpdatableConfigs( l2_regularize_ = other.l2_regularize_; is_gradient_ = other.is_gradient_; max_change_ = other.max_change_; + max_param_value_ = other.max_param_value_; + min_param_value_ = other.min_param_value_; } // If these defaults are changed, the defaults in the constructor that @@ -244,6 +254,13 @@ void UpdatableComponent::InitLearningRatesFromConfig(ConfigLine *cfl) { if (learning_rate_ < 0.0 || learning_rate_factor_ < 0.0 || max_change_ < 0.0 || l2_regularize_ < 0.0) KALDI_ERR << "Bad initializer " << cfl->WholeLine(); + BaseFloat min_param_value = std::numeric_limits::lowest(), + max_param_value = std::numeric_limits::max(); + cfl->GetValue("min-param-value", &min_param_value); + cfl->GetValue("max-param-value", &max_param_value); + KALDI_ASSERT(min_param_value < max_param_value); + min_param_value_ = min_param_value; + max_param_value_ = max_param_value; } @@ -282,6 +299,21 @@ std::string UpdatableComponent::ReadUpdatableCommon(std::istream &is, } else { l2_regularize_ = 0.0; } + + if (token == "") { + ReadBasicType(is, binary, &max_param_value_); + ReadToken(is, binary, &token); + } else { + max_param_value_ = std::numeric_limits::max(); + } + + if (token == "") { + ReadBasicType(is, binary, &min_param_value_); + ReadToken(is, binary, &token); + } else { + min_param_value_ = std::numeric_limits::lowest(); + } + if (token == "") { ReadBasicType(is, binary, &learning_rate_); return ""; @@ -312,6 +344,17 @@ void UpdatableComponent::WriteUpdatableCommon(std::ostream &os, WriteToken(os, binary, ""); WriteBasicType(os, binary, l2_regularize_); } + + if (max_param_value_ < std::numeric_limits::max()) { + WriteToken(os, binary, ""); + WriteBasicType(os, binary, max_param_value_); + } + + if (min_param_value_ > std::numeric_limits::lowest()) { + WriteToken(os, binary, ""); + WriteBasicType(os, binary, min_param_value_); + } + WriteToken(os, binary, ""); WriteBasicType(os, binary, learning_rate_); } @@ -330,6 +373,9 @@ std::string UpdatableComponent::Info() const { stream << ", learning-rate-factor=" << learning_rate_factor_; if (max_change_ > 0.0) stream << ", max-change=" << max_change_; + + stream << ", max-param-value=" << max_param_value_ + << ", min-param-value=" << min_param_value_; return stream.str(); } diff --git a/src/nnet3/nnet-component-itf.h b/src/nnet3/nnet-component-itf.h index 01697353308..6fd94a1738a 100644 --- a/src/nnet3/nnet-component-itf.h +++ b/src/nnet3/nnet-component-itf.h @@ -443,6 +443,8 @@ class UpdatableComponent: public Component { // InitLearningRatesFromConfig() should be changed too. UpdatableComponent(): learning_rate_(0.001), learning_rate_factor_(1.0), l2_regularize_(0.0), is_gradient_(false), + min_param_value_(std::numeric_limits::lowest()), + max_param_value_(std::numeric_limits::max()), max_change_(0.0) { } virtual ~UpdatableComponent() { } @@ -520,6 +522,13 @@ class UpdatableComponent: public Component { virtual void UnVectorize(const VectorBase ¶ms) { KALDI_ASSERT(0); } + // This function applies min and max parameter value contraints by + // mapping parameter weight to range [min_param_value_, max_param_value] + virtual void ApplyMinMaxToWeights() = 0; + + BaseFloat MaxParamValue() const { return max_param_value_; } + + BaseFloat MinParamValue() const { return min_param_value_; } protected: // to be called from child classes, extracts any learning rate information @@ -538,7 +547,11 @@ class UpdatableComponent: public Component { // tag and the learning-rate factor (if not 1.0) and the // learning rate; void WriteUpdatableCommon(std::ostream &is, bool binary) const; - + + BaseFloat max_param_value_; /// max parameter value constant, the parameters mapped to + /// this value, if they get larger than max_param_value_. + BaseFloat min_param_value_; /// min parameter value constant, the parameters mapped to + /// this value, if they get smaller than min_param_value_. BaseFloat learning_rate_; ///< learning rate (typically 0.0..0.01) BaseFloat learning_rate_factor_; ///< learning rate factor (normally 1.0, but ///< can be set to another < value so that diff --git a/src/nnet3/nnet-convolutional-component.h b/src/nnet3/nnet-convolutional-component.h index 35cf0de11c9..c39622775d1 100644 --- a/src/nnet3/nnet-convolutional-component.h +++ b/src/nnet3/nnet-convolutional-component.h @@ -276,6 +276,7 @@ class TimeHeightConvolutionComponent: public UpdatableComponent { virtual void Scale(BaseFloat scale); virtual void Add(BaseFloat alpha, const Component &other); virtual void PerturbParams(BaseFloat stddev); + virtual void ApplyMinMaxToWeights() {} virtual BaseFloat DotProduct(const UpdatableComponent &other) const; virtual int32 NumParameters() const; virtual void Vectorize(VectorBase *params) const; diff --git a/src/nnet3/nnet-general-component.h b/src/nnet3/nnet-general-component.h index cff73a55b59..163c42837ab 100644 --- a/src/nnet3/nnet-general-component.h +++ b/src/nnet3/nnet-general-component.h @@ -502,6 +502,7 @@ class BackpropTruncationComponent: public Component { virtual void Scale(BaseFloat scale); virtual void Add(BaseFloat alpha, const Component &other); + virtual void ApplyMinMaxToWeights() {} virtual void Read(std::istream &is, bool binary); // This Read function // requires that the Component has the correct type. /// Write component to stream @@ -675,6 +676,7 @@ class ConstantComponent: public UpdatableComponent { virtual void Scale(BaseFloat scale); virtual void Add(BaseFloat alpha, const Component &other); virtual void PerturbParams(BaseFloat stddev); + virtual void ApplyMinMaxToWeights() {} virtual BaseFloat DotProduct(const UpdatableComponent &other) const; virtual int32 NumParameters() const; virtual void Vectorize(VectorBase *params) const; diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index 4eb078c0fcb..0011b2989c2 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -1007,6 +1007,49 @@ void RectifiedLinearComponent::StoreStats( StoreStatsInternal(out_value, &temp_deriv); } + +void AffineComponent::ApplyMinMaxToWeights() { + BaseFloat max_param_value = MaxParamValue(), + min_param_value = MinParamValue(); + CuMatrix linear_params_diff(linear_params_); + if (min_param_value > std::numeric_limits::lowest()) { + linear_params_.ApplyFloor(min_param_value); + bias_params_.ApplyFloor(min_param_value); + } + int32 tot_dim = InputDim() * OutputDim(); + // percentage of weight, smaller than min-param-value, which mapped to + // min-param-value. + if (GetVerboseLevel() > 0) { + linear_params_diff.AddMat(-1.0, linear_params_); + linear_params_diff.Scale(-1.0); + linear_params_diff.ApplyHeaviside(); + BaseFloat num_min_weights = linear_params_diff.Sum(); + KALDI_LOG << num_min_weights / tot_dim * 100.0 << " % of parameters floored " + << " to min-param-value="<< min_param_value; + } + + linear_params_diff.CopyFromMat(linear_params_); + if (max_param_value < std::numeric_limits::max()) { + // apply min(max_value, w) + linear_params_.ApplyCeiling(max_param_value); + bias_params_.ApplyCeiling(max_param_value); + } + // percentage of weight, larger than max-param-value, which mapped to + // max-param-value. + if (GetVerboseLevel() > 0) { + linear_params_diff.AddMat(-1.0, linear_params_); + linear_params_diff.ApplyHeaviside(); + BaseFloat num_max_weights = linear_params_diff.Sum(); + KALDI_LOG << num_max_weights / tot_dim * 100.0 << " % of parameters ceiled " + << " to max-param-value="<< max_param_value; + } + + KALDI_ASSERT(linear_params_.Max() <= max_param_value && + linear_params_.Min() >= min_param_value && + bias_params_.Max() <= max_param_value && + bias_params_.Min() >= min_param_value); +} + void AffineComponent::Scale(BaseFloat scale) { if (scale == 0.0) { // If scale == 0.0 we call SetZero() which will get rid of NaN's and inf's. @@ -1094,6 +1137,9 @@ BaseFloat AffineComponent::DotProduct(const UpdatableComponent &other_in) const void AffineComponent::Init(int32 input_dim, int32 output_dim, BaseFloat param_stddev, BaseFloat bias_stddev) { + BaseFloat max_param_value = MaxParamValue(), + min_param_value = MinParamValue(); + KALDI_ASSERT(param_stddev < std::abs(max_param_value)); linear_params_.Resize(output_dim, input_dim); bias_params_.Resize(output_dim); KALDI_ASSERT(output_dim > 0 && input_dim > 0 && param_stddev >= 0.0); @@ -2245,7 +2291,9 @@ void ScaleAndOffsetComponent::InitFromConfig(ConfigLine *cfl) { } cfl->GetValue("rank", &rank); scales_.Resize(block_dim); - scales_.Set(1.0); + BaseFloat scale_value = 1.0; + cfl->GetValue("scale", &scale_value); + scales_.Set(scale_value); offsets_.Resize(block_dim); // offsets are all zero when initialized. if (cfl->HasUnusedValues()) @@ -2942,6 +2990,19 @@ void NaturalGradientAffineComponent::FreezeNaturalGradient(bool freeze) { preconditioner_out_.Freeze(freeze); } +void LinearComponent::ApplyMinMaxToWeights() { + BaseFloat max_param_value = MaxParamValue(), + min_param_value = MinParamValue(); + if (min_param_value > std::numeric_limits::lowest()) + params_.ApplyFloor(min_param_value); + + // apply min(max_value, w) + if (max_param_value < std::numeric_limits::max()) + params_.ApplyCeiling(max_param_value); + + KALDI_ASSERT(params_.Max() <= max_param_value && + params_.Min() >= min_param_value); +} void LinearComponent::Read(std::istream &is, bool binary) { std::string token = ReadUpdatableCommon(is, binary); @@ -3005,6 +3066,9 @@ void LinearComponent::InitFromConfig(ConfigLine *cfl) { if (!ok) KALDI_ERR << "Bad initializer " << cfl->WholeLine(); BaseFloat param_stddev = 1.0 / std::sqrt(input_dim); + BaseFloat max_param_value = MaxParamValue(), + min_param_value = MinParamValue(); + param_stddev = std::min(param_stddev, max_param_value); cfl->GetValue("param-stddev", ¶m_stddev); params_.Resize(output_dim, input_dim); KALDI_ASSERT(output_dim > 0 && input_dim > 0 && param_stddev >= 0.0); @@ -5860,6 +5924,389 @@ void SumBlockComponent::Backprop( } } +//For raw data +ShiftInputComponent::ShiftInputComponent(const ShiftInputComponent &other): + RandomComponent(other), + input_dim_(other.input_dim_), + output_dim_(other.output_dim_), + max_shift_(other.max_shift_), + rand_vol_var_(other.rand_vol_var_), + shift_per_frame_(other.shift_per_frame_), + dither_(other.dither_), + preprocess_(other.preprocess_) { } + +Component* ShiftInputComponent::Copy() const { + ShiftInputComponent *ans = new ShiftInputComponent(*this); + return ans; +} + +std::string ShiftInputComponent::Info() const { + std::stringstream stream; + stream << Type() << ", input-dim=" << input_dim_ + << ", output-dim=" << output_dim_ + << ", max-shift=" << max_shift_ + << ", shift-per-frame=" << shift_per_frame_ + << ", dither=" << dither_ + << ", preprocess=" << preprocess_; + return stream.str(); +} + +void ShiftInputComponent::Init(int32 input_dim, int32 output_dim, BaseFloat max_shift, + BaseFloat rand_vol_var, + BaseFloat dither, bool preprocess) { + input_dim_ = input_dim; + output_dim_ = output_dim; + max_shift_ = max_shift; + rand_vol_var_ = rand_vol_var; + dither_ = dither; + preprocess_ = preprocess; + KALDI_ASSERT(input_dim_ - output_dim_ >= 0 && input_dim_ > 0); + KALDI_ASSERT(max_shift >= 0.0 && max_shift <= 1.0); + KALDI_ASSERT(rand_vol_var >= 0.0 && rand_vol_var <= 1.0); +} + +void ShiftInputComponent::InitFromConfig(ConfigLine *cfl) { + bool ok = true, preprocess = false; + test_mode_ = false; + int32 input_dim, output_dim; + BaseFloat max_shift = 1.0, rand_vol_var = 0.0, dither = 0.0; + ok = ok && cfl->GetValue("input-dim", &input_dim); + ok = ok && cfl->GetValue("output-dim", &output_dim); + // It only makes sense to set test-mode in the config for testing purposes. + cfl->GetValue("test-mode", &test_mode_); + if (cfl->GetValue("max-shift", &max_shift)) + KALDI_ASSERT(max_shift >= 0.0 && max_shift <= 1.0); + if (cfl->GetValue("rand-vol-var", &rand_vol_var)) + KALDI_ASSERT(rand_vol_var >= 0 && rand_vol_var <= 1.0); + cfl->GetValue("shift-per-frame", &shift_per_frame_); + cfl->GetValue("dither", &dither); + cfl->GetValue("preprocess", &preprocess); + Init(input_dim, output_dim, max_shift, rand_vol_var, dither, preprocess); +} + +void ShiftInputComponent::Read(std::istream &is, bool binary) { + ExpectOneOrTwoTokens(is, binary, "", ""); + ReadBasicType(is, binary, &input_dim_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &output_dim_); + std::string token; + ReadToken(is, binary, &token); + if (token == "") { + ReadBasicType(is, binary, &max_shift_); + ReadToken(is, binary, &token); + if (token == "") { + ReadBasicType(is, binary, &rand_vol_var_); + ReadToken(is, binary, &token); + } + } + if (token == "") { + ReadBasicType(is, binary, &shift_per_frame_); + ReadToken(is, binary, &token); + } + if (token == "") { + ReadBasicType(is, binary, &dither_); + ReadToken(is, binary, &token); + } + if (token == "") { + ReadBasicType(is, binary, &preprocess_); + ReadToken(is, binary, &token); + } + + if (token == "") { + ReadBasicType(is, binary, &test_mode_); // read test mode + ExpectToken(is, binary, ""); + } else { + test_mode_ = false; + KALDI_ASSERT(token == ""); + } +} + +void ShiftInputComponent::Write(std::ostream &os, bool binary) const { + WriteToken(os, binary, ""); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, input_dim_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, output_dim_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, max_shift_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, rand_vol_var_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, shift_per_frame_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, dither_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, preprocess_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, test_mode_); + WriteToken(os, binary, ""); +} + +void* ShiftInputComponent::Propagate(const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const { + // dithering is done on both train and test time. + // it is done to make zero values nonzero on raw frame of signal. + CuMatrix modified_in(in); + if (dither_ != 0.0) { + CuMatrix dither_mat(in.NumRows(), in.NumCols()); + dither_mat.SetRandn(); + dither_mat.Scale(dither_); + modified_in.AddMat(1.0, dither_mat); + } + if (test_mode_) + out->CopyFromMat(modified_in.Range(0,in.NumRows(), 0, output_dim_)); + else { + int32 in_out_diff = input_dim_ - output_dim_, + shift; + KALDI_ASSERT(in_out_diff >= 0); + int32 max_shift_int = static_cast(max_shift_ * in_out_diff); + if (shift_per_frame_) { + int32 block_size = 1024, + num_blocks = out->NumRows() / block_size; + CuMatrix tmp(1, num_blocks, kUndefined); + const_cast&>(random_generator_).RandUniform(&tmp); + tmp.Scale(max_shift_int); + for (int i = 0; i < num_blocks; i++) { + int32 start_row = i * block_size, + num_shifted_row = std::min(block_size, out->NumRows() - start_row); + shift = static_cast(tmp(0,i) + 0.5); + CuSubMatrix out_row = out->Range(start_row, num_shifted_row, + 0, output_dim_); + out_row.CopyFromMat(modified_in.Range(start_row, num_shifted_row, + shift, output_dim_)); + } + } else { + // Generate random shift integer value. + shift = RandInt(0, max_shift_int); + out->CopyFromMat(modified_in.Range(0, in.NumRows(), shift, output_dim_)); + + BaseFloat rand_vol = (1.0 + rand_vol_var_ * + (Rand() % 2 ? -1.0 : 1.0) * RandUniform()); + if (rand_vol != 0 && rand_vol != 1.0) + out->Scale(rand_vol); + } + } + if (preprocess_) + Preprocess(out); + return NULL; +} + +void ShiftInputComponent::Preprocess(CuMatrixBase *preprocessed_in) const { + // removing dc offset + int32 dim = preprocessed_in->NumCols(), + num_rows = preprocessed_in->NumRows(); + BaseFloat scale_w = 1.0 / float(dim), preemph = 0.97; + CuVector mean(num_rows); + mean.AddColSumMat(scale_w, *preprocessed_in, 0); + preprocessed_in->AddVecToCols(-1.0, mean); + + // Doing pre-emphasis + CuMatrix shifted_in(preprocessed_in->ColRange(0, dim-1)); + preprocessed_in->ColRange(1, dim-1).AddMat(-1.0 * preemph, shifted_in); + preprocessed_in->ColRange(0,1).Scale(1.0 - preemph); + + + // Apply windowing + CuVector window(dim); + double a = M_2PI / (dim-1); + for (int32 i = 1; i < dim-1; i++) { + double i_fl = static_cast(i); + window(i) = std::pow(0.5 - 0.5 * cos(a * i_fl), 0.85); + } + window(0) = window(1); + window(dim - 1) = window(dim - 2); + CuMatrix window_mat(preprocessed_in->NumRows(), dim); + window_mat.CopyRowsFromVec(window); + preprocessed_in->MulElements(window_mat); +} + +void ShiftInputComponent::Backprop(const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &, + const CuMatrixBase &, + const CuMatrixBase &, + void *memo, + Component *, + CuMatrixBase *in_deriv) const { + in_deriv->SetZero(); +} + +std::string LogComponent::Info() const { + std::stringstream stream; + stream << NonlinearComponent::Info() + << ", log-floor=" << log_floor_ + << ", additive-offset=" << additive_offset_; + return stream.str(); +} + +void LogComponent::InitFromConfig(ConfigLine *cfl) { + cfl->GetValue("log-floor", &log_floor_); + cfl->GetValue("additive-offset", &additive_offset_); + NonlinearComponent::InitFromConfig(cfl); +} + +void* LogComponent::Propagate(const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const { + out->CopyFromMat(in); + if (additive_offset_) { + // Apply log(abs(x)+epsi) + out->ApplyPowAbs(1.0); + out->Add(log_floor_); + out->ApplyLog(); + } else { + // Apply log function (x >= epsi ? log(x) : log(epsi)). + out->ApplyFloor(log_floor_); + out->ApplyLog(); + } + return NULL; +} + +void LogComponent::Backprop(const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in_value, + const CuMatrixBase &out_value, + const CuMatrixBase &out_deriv, + void *memo, + Component *to_update, + CuMatrixBase *in_deriv) const { + if (in_deriv != NULL) { + CuMatrix divided_in_value(in_value), floored_in_value(in_value); + divided_in_value.Set(1.0); + floored_in_value.CopyFromMat(in_value); + if (additive_offset_) { + in_deriv->CopyFromMat(in_value); + in_deriv->ApplyHeaviside(); // (x > 0 ? 1 : 0) + in_deriv->Scale(2.0); + in_deriv->Add(-1.0); // (x > 0 ? 1 : -1) + floored_in_value.ApplyPowAbs(1.0); + floored_in_value.Add(log_floor_); // (abs(x) + epsi) + divided_in_value.DivElements(floored_in_value); // 1 / (abs(x) + epsi) + in_deriv->MulElements(divided_in_value); // (dy/dx: x > 0 : 1/(abs(x) + epsi), -1/(abs(x) + epsi)) + in_deriv->MulElements(out_deriv); // dF/dx = dF/dy * dy/dx + } else { + floored_in_value.ApplyFloor(log_floor_); // (x > epsi ? x : epsi) + divided_in_value.DivElements(floored_in_value); // (x > epsi ? 1/x : 1/epsi) + in_deriv->CopyFromMat(in_value); + in_deriv->Add(-1.0 * log_floor_); // (x - epsi) + in_deriv->ApplyHeaviside(); // (x > epsi ? 1 : 0) + in_deriv->MulElements(divided_in_value); // (dy/dx: x > epsi ? 1/x : 0) + in_deriv->MulElements(out_deriv); // dF/dx = dF/dy * dy/dx + } + } +} + +void LogComponent::Read(std::istream &is, bool binary) { + std::ostringstream ostr_beg, ostr_end; + ostr_beg << "<" << Type() << ">"; // e.g. "" + ostr_end << ""; // e.g. "" + ExpectOneOrTwoTokens(is, binary, ostr_beg.str(), ""); + ReadBasicType(is, binary, &dim_); // Read dimension. + ExpectToken(is, binary, ""); + value_sum_.Read(is, binary); + ExpectToken(is, binary, ""); + deriv_sum_.Read(is, binary); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &count_); + value_sum_.Scale(count_); + deriv_sum_.Scale(count_); + + std::string token; + ReadToken(is, binary, &token); + if (token == "") { + ReadBasicType(is, binary, &self_repair_lower_threshold_); + ReadToken(is, binary, &token); + } + if (token == "") { + ReadBasicType(is, binary, &self_repair_upper_threshold_); + ReadToken(is, binary, &token); + } + if (token == "") { + ReadBasicType(is, binary, &self_repair_scale_); + ReadToken(is, binary, &token); + } + if (token == "") { + ReadBasicType(is, binary, &log_floor_); + ReadToken(is, binary, &token); + } + + if (token == "") { + ReadBasicType(is, binary, &additive_offset_); + ReadToken(is, binary, &token); + } + + if (token != ostr_end.str()) { + KALDI_ERR << "Expected token " << ostr_end.str() + << ", got " << token; + } +} + +void LogComponent::Write(std::ostream &os, bool binary) const { + std::ostringstream ostr_beg, ostr_end; + ostr_beg << "<" << Type() << ">"; // e.g. "" + ostr_end << ""; // e.g. "" + WriteToken(os, binary, ostr_beg.str()); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, dim_); + // Write the values and derivatives in a count-normalized way, for + // greater readability in text form. + WriteToken(os, binary, ""); + Vector temp(value_sum_); + if (count_ != 0.0) temp.Scale(1.0 / count_); + temp.Write(os, binary); + WriteToken(os, binary, ""); + + temp.Resize(deriv_sum_.Dim(), kUndefined); + temp.CopyFromVec(deriv_sum_); + if (count_ != 0.0) temp.Scale(1.0 / count_); + temp.Write(os, binary); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, count_); + if (self_repair_lower_threshold_ != kUnsetThreshold) { + WriteToken(os, binary, ""); + WriteBasicType(os, binary, self_repair_lower_threshold_); + } + if (self_repair_upper_threshold_ != kUnsetThreshold) { + WriteToken(os, binary, ""); + WriteBasicType(os, binary, self_repair_upper_threshold_); + } + if (self_repair_scale_ != 0.0) { + WriteToken(os, binary, ""); + WriteBasicType(os, binary, self_repair_scale_); + } + WriteToken(os, binary, ""); + WriteBasicType(os, binary, log_floor_); + + WriteToken(os, binary, ""); + WriteBasicType(os, binary, additive_offset_); + WriteToken(os, binary, ostr_end.str()); +} + +void* ExpComponent::Propagate(const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const { + // Applied exp function + out->CopyFromMat(in); + out->ApplyExp(); + return NULL; +} + +void ExpComponent::Backprop(const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &,//in_value, + const CuMatrixBase &out_value, + const CuMatrixBase &out_deriv, + void *memo, + Component *to_update, + CuMatrixBase *in_deriv) const { + if (in_deriv != NULL) { + in_deriv->CopyFromMat(out_value); + in_deriv->MulElements(out_deriv); + } +} + + } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h index 3929c253aab..b1664473a4e 100644 --- a/src/nnet3/nnet-simple-component.h +++ b/src/nnet3/nnet-simple-component.h @@ -387,6 +387,7 @@ class PerElementOffsetComponent; // AffineComponent. class AffineComponent: public UpdatableComponent { public: + virtual void ApplyMinMaxToWeights(); virtual int32 InputDim() const { return linear_params_.NumCols(); } virtual int32 OutputDim() const { return linear_params_.NumRows(); } @@ -482,6 +483,7 @@ class RepeatedAffineComponent; /// num-blocks must divide both input-dim and output-dim class BlockAffineComponent : public UpdatableComponent { public: + virtual void ApplyMinMaxToWeights() {} virtual int32 InputDim() const { return linear_params_.NumCols() * num_blocks_; } virtual int32 OutputDim() const { return linear_params_.NumRows(); } @@ -547,7 +549,7 @@ class BlockAffineComponent : public UpdatableComponent { class RepeatedAffineComponent: public UpdatableComponent { public: - + virtual void ApplyMinMaxToWeights() {} virtual int32 InputDim() const { return linear_params_.NumCols() * num_repeats_; } virtual int32 OutputDim() const { return linear_params_.NumRows() * num_repeats_; } @@ -898,6 +900,7 @@ class NaturalGradientAffineComponent: public AffineComponent { */ class LinearComponent: public UpdatableComponent { public: + virtual void ApplyMinMaxToWeights(); virtual int32 InputDim() const { return params_.NumCols(); } virtual int32 OutputDim() const { return params_.NumRows(); } @@ -1462,6 +1465,7 @@ class PermuteComponent: public Component { */ class PerElementScaleComponent: public UpdatableComponent { public: + virtual void ApplyMinMaxToWeights() {} virtual int32 InputDim() const { return scales_.Dim(); } virtual int32 OutputDim() const { return scales_.Dim(); } @@ -1558,6 +1562,7 @@ class PerElementScaleComponent: public UpdatableComponent { */ class PerElementOffsetComponent: public UpdatableComponent { public: + virtual void ApplyMinMaxToWeights() {} virtual int32 InputDim() const { return dim_; } virtual int32 OutputDim() const { return dim_; } @@ -1622,6 +1627,7 @@ class PerElementOffsetComponent: public UpdatableComponent { // no inputs]. class ConstantFunctionComponent: public UpdatableComponent { public: + virtual void ApplyMinMaxToWeights() {} virtual int32 InputDim() const { return input_dim_; } virtual int32 OutputDim() const { return output_.Dim(); } @@ -1794,6 +1800,7 @@ class NaturalGradientPerElementScaleComponent: public PerElementScaleComponent { */ class ScaleAndOffsetComponent: public UpdatableComponent { public: + virtual void ApplyMinMaxToWeights() {} virtual int32 InputDim() const { return dim_; } virtual int32 OutputDim() const { return dim_; } @@ -1963,6 +1970,7 @@ class ScaleAndOffsetComponent: public UpdatableComponent { */ class ConvolutionComponent: public UpdatableComponent { public: + virtual void ApplyMinMaxToWeights() {} enum TensorVectorizationType { kYzx = 0, kZyx = 1 @@ -2185,6 +2193,7 @@ class ConvolutionComponent: public UpdatableComponent { class LstmNonlinearityComponent: public UpdatableComponent { public: + virtual void ApplyMinMaxToWeights() {} virtual int32 InputDim() const; virtual int32 OutputDim() const; virtual std::string Info() const; @@ -2426,6 +2435,7 @@ class MaxpoolingComponent: public Component { */ class CompositeComponent: public UpdatableComponent { public: + virtual void ApplyMinMaxToWeights() {} virtual int32 InputDim() const; virtual int32 OutputDim() const; @@ -2519,6 +2529,149 @@ class CompositeComponent: public UpdatableComponent { }; +//For raw data +/* + * The shiftedComponent shifts the input using random or constant shift. + * The output y contains the shifted version of input and it is equal to + * x.Range(shift * diff, output_dim_), where 0 <= shift < 1. + * The output_dim_ is the target dimension of the output and the input_dim_ is the input + * dim of this component and diff = input_dim_ - output_dim_ and input_dim_ > output_dim_ and the diff should be > = original frame_length. + * This component is useful when we train a DNN using raw-waveform + * and we can shift the input e.g. shift the input by 20% of original frame-length. + * max_shift_ is the max shift used to shift the input(0 <= max_shift_ <= 1, the default is 0.5.) + */ +class ShiftInputComponent: public RandomComponent { + public: + void Init(int32 input_dim, int32 output_dim, BaseFloat max_shift, + BaseFloat rand_vol_var = 0.0, BaseFloat dither = 0.0, bool preprocess = false); + + explicit ShiftInputComponent(const ShiftInputComponent &other); + + explicit ShiftInputComponent(int32 input_dim, int32 output_dim, + BaseFloat max_shift, + BaseFloat rand_vol_var = 0.0, + BaseFloat dither = 0.0, + bool preprocess = false) { + Init(input_dim, output_dim, max_shift, rand_vol_var, dither, preprocess); } + ShiftInputComponent(): input_dim_(0), output_dim_(0), max_shift_(1.0), + rand_vol_var_(0.0), shift_per_frame_(false), dither_(0.0), + preprocess_(false) { } + + virtual std::string Type() const { return "ShiftInputComponent"; } + virtual std::string Info() const; + virtual void InitFromConfig(ConfigLine *cfl); + void SetShiftAndVolume(BaseFloat shift, BaseFloat vol_var) { max_shift_ = shift; + rand_vol_var_ = vol_var; } + virtual int32 InputDim() const { return input_dim_; } + virtual int32 OutputDim() const { return output_dim_; } + virtual int32 Properties() const { + return kSimpleComponent|kRandomComponent; + } + virtual void* Propagate(const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const; + + virtual void Backprop(const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in_value, + const CuMatrixBase &out_value, + const CuMatrixBase &out_deriv, + void *memo, + Component *to_update, + CuMatrixBase *in_deriv) const; + + virtual Component* Copy() const; + + virtual void Read(std::istream &is, bool binary); // This Read function + // requires that the Component has the correct type. + + /// Write component to stream + virtual void Write(std::ostream &os, bool binary) const; + protected: + void Preprocess(CuMatrixBase *preprocessed_in) const; + int32 input_dim_; + int32 output_dim_; + BaseFloat max_shift_; // max shift is the max shift used to shift the input. + // max_shift_ should be between 0 and 1. + BaseFloat rand_vol_var_; // The variance used to generate random volume perturbation value. + bool shift_per_frame_; // If true, different random shift is applied per frame of input. + BaseFloat dither_; // The random vector with stddev of dither_ is added to input before random shift. + // The main reason is to make zero values on raw waveform nonzero. + // This is done on both test and train. + bool preprocess_; // If true, the preemphasis, mean-removal and windowing is applied + // on outputs. +}; + +// The ExpComponent outputs the exp of input values as y = Exp(x) +class ExpComponent: public NonlinearComponent { + public: + explicit ExpComponent(const ExpComponent &other): + NonlinearComponent(other) { } + ExpComponent() { } + virtual std::string Type() const { return "ExpComponent"; } + virtual int32 Properties() const { + return kSimpleComponent|kBackpropNeedsOutput|kStoresStats; + } + virtual void* Propagate(const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const; + virtual void Backprop(const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &, + const CuMatrixBase &out_value, + const CuMatrixBase &, + void *memo, + Component *to_update, + CuMatrixBase *in_deriv) const; + + virtual Component* Copy() const { return new ExpComponent(*this); } + private: + ExpComponent &operator = (const ExpComponent &other); // Disallow. +}; + +// The LogComponent outputs the log of input values as y = Log(max(x, epsi)) +class LogComponent: public NonlinearComponent { + public: + //explicit LogComponent(int32 dim): dim_(dim) { } + explicit LogComponent(const LogComponent &other): + NonlinearComponent(other), log_floor_(other.log_floor_), + additive_offset_(other.additive_offset_) {} + LogComponent(): log_floor_(1e-10), additive_offset_(false) { } + virtual std::string Type() const { return "LogComponent"; } + virtual int32 Properties() const { + return kSimpleComponent|kBackpropNeedsInput|kStoresStats; + } + + virtual std::string Info() const; + + virtual void InitFromConfig(ConfigLine *cfl); + + virtual void* Propagate(const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const; + virtual void Backprop(const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in_value, + const CuMatrixBase &out_value, + const CuMatrixBase &out_deriv, + void *memo, + Component *to_update, + CuMatrixBase *in_deriv) const; + + virtual Component* Copy() const { return new LogComponent(*this); } + + virtual void Read(std::istream &is, bool binary); + + virtual void Write(std::ostream &os, bool binary) const; + + private: + LogComponent &operator = (const LogComponent &other); // Disallow. + BaseFloat log_floor_; + bool additive_offset_; // If true, log is computed using abs(x) + log_floor_ + // otherwise it is computed as log(max(x,log_floor_)) +}; + + } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc index 1d00125a361..8246148abc6 100644 --- a/src/nnet3/nnet-utils.cc +++ b/src/nnet3/nnet-utils.cc @@ -2051,6 +2051,22 @@ void ApplyL2Regularization(const Nnet &nnet, } } +bool PositiveUpdatableWeights(Nnet *nnet) { + for (int32 c = 0; c < nnet->NumComponents(); c++) { + Component *comp = nnet->GetComponent(c); + if (comp->Properties() & kUpdatableComponent) { + UpdatableComponent *src_comp = + dynamic_cast(comp); + BaseFloat min_param_value = src_comp->MinParamValue(), + max_param_value = src_comp->MaxParamValue(); + KALDI_ASSERT(min_param_value < max_param_value); + // apply min and max weight constraints to linear and bias parameters. + src_comp->ApplyMinMaxToWeights(); + } + } + return true; +} + } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h index 3b304b8fb39..00aeb4a1661 100644 --- a/src/nnet3/nnet-utils.h +++ b/src/nnet3/nnet-utils.h @@ -497,6 +497,15 @@ void ConstrainOrthonormal(Nnet *nnet); int32 GetNumNvalues(const std::vector &io_vec, bool exhaustive); +/** + This function is used as part of the regular training workflow, after + UpdateNnetWithMaxChange(). + For each Updatable component c in the neural net, it makes updatable params + less than min_param_value_ to be equal to this value and also params larger + than max_param_value_ to max_param_value_. +*/ +bool PositiveUpdatableWeights(Nnet *nnet); + } // namespace nnet3 } // namespace kaldi From e3b272735d0003bc3b2ca5d0f5479da1028d2c2b Mon Sep 17 00:00:00 2001 From: LvHang Date: Fri, 11 May 2018 15:18:16 -0400 Subject: [PATCH 2/2] Add fvector code add _separate version fvector code --- egs/mini_librispeech/s5/cmd.sh | 7 +- .../s5/conf/queue_no_k20.conf | 16 + .../s5/local/fvector/add_output_node.py | 68 ++++ .../local/fvector/generate_sin_cos_matrix.py | 52 +++ .../s5/local/fvector/run_fvector.sh | 78 +++++ .../s5/local/fvector/run_fvector_separate.sh | 79 +++++ .../s5/local/fvector/run_tdnn.sh | 214 ++++++++++++ egs/mini_librispeech/s5/run.sh | 2 + egs/mini_librispeech/s5/run_fvector.sh | 64 ++++ .../s5/run_fvector_separate.sh | 64 ++++ egs/wsj/s5/steps/nnet3/fvector/get_egs.sh | 262 +++++++++++++++ .../steps/nnet3/fvector/get_egs_separate.sh | 165 ++++++++++ .../nnet3/fvector/make_fvector_feature.sh | 158 +++++++++ egs/wsj/s5/steps/nnet3/xvector/train.sh | 253 ++++++++++++++ .../s5/steps/nnet3/xvector/train_separate.sh | 257 +++++++++++++++ src/Makefile | 13 +- src/cudamatrix/cu-kernels-ansi.h | 9 + src/cudamatrix/cu-kernels.cu | 40 +++ src/cudamatrix/cu-kernels.h | 15 + src/cudamatrix/cu-math.cc | 43 +++ src/cudamatrix/cu-math.h | 31 ++ src/cudamatrix/cu-packed-matrix.cc | 9 + src/cudamatrix/cu-packed-matrix.h | 3 + src/fvector/Makefile | 19 ++ src/fvector/fvector-perturb-test.cc | 86 +++++ src/fvector/fvector-perturb.cc | 289 ++++++++++++++++ src/fvector/fvector-perturb.h | 172 ++++++++++ src/fvectorbin/Makefile | 25 ++ src/fvectorbin/compute-wav-to-rawmatrix.cc | 123 +++++++ src/fvectorbin/fvector-add-noise-block.cc | 61 ++++ src/fvectorbin/fvector-add-noise-separate.cc | 72 ++++ src/fvectorbin/fvector-add-noise.cc | 59 ++++ src/fvectorbin/fvector-chunk-block.cc | 212 ++++++++++++ src/fvectorbin/fvector-chunk-separate.cc | 207 ++++++++++++ src/fvectorbin/fvector-chunk.cc | 195 +++++++++++ .../fvector-debug-check-filter-bank.cc | 64 ++++ src/fvectorbin/fvector-debug-wav-to-vector.cc | 41 +++ src/fvectorbin/fvector-debug-write-to-wav.cc | 52 +++ src/fvectorbin/fvector-get-egs-block.cc | 122 +++++++ src/fvectorbin/fvector-get-egs.cc | 143 ++++++++ src/nnet3/nnet-example-utils.cc | 17 +- src/nnet3/nnet-utils.cc | 63 ++++ src/nnet3/nnet-utils.h | 7 + src/xvector/Makefile | 22 ++ src/xvector/nnet-xvector-compute.cc | 99 ++++++ src/xvector/nnet-xvector-compute.h | 55 ++++ src/xvector/nnet-xvector-diagnostics.cc | 214 ++++++++++++ src/xvector/nnet-xvector-diagnostics.h | 95 ++++++ src/xvector/nnet-xvector-training.cc | 272 +++++++++++++++ src/xvector/nnet-xvector-training.h | 89 +++++ src/xvector/xvector-test.cc | 311 ++++++++++++++++++ src/xvector/xvector.cc | 130 ++++++++ src/xvector/xvector.h | 94 ++++++ src/xvectorbin/Makefile | 28 ++ src/xvectorbin/nnet3-xvector-compute-prob.cc | 81 +++++ .../nnet3-xvector-compute-simple.cc | 155 +++++++++ src/xvectorbin/nnet3-xvector-compute.cc | 211 ++++++++++++ .../nnet3-xvector-get-egs-sre-subsample.cc | 264 +++++++++++++++ src/xvectorbin/nnet3-xvector-get-egs-sre.cc | 237 +++++++++++++ src/xvectorbin/nnet3-xvector-get-egs.cc | 244 ++++++++++++++ src/xvectorbin/nnet3-xvector-scoring.cc | 151 +++++++++ src/xvectorbin/nnet3-xvector-show-progress.cc | 158 +++++++++ src/xvectorbin/nnet3-xvector-train.cc | 94 ++++++ tools/config/common_path.sh | 2 + 64 files changed, 6927 insertions(+), 10 deletions(-) mode change 100644 => 100755 egs/mini_librispeech/s5/cmd.sh create mode 100644 egs/mini_librispeech/s5/conf/queue_no_k20.conf create mode 100644 egs/mini_librispeech/s5/local/fvector/add_output_node.py create mode 100644 egs/mini_librispeech/s5/local/fvector/generate_sin_cos_matrix.py create mode 100755 egs/mini_librispeech/s5/local/fvector/run_fvector.sh create mode 100755 egs/mini_librispeech/s5/local/fvector/run_fvector_separate.sh create mode 100755 egs/mini_librispeech/s5/local/fvector/run_tdnn.sh create mode 100755 egs/mini_librispeech/s5/run_fvector.sh create mode 100755 egs/mini_librispeech/s5/run_fvector_separate.sh create mode 100755 egs/wsj/s5/steps/nnet3/fvector/get_egs.sh create mode 100755 egs/wsj/s5/steps/nnet3/fvector/get_egs_separate.sh create mode 100755 egs/wsj/s5/steps/nnet3/fvector/make_fvector_feature.sh create mode 100755 egs/wsj/s5/steps/nnet3/xvector/train.sh create mode 100755 egs/wsj/s5/steps/nnet3/xvector/train_separate.sh create mode 100644 src/fvector/Makefile create mode 100644 src/fvector/fvector-perturb-test.cc create mode 100644 src/fvector/fvector-perturb.cc create mode 100644 src/fvector/fvector-perturb.h create mode 100644 src/fvectorbin/Makefile create mode 100644 src/fvectorbin/compute-wav-to-rawmatrix.cc create mode 100644 src/fvectorbin/fvector-add-noise-block.cc create mode 100644 src/fvectorbin/fvector-add-noise-separate.cc create mode 100644 src/fvectorbin/fvector-add-noise.cc create mode 100644 src/fvectorbin/fvector-chunk-block.cc create mode 100644 src/fvectorbin/fvector-chunk-separate.cc create mode 100644 src/fvectorbin/fvector-chunk.cc create mode 100644 src/fvectorbin/fvector-debug-check-filter-bank.cc create mode 100644 src/fvectorbin/fvector-debug-wav-to-vector.cc create mode 100644 src/fvectorbin/fvector-debug-write-to-wav.cc create mode 100644 src/fvectorbin/fvector-get-egs-block.cc create mode 100644 src/fvectorbin/fvector-get-egs.cc create mode 100644 src/xvector/Makefile create mode 100644 src/xvector/nnet-xvector-compute.cc create mode 100644 src/xvector/nnet-xvector-compute.h create mode 100644 src/xvector/nnet-xvector-diagnostics.cc create mode 100644 src/xvector/nnet-xvector-diagnostics.h create mode 100644 src/xvector/nnet-xvector-training.cc create mode 100644 src/xvector/nnet-xvector-training.h create mode 100644 src/xvector/xvector-test.cc create mode 100644 src/xvector/xvector.cc create mode 100644 src/xvector/xvector.h create mode 100644 src/xvectorbin/Makefile create mode 100644 src/xvectorbin/nnet3-xvector-compute-prob.cc create mode 100644 src/xvectorbin/nnet3-xvector-compute-simple.cc create mode 100644 src/xvectorbin/nnet3-xvector-compute.cc create mode 100644 src/xvectorbin/nnet3-xvector-get-egs-sre-subsample.cc create mode 100644 src/xvectorbin/nnet3-xvector-get-egs-sre.cc create mode 100644 src/xvectorbin/nnet3-xvector-get-egs.cc create mode 100644 src/xvectorbin/nnet3-xvector-scoring.cc create mode 100644 src/xvectorbin/nnet3-xvector-show-progress.cc create mode 100644 src/xvectorbin/nnet3-xvector-train.cc diff --git a/egs/mini_librispeech/s5/cmd.sh b/egs/mini_librispeech/s5/cmd.sh old mode 100644 new mode 100755 index 71dd849a93b..4f3b12aa700 --- a/egs/mini_librispeech/s5/cmd.sh +++ b/egs/mini_librispeech/s5/cmd.sh @@ -10,6 +10,7 @@ # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. -export train_cmd="queue.pl --mem 2G" -export decode_cmd="queue.pl --mem 4G" -export mkgraph_cmd="queue.pl --mem 8G" +export train_cmd="queue.pl --mem 2G --config conf/queue_no_k20.conf --allow-k10-k20 true" +export decode_cmd="queue.pl --mem 4G --config conf/queue_no_k20.conf --allow-k10-k20 true" +export mkgraph_cmd="queue.pl --mem 8G --config conf/queue_no_k20.conf --allow-k10-k20 true" +export cuda_cmd="queue.pl --gpu 1 --config conf/queue_no_k20.conf --allow-k10-k20 true" diff --git a/egs/mini_librispeech/s5/conf/queue_no_k20.conf b/egs/mini_librispeech/s5/conf/queue_no_k20.conf new file mode 100644 index 00000000000..e8d19a24ef7 --- /dev/null +++ b/egs/mini_librispeech/s5/conf/queue_no_k20.conf @@ -0,0 +1,16 @@ +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -l 'hostname=!a08*&!a09*&!a10*&!c04*&!b18*&!b19*&!b20*' +option mem=* -l mem_free=$0,ram_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -pe smp $0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q +option gpu=* -l gpu=$0 -q g.q +default allow_k20=true +option allow_k20=true +option allow_k20=false -l 'hostname=!g01*&!g02*&!b06*' +default allow_k10_k20=true +option allow_k10_k20=true +option allow_k10_k20=false -l 'hostname=!b0*&!b10*&!g01*&!g02' diff --git a/egs/mini_librispeech/s5/local/fvector/add_output_node.py b/egs/mini_librispeech/s5/local/fvector/add_output_node.py new file mode 100644 index 00000000000..c3d446e5db4 --- /dev/null +++ b/egs/mini_librispeech/s5/local/fvector/add_output_node.py @@ -0,0 +1,68 @@ +#!/usr/bin/env/python + +from __future__ import print_function +import argparse +import logging +import os +import pprint +import sys +import shutil +import traceback + +def get_args(): + parser = argparse.ArgumentParser(description="Add the S and b output node " + "which is used in plda object function.", + epilog="Called by local/fvector/run_fvector.sh") + parser.add_argument("--input-dim", type=int, required=True, + help="The input dimension of fvector network.") + parser.add_argument("--output-dim", type=int, required=True, + help="The output dimension of fvector network which is used to " + "compute the dimension of S matrix.") + parser.add_argument("--s-scale", type=float, default=0.2, + help="Scaling factor on the output 's' (s is a symmetric matrix " + "used for scoring).") + parser.add_argument("--b-scale", type=float, default=0.2, + help="Scaling factor on output 'b' (b is a scalar offset used in scoring).") + parser.add_argument("--config-file", type=str, required=True, + help="The file is needed to be modified. It's always is configs/final.config") + + print(' '.join(sys.argv), file=sys.stderr) + print(sys.argv, file=sys.stderr) + args = parser.parse_args() + return args + + +def main(): + args = get_args() + + f = open(args.config_file, "a") + # The s output + s_dim = (args.output_dim) * (args.output_dim+1) / 2 + + print('component name=x-s type=ConstantFunctionComponent input-dim={0} output-dim={1} ' + 'output-mean=0 output-stddev=0 '.format( + args.input_dim, s_dim), file=f) + print('component-node name=x-s component=x-s input=IfDefined(input)', + file=f) + print('component name=x-s-scale type=FixedScaleComponent dim={0} scale={1}'.format( + s_dim, args.s_scale), file=f); + print('component-node name=x-s-scale component=x-s-scale input=x-s', + file=f) + print('output-node name=s input=x-s-scale', file=f) + + # now the 'b' output, which is just a scalar. + b_dim = 1 + print('component name=x-b type=ConstantFunctionComponent input-dim={0} output-dim=1 ' + 'output-mean=0 output-stddev=0 '.format(args.input_dim), file=f) + print('component-node name=x-b component=x-b input=IfDefined(input)', file=f) + print('component name=x-b-scale type=FixedScaleComponent dim=1 scale={0}'.format( + args.b_scale), file=f); + print('component-node name=x-b-scale component=x-b-scale input=x-b', + file=f) + print('output-node name=b input=x-b-scale', file=f) + f.close() + + + +if __name__ == "__main__": + main() diff --git a/egs/mini_librispeech/s5/local/fvector/generate_sin_cos_matrix.py b/egs/mini_librispeech/s5/local/fvector/generate_sin_cos_matrix.py new file mode 100644 index 00000000000..45e986723a3 --- /dev/null +++ b/egs/mini_librispeech/s5/local/fvector/generate_sin_cos_matrix.py @@ -0,0 +1,52 @@ +#!/usr/bin/env/python + +from __future__ import print_function +import argparse +import logging +import os +import pprint +import shutil +import sys +import traceback + +sys.path.insert(0, 'steps') +import libs.common as common_lib + +def get_args(): + parser = argparse.ArgumentParser(description="Generate sine_transform.mat " + "and cosine_transform.mat for frequency domain raw waveform setup.", + epilog="Called by local/fvector/run_fvector.sh") + parser.add_argument("--feat-dim", type=int, required=True, + help="The dimension of input.") + parser.add_argument("--add-bias", type=str, + help="If true, add a column for fft matrix.", + default=True, choices=["True","False"]) + parser.add_argument("--half-range", type=str, + help="If true, generate half fft matrix.", + default=True, choices=["True","False"]) + parser.add_argument("--dir", type=str, required=True, + help="The output directory.") + + print(' '.join(sys.argv), file=sys.stderr) + print(sys.argv, file=sys.stderr) + args = parser.parse_args() + return args + + +def main(): + args = get_args() + + feat_dim = args.feat_dim + num_fft_bins = (2**(args.feat_dim-1).bit_length()) + add_bias = args.add_bias + half_range = args.half_range + + common_lib.write_sin_cos_transform_matrix(feat_dim, num_fft_bins, + "{0}/configs/cos_transform.mat".format(args.dir), + compute_cosine=True, add_bias=add_bias, half_range=half_range) + common_lib.write_sin_cos_transform_matrix(feat_dim, num_fft_bins, + "{0}/configs/sin_transform.mat".format(args.dir), + compute_cosine=False, add_bias=add_bias, half_range=half_range) + +if __name__ == "__main__": + main() diff --git a/egs/mini_librispeech/s5/local/fvector/run_fvector.sh b/egs/mini_librispeech/s5/local/fvector/run_fvector.sh new file mode 100755 index 00000000000..d57c6813428 --- /dev/null +++ b/egs/mini_librispeech/s5/local/fvector/run_fvector.sh @@ -0,0 +1,78 @@ +#!/bin/bash + +. ./cmd.sh +set -e + +stage=3 +train_stage=-10 +data=data/train_clean_5 +noise_data=data/noise +egs_dir=exp/fvector/egs +fvector_dir=exp/fvector +use_gpu=true + +. ./path.sh +. ./cmd.sh +. ./utils/parse_options.sh + +if [ $stage -le 3 ]; then + #dump egs + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b{11,12,13}/$USER/kaldi-data/egs/minilibrispeech-$(date +'%m_%d_%H_%M')/s5/$egs_dir/storage $egs_dir/storage + fi + + steps/nnet3/fvector/get_egs.sh --cmd "$train_cmd" \ + --nj 8 \ + --stage 0 \ + --egs-per-iter 12500 \ + --egs-per-iter-diagnostic 10000 \ + --num-diagnostic-percent 5 \ + --frame-length 25 \ + --left-padding 1 \ + --right-padding 1 \ + "$data" "$noise_data" "$egs_dir" +fi + +if [ $stage -le 4 ]; then + #prepare configs + echo "$0: creating neural net configs using the xconfig parser"; + #options + input_dim=400 + num_filters=100 + + mkdir -p $fvector_dir/configs + + cat < $fvector_dir/configs/network.xconfig + input dim=$input_dim name=input + # Each eg contains 8 frames, do Frequency-domain feature learning, and then + # use TDNN model split it into one vector + preprocess-fft-abs-lognorm-affine-log-layer name=raw0 cos-transform-file=$fvector_dir/configs/cos_transform.mat sin-transform-file=$fvector_dir/configs/sin_transform.mat num-filters=$num_filters half-fft-range=true + conv-relu-batchnorm-layer name=cnn1 height-in=$num_filters height-out=$[$num_filters/2] time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 height-subsample-out=2 learning-rate-factor=0.34 max-change=0.25 + + relu-batchnorm-layer name=tdnn0 input=cnn1 dim=625 + relu-batchnorm-layer name=tdnn1 input=Append(0,1,2) dim=625 + relu-batchnorm-layer name=tdnn2 input=Append(0,1,2) dim=625 + relu-batchnorm-layer name=tdnn3 input=Append(0,1,2) dim=625 + relu-batchnorm-layer name=tdnn4 input=Append(0,1) dim=625 + output-layer name=output input=tdnn4 dim=200 include-log-softmax=False param-stddev=0.04 bias-stddev=1.0 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $fvector_dir/configs/network.xconfig --config-dir $fvector_dir/configs/ + # Modify the final.config and generate sin.mat/cos.mat manually + python local/fvector/add_output_node.py --input-dim 400 --output-dim 200 --config-file $fvector_dir/configs/final.config + python local/fvector/generate_sin_cos_matrix.py \ + --feat-dim 400 --dir $fvector_dir +fi + +if [ $stage -le 5 ]; then + #training + steps/nnet3/xvector/train.sh --cmd "$train_cmd" \ + --initial-effective-lrate 0.002 \ + --final-effective-lrate 0.0002 \ + --max-param-change 0.2 \ + --minibatch-size 16 \ + --num-epochs 8 --use-gpu $use_gpu --stage $train_stage \ + --num-jobs-initial 1 --num-jobs-final 5 \ + --egs-dir $egs_dir \ + $fvector_dir +fi diff --git a/egs/mini_librispeech/s5/local/fvector/run_fvector_separate.sh b/egs/mini_librispeech/s5/local/fvector/run_fvector_separate.sh new file mode 100755 index 00000000000..17f4e95f667 --- /dev/null +++ b/egs/mini_librispeech/s5/local/fvector/run_fvector_separate.sh @@ -0,0 +1,79 @@ +#!/bin/bash + +. ./cmd.sh +set -e + +stage=5 +train_stage=-10 +data=data/train_clean_5 +noise_data=data/noise +egs_dir=exp/fvector/egs +fvector_dir=exp/fvector +use_gpu=true + +. ./path.sh +. ./cmd.sh +. ./utils/parse_options.sh + +if [ $stage -le 3 ]; then + #dump egs + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b{11,12,13}/$USER/kaldi-data/egs/minilibrispeech-$(date +'%m_%d_%H_%M')/s5/$egs_dir/storage $egs_dir/storage + fi + + steps/nnet3/fvector/get_egs_separate.sh --cmd "$train_cmd" \ + --nj 8 \ + --stage 3 \ + --egs-per-iter 100000 \ + --egs-per-iter-diagnostic 10000 \ + --num-diagnostic-percent 5 \ + "$data" "$noise_data" "$egs_dir" +fi + +if [ $stage -le 4 ]; then + #prepare configs + echo "$0: creating neural net configs using the xconfig parser"; + #options + input_dim=400 + num_filters=200 + + mkdir -p $fvector_dir/configs + + cat < $fvector_dir/configs/network.xconfig + input dim=$input_dim name=input + # Each eg contains 8 frames, do Frequency-domain feature learning, and then + # use TDNN model split it into one vector + preprocess-fft-abs-lognorm-affine-log-layer name=raw0 cos-transform-file=$fvector_dir/configs/cos_transform.mat sin-transform-file=$fvector_dir/configs/sin_transform.mat num-filters=$num_filters half-fft-range=true + conv-relu-batchnorm-layer name=cnn1 height-in=$num_filters height-out=$[$num_filters/2] time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 height-subsample-out=2 learning-rate-factor=0.34 max-change=0.25 + + relu-batchnorm-layer name=tdnn0 input=cnn1 dim=625 + relu-batchnorm-layer name=tdnn1 input=Append(0,1,2) dim=625 + relu-batchnorm-layer name=tdnn2 input=Append(0,1,2) dim=625 + relu-batchnorm-layer name=tdnn3 input=Append(0,1,2) dim=625 + relu-batchnorm-layer name=tdnn4 input=Append(0,1) dim=625 + output-layer name=output input=tdnn4 dim=200 include-log-softmax=False param-stddev=0.04 bias-stddev=1.0 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $fvector_dir/configs/network.xconfig --config-dir $fvector_dir/configs/ + # Modify the final.config and generate sin.mat/cos.mat manually + python local/fvector/add_output_node.py --input-dim 400 --output-dim 200 --config-file $fvector_dir/configs/final.config + python local/fvector/generate_sin_cos_matrix.py \ + --feat-dim 400 --dir $fvector_dir +fi + +if [ $stage -le 5 ]; then + #training + steps/nnet3/xvector/train_separate.sh --cmd "$train_cmd" \ + --initial-effective-lrate 0.002 \ + --final-effective-lrate 0.0002 \ + --max-param-change 0.2 \ + --minibatch-size 16 \ + --left-padding 1 \ + --right-padding 1 \ + --max-snr 20 \ + --min-snr 10 \ + --num-epochs 8 --use-gpu $use_gpu --stage $train_stage \ + --num-jobs-initial 1 --num-jobs-final 3 \ + --egs-dir $egs_dir \ + $fvector_dir +fi diff --git a/egs/mini_librispeech/s5/local/fvector/run_tdnn.sh b/egs/mini_librispeech/s5/local/fvector/run_tdnn.sh new file mode 100755 index 00000000000..a69a26c6bb4 --- /dev/null +++ b/egs/mini_librispeech/s5/local/fvector/run_tdnn.sh @@ -0,0 +1,214 @@ +#!/bin/bash + +# 1e is as 1d but instead of the --proportional-shrink option, using +# the newly added xconfig-layer-specific 'l2-regularize' options. + +# local/chain/compare_wer.sh exp/chain/tdnn1d_sp exp/chain/tdnn1e_sp +# System tdnn1d_sp tdnn1e_sp +#WER dev_clean_2 (tgsmall) 14.21 13.43 +#WER dev_clean_2 (tglarge) 10.41 9.76 +# Final train prob -0.0473 -0.0510 +# Final valid prob -0.0893 -0.0889 +# Final train prob (xent) -1.0757 -1.4148 +# Final valid prob (xent) -1.4222 -1.6640 + + +# steps/info/chain_dir_info.pl exp/chain/tdnn1{d,e}_sp +# exp/chain/tdnn1d_sp: num-iters=17 nj=2..5 num-params=7.5M dim=40+100->2309 combine=-0.063->-0.052 xent:train/valid[10,16,final]=(-1.65,-1.23,-1.08/-1.91,-1.55,-1.42) logprob:train/valid[10,16,final]=(-0.084,-0.057,-0.047/-0.125,-0.100,-0.089) +# exp/chain/tdnn1e_sp: num-iters=17 nj=2..5 num-params=7.5M dim=40+100->2309 combine=-0.061->-0.056 xent:train/valid[10,16,final]=(-1.69,-1.41,-1.41/-1.91,-1.67,-1.66) logprob:train/valid[10,16,final]=(-0.065,-0.055,-0.051/-0.104,-0.095,-0.089) + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=13 +decode_nj=10 +train_set=train_clean_5 +test_sets=dev_clean_2 +gmm=tri3b +nnet3_affix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1e_fvector # affix for the TDNN directory name +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +# training options +# training chunk-options +chunk_width=140,100,160 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +common_egs_dir= +xent_regularize=0.1 + +# training options +srand=0 +remove_egs=true +reporting_email= + +#decode options +test_online_decoding=true # if true, it will run the last decoding stage. + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=200 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 $opts dim=512 + relu-batchnorm-layer name=tdnn2 $opts dim=512 input=Append(-1,0,1) + relu-batchnorm-layer name=tdnn3 $opts dim=512 + relu-batchnorm-layer name=tdnn4 $opts dim=512 input=Append(-1,0,1) + relu-batchnorm-layer name=tdnn5 $opts dim=512 + relu-batchnorm-layer name=tdnn6 $opts dim=512 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn7 $opts dim=512 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn8 $opts dim=512 input=Append(-6,-3,0) + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain $opts dim=512 target-rms=0.5 + output-layer name=output include-log-softmax=false $output_opts dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn8 $opts dim=512 target-rms=0.5 + output-layer name=output-xent $output_opts dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=10 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=256,128,64 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l data/noise/utt2dur_fix +fi + +if [ $stage -le 2 ]; then +#generate fvector egs and train model. +local/fvector/run_fvector.sh --data data/train_clean_5 --noise-data data/noise \ + --egs-dir exp/fvector/egs --fvector-dir exp/fvector +fi + +if [ $stage -le 3 ]; then + for part in dev_clean_2_hires train_clean_5_sp_hires; do + if [ -e data/${part}_mfcc ]; then + if [ -e data/${part} ]; then + rm -rf data/${part} + fi + mv data/${part}_mfcc data/${part} + fi + + mv data/${part} data/${part}_mfcc + cp -r data/${part}_mfcc data/${part} + for f in $(ls data/${part}); do + if [ $f != "spk2gender" -a $f != "spk2utt" -a $f != "text" -a $f != "utt2spk" -a $f != "wav.scp" ]; then + rm -rf data/$part/$f + fi + done + steps/nnet3/fvector/make_fvector_feature.sh --cmd "$train_cmd" --nj 10 \ + data/${part} exp/fvector exp/make_fvector/train fvector_feature + done +fi + +if [ $stage -le 4 ]; then + local/fvector/run_tdnn.sh --stage 14 --train-stage 9 +fi diff --git a/egs/mini_librispeech/s5/run_fvector_separate.sh b/egs/mini_librispeech/s5/run_fvector_separate.sh new file mode 100755 index 00000000000..34c0e800aa6 --- /dev/null +++ b/egs/mini_librispeech/s5/run_fvector_separate.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +# Note: this works only on pre-downloaded data on the CLSP servers +data=/export/a05/dgalvez/ + +data_url=www.openslr.org/resources/31 +lm_url=www.openslr.org/resources/11 + +. ./cmd.sh +. ./path.sh + +stage=3 +. utils/parse_options.sh + +set -euo pipefail + +mkdir -p $data +#Stage1: run run.sh from scratch to generate a chain model. +if [ $stage -le 0 ]; then + run.sh +fi + +#Stage2: prepare a noise dir(maybe a speicial noise dataset). In mini_librispeech, +#we just use trainset directly. +if [ $stage -le 1 ]; then + cp -r data/train_clean_5 data/noise + #for the noise dir, we prepare a file utt2dur_fix. Each line is "utt_id dur-0.2" + #This file is used in "fvector-chunk.cc". It will be store into a vector in binary code. + #For each target chunk, we randomly select two utt_id form vector, and the + #corresponding start point. + utils/data/get_utt2dur.sh data/noise # wav-to-duration + cat data/noise/utt2dur | awk '{print $1,$2-0.2}' > data/noise/utt2dur_fix +fi + +if [ $stage -le 2 ]; then +#generate fvector egs and train model. +local/fvector/run_fvector_separate.sh --data data/train_clean_5 --noise-data data/noise \ + --egs-dir exp/fvector/egs --fvector-dir exp/fvector +fi + +if [ $stage -le 3 ]; then + for part in dev_clean_2_hires train_clean_5_sp_hires; do + if [ -e data/${part}_mfcc ]; then + if [ -e data/${part} ]; then + rm -rf data/${part} + fi + mv data/${part}_mfcc data/${part} + fi + + mv data/${part} data/${part}_mfcc + cp -r data/${part}_mfcc data/${part} + for f in $(ls data/${part}); do + if [ $f != "spk2gender" -a $f != "spk2utt" -a $f != "text" -a $f != "utt2spk" -a $f != "wav.scp" ]; then + rm -rf data/$part/$f + fi + done + steps/nnet3/fvector/make_fvector_feature.sh --cmd "$train_cmd" --nj 10 \ + data/${part} exp/fvector exp/make_fvector/train fvector_feature + done +fi + +if [ $stage -le 4 ]; then + local/fvector/run_tdnn.sh --stage 14 --train-stage -10 +fi diff --git a/egs/wsj/s5/steps/nnet3/fvector/get_egs.sh b/egs/wsj/s5/steps/nnet3/fvector/get_egs.sh new file mode 100755 index 00000000000..1063a51fb32 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/fvector/get_egs.sh @@ -0,0 +1,262 @@ +#!/bin/bash + +# This script dumps training examples (egs) for fvector training. At least, +# each eg has two "NnetIo"s(data-chunks), which come from the same original +# source signal fragment. The two data-chunks in each eg will have respectively +# n=0 and n=1. +# +# +# This script, which will generally be called from other neural-net training +# scripts, extracts the training examples used to train the neural net (and also +# the validation examples used for diagnostics), and puts them in archives. + +# Begin configuration section. +cmd=run.pl +egs_per_iter=12500 # have this many frames per archive. + # In xvector setup, this item is 2 milion and each frame + # is 40 dims. In fvector case, the dimension is about + # 1egs=100ms=2 * 8frames* (16kHz * 25ms)= 6400. + # So (2milion * 40 / 6400) + # If frame-length=10ms, it should be 30000. + # That means we keep the capacity of fvector with xvector. +egs_per_iter_diagnostic=10000 # have this many frames per achive for the + # archives used for diagnostics. +num_diagnostic_percent=5 # we want to test the training and validation likelihoods + # on a range of utterance lengths, and this number + # controls how many archives we evaluate on. Select + # "num_diagnostic_percent"% train data to be valid +compress=true +srand=0 +generate_egs_scp=true + +stage=0 +nj=8 # This should be set to the maximum number of jobs you are confortable + # to run in parallel + +echo "$0 $@" + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Usage: $0 [opts] " + echo " e.g.: $0 data/train data/noise exp/fvector/egs" + echo "" + echo "Main options (for others, see top of script file)" + echo " --nj # The maximum number of jobs you want to run in" + echo " # parallel (increase this only if you have good disk and" + echo " # network speed). default=8" + echo " --cmd (utils/run.pl;utils/queue.pl ) # how to run jobs." + echo " --frames-per-iter <#samples;100000> # Target number of frames per archive" + echo " # {train_subset,valid}.*.egs" + echo " --stage # Used to run a partially-completed training process from" + echo " # somewhere in the middle." + echo "" + + exit 1; +fi + +data_dir=$1 +noise_dir=$2 +egs_dir=$3 + +for f in $data_dir/wav.scp $noise_dir/wav.scp $noise_dir/utt2dur_fix; do + [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1; +done + +mkdir -p $egs_dir +mkdir -p $egs_dir/log +mkdir -p $egs_dir/info +num_utts=$(cat $data_dir/wav.scp | wc -l) +num_valid=$[$num_utts*$num_diagnostic_percent/100]; + +#Assume recording-id == utt-id +if [ $stage -le 1 ]; then + #Get list of validation utterances. + awk '{print $1}' $data_dir/wav.scp | utils/shuffle_list.pl | head -$num_valid \ + > ${egs_dir}/info/valid_uttlist + cat $data_dir/wav.scp | utils/filter_scp.pl --exclude $egs_dir/info/valid_uttlist | \ + awk '{print $1}' > ${egs_dir}/info/train_uttlist + cat ${egs_dir}/info/train_uttlist | utils/shuffle_list.pl | head -$num_valid \ + > ${egs_dir}/info/train_diagnostic_uttlist +fi +# get the (120ms) chunks from wav.scp and noise.scp. And compose 1 source +# chunk and 2 noise chunks into a matrix. +if [ $stage -le 2 ]; then + sdata=$data_dir/split$nj + utils/data/split_data.sh $data_dir $nj || exit 1; + $cmd JOB=1:$nj $egs_dir/log/cut_train_wav_into_chunks.JOB.log \ + fvector-chunk --chunk-size=120 "scp:utils/filter_scp.pl --exclude $egs_dir/info/valid_uttlist $sdata/JOB/wav.scp |" \ + scp:$noise_dir/wav.scp $noise_dir/utt2dur_fix \ + ark,scp:$egs_dir/orign_train_chunks.JOB.ark,$egs_dir/orign_train_chunks.JOB.scp + for n in $(seq $nj); do + cat $egs_dir/orign_train_chunks.${n}.scp || exit 1; + done > $data_dir/orign_train_chunks.all.scp + + $cmd $egs_dir/log/cut_valid_wav_into_chunks.log \ + fvector-chunk --chunk-size=120 "scp:utils/filter_scp.pl $egs_dir/info/valid_uttlist $data_dir/wav.scp |" \ + scp:$noise_dir/wav.scp $noise_dir/utt2dur_fix \ + ark,scp:$egs_dir/orign_valid_chunks.ark,$egs_dir/orign_valid_chunks.scp + cp $egs_dir/orign_valid_chunks.scp $data_dir/orign_valid_chunks.scp + + $cmd $egs_dir/log/cut_train_diagnostic_wav_into_chunks.log \ + fvector-chunk --chunk-size=120 "scp:utils/filter_scp.pl $egs_dir/info/train_diagnostic_uttlist $data_dir/wav.scp |" \ + scp:$noise_dir/wav.scp $noise_dir/utt2dur_fix \ + ark,scp:$egs_dir/orign_train_diagnostic_chunks.ark,$egs_dir/orign_train_diagnostic_chunks.scp + cp $egs_dir/orign_train_diagnostic_chunks.scp $data_dir/orign_train_diagnostic_chunks.scp +fi + +echo "$0: Generate the egs for train dataset." + +#each chunk will generate two "NnetIo"s +num_egs=$(cat $data_dir/orign_train_chunks.all.scp | wc -l) +num_archives=$[$num_egs/$egs_per_iter+1] +# We may have to first create a smaller number of larger archives, with number +# $num_archives_intermediate, if $num_archives is more than the maximum number +# of open filehandles that the system allows per process (ulimit -n). +# This sometimes gives a misleading answer as GridEngine sometimes changes that +# somehow, so we limit it to 512. +max_open_filehandles=$(ulimit -n) || exit 1 +[ $max_open_filehandles -gt 512 ] && max_open_filehandles=512 +num_archives_intermediate=$num_archives +archives_multiple=1 +while [ $[$num_archives_intermediate+4] -gt $max_open_filehandles ]; do + archives_multiple=$[$archives_multiple+1] + num_archives_intermediate=$[$num_archives/$archives_multiple+1]; +done +# now make sure num_archives is an exact multiple of archives_multiple. +num_archives=$[$archives_multiple*$num_archives_intermediate] +echo $num_archives > $egs_dir/info/num_archives + +# prepare the dir link +if [ -e $egs_dir/storage ]; then + # Make soft links to storage directories, if distributing this way.. See + # utils/create_split_dir.pl. + echo "$0: creating data links" + utils/create_data_link.pl $(for x in $(seq $num_archives); do echo $egs_dir/egs.$x.ark; done) + for x in $(seq $num_archives_intermediate); do + utils/create_data_link.pl $(for y in $(seq $nj); do echo $egs_dir/egs_orig.$y.$x.ark; done) + done +fi +# Deal with the chunk one-by-one, add the noise. +# convert the chunk data into Nnet3eg +if [ $stage -le 3 ]; then + # create egs_orig.*.*.ark; the first index goes to $nj, + # the second to $num_archives_intermediate. + egs_list= + for n in $(seq $num_archives_intermediate); do + egs_list="$egs_list ark:$egs_dir/egs_orig.JOB.$n.ark" + done + echo "$0: Do data perturbation and dump on disk" + #The options could be added in this line + $cmd JOB=1:$nj $egs_dir/log/do_train_perturbation_and_get_egs.JOB.log \ + fvector-add-noise --max-snr=20 --min-snr=10 scp:$egs_dir/orign_train_chunks.JOB.scp ark:- \| \ + fvector-get-egs ark:- ark:- \| \ + nnet3-copy-egs --random=true --srand=\$[JOB+$srand] ark:- $egs_list || exit 1; +fi + +# The num_archives_intermediate looks like a bridge. It used to convert the +# egs_orig(nj * num_achives_intermediate) to egs(num_achives_intermediate * archives_multiple) +# Each time, get a colmn from egs_orig and average dispersion to a row of egs. +if [ $stage -le 4 ]; then + echo "$0: recombining and shuffling order of archives on disk" + # combine all the "egs_orig.*.JOB.scp" (over the $nj splits of the data) and + # shuffle the order, writing to the egs.JOB.ark + + # the input is a concatenation over the input jobs. + egs_list= + for n in $(seq $nj); do + egs_list="$egs_list $egs_dir/egs_orig.$n.JOB.ark" + done + + if [ $archives_multiple == 1 ]; then # normal case. + if $generate_egs_scp; then + output_archive="ark,scp:$egs_dir/egs.JOB.ark,$egs_dir/egs.JOB.scp" + else + output_archive="ark:$egs_dir/egs.JOB.ark" + fi + $cmd --max-jobs-run $nj JOB=1:$num_archives_intermediate $egs_dir/log/shuffle.JOB.log \ + nnet3-shuffle-egs --srand=\$[JOB+$srand] "ark:cat $egs_list|" $output_archive || exit 1; + + if $generate_egs_scp; then + #concatenate egs.JOB.scp in single egs.scp + rm $egs_dir/egs.scp 2> /dev/null || true + for j in $(seq $num_archives_intermediate); do + cat $egs_dir/egs.$j.scp || exit 1; + done > $egs_dir/egs.scp || exit 1; + for f in $egs_dir/egs.*.scp; do rm $f; done + fi + else + # we need to shuffle the 'intermediate archives' and then split into the + # final archives. we create soft links to manage this splitting, because + # otherwise managing the output names is quite difficult (and we don't want + # to submit separate queue jobs for each intermediate archive, because then + # the --max-jobs-run option is hard to enforce). + if $generate_egs_scp; then + output_archives="$(for y in $(seq $archives_multiple); do echo ark,scp:$egs_dir/egs.JOB.$y.ark,$egs_dir/egs.JOB.$y.scp; done)" + else + output_archives="$(for y in $(seq $archives_multiple); do echo ark:$egs_dir/egs.JOB.$y.ark; done)" + fi + for x in $(seq $num_archives_intermediate); do + for y in $(seq $archives_multiple); do + archive_index=$[($x-1)*$archives_multiple+$y] + # egs.intermediate_archive.{1,2,...}.ark will point to egs.archive.ark + ln -sf egs.$archive_index.ark $egs_dir/egs.$x.$y.ark || exit 1 + done + done + $cmd --max-jobs-run $nj JOB=1:$num_archives_intermediate $egs_dir/log/shuffle.JOB.log \ + nnet3-shuffle-egs --srand=\$[JOB+$srand] "ark:cat $egs_list|" ark:- \| \ + nnet3-copy-egs ark:- $output_archives || exit 1; + + if $generate_egs_scp; then + #concatenate egs.JOB.scp in single egs.scp + rm $egs_dir/egs.scp 2> /dev/null || true + for j in $(seq $num_archives_intermediate); do + for y in $(seq $num_archives_intermediate); do + cat $egs_dir/egs.$j.$y.scp || exit 1; + done + done > $egs_dir/egs.scp || exit 1; + for f in $egs_dir/egs.*.*.scp; do rm $f; done + fi + fi +fi +#get egs.$archives_multiple.$num_archives_intermediate.ark +#get egs.scp + +echo "$0: Generate the egs for valid dataset" +if [ $stage -le 5 ]; then + $cmd $egs_dir/log/do_valid_perturbation_and_get_egs.log \ + fvector-add-noise --max-snr=20 --min-snr=10 scp:$egs_dir/orign_valid_chunks.scp ark:- \| \ + fvector-get-egs ark:- ark:- \| \ + nnet3-copy-egs --random=true --srand=$srand ark:- ark:$egs_dir/valid.egs || exit 1; + #get the valid.egs + cp $egs_dir/valid.egs $egs_dir/valid_diagnostic_egs.1.ark +fi + +echo "$0: Generate the egs for train diagnostic" +if [ $stage -le 6 ];then + $cmd $egs_dir/log/do_train_diagnostic_perturbation_and_get_egs.log \ + fvector-add-noise --max-snr=20 --min-snr=10 scp:$egs_dir/orign_train_diagnostic_chunks.scp ark:- \| \ + fvector-get-egs ark:- ark:- \| \ + nnet3-copy-egs --random=true --srand=$srand ark:- ark:$egs_dir/train_diagnostic.egs || exit 1; + #get the train_diagnostic.egs + cp $egs_dir/train_diagnostic.egs $egs_dir/train_diagnostic_egs.1.ark + echo "1" > $egs_dir/info/num_diagnostic_archives +fi + +# remove unnecessary arks and links. +if [ $stage -le 7 ]; then + echo "$0: removing temporary archives" + for x in $(seq $nj); do + for y in $(seq $num_archives_intermediate); do + file=$egs_dir/egs_orig.$x.$y.ark + [ -L $file ] && rm $(utils/make_absolute.sh $file) + rm $file + done + done + if [ $archives_multiple -gt 1 ]; then + # there are some extra soft links that we should delete. + for f in $egs_dir/egs.*.*.ark; do rm $f; done + fi +fi +echo "$0: Finished preparing fvector training examples" diff --git a/egs/wsj/s5/steps/nnet3/fvector/get_egs_separate.sh b/egs/wsj/s5/steps/nnet3/fvector/get_egs_separate.sh new file mode 100755 index 00000000000..3040b269f87 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/fvector/get_egs_separate.sh @@ -0,0 +1,165 @@ +#!/bin/bash + +# This script dumps training examples (egs) for fvector training. At least, +# each eg has two "NnetIo"s(data-chunks), which come from the same original +# source signal fragment. The two data-chunks in each eg will have respectively +# n=0 and n=1. +# +# +# This script, which will generally be called from other neural-net training +# scripts, extracts the training examples used to train the neural net (and also +# the validation examples used for diagnostics), and puts them in archives. + +# Begin configuration section. +cmd=run.pl +egs_per_iter=12500 # have this many frames per archive. +egs_per_iter_diagnostic=10000 # have this many frames per achive for the + # archives used for diagnostics. +num_diagnostic_percent=5 # we want to test the training and validation likelihoods + # on a range of utterance lengths, and this number + # controls how many archives we evaluate on. Select + # "num_diagnostic_percent"% train data to be valid +chunk_size=120 +compress=true +srand=0 +generate_egs_scp=true + +stage=0 +nj=8 # This should be set to the maximum number of jobs you are confortable + # to run in parallel + +echo "$0 $@" + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Usage: $0 [opts] " + echo " e.g.: $0 data/train data/noise exp/fvector/egs" + echo "" + echo "Main options (for others, see top of script file)" + echo " --nj # The maximum number of jobs you want to run in" + echo " # parallel (increase this only if you have good disk and" + echo " # network speed). default=8" + echo " --cmd (utils/run.pl;utils/queue.pl ) # how to run jobs." + echo " --frames-per-iter <#samples;100000> # Target number of frames per archive" + echo " # {train_subset,valid}.*.egs" + echo " --stage # Used to run a partially-completed training process from" + echo " # somewhere in the middle." + echo "" + + exit 1; +fi + +data_dir=$1 +noise_dir=$2 +egs_dir=$3 + +for f in $data_dir/wav.scp $noise_dir/wav.scp $noise_dir/utt2dur_fix; do + [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1; +done + +mkdir -p $egs_dir +mkdir -p $egs_dir/log +mkdir -p $egs_dir/info +num_utts=$(cat $data_dir/wav.scp | wc -l) +num_valid=$[$num_utts*$num_diagnostic_percent/100]; + +#Assume recording-id == utt-id +if [ $stage -le 1 ]; then + #Get list of validation utterances. + awk '{print $1}' $data_dir/wav.scp | utils/shuffle_list.pl | head -$num_valid \ + > ${egs_dir}/info/valid_uttlist + cat $data_dir/wav.scp | utils/filter_scp.pl --exclude $egs_dir/info/valid_uttlist | \ + awk '{print $1}' > ${egs_dir}/info/train_uttlist + cat ${egs_dir}/info/train_uttlist | utils/shuffle_list.pl | head -$num_valid \ + > ${egs_dir}/info/train_diagnostic_uttlist +fi +# get the (120ms) chunks from wav.scp and noise.scp. And compose 1 source +# chunk and 2 noise chunks into a matrix. +if [ $stage -le 2 ]; then + sdata=$data_dir/split$nj + utils/data/split_data.sh $data_dir $nj || exit 1; + $cmd JOB=1:$nj $egs_dir/log/cut_train_wav_into_chunks.JOB.log \ + fvector-chunk-separate --chunk-size=$chunk_size "scp:utils/filter_scp.pl --exclude $egs_dir/info/valid_uttlist $sdata/JOB/wav.scp |" \ + scp:$noise_dir/wav.scp $noise_dir/utt2dur_fix \ + ark,scp:$egs_dir/orign_train_chunks.JOB.ark,$egs_dir/orign_train_chunks.JOB.scp \ + ark,scp:$egs_dir/orign_train_noise_chunks.JOB.ark,$egs_dir/orign_train_noise_chunks.JOB.scp + + for n in $(seq $nj); do + cat $egs_dir/orign_train_chunks.${n}.scp || exit 1; + done > $data_dir/orign_train_chunks.all.scp + for n in $(seq $nj); do + cat $egs_dir/orign_train_noise_chunks.${n}.scp || exit 1; + done > $data_dir/orign_train_noise_chunks.all.scp + cp $data_dir/orign_train_chunks.all.scp $egs_dir/orign_train_chunks.all.scp + cp $data_dir/orign_train_noise_chunks.all.scp $egs_dir/orign_train_noise_chunks.all.scp + + $cmd $egs_dir/log/cut_valid_wav_into_chunks.log \ + fvector-chunk-separate --chunk-size=$chunk_size "scp:utils/filter_scp.pl $egs_dir/info/valid_uttlist $data_dir/wav.scp |" \ + scp:$noise_dir/wav.scp $noise_dir/utt2dur_fix \ + ark,scp:$egs_dir/orign_valid_chunks.ark,$egs_dir/orign_valid_chunks.scp \ + ark,scp:$egs_dir/orign_valid_noise_chunks.ark,$egs_dir/orign_valid_noise_chunks.scp + cp $egs_dir/orign_valid_chunks.scp $data_dir/orign_valid_chunks.scp + cp $egs_dir/orign_valid_noise_chunks.scp $data_dir/orign_valid_noise_chunks.scp + + $cmd $egs_dir/log/cut_train_diagnostic_wav_into_chunks.log \ + fvector-chunk-separate --chunk-size=$chunk_size "scp:utils/filter_scp.pl $egs_dir/info/train_diagnostic_uttlist $data_dir/wav.scp |" \ + scp:$noise_dir/wav.scp $noise_dir/utt2dur_fix \ + ark,scp:$egs_dir/orign_train_diagnostic_chunks.ark,$egs_dir/orign_train_diagnostic_chunks.scp \ + ark,scp:$egs_dir/orign_train_diagnostic_noise_chunks.ark,$egs_dir/orign_train_diagnostic_noise_chunks.scp + cp $egs_dir/orign_train_diagnostic_chunks.scp $data_dir/orign_train_diagnostic_chunks.scp + cp $egs_dir/orign_train_diagnostic_noise_chunks.scp $data_dir/orign_train_diagnostic_noise_chunks.scp +fi + +echo "$0: Generate the egs for train dataset." + +num_egs=$(cat $data_dir/orign_train_chunks.all.scp | wc -l) +num_archives=$[$num_egs/$egs_per_iter+1] +echo $num_archives > $egs_dir/info/num_archives + +if [ -e $egs_dir/storage ]; then + echo "$0:creating data links" + utils/create_data_link.pl $(for x in $(seq $num_archives); do echo $egs_dir/egs.$x.ark; done) + utils/create_data_link.pl $(for x in $(seq $num_archives); do echo $egs_dir/egs.noise.$x.ark; done) +fi + +if [ $stage -le 3 ]; then + echo "$0:shuffle and recombine train set" + egs_scp_list= + for n in $(seq $num_archives); do + egs_scp_list="$egs_scp_list $egs_dir/egs.$n.scp.tmp" + done + utils/shuffle_list.pl $egs_dir/orign_train_chunks.all.scp > $egs_dir/orign_train_chunks.all.scp.shuffled + utils/split_scp.pl $egs_dir/orign_train_chunks.all.scp.shuffled $egs_scp_list + + utils/shuffle_list.pl $egs_dir/orign_train_noise_chunks.all.scp > $egs_dir/orign_train_noise_chunks.all.scp.shuffled + count=0 + for n in $(seq $num_archives); do + current_count=$(cat $egs_dir/egs.$n.scp | wc -l) + count=$[$count+2*$current_count] + cat $egs_dir/orign_train_noise_chunks.all.scp.shuffled | head -n $count | tail -n $[2*$current_count] > $egs_dir/egs.noise.$n.scp.tmp + done + $cmd JOB=1:$num_archives $egs_dir/log/get_egs.JOB.log \ + copy-vector scp:$egs_dir/egs.JOB.scp.tmp ark,scp:$egs_dir/egs.JOB.ark,$egs_dir/egs.JOB.scp || exit 1; + $cmd JOB=1:$num_archives $egs_dir/log/get_egs_noise.JOB.log \ + copy-vector scp:$egs_dir/egs.noise.JOB.scp.tmp ark,scp:$egs_dir/egs.noise.JOB.ark,$egs_dir/egs.noise.JOB.scp || exit 1; +fi + +if [ $stage -le 4 ]; then + echo "$0:shuffle and recombine valid set" + $cmd $egs_dir/log/get_egs_valid.log \ + copy-vector scp:$egs_dir/orign_valid_chunks.scp ark,scp:$egs_dir/valid_diagnostic_egs.1.ark,$egs_dir/valid_diagnostic_egs.1.scp || exit 1; + $cmd $egs_dir/log/get_egs_valid_noise.log \ + copy-vector scp:$egs_dir/orign_valid_noise_chunks.scp ark,scp:$egs_dir/valid_diagnoistic_egs.noise.1.ark,$egs_dir/valid_diagnostic_egs.noise.1.scp || exit 1; +fi + +if [ $stage -le 5 ]; then + echo "$0:shuffle and recombine train_diagnostic set" + $cmd $egs_dir/log/get_egs_train_diagnostic.log \ + copy-vector scp:$egs_dir/orign_train_diagnostic_chunks.scp ark,scp:$egs_dir/train_diagnostic_egs.1.ark,$egs_dir/train_diagnostic_egs.1.scp || exit 1; + $cmd $egs_dir/log/get_egs_train_diagnostic_noise.log \ + copy-vector scp:$egs_dir/orign_train_diagnostic_noise_chunks.scp ark,scp:$egs_dir/train_diagnostic_egs.noise.1.ark,$egs_dir/train_diagnostic_egs.noise.1.scp || exit 1; + echo "1" > $egs_dir/info/num_diagnostic_archives +fi +echo "$0: Finished preparing fvector training examples" diff --git a/egs/wsj/s5/steps/nnet3/fvector/make_fvector_feature.sh b/egs/wsj/s5/steps/nnet3/fvector/make_fvector_feature.sh new file mode 100755 index 00000000000..bf6faa5391d --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/fvector/make_fvector_feature.sh @@ -0,0 +1,158 @@ +#!/bin/bash + +# Copyright 2012-2016 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 +# To be run from .. (one directory up from here) +# see ../run.sh for example + +# Begin configuration section. +nj=4 +cmd=run.pl +compress=true +write_utt2num_frames=false # if true writes utt2num_frames +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# -lt 2 ] || [ $# -gt 4 ]; then + echo "Usage: $0 [options] [ [] ]"; + echo "e.g.: $0 data/train exp/make_fvector/train fvector-feature/" + echo "Note: defaults to /log, and defaults to /data" + echo "Options: " + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --write-utt2num-frames # If true, write utt2num_frames file." + exit 1; +fi + +data=$1 +fvectordir=$2 +if [ $# -ge 3 ]; then + logdir=$3 +else + logdir=$data/log +fi +if [ $# -ge 4 ]; then + feadir=$4 +else + feadir=$data/data +fi + +# make $feadir an absolute pathname. +feadir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $feadir ${PWD}` + +# use "name" as part of name of the archive. +name=`basename $data` + +mkdir -p $feadir || exit 1; +mkdir -p $logdir || exit 1; + +if [ -f $data/feats.scp ]; then + mkdir -p $data/.backup + echo "$0: moving $data/feats.scp to $data/.backup" + mv $data/feats.scp $data/.backup +fi + +scp=$data/wav.scp + +required="$scp $fvectordir/final.raw" + +for f in $required; do + if [ ! -f $f ]; then + echo "make_fvector_feature.sh: no such file $f" + exit 1; + fi +done +utils/validate_data_dir.sh --no-text --no-feats $data || exit 1; + +for n in $(seq $nj); do + # the next command does nothing unless $mfccdir/storage/ exists, see + # utils/create_data_link.pl for more info. + utils/create_data_link.pl $mfccdir/raw_fvector_$name.$n.ark +done + + +if $write_utt2num_frames; then + write_num_frames_opt="--write-num-frames=ark,t:$logdir/utt2num_frames.JOB" +else + write_num_frames_opt= +fi + + +if [ -f $data/segments ]; then + echo "$0 [info]: segments file exists: using that." + + split_segments="" + for n in $(seq $nj); do + split_segments="$split_segments $logdir/segments.$n" + done + + utils/split_scp.pl $data/segments $split_segments || exit 1; + rm $logdir/.error 2>/dev/null + + $cmd JOB=1:$nj $logdir/make_mfcc_${name}.JOB.log \ + extract-segments scp,p:$scp $logdir/segments.JOB ark:- \| \ + compute-wav-to-rawmatrix ark:- ark:- \| \ + nnet3-compute --use-gpu=no $fvectordir/final.raw ark:- ark:- \| \ + copy-feats --compress=$compress $write_num_frames_opt ark:- \ + ark,scp:$feadir/raw_mfcc_$name.JOB.ark,$feadir/raw_fvector_$name.JOB.scp \ + || exit 1; + +else + echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance." + split_scps="" + for n in $(seq $nj); do + split_scps="$split_scps $logdir/wav_${name}.$n.scp" + done + + utils/split_scp.pl $scp $split_scps || exit 1; + + + # add ,p to the input rspecifier so that we can just skip over + # utterances that have bad wave data. + + $cmd JOB=1:$nj $logdir/make_fvector_${name}.JOB.log \ + compute-wav-to-rawmatrix scp,p:$logdir/wav_${name}.JOB.scp ark:- \| \ + nnet3-compute --use-gpu=no $fvectordir/final.raw ark:- ark:- \| \ + copy-feats $write_num_frames_opt --compress=$compress ark:- \ + ark,scp:$feadir/raw_fvector_$name.JOB.ark,$feadir/raw_fvector_$name.JOB.scp \ + || exit 1; +fi + + +if [ -f $logdir/.error.$name ]; then + echo "Error producing mfcc features for $name:" + tail $logdir/make_fvector_${name}.1.log + exit 1; +fi + +# concatenate the .scp files together. +for n in $(seq $nj); do + cat $feadir/raw_fvector_$name.$n.scp || exit 1; +done > $data/feats.scp || exit 1 + +if $write_utt2num_frames; then + for n in $(seq $nj); do + cat $logdir/utt2num_frames.$n || exit 1; + done > $data/utt2num_frames || exit 1 + rm $logdir/utt2num_frames.* +fi + +rm $logdir/wav_${name}.*.scp $logdir/segments.* 2>/dev/null + +nf=`cat $data/feats.scp | wc -l` +nu=`cat $data/utt2spk | wc -l` +if [ $nf -ne $nu ]; then + echo "It seems not all of the feature files were successfully processed ($nf != $nu);" + echo "consider using utils/fix_data_dir.sh $data" +fi + +if [ $nf -lt $[$nu - ($nu/20)] ]; then + echo "Less than 95% the features were successfully generated. Probably a serious error." + exit 1; +fi + +echo "Succeeded creating MFCC features for $name" diff --git a/egs/wsj/s5/steps/nnet3/xvector/train.sh b/egs/wsj/s5/steps/nnet3/xvector/train.sh new file mode 100755 index 00000000000..9e148ea3ab0 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/xvector/train.sh @@ -0,0 +1,253 @@ +#!/bin/bash + +# Copyright 2016 Johns Hopkins University (Author: Daniel Povey). +# Apache 2.0. + + +# Begin configuration section. +cmd=run.pl +num_epochs=4 # Number of epochs of training; + # the number of iterations is worked out from this. +num_shifts=1 +initial_effective_lrate=0.0003 +final_effective_lrate=0.00003 +num_jobs_initial=2 # Number of neural net jobs to run in parallel at the start of training +num_jobs_final=8 # Number of neural net jobs to run in parallel at the end of training +stage=-3 +diagnostic_period=5 +compute_accuracy=true + + +shuffle_buffer_size=1000 # This "buffer_size" variable controls randomization of the samples + # on each iter. You could set it to 0 or to a large value for complete + # randomization, but this would both consume memory and cause spikes in + # disk I/O. Smaller is easier on disk and memory but less random. It's + # not a huge deal though, as samples are anyway randomized right at the start. + # (the point of this is to get data in different minibatches on different iterations, + # since in the preconditioning method, 2 samples in the same minibatch can + # affect each others' gradients. + +max_param_change=0.2 # max param change per minibatch to use eventually + # (for first epoch we use half this) +minibatch_size=256 # minibatch size to use eventually + # (for first epoch we use half this) + +use_gpu=true # if true, we run on GPU. +egs_dir= + +# End configuration section. + +trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 1 ]; then + echo "Usage: $0 [opts] " + echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet" + echo "This script trains the xvector system; see egs/swbd/s5c/local/xvector/train.sh for" + echo "example (you have to create the nnet configs and the egs first)." + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config file containing options" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --num-epochs <#epochs|10> # Number of epochs of training" + echo " --initial-effective-lrate # effective learning rate at start of training." + echo " --final-effective-lrate # effective learning rate at end of training." + echo " # data, 0.00025 for large data" + echo " --num-jobs-initial # Number of parallel jobs to use for neural net training, at the start." + echo " --num-jobs-final # Number of parallel jobs to use for neural net training, at the end" + echo " --egs-dir # If supplied, overrides /egs as location of egs" + echo " --stage # Used to run a partially-completed training process from somewhere in" + echo " # the middle." + exit 1; +fi + +dir=$1 + +[ -z $egs_dir ] && egs_dir=$dir/egs + +if [ ! -d $egs_dir/info ]; then + echo "$0: expected $egs_dir/info to exist: did you run steps/nnet3/xvector/get_egs.sh first?" + exit 1 +fi +if [ ! -f $dir/configs/final.config ]; then + echo "$0: expected $dir/configs/final.config to exist (e.g. run steps/nnet3/xvector/make_jesus_configs.py first)" + exit 1 +fi + + +num_archives=$(cat $egs_dir/info/num_archives) +num_diagnostic_archives=$(cat $egs_dir/info/num_diagnostic_archives) + + + +[ $num_jobs_initial -gt $num_jobs_final ] && \ + echo "$0: --initial-num-jobs cannot exceed --final-num-jobs" && exit 1; + +[ $num_jobs_final -gt $num_archives ] && \ + echo "$0: --final-num-jobs cannot exceed #archives $num_archives." && exit 1; + +# set num_iters so that as close as possible, we process the data $num_epochs +# times $num_shifts times, times, i.e. $num_iters*$avg_num_jobs) == +# $num_epochs*$num_archives*$num_shifts, where +# avg_num_jobs=(num_jobs_initial+num_jobs_final)/2. +num_archives_to_process=$[$num_epochs*$num_archives*$num_shifts] +num_archives_processed=0 +num_iters=$[($num_archives_to_process*2)/($num_jobs_initial+$num_jobs_final)] + +echo "$0: Will train for $num_epochs epochs = $num_iters iterations" + +if $use_gpu; then + parallel_suffix="" + train_queue_opt="--gpu 1" + parallel_train_opts= + if ! cuda-compiled; then + echo "$0: WARNING: you are running with one thread but you have not compiled" + echo " for CUDA. You may be running a setup optimized for GPUs. If you have" + echo " GPUs and have nvcc installed, go to src/ and do ./configure; make" + exit 1 + fi +else + echo "$0: without using a GPU this will be very slow. nnet3 does not yet support multiple threads." + parallel_train_opts="--use-gpu=no" +fi + +if [ $stage -le -1 ]; then + $cmd $dir/log/nnet_init.log \ + nnet3-init $dir/configs/final.config $dir/0.raw || exit 1 +fi + + +x=0 + +while [ $x -lt $num_iters ]; do + + this_num_jobs=$(perl -e "print int(0.5+$num_jobs_initial+($num_jobs_final-$num_jobs_initial)*$x/$num_iters);") + + ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process; + this_effective_learning_rate=$(perl -e "print ($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt));"); + this_learning_rate=$(perl -e "print ($this_effective_learning_rate*$this_num_jobs);"); + + if [ $stage -le $x ]; then + echo "On iteration $x, learning rate is $this_learning_rate" + raw="nnet3-copy --learning-rate=$this_learning_rate $dir/$x.raw - |" + + if [ $[$x%$diagnostic_period] == 0 ]; then + # Set off jobs doing some diagnostics, in the background. + $cmd JOB=1:$num_diagnostic_archives $dir/log/compute_prob_valid.$x.JOB.log \ + nnet3-xvector-compute-prob --compute-accuracy=${compute_accuracy} $dir/$x.raw \ + "ark:nnet3-merge-egs --measure-output-frames=false ark:$egs_dir/valid_diagnostic_egs.JOB.ark ark:- |" & + $cmd JOB=1:$num_diagnostic_archives $dir/log/compute_prob_train.$x.JOB.log \ + nnet3-xvector-compute-prob --compute-accuracy=${compute_accuracy} $dir/$x.raw \ + "ark:nnet3-merge-egs --measure-output-frames=false ark:$egs_dir/train_diagnostic_egs.JOB.ark ark:- |" & + fi + if [ $x -gt 0 ]; then + $cmd $dir/log/progress.$x.log \ + nnet3-info $dir/$x.raw '&&' \ + nnet3-show-progress --use-gpu=no $dir/$[$x-1].raw $dir/$x.raw & + fi + + echo "Training neural net (pass $x)" + + if [ $x -le 1 ]; then + do_average=false # for the first 2 iters, don't do averaging, pick the best. + else + do_average=true + fi + + rm $dir/.error 2>/dev/null + + + ( # this sub-shell is so that when we "wait" below, + # we only wait for the training jobs that we just spawned, + # not the diagnostic jobs that we spawned above. + + # We cannot easily use a single parallel SGE job to do the main training, + # because the computation of which archive and which --frame option + # to use for each job is a little complex, so we spawn each one separately. + # this is no longer true for RNNs as we use do not use the --frame option + # but we use the same script for consistency with FF-DNN code + + for n in $(seq $this_num_jobs); do + k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we will derive + # the other indexes from. + archive=$[($k%$num_archives)+1]; # work out the 1-based archive index. + frame_shift=$[($k/$num_archives)%$num_shifts]; + + this_max_param_change=$max_param_change + this_minibatch_size=$minibatch_size + # for the first 20 iterations or the first epoch, whichever comes earlier, + # use a smaller minibatch size and max-param-change. + if [ $k -lt $[$num_archives*$num_shifts] ] && [ $x -lt 20 ]; then + # if we're the first epoch, use half the minibatch size and half the + # max-param-change. + this_minibatch_size=$[$minibatch_size/2] + this_max_param_change=$(perl -e "print ($max_param_change / 2.0);") + fi + + $cmd $train_queue_opt $dir/log/train.$x.$n.log \ + nnet3-xvector-train $parallel_train_opts --print-interval=10 \ + --max-param-change=$this_max_param_change "$raw" \ + "ark:nnet3-copy-egs --frame-shift=$frame_shift ark:$egs_dir/egs.$archive.ark ark:- | nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-merge-egs --measure-output-frames=false --minibatch-size=$this_minibatch_size --discard-partial-minibatches=true ark:- ark:- |" \ + $dir/$[$x+1].$n.raw || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: error detected on iteration $x of training" + exit 1 + fi + ) + # the error message below is not that informative, but $cmd will + # have printed a more specific one. + [ -f $dir/.error ] && echo "$0: error on iteration $x of training" && exit 1; + + models_to_average=$(steps/nnet3/get_successful_models.py $this_num_jobs $dir/log/train.$x.%.log) + nnets_list= + for n in $models_to_average; do + nnets_list="$nnets_list $dir/$[$x+1].$n.raw" + done + + if $do_average; then + # average the output of the different jobs. + $cmd $dir/log/average.$x.log \ + nnet3-average $nnets_list $dir/$[$x+1].raw || exit 1; + else + # choose the best from the different jobs. + n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) { + $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn"; + undef $logprob; while () { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } } + close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; + $best_n=$n; } } print "$best_n\n"; ' $this_num_jobs $dir/log/train.$x.%d.log) || exit 1; + [ -z "$n" ] && echo "Error getting best model" && exit 1; + $cmd $dir/log/select.$x.log \ + cp $dir/$[$x+1].$n.raw $dir/$[$x+1].raw || exit 1; + fi + + nnets_list= + for n in `seq 1 $this_num_jobs`; do + nnets_list="$nnets_list $dir/$[$x+1].$n.raw" + done + + rm $nnets_list + [ ! -f $dir/$[$x+1].raw ] && exit 1; + if [ -f $dir/$[$x-1].raw ] && $cleanup && \ + [ $[($x-1)%100] -ne 0 ]; then + rm $dir/$[$x-1].raw + fi + fi + rm $dir/cache.$x 2>/dev/null + x=$[$x+1] + num_archives_processed=$[$num_archives_processed+$this_num_jobs] +done + + +cp $dir/$x.raw $dir/final.raw + +# don't bother with combination for now - it makes very little difference. + +sleep 2 + +echo Done diff --git a/egs/wsj/s5/steps/nnet3/xvector/train_separate.sh b/egs/wsj/s5/steps/nnet3/xvector/train_separate.sh new file mode 100755 index 00000000000..5388af1e21a --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/xvector/train_separate.sh @@ -0,0 +1,257 @@ +#!/bin/bash + +# Copyright 2016 Johns Hopkins University (Author: Daniel Povey). +# Apache 2.0. + + +# Begin configuration section. +cmd=run.pl +num_epochs=4 # Number of epochs of training; + # the number of iterations is worked out from this. +num_shifts=1 +initial_effective_lrate=0.0003 +final_effective_lrate=0.00003 +num_jobs_initial=2 # Number of neural net jobs to run in parallel at the start of training +num_jobs_final=8 # Number of neural net jobs to run in parallel at the end of training +stage=-3 +diagnostic_period=5 +compute_accuracy=true + + +shuffle_buffer_size=1000 # This "buffer_size" variable controls randomization of the samples + # on each iter. You could set it to 0 or to a large value for complete + # randomization, but this would both consume memory and cause spikes in + # disk I/O. Smaller is easier on disk and memory but less random. It's + # not a huge deal though, as samples are anyway randomized right at the start. + # (the point of this is to get data in different minibatches on different iterations, + # since in the preconditioning method, 2 samples in the same minibatch can + # affect each others' gradients. + +max_param_change=0.2 # max param change per minibatch to use eventually + # (for first epoch we use half this) +minibatch_size=256 # minibatch size to use eventually + # (for first epoch we use half this) + +use_gpu=true # if true, we run on GPU. +egs_dir= +max_snr=10 +min_snr=20 +left_padding=1 +right_padding=1 + +# End configuration section. + +trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 1 ]; then + echo "Usage: $0 [opts] " + echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet" + echo "This script trains the xvector system; see egs/swbd/s5c/local/xvector/train.sh for" + echo "example (you have to create the nnet configs and the egs first)." + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config file containing options" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --num-epochs <#epochs|10> # Number of epochs of training" + echo " --initial-effective-lrate # effective learning rate at start of training." + echo " --final-effective-lrate # effective learning rate at end of training." + echo " # data, 0.00025 for large data" + echo " --num-jobs-initial # Number of parallel jobs to use for neural net training, at the start." + echo " --num-jobs-final # Number of parallel jobs to use for neural net training, at the end" + echo " --egs-dir # If supplied, overrides /egs as location of egs" + echo " --stage # Used to run a partially-completed training process from somewhere in" + echo " # the middle." + exit 1; +fi + +dir=$1 + +[ -z $egs_dir ] && egs_dir=$dir/egs + +if [ ! -d $egs_dir/info ]; then + echo "$0: expected $egs_dir/info to exist: did you run steps/nnet3/xvector/get_egs.sh first?" + exit 1 +fi +if [ ! -f $dir/configs/final.config ]; then + echo "$0: expected $dir/configs/final.config to exist (e.g. run steps/nnet3/xvector/make_jesus_configs.py first)" + exit 1 +fi + + +num_archives=$(cat $egs_dir/info/num_archives) +num_diagnostic_archives=$(cat $egs_dir/info/num_diagnostic_archives) + + + +[ $num_jobs_initial -gt $num_jobs_final ] && \ + echo "$0: --initial-num-jobs cannot exceed --final-num-jobs" && exit 1; + +[ $num_jobs_final -gt $num_archives ] && \ + echo "$0: --final-num-jobs cannot exceed #archives $num_archives." && exit 1; + +# set num_iters so that as close as possible, we process the data $num_epochs +# times $num_shifts times, times, i.e. $num_iters*$avg_num_jobs) == +# $num_epochs*$num_archives*$num_shifts, where +# avg_num_jobs=(num_jobs_initial+num_jobs_final)/2. +num_archives_to_process=$[$num_epochs*$num_archives*$num_shifts] +num_archives_processed=0 +num_iters=$[($num_archives_to_process*2)/($num_jobs_initial+$num_jobs_final)] + +echo "$0: Will train for $num_epochs epochs = $num_iters iterations" + +if $use_gpu; then + parallel_suffix="" + train_queue_opt="--gpu 1" + parallel_train_opts= + if ! cuda-compiled; then + echo "$0: WARNING: you are running with one thread but you have not compiled" + echo " for CUDA. You may be running a setup optimized for GPUs. If you have" + echo " GPUs and have nvcc installed, go to src/ and do ./configure; make" + exit 1 + fi +else + echo "$0: without using a GPU this will be very slow. nnet3 does not yet support multiple threads." + parallel_train_opts="--use-gpu=no" +fi + +if [ $stage -le -1 ]; then + $cmd $dir/log/nnet_init.log \ + nnet3-init $dir/configs/final.config $dir/0.raw || exit 1 +fi + + +x=0 + +while [ $x -lt $num_iters ]; do + + this_num_jobs=$(perl -e "print int(0.5+$num_jobs_initial+($num_jobs_final-$num_jobs_initial)*$x/$num_iters);") + + ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process; + this_effective_learning_rate=$(perl -e "print ($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt));"); + this_learning_rate=$(perl -e "print ($this_effective_learning_rate*$this_num_jobs);"); + + if [ $stage -le $x ]; then + echo "On iteration $x, learning rate is $this_learning_rate" + raw="nnet3-copy --learning-rate=$this_learning_rate $dir/$x.raw - |" + + if [ $[$x%$diagnostic_period] == 0 ]; then + # Set off jobs doing some diagnostics, in the background. + $cmd JOB=1:$num_diagnostic_archives $dir/log/compute_prob_valid.$x.JOB.log \ + nnet3-xvector-compute-prob --compute-accuracy=${compute_accuracy} $dir/$x.raw \ + "ark:fvector-add-noise-separate --max-snr=$max_snr --min-snr=$min_snr scp:$egs_dir/valid_diagnostic_egs.JOB.scp scp:$egs_dir/valid_diagnostic_egs.noise.JOB.scp ark:- | fvector-get-egs --left-padding=$left_padding --right-padding=$right_padding ark:- ark:- | nnet3-merge-egs --measure-output-frames=false ark:- ark:- |" & + $cmd JOB=1:$num_diagnostic_archives $dir/log/compute_prob_train.$x.JOB.log \ + nnet3-xvector-compute-prob --compute-accuracy=${compute_accuracy} $dir/$x.raw \ + "ark:fvector-add-noise-separate --max-snr=$max_snr --min-snr=$min_snr scp:$egs_dir/train_diagnostic_egs.JOB.scp scp:$egs_dir/train_diagnostic_egs.noise.JOB.scp ark:- | fvector-get-egs --left-padding=$left_padding --right-padding=$right_padding ark:- ark:- | nnet3-merge-egs --measure-output-frames=false ark:- ark:- |" & + fi + if [ $x -gt 0 ]; then + $cmd $dir/log/progress.$x.log \ + nnet3-info $dir/$x.raw '&&' \ + nnet3-show-progress --use-gpu=no $dir/$[$x-1].raw $dir/$x.raw & + fi + + echo "Training neural net (pass $x)" + + if [ $x -le 1 ]; then + do_average=false # for the first 2 iters, don't do averaging, pick the best. + else + do_average=true + fi + + rm $dir/.error 2>/dev/null + + + ( # this sub-shell is so that when we "wait" below, + # we only wait for the training jobs that we just spawned, + # not the diagnostic jobs that we spawned above. + + # We cannot easily use a single parallel SGE job to do the main training, + # because the computation of which archive and which --frame option + # to use for each job is a little complex, so we spawn each one separately. + # this is no longer true for RNNs as we use do not use the --frame option + # but we use the same script for consistency with FF-DNN code + + for n in $(seq $this_num_jobs); do + k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we will derive + # the other indexes from. + archive=$[($k%$num_archives)+1]; # work out the 1-based archive index. + frame_shift=$[($k/$num_archives)%$num_shifts]; + + this_max_param_change=$max_param_change + this_minibatch_size=$minibatch_size + # for the first 20 iterations or the first epoch, whichever comes earlier, + # use a smaller minibatch size and max-param-change. + if [ $k -lt $[$num_archives*$num_shifts] ] && [ $x -lt 20 ]; then + # if we're the first epoch, use half the minibatch size and half the + # max-param-change. + this_minibatch_size=$[$minibatch_size/2] + this_max_param_change=$(perl -e "print ($max_param_change / 2.0);") + fi + + $cmd $train_queue_opt $dir/log/train.$x.$n.log \ + fvector-add-noise-separate --max-snr=$max_snr --min-snr=$min_snr scp:"utils/shuffle_list.pl --srand $k $egs_dir/egs.$archive.scp |" scp:"utils/shuffle_list.pl --srand $k $egs_dir/egs.noise.$archive.scp |" ark:- \| \ + fvector-get-egs --left-padding=$left_padding --right-padding=$right_padding ark:- ark:- \| \ + nnet3-merge-egs --measure-output-frames=false --minibatch-size=$this_minibatch_size --discard-partial-minibatches=true ark:- ark:- \| \ + nnet3-xvector-train $parallel_train_opts --print-interval=10 --max-param-change=$this_max_param_change "$raw" ark:- $dir/$[$x+1].$n.raw || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: error detected on iteration $x of training" + exit 1 + fi + ) + # the error message below is not that informative, but $cmd will + # have printed a more specific one. + [ -f $dir/.error ] && echo "$0: error on iteration $x of training" && exit 1; + + models_to_average=$(steps/nnet3/get_successful_models.py $this_num_jobs $dir/log/train.$x.%.log) + nnets_list= + for n in $models_to_average; do + nnets_list="$nnets_list $dir/$[$x+1].$n.raw" + done + + if $do_average; then + # average the output of the different jobs. + $cmd $dir/log/average.$x.log \ + nnet3-average $nnets_list $dir/$[$x+1].raw || exit 1; + else + # choose the best from the different jobs. + n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) { + $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn"; + undef $logprob; while () { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } } + close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; + $best_n=$n; } } print "$best_n\n"; ' $this_num_jobs $dir/log/train.$x.%d.log) || exit 1; + [ -z "$n" ] && echo "Error getting best model" && exit 1; + $cmd $dir/log/select.$x.log \ + cp $dir/$[$x+1].$n.raw $dir/$[$x+1].raw || exit 1; + fi + + nnets_list= + for n in `seq 1 $this_num_jobs`; do + nnets_list="$nnets_list $dir/$[$x+1].$n.raw" + done + + rm $nnets_list + [ ! -f $dir/$[$x+1].raw ] && exit 1; + if [ -f $dir/$[$x-1].raw ] && $cleanup && \ + [ $[($x-1)%100] -ne 0 ]; then + rm $dir/$[$x-1].raw + fi + fi + rm $dir/cache.$x 2>/dev/null + x=$[$x+1] + num_archives_processed=$[$num_archives_processed+$this_num_jobs] +done + + +cp $dir/$x.raw $dir/final.raw + +# don't bother with combination for now - it makes very little difference. + +sleep 2 + +echo Done diff --git a/src/Makefile b/src/Makefile index 6dfd146e3d5..f58e69408a3 100644 --- a/src/Makefile +++ b/src/Makefile @@ -9,13 +9,15 @@ SUBDIRS = base matrix util feat tree gmm transform \ fstext hmm lm decoder lat kws cudamatrix nnet \ bin fstbin gmmbin fgmmbin featbin \ nnetbin latbin sgmm2 sgmm2bin nnet2 nnet3 rnnlm chain nnet3bin nnet2bin kwsbin \ - ivector ivectorbin online2 online2bin lmbin chainbin rnnlmbin + ivector ivectorbin online2 online2bin lmbin chainbin rnnlmbin \ + xvector xvectorbin fvector fvectorbin MEMTESTDIRS = base matrix util feat tree gmm transform \ fstext hmm lm decoder lat nnet kws chain \ bin fstbin gmmbin fgmmbin featbin \ nnetbin latbin sgmm2 nnet2 nnet3 rnnlm nnet2bin nnet3bin sgmm2bin kwsbin \ - ivector ivectorbin online2 online2bin lmbin + ivector ivectorbin online2 online2bin lmbin \ + xvector xvectorbin fvector fvectorbin CUDAMEMTESTDIR = cudamatrix @@ -149,9 +151,10 @@ $(EXT_SUBDIRS) : mklibdir ext_depend ### Dependency list ### # this is necessary for correct parallel compilation #1)The tools depend on all the libraries -bin fstbin gmmbin fgmmbin sgmm2bin featbin nnetbin nnet2bin nnet3bin chainbin latbin ivectorbin lmbin kwsbin online2bin rnnlmbin: \ +bin fstbin gmmbin fgmmbin sgmm2bin featbin nnetbin nnet2bin nnet3bin chainbin latbin ivectorbin lmbin kwsbin online2bin rnnlmbin xvectorbin fvectorbin: \ base matrix util feat tree gmm transform sgmm2 fstext hmm \ - lm decoder lat cudamatrix nnet nnet2 nnet3 ivector chain kws online2 rnnlm + lm decoder lat cudamatrix nnet nnet2 nnet3 ivector chain kws online2 rnnlm \ + xvector fvector #2)The libraries have inter-dependencies base: base/.depend.mk @@ -174,6 +177,8 @@ nnet3: base util matrix lat gmm hmm tree transform cudamatrix chain fstext rnnlm: base util matrix cudamatrix nnet3 lm hmm chain: lat hmm tree fstext matrix cudamatrix util base ivector: base util matrix transform tree gmm +xvector: base util matrix cudamatrix nnet3 +fvector: base util matrix cudamatrix nnet3 #3)Dependencies for optional parts of Kaldi onlinebin: base matrix util feat tree gmm transform sgmm2 fstext hmm lm decoder lat cudamatrix nnet nnet2 online # python-kaldi-decoding: base matrix util feat tree gmm transform sgmm2 fstext hmm decoder lat online diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h index 6b99a77e73b..232b274a344 100644 --- a/src/cudamatrix/cu-kernels-ansi.h +++ b/src/cudamatrix/cu-kernels-ansi.h @@ -789,6 +789,15 @@ void cuda_uncompress_uint8(dim3 Gr, dim3 Bl, BaseFloat *dest, MatrixDim dim, const uint8_t *src, int src_stride, float scale); +/// For Xvector +void cudaD_compute_xvector_objf(dim3 Gr, dim3 Bl, const double *scores, + MatrixDim scores_dim, double *obfj_terms, + MatrixDim objf_dim, double *objf_derivs, + MatrixDim derivs_dim); +void cudaF_compute_xvector_objf(dim3 Gr, dim3 Bl, const float *scores, + MatrixDim scores_dim, float *obfj_terms, + MatrixDim objf_dim, float *objf_derivs, + MatrixDim derivs_dim); } // extern "C" diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 934a860a055..b31bd92d760 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -5445,3 +5445,43 @@ void cuda_uncompress_int16(dim3 Gr, dim3 Bl, BaseFloat *dest, int src_stride, float scale) { _cuda_uncompress<<>>(dest, dim, src, src_stride, scale); } +/// For Xvector +template +__global__ +static void _compute_xvector_objf(const Real* scores, MatrixDim scores_dim, + Real* objf_terms, MatrixDim objf_dim, + Real* objf_derivs, MatrixDim derivs_dim) { + int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; + int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; + int32_cuda scores_index = i + j * scores_dim.stride; + int32_cuda objf_index = i + j * objf_dim.stride; + int32_cuda derivs_index = i + j * derivs_dim.stride; + Real K = 1.0 / (scores_dim.rows - 2.0); + Real L = scores[scores_index]; + if (i < scores_dim.cols && j < scores_dim.rows) { + if (i + 1 == j && i % 2 == 0) { + objf_terms[objf_index] = L < -15 ? L : -log(1.0 + exp(-L)); + objf_derivs[derivs_index] = L > 15 ? 0.0 : 1.0 / (1.0 + exp(L)); + } else if (i < j) { + objf_terms[objf_index] = K * (L > 15 ? -L : -log(1.0 + exp(L))); + objf_derivs[derivs_index] = L < -15 ? 0 : -K / (1.0 + exp(-L)); + } else { + objf_terms[objf_index] = 0.0; + objf_derivs[derivs_index] = 0.0; + } + } +} +void cudaD_compute_xvector_objf(dim3 Gr, dim3 Bl, const double *scores, + MatrixDim scores_dim, double *objf_terms, + MatrixDim objf_dim, double *objf_derivs, + MatrixDim derivs_dim) { + _compute_xvector_objf<<>>(scores, scores_dim, objf_terms, objf_dim, + objf_derivs, derivs_dim); +} +void cudaF_compute_xvector_objf(dim3 Gr, dim3 Bl, const float *scores, + MatrixDim scores_dim, float *objf_terms, + MatrixDim objf_dim, float *objf_derivs, + MatrixDim derivs_dim) { + _compute_xvector_objf<<>>(scores, scores_dim, objf_terms, objf_dim, + objf_derivs, derivs_dim); +} diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h index 8f719a8c4a1..27804c9339f 100644 --- a/src/cudamatrix/cu-kernels.h +++ b/src/cudamatrix/cu-kernels.h @@ -1546,6 +1546,21 @@ inline void cuda_mat_uncompress(dim3 Gr, dim3 Bl, BaseFloat *dest, int src_stride, float scale) { cuda_uncompress_uint16(Gr, Bl, dest, dim, src, src_stride, scale); } +/// For Xvector +inline void cuda_compute_xvector_objf(dim3 Gr, dim3 Bl, const float *scores, + MatrixDim scores_dim, float *obfj_terms, + MatrixDim objf_dim, float *objf_derivs, + MatrixDim derivs_dim) { + cudaF_compute_xvector_objf(Gr, Bl, scores, scores_dim, obfj_terms, objf_dim, + objf_derivs, derivs_dim); +} +inline void cuda_compute_xvector_objf(dim3 Gr, dim3 Bl, const double *scores, + MatrixDim scores_dim, double *obfj_terms, + MatrixDim objf_dim, double *objf_derivs, + MatrixDim derivs_dim) { + cudaD_compute_xvector_objf(Gr, Bl, scores, scores_dim, obfj_terms, objf_dim, + objf_derivs, derivs_dim); +} } // namespace kaldi diff --git a/src/cudamatrix/cu-math.cc b/src/cudamatrix/cu-math.cc index 3fbeff3a470..17e1864ec63 100644 --- a/src/cudamatrix/cu-math.cc +++ b/src/cudamatrix/cu-math.cc @@ -991,6 +991,49 @@ void BackpropLstmNonlinearity(const CuMatrixBase &input, CuMatrixBase *value_sum_out, CuMatrixBase *deriv_sum_out, CuMatrixBase *self_repair_sum_out); +// For Xvector +void ComputeXvectorObjfFromScores(const CuMatrixBase &scores, + CuMatrixBase *objf_terms, + CuMatrixBase *objf_derivs) { + KALDI_ASSERT(SameDim(*objf_terms, *objf_derivs) + && SameDim(*objf_terms, scores) && + scores.NumRows() == scores.NumCols()); + #if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + CuTimer tim; + dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); + dim3 dimGrid(n_blocks(scores.NumCols(), CU2DBLOCK), + n_blocks(scores.NumRows(), CU2DBLOCK)); + + cuda_compute_xvector_objf(dimGrid, dimBlock, scores.Data(), scores.Dim(), + objf_terms->Data(), objf_terms->Dim(), objf_derivs->Data(), + objf_derivs->Dim()); + CU_SAFE_CALL(cudaGetLastError()); + + CuDevice::Instantiate().AccuProfile(__func__, tim); + } else + #endif + { + // Compute the xvector objective function and its derivatives in the CPU. + int32 num_rows = scores.NumRows(); + BaseFloat K = 1.0 / (num_rows - 2.0); + for (int32 i = 0; i < num_rows; i++) { + for (int32 j = 0; j < num_rows; j++) { + BaseFloat L = scores(i, j); + if (i + 1 == j && i % 2 == 0) { + (*objf_terms)(i, j) = L < -15 ? L : -log(1.0 + exp(-L)); + (*objf_derivs)(i, j) = L > 15 ? 0.0 : 1.0 / (1.0 + exp(L)); + } else if (i < j) { + (*objf_terms)(i, j) = K * (L > 15 ? -L : -log(1.0 + exp(L))); + (*objf_derivs)(i, j) = L < -15 ? 0 : -K / (1.0 + exp(-L)); + } else { + (*objf_terms)(i, j) = 0; + (*objf_derivs)(i, j) = 0; + } + } + } + } +} diff --git a/src/cudamatrix/cu-math.h b/src/cudamatrix/cu-math.h index df533dd73ff..3ec4a693e6f 100644 --- a/src/cudamatrix/cu-math.h +++ b/src/cudamatrix/cu-math.h @@ -297,6 +297,37 @@ void DiffNormalizePerRow(const CuMatrixBase &in_value, const CuMatrixBase &out_deriv, const Real target_rms, const bool add_log_stddev, CuMatrixBase* in_deriv); +// For Xvector +/* + This function is used in computing the objective function and derivatives + in xvector training. + @param [in] scores 'scores' is a symmetric matrix of scores which are to + be interpreted as log-odds (according to the model) of pairs coming from the + same class, so scores(i, j) is the model's log p(same/different) for + elements i and j of the original minibatch of input. We assume that the data + in 'scores' has been arranged in such a way that pairs of indexes of the form + (2k, 2k+1), e.g., (0, 1), (2, 3), (4, 5), etc, are from the same class, but + indexes of any other form, such as (0, 2), (1, 2), etc, are from different + classes. + @param [out] objf_terms 'objf_terms' is a matrix of the same dimension as + 'scores' whose elements we will sum to get the objective function for this + minibatch. This function computes the appropriate contributions to the + objective function, as follows. + if i == j: + objf_terms(i, j)== 0 # the same exact element is not scored + elsif i%2 == j%2: + objf_terms(i, j) = log(p(same)) + = -log(1 + exp(-scores(i, j)) + else: + objf_terms(i, j) = 1 / (scores.NumRows() - 2) * log(p(different)) + = -1/(scores.NumRows() - 2) * log(1+exp(scores(i,j)) + @param [out] objf_derivs Element (i,j) of this matrix is the derivative + of objf_terms(i,j) with respect to scores(i, j). +*/ +void ComputeXvectorObjfFromScores(const CuMatrixBase &scores, + CuMatrixBase *objf_terms, + CuMatrixBase *objf_derivs); + } // namespace cu diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc index 64f8afe0616..972e3b03b73 100644 --- a/src/cudamatrix/cu-packed-matrix.cc +++ b/src/cudamatrix/cu-packed-matrix.cc @@ -194,6 +194,15 @@ void CuPackedMatrix::CopyToPacked(PackedMatrix *dst) const { } } +//For Xvector +template +void CuPackedMatrix::CopyFromVec(const CuVectorBase &vec) { + MatrixIndexT size = (NumRows() * (NumRows() + 1)) / 2; + KALDI_ASSERT(vec.Dim() == size); + CuSubVector this_as_vec(data_, (num_rows_ * (num_rows_ + 1)) / 2); + this_as_vec.CopyFromVec(vec); +} + /* template void CuPackedMatrix::CopyRowsFromPacked(int32 r, const CuPackedMatrix &src, int32 src_ro, int32 dst_ro) { diff --git a/src/cudamatrix/cu-packed-matrix.h b/src/cudamatrix/cu-packed-matrix.h index 0131ba6c101..499506c694c 100644 --- a/src/cudamatrix/cu-packed-matrix.h +++ b/src/cudamatrix/cu-packed-matrix.h @@ -99,6 +99,9 @@ class CuPackedMatrix { void CopyFromPacked(const PackedMatrix &src); void CopyToPacked(PackedMatrix *dst) const; + // For Xvector + void CopyFromVec(const CuVectorBase &vec); + void Read(std::istream &in, bool binary); void Write(std::ostream &out, bool binary) const; diff --git a/src/fvector/Makefile b/src/fvector/Makefile new file mode 100644 index 00000000000..882336a20c1 --- /dev/null +++ b/src/fvector/Makefile @@ -0,0 +1,19 @@ + +all: + +OPENFST_CXXFLAGS = +OPENFST_LDLIBS = +include ../kaldi.mk + +TESTFILES = fvector-perturb-test + +OBJFILES = fvector-perturb.o + +LIBNAME = kaldi-fvector + +ADDLIBS = ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \ + ../tree/kaldi-tree.a ../util/kaldi-util.a \ + ../matrix/kaldi-matrix.a ../base/kaldi-base.a ../feat/kaldi-feat.a + + +include ../makefiles/default_rules.mk diff --git a/src/fvector/fvector-perturb-test.cc b/src/fvector/fvector-perturb-test.cc new file mode 100644 index 00000000000..3e12797aba2 --- /dev/null +++ b/src/fvector/fvector-perturb-test.cc @@ -0,0 +1,86 @@ +// fvector/fvector-perturb-test.cc + +#include +#include +#include "fvector/fvector-perturb.h" +#include "feat/wave-reader.h" + +using namespace kaldi; + +static void UnitTestSpeedPerturb() { + std::cout << "=== UnitTestSpeedPerturb ===" << std::endl; + Vector input, output; + BaseFloat sample_freq; + { + std::ifstream is("./test_data/test.wav", std::ios_base::binary); + WaveData wave; + wave.Read(is); + const Matrix data(wave.Data()); + KALDI_ASSERT(data.NumRows() == 1); + input.Resize(data.NumCols()); + input.CopyFromVec(data.Row(0)); + sample_freq = wave.SampFreq(); + } + BaseFloat speed_factor = 1.2; + FvectorPerturbOptions opts; + FvectorPerturb perturb(opts); + output.Resize(static_cast(ceil(input.Dim()/speed_factor))); + perturb.SpeedPerturbation(input, sample_freq, speed_factor, &output); + { + std::ofstream os("./test_data/test_speedperturbed.wav.txt", std::ios::out); + output.Write(os, false); + } + std::cout << "With fvector class, The dim of input is: " << input.Dim() << std::endl; + std::cout << "With fvector class, The dim of output is: " << output.Dim() << std::endl; + // Write the perturbed data into wav format + { + Matrix output_matrix(1, output.Dim()); + output_matrix.CopyRowFromVec(output, 0); + WaveData perturbed_wave(sample_freq, output_matrix); + std::ofstream os("./test_data/test_speedperturbed.wav", std::ios::out); + perturbed_wave.Write(os); + } + // print the wav data which is dealed by sox. + // Command: sox -t wav test.wav -t wav test_speed12.wav speed 1.2 + Vector sox_input; + BaseFloat sox_sample_freq; + { + std::ifstream is("./test_data/test_speed12.wav", std::ios_base::binary); + WaveData wave; + wave.Read(is); + const Matrix data(wave.Data()); + KALDI_ASSERT(data.NumRows() == 1); + sox_input.Resize(data.NumCols()); + sox_input.CopyFromVec(data.Row(0)); + sox_sample_freq = wave.SampFreq(); + std::ofstream os("./test_data/test_sox.wav.txt", std::ios::out); + sox_input.Write(os, false); + } + KALDI_ASSERT(sample_freq == sox_sample_freq); + if (output.ApproxEqual(sox_input, 0.01)) { + std::cout << "Equal" << std::endl; + } else { + std::cout << "Not Equal" << std::endl; + BaseFloat prod_output = VecVec(output, output); + BaseFloat prod_sox = VecVec(sox_input, sox_input); + BaseFloat cross_prod = VecVec(output, sox_input); + std::cout << "The cosin distance is: " + << cross_prod/(sqrt(prod_output)*sqrt(prod_sox)) + << std::endl; + } + std::cout << "=== UnitTestSpeedPerturb finish ===" << std::endl; +} + +static void UnitTestFvectorPerturb() { + UnitTestSpeedPerturb(); +} + +int main() { + try{ + UnitTestFvectorPerturb(); + std::cout << "Tests succeeded." << std::endl; + } catch (const std::exception &e) { + std::cerr << e.what(); + return 1; + } +} diff --git a/src/fvector/fvector-perturb.cc b/src/fvector/fvector-perturb.cc new file mode 100644 index 00000000000..9ed6fb2b9c7 --- /dev/null +++ b/src/fvector/fvector-perturb.cc @@ -0,0 +1,289 @@ +#include "fvector/fvector-perturb.h" + +namespace kaldi { + +void FvectorPerturb::ApplyPerturbation(const MatrixBase& input_chunk, + Matrix* perturbed_chunk) { + // The original_dim_matrix is a matrix whose dimension is same with input_chunk. + // Assume the sample_frequency=8kHz, the original length is 120ms. + // It will be a (4, 960) matrix. + Matrix original_dim_matrix(input_chunk); + // Firstly, we add additive noise with probability. + AddNoise(opts_.add_noise, &original_dim_matrix); + // we do Resize() here, because Resize() belongs to Matrix<> rather than MatrixBase<> + original_dim_matrix.Resize(2, original_dim_matrix.NumCols(), kCopyData); + KALDI_ASSERT(original_dim_matrix.NumRows() == 2); + // After AddNoise(), the shape of original_dim_matrix is (2, original_dim). + if (opts_.volume_perturbation) { + VolumePerturbation(&original_dim_matrix); + } + // The expected_dim_matrix is a matrix (input_chunk.NumRows(), expected-chunk-length + // * sample_frequency / 1000). E.g. it is a (4, 800) matrix. + Matrix expected_dim_matrix(original_dim_matrix.NumRows(), + opts_.expected_chunk_length * opts_.sample_frequency / 1000); + if (opts_.speed_perturbation) { + //1. generate speed perturb factor randomly(Noice: the expected_length is + //always smaller than original_length) for each line. + //(1) a=min{original_length/expected_length -1, max-speed-perturb-rate} + //(2) the range of factor is (1-a, 1+a) + BaseFloat boundary = std::min(static_cast((original_dim_matrix.NumCols() * 1.0 / opts_.sample_frequency) + * 1000 / opts_.expected_chunk_length - 1), opts_.max_speed_perturb_rate); + for (MatrixIndexT i = 0; i < original_dim_matrix.NumRows(); ++i) { + //caculate the speed factor + BaseFloat factor =static_cast (RandInt( + (int)((1-boundary)*100),(int)((1+boundary)*100)) * 1.0 / 100.0); + + Vector speed_input_vector(original_dim_matrix.Row(i)); + + MatrixIndexT speed_output_dim = static_cast(ceil(original_dim_matrix.NumCols() / factor)); + KALDI_ASSERT(speed_output_dim >= opts_.expected_chunk_length * opts_.sample_frequency / 1000); + Vector speed_output_vector(speed_output_dim); + + SpeedPerturbation(speed_input_vector, opts_.sample_frequency, factor, &speed_output_vector); + + Vector time_shifted_vector(expected_dim_matrix.NumCols()); + if (opts_.time_shift) { + TimeShift(speed_output_vector, &time_shifted_vector); + } else { + time_shifted_vector.CopyFromVec(speed_output_vector.Range(0, expected_dim_matrix.NumCols())); + } + expected_dim_matrix.CopyRowFromVec(time_shifted_vector, i); + } + } else { //no speed_perturbation + if (opts_.time_shift) { + for (MatrixIndexT i = 0; i < original_dim_matrix.NumRows(); ++i) { + Vector input_vector(original_dim_matrix.Row(i)); + Vector time_shifted_vector(expected_dim_matrix.NumCols()); + TimeShift(input_vector, &time_shifted_vector); + expected_dim_matrix.CopyRowFromVec(time_shifted_vector, i); + } + } else { + expected_dim_matrix.CopyFromMat(original_dim_matrix.Range(0, expected_dim_matrix.NumRows(), + 0, expected_dim_matrix.NumCols())); + } + } + // Now we operate the "expected_dim_matrix" + perturbed_chunk->Resize(2, expected_dim_matrix.NumCols()); + MatrixIndexT indices[2] = {0, 1}; + perturbed_chunk->CopyRows(expected_dim_matrix, indices); +} + +void FvectorPerturb::VolumePerturbation(MatrixBase* chunk) { + //1. Randomly generate 2 number from (1-max-volume-variance, 1+max-volume-variance) + std::vector volume_factors; + for (MatrixIndexT i = 0; i < chunk->NumRows(); ++i) { + BaseFloat factor = static_cast( + RandInt((int)((1-opts_.max_volume_variance)*100), + (int)((1+opts_.max_volume_variance)*100)) / 100.0); + volume_factors.push_back(factor); + } + //2. scale each line respectively. + for (MatrixIndexT i = 0; i < chunk->NumRows(); ++i) { + chunk->Row(i).Scale(volume_factors[i]); + } +} + +// we stretch the signal from the beginning to end. +// y(t) = x(s*t) for t = 0,...,n. If s>0, the output will be shorter than +// input. It represents speeding up. Vice versa. +// Use ArbitraryResample deal with each line. +// +// In ArbitraryResample, according to num_zeros and filter_cutoff, it generates +// the "filter_with". And then each output_sample(t) corresponds to few input_samples +// from (t-filter_with) to (t+filter_with), which is stored in "first_index_". +// And "weights_" will be adjust by a Hanning window in function FilterFunc. +// In brief, you can think each output sample is the weighted sum of few input_samples. +void FvectorPerturb::SpeedPerturbation(VectorBase& input_vector, + BaseFloat samp_freq, + BaseFloat speed_factor, + VectorBase* output_vector) { + if (speed_factor == 1.0) { + output_vector->CopyFromVec(input_vector); + } else { + Vector in_vec(input_vector), + out_vec(output_vector->Dim()); + int32 input_dim = in_vec.Dim(), + output_dim = out_vec.Dim(); + Vector samp_points_secs(output_dim); + int32 num_zeros = 4; // Number of zeros of the sinc function that the window extends out to. + // lowpass frequency that's lower than 95% of the Nyquist. + BaseFloat filter_cutoff_hz = samp_freq * 0.475; + for (int32 i = 0; i < output_dim; i++) { + samp_points_secs(i) = static_cast(speed_factor * i / samp_freq); + } + ArbitraryResample time_resample(input_dim, samp_freq, + filter_cutoff_hz, + samp_points_secs, + num_zeros); + time_resample.Resample(in_vec, &out_vec); + output_vector->CopyFromVec(out_vec); + } +} + +void FvectorPerturb::TimeShift(VectorBase& input_vector, + VectorBase* output_vector) { + //1. generate start point randomly whose range is + // [0, row.NumCols()- expected_chunk_length * sample_frequency) + int32 start_point = static_cast(RandInt(0, input_vector.Dim() - output_vector->Dim())); + //2. get the successive expected_chunk_length * sample_frequency data. + output_vector->CopyFromVec(input_vector.Range(start_point, output_vector->Dim())); +} + +void FvectorPerturb::AddNoise(BaseFloat probability_threshold, + MatrixBase* chunk) { + //1. generate 2 SNR from (min-snr, max-snr) + //2. add N1(line3) to S1(line1) with snr1 with probability + // add N2(line4) to S2(line2) with snr2 with probability + for (MatrixIndexT i = 0; i < 2; i++) { + BaseFloat probability = static_cast(RandInt(0, 100) / 100.0); + if (probability <= probability_threshold) { + Vector source(chunk->Row(i)); + Vector noise(chunk->Row(i+2)); + BaseFloat source_power = VecVec(source, source) / source.Dim(); + BaseFloat noise_power = VecVec(noise, noise) / noise.Dim(); + int32 snr = RandInt(opts_.min_snr, opts_.max_snr); + BaseFloat scale_factor = sqrt(pow(10, -snr/10) * source_power / noise_power); + //BaseFloat source_energy = VecVec(source, source); + //BaseFloat noise_energy = VecVec(noise, noise); + // The smaller the value, the greater the snr + //BaseFloat scale_factor = sqrt(source_energy/ noise_energy / (pow(10, snr/20))); + chunk->Row(i).AddVec(scale_factor, noise); + } + } +} + + +// The following functions belong to Class FvectorPerturbBlock + +void FvectorPerturbBlock::ApplyPerturbationBlock(Matrix* perturbed_chunk) { + // 1.Add additive noise with probability + AddNoiseBlock(opts_.add_noise, perturbed1, noise1); + AddNoiseBlock(opts_.add_noise, perturbed2, noise2); + + // 2.After AddNoise(), conduct volume perturbation. + if (opts_.volume_perturbation) { + VolumePerturbationBlock(perturbed1); + VolumePerturbationBlock(perturbed2); + } + + // 3. Conduct the speed perturbation and time shift together. At last the + // NumCols of perturbed matrix equals expected_chunk_length(e.g. 100ms) + if (opts_.speed_perturbation) { + SpeedPerturbationBlock(perturbed1); + SpeedPerturbationBlock(perturbed2); + if (opts_.time_shift) { + TimeShiftBlock(perturbed1); + TimeShiftBlock(perturbed2); + } else { + int32 output_cols = static_cast(opts_.expected_chunk_length * opts_.sample_frequency); + KALDI_ASSERT(perturbed1.NumRows() == perturbed2.NumRows()); + int32 output_rows = perturbed1.NumRows(); + perturbed1.Resize(output_rows, output_cols, kCopyData); + perturbed2.Resize(output_rows, output_cols, kCopyData); + } + } else { + if (opts_.time_shift) { + TimeShiftBlock(perturbed1); + TimeShiftBlock(perturbed2); + } else { + int32 output_cols = static_cast(opts_.expected_chunk_length * opts_.sample_frequency); + KALDI_ASSERT(perturbed1.NumRows() == perturbed2.NumRows()); + int32 output_rows = perturbed1.NumRows(); + perturbed1.Resize(output_rows, output_cols, kCopyData); + perturbed2.Resize(output_rows, output_cols, kCopyData); + } + } + + // 4. At last, compose two different perturbed matrices into one matrix. + // Each two consecutive lines come from the same original source signal. + KALDI_ASSERT(perturbed1.NumRows() == perturbed2.NumRows()); + int32 output_rows = perturbed1.NumRows() *2; + KALDI_ASSERT(perturbed1.NumCols() == perturbed2.NumCols()); + int32 output_cols = perturbed1.NumCols(); + perturbed_chunk->Resize(output_rows, output_cols); + for (MatrixIndexT i = 0; i < output_rows/2 ; i++) { + perturbed_chunk->Row(2*i).CopyFromVec(perturbed1.Row(i)); + perturbed_chunk->Row(2*i+1).CopyFromVec(perturbed2.Row(i)); + } +} + +// For each row of source, we add additive noise to it with random snr with +// probability. +void FvectorPerturbBlock::AddNoiseBlock(BaseFloat probability_threshold, + MatrixBase& source, + MatrixBase& noise) { + KALDI_ASSERT(source.NumRows() == noise.NumRows()); + for (MatrixIndexT i = 0; i < source.NumRows(); i++) { + BaseFloat probability = static_cast(RandInt(0, 10000) / 100.0); + if (probability <= probability_threshold) { + Vector source_signal(source.Row(i)); + Vector noise_signal(noise.Row(i)); + BaseFloat source_energy = VecVec(source_signal, source_signal); + BaseFloat noise_energy = VecVec(noise_signal, noise_signal); + // The smaller the value, the greater the snr + int32 snr = RandInt(opts_.min_snr, opts_.max_snr); + BaseFloat scale_factor = sqrt(source_energy/ noise_energy / (pow(10, snr/20))); + source.Row(i).AddVec(scale_factor, noise_signal); + } + } +} + +// For the whole block, we use a uniform scale factor. +void FvectorPerturbBlock::VolumePerturbationBlock(MatrixBase& block) { + BaseFloat factor = static_cast( + RandInt((int)((1-opts_.max_volume_variance)*100), + (int)((1+opts_.max_volume_variance)*100)) / 100.0); + block.Scale(factor); +} + +// It is similar with FvectorPerturb::SpeedPerturbation(). Use ArbitraryResample. +// For the whole block, we use a uniform random speed factor. +void FvectorPerturbBlock::SpeedPerturbationBlock(Matrix& block) { + //1. generate speed perturb factor randomly(Noice: the expected_length is + //always smaller than original_length) for each line. + //(1) a=min{original_length/expected_length -1, max-speed-perturb-rate} + //(2) the range of factor is (1-a, 1+a) + BaseFloat boundary = std::min((block.NumCols() / opts_.sample_frequency) / opts_.expected_chunk_length - 1, + opts_.max_speed_perturb_rate); + //caculate the speed factor + BaseFloat speed_factor =static_cast (RandInt( + (int)((1-boundary)*100),(int)((1+boundary)*100)) * 1.0 / 100.0); + MatrixIndexT output_dim = static_cast(ceil(block.NumCols() / speed_factor)); + KALDI_ASSERT(output_dim >= opts_.expected_chunk_length * opts_.sample_frequency / 1000); + + if (speed_factor == 1.0) { + // return the original block + } else { + int32 input_dim = block.NumCols(); + Vector samp_points_secs(output_dim); + int32 num_zeros = 4; // Number of zeros of the sinc function that the window extends out to. + // lowpass frequency that's lower than 95% of the Nyquist. + BaseFloat filter_cutoff_hz = opts_.sample_frequency * 0.475; + for (int32 i = 0; i < output_dim; i++) { + samp_points_secs(i) = static_cast(speed_factor * i / opts_.sample_frequency); + } + ArbitraryResample time_resample(input_dim, opts_.sample_frequency, + filter_cutoff_hz, + samp_points_secs, + num_zeros); + Matrix tmp_block(block.NumRows(), output_dim); + time_resample.Resample(block, &tmp_block); + block.Resize(tmp_block.NumRows(), tmp_block.NumCols()); + block.CopyFromMat(tmp_block); + } +} + +// Choose a uniform start_point randomly, whose range is +// [0, row.NumCols()- expected_chunk_length * sample_frequency)] +// get the successive expected_chunk_length * sample_frequency data. +void FvectorPerturbBlock::TimeShiftBlock(Matrix& block) { + int32 output_cols = static_cast(opts_.expected_chunk_length * opts_.sample_frequency); + int32 output_rows = block.NumRows(); + int32 start_point = static_cast(RandInt(0, block.NumCols() - output_cols)); + Matrix tmp_block(output_rows, output_cols); + tmp_block.CopyFromMat(block.Range(0, output_rows, start_point, output_cols)); + block.Resize(output_rows, output_cols); + block.CopyFromMat(tmp_block); +} + +} // end of namespace kaldi diff --git a/src/fvector/fvector-perturb.h b/src/fvector/fvector-perturb.h new file mode 100644 index 00000000000..c7dc5674820 --- /dev/null +++ b/src/fvector/fvector-perturb.h @@ -0,0 +1,172 @@ +#ifndef KALDI_FVECTOR_PERTURB_H_ +#define KALDI_FVECTOR_PERTURB_H_ + +#include +#include +#include +#include + +#include "base/kaldi-error.h" +#include "matrix/matrix-lib.h" +#include "util/common-utils.h" + +#include "feat/resample.h" +#include "matrix/matrix-functions.h" + +namespace kaldi { + +// options class for distorting signals in egs +struct FvectorPerturbOptions { + BaseFloat sample_frequency; + BaseFloat expected_chunk_length; + BaseFloat max_speed_perturb_rate; + BaseFloat max_volume_variance; + BaseFloat max_snr; + BaseFloat min_snr; + bool volume_perturbation; + bool speed_perturbation; + bool time_shift; + BaseFloat add_noise; + + FvectorPerturbOptions(): sample_frequency(16000), + expected_chunk_length(100), + max_speed_perturb_rate(0.1), + max_volume_variance(0.03), + max_snr(20), + min_snr(0), + volume_perturbation(true), + speed_perturbation(true), + time_shift(true), + add_noise(0.85) { } + + void Register(OptionsItf *opts) { + opts->Register("sample-frequency", &sample_frequency, "The sample frequency " + "of the wav signal."); + opts->Register("expected-chunk-length", &expected_chunk_length, "It show the " + "length of chunk you expected. e.g. 100ms. That means the length " + "of output will correspond to 100ms. At the same time, it will " + "affect the speed_perturb_rate, the speed_perturb_rate factor will " + "in the range of min{expected-chunk-length/original-length, " + "max-speed-perturb-rate}."); + opts->Register("max-speed-perturb-rate", &max_speed_perturb_rate, + "Max speed perturbation applied on matrix. It will work together " + "with expected_chunk_length. E.g. 0.1 means we will generate " + "speed_factor randomly from range (1-a, 1+a), where a=" + "min{original_length/expected_length-1, 0.1}."); + opts->Register("max-volume-variance", &max_volume_variance, "The variation in " + "volume will vary form 1-max-volume-variance to 1+max-volume-variance " + "randomly."); + opts->Register("max-snr",&max_snr,"Specify a upperbound Signal to Noise Ratio. We will scale the noise according " + "to the original signal and SNR. Normally, it's a non-zero number between -30 and 30."); + opts->Register("min-snr",&min_snr,"Specify a lowerbound Signal to Noise Ratio. We will scale the noise according " + "to the original signal and SNR. Normally, it's a non-zero number between -30 and 30."); + opts->Register("volume-perturbation", &volume_perturbation, "If true, we will " + "conduct variations in volume."); + opts->Register("speed-perturbation", &speed_perturbation, "If true, we will " + "conduct variations in speed."); + opts->Register("time-shift", &time_shift, "If true, we will " + "conduct time shift. That means randomly select the start point from " + "range [0, input.NumCols() - expected_chunk_length], and then " + "get the successive 'expected_chunk_length' data. Otherwise, we get " + "the data from the head."); + opts->Register("add-noise", &add_noise, "Add additive noise to source chunk with " + "probability. E.g. 0.85 means we add noise with 85 percent probability, " + "and remain with 15 percent probability."); + } +}; + +/* This class is used to do (0-4) kinds of perturbation operation to fvector. + * According to the FvectorPerturbOptions, we choose do or not. + * The input always is a Matrix which contains four lines(S1, S2, N1, N2)[S1=S2] + * Then we will call different perturbation methods. (For details, see the comments + * of FvectorPerturbOption.) + * For the details about the four kinds of perturbation operation, please see + * the document in fvector-perturb.cc. + */ +class FvectorPerturb { + public: + FvectorPerturb(FvectorPerturbOptions opts) { opts_ = opts; } + void ApplyPerturbation(const MatrixBase& input_chunk, + Matrix* perturbed_chunk); + + // Randomly Generate 2 scale number and scale each line respectively + void VolumePerturbation(MatrixBase* chunk); + + // Use ArbitraryResample. For each line, randomly generate a speed factor. + // Then do time axis stretch. As speed factor is different, so we deal with + // each vector separately. The dim of output_vector is bigger than + // expected_chunk_length(ms) + void SpeedPerturbation(VectorBase& input_vector, + BaseFloat samp_freq, + BaseFloat speed_factor, + VectorBase* output_vector); + + // Randomly choose a expect_chunk_length(ms) vector. + void TimeShift(VectorBase& input_vector, + VectorBase* output_vector); + + // The input is a matrix contains four consecutive rows. + // It is (S1, S2, N1, N2). Each line is original_chunk_length(ms)(e.g. 960 dims = 120ms) + // add N1 to S1, add N2 to S2 with random snr. with probability (probability_threshold). + // After that, only the first two lines is meaningful, which represents two + // perturbed signals from the same source wavform signal. + // After use this function, maybe you need to resize the output. + // (Notice: Resize() belongs to Matrix<> rather than MatrixBase<>) + void AddNoise(BaseFloat probability_threshold, + MatrixBase* chunk); + + private: + FvectorPerturbOptions opts_; +}; + + +/* This class is used to do (0-4) kinds of perturbation operation to fvector. + * According to the FvectorPerturbOptions, we choose do or not. + * It is block version code that means it will process a matrix each time. + * Different from class FvectorPerturb, the class will process its private members + * (perturbed1, perturbed2, noise1 and noise2) to conducte perturbation operations. + * We will call different perturbation methods. (For details, see the comments + * of FvectorPerturbOption.) + * For the details about the four kinds of perturbation operation, please see + * the document in fvector-perturb.cc. + */ +class FvectorPerturbBlock { + public: + FvectorPerturbBlock(FvectorPerturbOptions opts, + const MatrixBase &source, + const MatrixBase &noise1, + const MatrixBase &noise2) : opts_(opts), + perturbed1(source), perturbed2(source), noise1(noise1), noise2(noise2) {} + + // The interface to apply different perturbation opertaions. Firstly, the + // function will conduct different perturbation operations. And then + // it will compose the final matrices--perturbed1, perturbed2 together. + void ApplyPerturbationBlock(Matrix* perturbed_chunk); + + // The input is two matrices. One is source matrix(e.g. perturbed1), another + // is noise matrix(e.g. noise1). Each line is original_chunk_length(ms)(e.g. 960 dims = 120ms) + // add noise row-by-row with random snr. with probability (probability_threshold). + // After that, the source signal is perturbed by noise signal. + void AddNoiseBlock(BaseFloat probability_threshold, + MatrixBase& source, + MatrixBase& noise); + + // Randomly Generate a scale number and scale the whole matrix + void VolumePerturbationBlock(MatrixBase& block); + + // Use ArbitraryResample. Generate a speed factor randomly for the whole matrix. + // Then do time axis stretch. The dim of output_vector is bigger than + // expected_chunk_length(ms) + void SpeedPerturbationBlock(Matrix& block); + + // Randomly choose a expect_chunk_length(ms) vector. + void TimeShiftBlock(Matrix& block); + + private: + FvectorPerturbOptions opts_; + Matrix perturbed1, perturbed2, noise1, noise2; + +}; + +} // end of namespace kaldi +#endif // KALDI_FVECTOR_PERTURB_H_ diff --git a/src/fvectorbin/Makefile b/src/fvectorbin/Makefile new file mode 100644 index 00000000000..ea54a572168 --- /dev/null +++ b/src/fvectorbin/Makefile @@ -0,0 +1,25 @@ + +all: + +EXTRA_CXXFLAGS = -Wno-sign-compare +include ../kaldi.mk + +BINFILES = fvector-add-noise fvector-chunk fvector-get-egs \ + fvector-chunk-block fvector-add-noise-block fvector-get-egs-block \ + compute-wav-to-rawmatrix fvector-debug-write-to-wav \ + fvector-debug-wav-to-vector fvector-debug-check-filter-bank \ + fvector-chunk-separate fvector-add-noise-separate + +OBJFILES = + + + +TESTFILES = + +ADDLIBS = ../lat/kaldi-lat.a ../lm/kaldi-lm.a ../fstext/kaldi-fstext.a \ + ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \ + ../matrix/kaldi-matrix.a ../fvector/kaldi-fvector.a \ + ../feat/kaldi-feat.a ../base/kaldi-base.a ../nnet3/kaldi-nnet3.a \ + ../cudamatrix/kaldi-cudamatrix.a + +include ../makefiles/default_rules.mk diff --git a/src/fvectorbin/compute-wav-to-rawmatrix.cc b/src/fvectorbin/compute-wav-to-rawmatrix.cc new file mode 100644 index 00000000000..6f4ba1b60c2 --- /dev/null +++ b/src/fvectorbin/compute-wav-to-rawmatrix.cc @@ -0,0 +1,123 @@ +// featbin/compute-mfcc-feats.cc + +// Copyright 2009-2012 Microsoft Corporation +// Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "feat/feature-mfcc.h" +#include "feat/wave-reader.h" + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + const char *usage = + "Convert wav to rawmatrix.\n" + "Usage: compute-wav-to-rawmatrix [options...] \n"; + + // construct all the global objects + ParseOptions po(usage); + FrameExtractionOptions extraction_opts; + int32 channel = -1; + BaseFloat min_duration = 0.0; + // Register the MFCC option struct + extraction_opts.Register(&po); + + // Register the options + po.Register("channel", &channel, "Channel to extract (-1 -> expect mono, " + "0 -> left, 1 -> right)"); + po.Register("min-duration", &min_duration, "Minimum duration of segments " + "to process (in seconds)."); + + po.Read(argc, argv); + + if (po.NumArgs() != 2) { + po.PrintUsage(); + exit(1); + } + + std::string wav_rspecifier = po.GetArg(1); + std::string output_wspecifier = po.GetArg(2); + + SequentialTableReader reader(wav_rspecifier); + BaseFloatMatrixWriter kaldi_writer; // typedef to TableWriter. + + if (!kaldi_writer.Open(output_wspecifier)) { + KALDI_ERR << "Could not initialize output with wspecifier " + << output_wspecifier; + } + + int32 num_utts = 0, num_success = 0; + for (; !reader.Done(); reader.Next()) { + num_utts++; + std::string utt = reader.Key(); + const WaveData &wave_data = reader.Value(); + if (wave_data.Duration() < min_duration) { + KALDI_WARN << "File: " << utt << " is too short (" + << wave_data.Duration() << " sec): producing no output."; + continue; + } + int32 num_chan = wave_data.Data().NumRows(), this_chan = channel; + { // This block works out the channel (0=left, 1=right...) + KALDI_ASSERT(num_chan > 0); // should have been caught in + // reading code if no channels. + if (channel == -1) { + this_chan = 0; + if (num_chan != 1) + KALDI_WARN << "Channel not specified but you have data with " + << num_chan << " channels; defaulting to zero"; + } else { + if (this_chan >= num_chan) { + KALDI_WARN << "File with id " << utt << " has " + << num_chan << " channels but you specified channel " + << channel << ", producing no output."; + continue; + } + } + } + SubVector waveform(wave_data.Data(), this_chan); + Matrix features; + try { + int32 rows_out = NumFrames(waveform.Dim(), extraction_opts); + int32 cols_out = extraction_opts.WindowSize(); + features.Resize(rows_out, cols_out); + for (int32 i = 0; i < rows_out; i++) { + features.CopyRowFromVec( + SubVector(waveform, i*extraction_opts.WindowShift(), + extraction_opts.WindowSize()), i); + } + } catch (...) { + KALDI_WARN << "Failed to compute features for utterance " + << utt; + continue; + } + kaldi_writer.Write(utt, features); + if (num_utts % 10 == 0) + KALDI_LOG << "Processed " << num_utts << " utterances"; + KALDI_VLOG(2) << "Processed features for key " << utt; + num_success++; + } + KALDI_LOG << " Done " << num_success << " out of " << num_utts + << " utterances."; + return (num_success != 0 ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} + diff --git a/src/fvectorbin/fvector-add-noise-block.cc b/src/fvectorbin/fvector-add-noise-block.cc new file mode 100644 index 00000000000..f59babf3078 --- /dev/null +++ b/src/fvectorbin/fvector-add-noise-block.cc @@ -0,0 +1,61 @@ +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "fvector/fvector-perturb.h" + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + const char *usage = + "Perturb the chunk data. Each time the input is a source chunk block and\n" + "two noise chunk block. The two noise blocks are added to source block separately,\n" + "and then we maybe do volume perturbate, speed perturb or time shift.\n" + "At last, the output is a matrix. Each two consecutive rows of the matrix\n" + "come from same source wave, but were used different perturbation method.\n" + "Usage: fvector-add-noise-block [options...] " + " \n"; + + // construct all the global objects + ParseOptions po(usage); + FvectorPerturbOptions perturb_opts; + perturb_opts.Register(&po); + + po.Read(argc, argv); + if (po.NumArgs() != 4) { + po.PrintUsage(); + exit(1); + } + + std::string source_chunk_rspecifier = po.GetArg(1), + noise1_chunk_rspecifier = po.GetArg(2), + noise2_chunk_rspecifier = po.GetArg(3), + perturbed_chunk_rspecifier = po.GetArg(4); + + SequentialBaseFloatMatrixReader source_chunk_reader(source_chunk_rspecifier); + RandomAccessBaseFloatMatrixReader noise1_chunk_reader(noise1_chunk_rspecifier); + RandomAccessBaseFloatMatrixReader noise2_chunk_reader(noise2_chunk_rspecifier); + BaseFloatMatrixWriter perturbed_chunk_writer(perturbed_chunk_rspecifier); + + int64 num_read = 0, num_written = 0; + for (; !source_chunk_reader.Done(); source_chunk_reader.Next(), num_read++) { + std::string key = source_chunk_reader.Key(); + // get source and 2 noise matrices. + const Matrix &source_input = source_chunk_reader.Value(); + const Matrix &noise1_input = noise1_chunk_reader.Value(key); + const Matrix &noise2_input = noise2_chunk_reader.Value(key); + + // the class FvectorPerturbBlock conduct the different perturb operation. + FvectorPerturbBlock perturb_fvector_block(perturb_opts, source_input, + noise1_input, noise2_input); + Matrix perturbed_chunk; + perturb_fvector_block.ApplyPerturbationBlock(&perturbed_chunk); + perturbed_chunk_writer.Write(key, perturbed_chunk); + num_written++; + } + KALDI_LOG << " Done " << num_written << " out of " << num_read + << " utterances."; + return (num_written != 0 ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} diff --git a/src/fvectorbin/fvector-add-noise-separate.cc b/src/fvectorbin/fvector-add-noise-separate.cc new file mode 100644 index 00000000000..3d0652ef01e --- /dev/null +++ b/src/fvectorbin/fvector-add-noise-separate.cc @@ -0,0 +1,72 @@ +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "fvector/fvector-perturb.h" + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + const char *usage = + "Perturb the chunk data. We read in one source chunk and two noise chunks separately\n" + "According to the setup, use (0-4) kinds of perturbation opertation, and then each output chunk \n" + "is a 2 consecutive rows of output matrix.\n" + "The two rows come from the same source wavform signal, but now they are different.\n" + "Usage: fvector-add-noise [options...] \n"; + + // construct all the global objects + ParseOptions po(usage); + FvectorPerturbOptions perturb_opts; + perturb_opts.Register(&po); + + po.Read(argc, argv); + if (po.NumArgs() != 3) { + po.PrintUsage(); + exit(1); + } + + std::string chunk_rspecifier = po.GetArg(1), + noise_chunk_rspecifier = po.GetArg(2), + perturbed_chunk_rspecifier = po.GetArg(3); + + SequentialBaseFloatVectorReader chunk_reader(chunk_rspecifier); + SequentialBaseFloatVectorReader noise_chunk_reader(noise_chunk_rspecifier); + BaseFloatMatrixWriter perturbed_chunk_writer(perturbed_chunk_rspecifier); + + int64 num_read = 0, num_written = 0; + for (; !chunk_reader.Done(); chunk_reader.Next(), num_read++) { + // Read 2 noise chunks + if (noise_chunk_reader.Done()) { + KALDI_ERR << "Noise chunk is too short to enough"; + } + const Vector noise1_chunk(noise_chunk_reader.Value()); + noise_chunk_reader.Next(); + const Vector noise2_chunk(noise_chunk_reader.Value()); + noise_chunk_reader.Next(); + + std::string key = chunk_reader.Key(); + // input_chunk has 3 lines. + const Vector &input_chunk = chunk_reader.Value(); + // whole_chunk has 4 lines, it copies the first line and will be operate. + Matrix whole_chunk(4, input_chunk.Dim()); + // For here, we copy the first line. So in the "whole_chunk" the first + // two lines come from the same source wavform signal. And the third/forth + // line is the random noise. + whole_chunk.CopyRowFromVec(input_chunk, 0); + whole_chunk.CopyRowFromVec(input_chunk, 1); + whole_chunk.CopyRowFromVec(noise1_chunk, 2); + whole_chunk.CopyRowFromVec(noise2_chunk, 3); + Matrix perturbed_chunk; + + // the class FvectorPerturb conduct the different perturb operation. + FvectorPerturb perturb_fvector(perturb_opts); + perturb_fvector.ApplyPerturbation(whole_chunk, &perturbed_chunk); + perturbed_chunk_writer.Write(key, perturbed_chunk); + num_written++; + } + KALDI_LOG << " Done " << num_written << " out of " << num_read + << " utterances."; + return (num_written != 0 ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} diff --git a/src/fvectorbin/fvector-add-noise.cc b/src/fvectorbin/fvector-add-noise.cc new file mode 100644 index 00000000000..be5438fea7f --- /dev/null +++ b/src/fvectorbin/fvector-add-noise.cc @@ -0,0 +1,59 @@ +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "fvector/fvector-perturb.h" + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + const char *usage = + "Perturb the chunk data. Each input chunk is a four consecutive rows matrix(S1, S2, N1, N2).\n" + "According to the setup, use (0-4) kinds of perturbation opertation, and then each output chunk \n" + "is a 2 consecutive rows of output matrix.\n" + "The two rows come from the same source wavform signal, but now they are different.\n" + "Usage: fvector-add-noise [options...] \n"; + + // construct all the global objects + ParseOptions po(usage); + FvectorPerturbOptions perturb_opts; + perturb_opts.Register(&po); + + po.Read(argc, argv); + if (po.NumArgs() != 2) { + po.PrintUsage(); + exit(1); + } + + std::string chunk_rspecifier = po.GetArg(1), + perturbed_chunk_rspecifier = po.GetArg(2); + + SequentialBaseFloatMatrixReader chunk_reader(chunk_rspecifier); + BaseFloatMatrixWriter perturbed_chunk_writer(perturbed_chunk_rspecifier); + + int64 num_read = 0, num_written = 0; + for (; !chunk_reader.Done(); chunk_reader.Next(), num_read++) { + std::string key = chunk_reader.Key(); + // input_chunk has 3 lines. + const Matrix &input_chunk = chunk_reader.Value(); + // whole_chunk has 4 lines, it copies the first line and will be operate. + Matrix whole_chunk(4, input_chunk.NumCols()); + // For here, we copy the first line. So in the "whole_chunk" the first + // two lines come from the same source wavform signal. And the third/forth + // line is the random noise. + MatrixIndexT indices[4] = {0, 0, 1, 2}; + whole_chunk.CopyRows(input_chunk, indices); + Matrix perturbed_chunk; + + // the class FvectorPerturb conduct the different perturb operation. + FvectorPerturb perturb_fvector(perturb_opts); + perturb_fvector.ApplyPerturbation(whole_chunk, &perturbed_chunk); + perturbed_chunk_writer.Write(key, perturbed_chunk); + num_written++; + } + KALDI_LOG << " Done " << num_written << " out of " << num_read + << " utterances."; + return (num_written != 0 ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} diff --git a/src/fvectorbin/fvector-chunk-block.cc b/src/fvectorbin/fvector-chunk-block.cc new file mode 100644 index 00000000000..ab8daa93f0b --- /dev/null +++ b/src/fvectorbin/fvector-chunk-block.cc @@ -0,0 +1,212 @@ +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "feat/wave-reader.h" + +namespace kaldi { +//Randomly select two groups (uttid, startpoint) from noise list, respectively. +//The length of selected utterance is bigger than chunk_size, which guarantees +//we can get a complete noise chunk. At the same time, the startpoint is randomly +//selected from [0, len(utt)-chunk_size]. +void RandomSelectTwoNoiseUtt(const std::vector>& utt2dur_list, + const int32& utt2dur_len, + const int32& chunk_size, + std::vector>* output) { + for(int32 index = 0; index < 2; ++index) { + int32 r_index = -1; + do { + // r_index indicate the random index of utt2dur_list + r_index = RandInt(0, utt2dur_len-1); + } while (utt2dur_list[r_index].second > chunk_size); + // random number in [0, utt2dur] + float start_point = RandInt(0, (int)(utt2dur_list[r_index].second)*100) * 1.0 / 100; + output->push_back(std::make_pair(utt2dur_list[r_index].first, start_point)); + } + KALDI_ASSERT(output->size() == 2); +} + + +} //The end of namespace kaldi + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + const char *usage = + "Get the data chunks. We sequentially read the wav files. And cut them\n" + "into 'chunk_size' length fragment. And we randomly select two 'chunk_size'\n" + "length fragments from noise-list. Then we the store the 'source' chunk and\n" + "'noise' chunks into the corresponding matrix separately." + "Usage: fvector-chunk [options...] " + " \n"; + + // construct all the global objects + ParseOptions po(usage); + int32 chunk_size = 120; + int32 channel = -1; + int32 shift_time = 60; + BaseFloat min_duration = 0.0; + int32 srand_seed = 1; + int32 block_size = 32; + BaseFloat samp_freq = 8000; + + po.Register("channel", &channel, "Channel to extract (-1 -> expect mono, " + "0 -> left, 1 -> right)"); + po.Register("chunk-size", &chunk_size, "The expected length of the chunk."); + po.Register("shift-time", &shift_time, "Time shift, which decide the overlap " + "of two adjacent chunks in the same utterance."); + po.Register("min-duration", &min_duration, "Minimum duration of segments " + "to process (in seconds)."); + po.Register("srand", &srand_seed, "Seed for random number generator."); + po.Register("block-size",&block_size, "Specify the number of lines of feature " + "block; the number of lines of noise block will be twice."); + po.Register("sample-frequency", &samp_freq, "Specify the sample frequency. " + "(default=8000)"); + + po.Read(argc, argv); + + if (po.NumArgs() != 6) { + po.PrintUsage(); + exit(1); + } + + srand(srand_seed); + + std::string wav_rspecifier = po.GetArg(1); + std::string noise_rspecifier = po.GetArg(2); + std::string utt2dur_rxfilename = po.GetArg(3); + std::string output_feature_wspecifier = po.GetArg(4); + std::string output_noise1_wspecifier = po.GetArg(5); + std::string output_noise2_wspecifier = po.GetArg(6); + + + SequentialTableReader reader(wav_rspecifier); + RandomAccessTableReader noise_reader(noise_rspecifier); + Input ki(utt2dur_rxfilename); + BaseFloatMatrixWriter feature_writer; // typedef to TableWriter. + BaseFloatMatrixWriter noise1_writer; + BaseFloatMatrixWriter noise2_writer; + + //Read the utt2dur file + //the vector--utt2dur is used to randomly select the noise chunk. + std::vector> utt2dur; + std::string line; + while (std::getline(ki.Stream(), line)) { + std::vector split_line; + // Split the line by space or tab and check the number of fields in each + // line. There must be 2 fields--segment utt_id and duration + SplitStringToVector(line, " \t\r", true, &split_line); + if (split_line.size() != 2) { + KALDI_WARN << "Invalid line in segments file: " << line; + continue; + } + std::string utt = split_line[0], + duration_str = split_line[1]; + + double duration; + if (!ConvertStringToReal(duration_str, &duration)) { + KALDI_WARN << "Invalid line in utt2dur file: " << line; + continue; + } + utt2dur.push_back(std::make_pair(utt, duration)); + } + //random number in [0, utt2dur_len), so we get variable "utt2dur_len" + int32 utt2dur_len = utt2dur.size(); + + // Start to chunk the data, each source chunk and 2 corresponding noise + // chunks were store into corresping block matrix. When counter == block_size, + // write one source block and two noise blocks. + int32 num_utts = 0, num_success = 0; + int32 counter = 0; + int32 dim = static_cast(samp_freq * chunk_size / 1000); + Matrix feature_block(block_size, dim), + noise_block1(block_size, dim), + noise_block2(block_size, dim); + + for (; !reader.Done(); reader.Next()) { + num_utts++; + std::string utt = reader.Key(); + const WaveData &wave_data = reader.Value(); + if (wave_data.Duration() < min_duration) { + KALDI_WARN << "File: " << utt << " is too short (" + << wave_data.Duration() << " sec): producing no output."; + continue; + } + int32 num_chan = wave_data.Data().NumRows(), this_chan = channel; + { // This block works out the channel (0=left, 1=right...) + KALDI_ASSERT(num_chan > 0); // should have been caught in + // reading code if no channels. + if (channel == -1) { + this_chan = 0; + if (num_chan != 1) + KALDI_WARN << "Channel not specified but you have data with " + << num_chan << " channels; defaulting to zero"; + } else { + if (this_chan >= num_chan) { + KALDI_WARN << "File with id " << utt << " has " + << num_chan << " channels but you specified channel " + << channel << ", producing no output."; + continue; + } + } + } + + KALDI_ASSERT(wave_data.SampFreq() == samp_freq); + SubVector waveform(wave_data.Data(), this_chan); + //e.g. A "waveform" is 285ms, chunk_size is 120ms, shift_time is 70ms. At last, the chunks + //will be 0-120ms, 70-190ms, 140-260ms. So num_chunk = 3 + int32 num_chunk = (int)((waveform.Dim() / wave_data.SampFreq() - chunk_size ) / shift_time) + 1; + try { + for (int32 index = 0; index < num_chunk; ++index) { + int32 source_start = wave_data.SampFreq() * (index * shift_time); + feature_block.CopyRowFromVec(SubVector(waveform, source_start, dim), counter); + //1. Generate 2 random number form [0, utt2dur_len) + //2. From vector utt2dur, get the 2 pairs + //3. Generate 2 random "start point" number from [0, utt2dur[x][1]) + //The three steps is implemented by function--"RandomSelectTwoNoiseUtt" + //The output vector, "two_random_uttid", contains two pairs. For each + //pair, its content is + std::vector> two_random_uttid; + RandomSelectTwoNoiseUtt(utt2dur, utt2dur_len, chunk_size/1000, + &two_random_uttid); + //4. According to the utt2dur[x][0]--utt_id and startpoint form RandomAccessTable + // read noise chunk. + //5. The features matrix has 3 lines: source, nosie1, noise2. + const WaveData &noise_wav1 = noise_reader.Value(two_random_uttid[0].first); + KALDI_ASSERT(wave_data.SampFreq() == noise_wav1.SampFreq()); + SubVector noise1(noise_wav1.Data(), 0); + noise_block1.CopyRowFromVec(SubVector(noise1, two_random_uttid[0].second, dim), counter); + + const WaveData &noise_wav2 = noise_reader.Value(two_random_uttid[1].first); + KALDI_ASSERT(wave_data.SampFreq() == noise_wav2.SampFreq()); + SubVector noise2(noise_wav2.Data(), 0); + noise_block2.CopyRowFromVec(SubVector(noise2, two_random_uttid[1].second, dim), counter); + counter++; + + // when "counter == block_size", store the matrices. + if (counter == block_size) { + std::ostringstream utt_id_new; + utt_id_new << utt << '_' << index; + feature_writer.Write(utt_id_new.str(), feature_block); + noise1_writer.Write(utt_id_new.str(), noise_block1); + noise2_writer.Write(utt_id_new.str(), noise_block2); + counter = 0; + } + } + } catch (...) { + KALDI_WARN << "Failed to compute features for utterance " + << utt; + continue; + } + + if (num_utts % 10 == 0) + KALDI_LOG << "Processed " << num_utts << " utterances"; + KALDI_VLOG(2) << "Processed features for key " << utt; + num_success++; + } + KALDI_LOG << " Done " << num_success << " out of " << num_utts + << " utterances."; + return (num_success != 0 ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} diff --git a/src/fvectorbin/fvector-chunk-separate.cc b/src/fvectorbin/fvector-chunk-separate.cc new file mode 100644 index 00000000000..f9a00f880f3 --- /dev/null +++ b/src/fvectorbin/fvector-chunk-separate.cc @@ -0,0 +1,207 @@ +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "feat/wave-reader.h" + +namespace kaldi { +//Randomly select two groups (uttid, startpoint) from noise list, respectively. +//The length of selected utterance is bigger than chunk_size, which guarantees +//we can get a complete noise chunk. At the same time, the startpoint is randomly +//selected from [0, len(utt)-chunk_size]. +void RandomSelectTwoNoiseUtt(const std::vector>& utt2dur_list, + const int32& utt2dur_len, + const BaseFloat& sample_frequency, + const BaseFloat& chunk_size, + std::vector>* output) { + for(int32 index = 0; index < 2; ++index) { + int32 r_index = -1; + do { + // r_index indicate the random index of utt2dur_list + r_index = RandInt(0, utt2dur_len-1); + } while (utt2dur_list[r_index].second < chunk_size); + // random number in [0, utt2dur] + int boundary = (int)((utt2dur_list[r_index].second - chunk_size) * 1000); + float start_point = RandInt(0, boundary); + output->push_back(std::make_pair(utt2dur_list[r_index].first, start_point)); + } + KALDI_ASSERT(output->size() == 2); +} + + +} //The end of namespace kaldi + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + const char *usage = + "Get the data chunks. We sequentially read the wav files. And cut them\n" + "into 'chunk_size' length fragment. And we randomly select two 'chunk_size'\n" + "length fragments from noise-list. We call the three chunks (S1,N1,N2)\n" + "separately. Then we store the S1 into and sotre the\n" + "(N1,N2) into separately\n" + "Usage: fvector-chunk-separate [options...] " + " \n"; + + // construct all the global objects + ParseOptions po(usage); + int32 chunk_size = 120; + int32 channel = -1; + int32 shift_time = 60; + BaseFloat min_duration = 0.0; + int32 srand_seed = 1; + + po.Register("channel", &channel, "Channel to extract (-1 -> expect mono, " + "0 -> left, 1 -> right)"); + po.Register("chunk-size", &chunk_size, "The expected length of the chunk."); + po.Register("shift-time", &shift_time, "Time shift, which decide the overlap " + "of two adjacent chunks in the same utterance."); + po.Register("min-duration", &min_duration, "Minimum duration of segments " + "to process (in seconds)."); + po.Register("srand", &srand_seed, "Seed for random number generator."); + + po.Read(argc, argv); + + if (po.NumArgs() != 5) { + po.PrintUsage(); + exit(1); + } + + srand(srand_seed); + + std::string wav_rspecifier = po.GetArg(1); + std::string noise_rspecifier = po.GetArg(2); + std::string utt2dur_rxfilename = po.GetArg(3); + std::string output_wspecifier = po.GetArg(4); + std::string noise_wspecifier = po.GetArg(5); + + + SequentialTableReader reader(wav_rspecifier); + RandomAccessTableReader noise_reader(noise_rspecifier); + Input ki(utt2dur_rxfilename); + BaseFloatVectorWriter source_writer; // typedef to TableWriter. + BaseFloatVectorWriter noise_writer; + + if (!source_writer.Open(output_wspecifier)) { + KALDI_ERR << "Could not initialize output with wspecifier " + << output_wspecifier; + } + if (!noise_writer.Open(noise_wspecifier)) { + KALDI_ERR << "Could not initialize output with wspecifier " + << noise_wspecifier; + } + //Read the utt2dur file + //the vector--utt2dur is used to randomly select the noise chunk. + std::vector> utt2dur; + std::string line; + while (std::getline(ki.Stream(), line)) { + std::vector split_line; + // Split the line by space or tab and check the number of fields in each + // line. There must be 2 fields--segment utt_id and duration + SplitStringToVector(line, " \t\r", true, &split_line); + if (split_line.size() != 2) { + KALDI_WARN << "Invalid line in segments file: " << line; + continue; + } + std::string utt = split_line[0], + duration_str = split_line[1]; + + double duration; + if (!ConvertStringToReal(duration_str, &duration)) { + KALDI_WARN << "Invalid line in utt2dur file: " << line; + continue; + } + utt2dur.push_back(std::make_pair(utt, duration)); + } + //random number in [0, utt2dur_len), so we get variable "utt2dur_len" + int32 utt2dur_len = utt2dur.size(); + + // Start to chunk the data, compose 1 source chunk and 2 noise chunks into + // a matrix. + int32 num_utts = 0, num_success = 0; + for (; !reader.Done(); reader.Next()) { + num_utts++; + std::string utt = reader.Key(); + const WaveData &wave_data = reader.Value(); + if (wave_data.Duration() < min_duration) { + KALDI_WARN << "File: " << utt << " is too short (" + << wave_data.Duration() << " sec): producing no output."; + continue; + } + int32 num_chan = wave_data.Data().NumRows(), this_chan = channel; + { // This block works out the channel (0=left, 1=right...) + KALDI_ASSERT(num_chan > 0); // should have been caught in + // reading code if no channels. + if (channel == -1) { + this_chan = 0; + if (num_chan != 1) + KALDI_WARN << "Channel not specified but you have data with " + << num_chan << " channels; defaulting to zero"; + } else { + if (this_chan >= num_chan) { + KALDI_WARN << "File with id " << utt << " has " + << num_chan << " channels but you specified channel " + << channel << ", producing no output."; + continue; + } + } + } + SubVector waveform(wave_data.Data(), this_chan); + //e.g. A "waveform" is 285ms, chunk_size is 120ms, shift_time is 70ms. At last, the chunks + //will be 0-120ms, 70-190ms, 140-260ms. So num_chunk = 3 + int32 num_chunk = (int)(((waveform.Dim() * 1.0 / wave_data.SampFreq()) * 1000 - chunk_size ) / shift_time) + 1; + int32 dim = wave_data.SampFreq() * chunk_size / 1000; + try { + for (int32 index = 0; index < num_chunk; ++index) { + Matrix features(3, dim); + int32 source_start = static_cast(wave_data.SampFreq() * (index * shift_time / 1000.0)); + features.CopyRowFromVec(SubVector(waveform, source_start, dim), 0); + //1. Generate 2 random number form [0, utt2dur_len) + //2. From vector utt2dur, get the 2 pairs + //3. Generate 2 random "start point" number from [0, utt2dur[x][1]) + //The three steps is implemented by function--"RandomSelectTwoNoiseUtt" + //The output vector, "two_random_uttid", contains two pairs. For each + //pair, its content is + std::vector> two_random_uttid; + RandomSelectTwoNoiseUtt(utt2dur, utt2dur_len, wave_data.SampFreq(), chunk_size/1000.0, + &two_random_uttid); + //4. According to the utt2dur[x][0]--utt_id and startpoint form RandomAccessTable + // read noise chunk. + //5. The features matrix has 3 lines: source, nosie1, noise2. + const WaveData &noise_wav1 = noise_reader.Value(two_random_uttid[0].first); + KALDI_ASSERT(wave_data.SampFreq() == noise_wav1.SampFreq()); + SubVector noise1(noise_wav1.Data(), 0); + features.CopyRowFromVec(SubVector(noise1, two_random_uttid[0].second, dim), 1); + + const WaveData &noise_wav2 = noise_reader.Value(two_random_uttid[1].first); + KALDI_ASSERT(wave_data.SampFreq() == noise_wav2.SampFreq()); + SubVector noise2(noise_wav2.Data(), 0); + features.CopyRowFromVec(SubVector(noise2, two_random_uttid[1].second, dim), 2); + + std::ostringstream source_id; + source_id << utt << '_' << index << "_s"; + source_writer.Write(source_id.str(), Vector(SubVector(features, 0))); + std::ostringstream noise1_id; + noise1_id << utt << '_' << index << "_n1"; + noise_writer.Write(noise1_id.str(), Vector(SubVector(features, 1))); + std::ostringstream noise2_id; + noise2_id << utt << '_' << index << "_n2"; + noise_writer.Write(noise2_id.str(), Vector(SubVector(features, 2))); + } + } catch (...) { + KALDI_WARN << "Failed to compute features for utterance " + << utt; + continue; + } + + if (num_utts % 10 == 0) + KALDI_LOG << "Processed " << num_utts << " utterances"; + KALDI_VLOG(2) << "Processed features for key " << utt; + num_success++; + } + KALDI_LOG << " Done " << num_success << " out of " << num_utts + << " utterances."; + return (num_success != 0 ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} diff --git a/src/fvectorbin/fvector-chunk.cc b/src/fvectorbin/fvector-chunk.cc new file mode 100644 index 00000000000..ad280cf9577 --- /dev/null +++ b/src/fvectorbin/fvector-chunk.cc @@ -0,0 +1,195 @@ +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "feat/wave-reader.h" + +namespace kaldi { +//Randomly select two groups (uttid, startpoint) from noise list, respectively. +//The length of selected utterance is bigger than chunk_size, which guarantees +//we can get a complete noise chunk. At the same time, the startpoint is randomly +//selected from [0, len(utt)-chunk_size]. +void RandomSelectTwoNoiseUtt(const std::vector>& utt2dur_list, + const int32& utt2dur_len, + const BaseFloat& sample_frequency, + const BaseFloat& chunk_size, + std::vector>* output) { + for(int32 index = 0; index < 2; ++index) { + int32 r_index = -1; + do { + // r_index indicate the random index of utt2dur_list + r_index = RandInt(0, utt2dur_len-1); + } while (utt2dur_list[r_index].second < chunk_size); + // random number in [0, utt2dur] + int boundary = (int)((utt2dur_list[r_index].second - chunk_size) * 1000); + float start_point = RandInt(0, boundary); + output->push_back(std::make_pair(utt2dur_list[r_index].first, start_point)); + } + KALDI_ASSERT(output->size() == 2); +} + + +} //The end of namespace kaldi + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + const char *usage = + "Get the data chunks. We sequentially read the wav files. And cut them\n" + "into 'chunk_size' length fragment. And we randomly select two 'chunk_size'\n" + "length fragments from noise-list. Then we compose the three vectors into\n" + "a matrix, which we call it 'chunk'. So each item in the output file \n" + "is a matrix which has 3 lines.(S1, N1, N2).\n" + "Usage: fvector-chunk [options...] " + " \n"; + + // construct all the global objects + ParseOptions po(usage); + int32 chunk_size = 120; + int32 channel = -1; + int32 shift_time = 60; + BaseFloat min_duration = 0.0; + int32 srand_seed = 1; + + po.Register("channel", &channel, "Channel to extract (-1 -> expect mono, " + "0 -> left, 1 -> right)"); + po.Register("chunk-size", &chunk_size, "The expected length of the chunk."); + po.Register("shift-time", &shift_time, "Time shift, which decide the overlap " + "of two adjacent chunks in the same utterance."); + po.Register("min-duration", &min_duration, "Minimum duration of segments " + "to process (in seconds)."); + po.Register("srand", &srand_seed, "Seed for random number generator."); + + po.Read(argc, argv); + + if (po.NumArgs() != 4) { + po.PrintUsage(); + exit(1); + } + + srand(srand_seed); + + std::string wav_rspecifier = po.GetArg(1); + std::string noise_rspecifier = po.GetArg(2); + std::string utt2dur_rxfilename = po.GetArg(3); + std::string output_wspecifier = po.GetArg(4); + + + SequentialTableReader reader(wav_rspecifier); + RandomAccessTableReader noise_reader(noise_rspecifier); + Input ki(utt2dur_rxfilename); + BaseFloatMatrixWriter kaldi_writer; // typedef to TableWriter. + + if (!kaldi_writer.Open(output_wspecifier)) { + KALDI_ERR << "Could not initialize output with wspecifier " + << output_wspecifier; + } + //Read the utt2dur file + //the vector--utt2dur is used to randomly select the noise chunk. + std::vector> utt2dur; + std::string line; + while (std::getline(ki.Stream(), line)) { + std::vector split_line; + // Split the line by space or tab and check the number of fields in each + // line. There must be 2 fields--segment utt_id and duration + SplitStringToVector(line, " \t\r", true, &split_line); + if (split_line.size() != 2) { + KALDI_WARN << "Invalid line in segments file: " << line; + continue; + } + std::string utt = split_line[0], + duration_str = split_line[1]; + + double duration; + if (!ConvertStringToReal(duration_str, &duration)) { + KALDI_WARN << "Invalid line in utt2dur file: " << line; + continue; + } + utt2dur.push_back(std::make_pair(utt, duration)); + } + //random number in [0, utt2dur_len), so we get variable "utt2dur_len" + int32 utt2dur_len = utt2dur.size(); + + // Start to chunk the data, compose 1 source chunk and 2 noise chunks into + // a matrix. + int32 num_utts = 0, num_success = 0; + for (; !reader.Done(); reader.Next()) { + num_utts++; + std::string utt = reader.Key(); + const WaveData &wave_data = reader.Value(); + if (wave_data.Duration() < min_duration) { + KALDI_WARN << "File: " << utt << " is too short (" + << wave_data.Duration() << " sec): producing no output."; + continue; + } + int32 num_chan = wave_data.Data().NumRows(), this_chan = channel; + { // This block works out the channel (0=left, 1=right...) + KALDI_ASSERT(num_chan > 0); // should have been caught in + // reading code if no channels. + if (channel == -1) { + this_chan = 0; + if (num_chan != 1) + KALDI_WARN << "Channel not specified but you have data with " + << num_chan << " channels; defaulting to zero"; + } else { + if (this_chan >= num_chan) { + KALDI_WARN << "File with id " << utt << " has " + << num_chan << " channels but you specified channel " + << channel << ", producing no output."; + continue; + } + } + } + SubVector waveform(wave_data.Data(), this_chan); + //e.g. A "waveform" is 285ms, chunk_size is 120ms, shift_time is 70ms. At last, the chunks + //will be 0-120ms, 70-190ms, 140-260ms. So num_chunk = 3 + int32 num_chunk = (int)(((waveform.Dim() * 1.0 / wave_data.SampFreq()) * 1000 - chunk_size ) / shift_time) + 1; + int32 dim = wave_data.SampFreq() * chunk_size / 1000; + try { + for (int32 index = 0; index < num_chunk; ++index) { + Matrix features(3, dim); + int32 source_start = static_cast(wave_data.SampFreq() * (index * shift_time / 1000.0)); + features.CopyRowFromVec(SubVector(waveform, source_start, dim), 0); + //1. Generate 2 random number form [0, utt2dur_len) + //2. From vector utt2dur, get the 2 pairs + //3. Generate 2 random "start point" number from [0, utt2dur[x][1]) + //The three steps is implemented by function--"RandomSelectTwoNoiseUtt" + //The output vector, "two_random_uttid", contains two pairs. For each + //pair, its content is + std::vector> two_random_uttid; + RandomSelectTwoNoiseUtt(utt2dur, utt2dur_len, wave_data.SampFreq(), chunk_size/1000.0, + &two_random_uttid); + //4. According to the utt2dur[x][0]--utt_id and startpoint form RandomAccessTable + // read noise chunk. + //5. The features matrix has 3 lines: source, nosie1, noise2. + const WaveData &noise_wav1 = noise_reader.Value(two_random_uttid[0].first); + KALDI_ASSERT(wave_data.SampFreq() == noise_wav1.SampFreq()); + SubVector noise1(noise_wav1.Data(), 0); + features.CopyRowFromVec(SubVector(noise1, two_random_uttid[0].second, dim), 1); + + const WaveData &noise_wav2 = noise_reader.Value(two_random_uttid[1].first); + KALDI_ASSERT(wave_data.SampFreq() == noise_wav2.SampFreq()); + SubVector noise2(noise_wav2.Data(), 0); + features.CopyRowFromVec(SubVector(noise2, two_random_uttid[1].second, dim), 2); + + std::ostringstream utt_id_new; + utt_id_new << utt << '_' << index; + kaldi_writer.Write(utt_id_new.str(), features); + } + } catch (...) { + KALDI_WARN << "Failed to compute features for utterance " + << utt; + continue; + } + + if (num_utts % 10 == 0) + KALDI_LOG << "Processed " << num_utts << " utterances"; + KALDI_VLOG(2) << "Processed features for key " << utt; + num_success++; + } + KALDI_LOG << " Done " << num_success << " out of " << num_utts + << " utterances."; + return (num_success != 0 ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} diff --git a/src/fvectorbin/fvector-debug-check-filter-bank.cc b/src/fvectorbin/fvector-debug-check-filter-bank.cc new file mode 100644 index 00000000000..67a8140d5f9 --- /dev/null +++ b/src/fvectorbin/fvector-debug-check-filter-bank.cc @@ -0,0 +1,64 @@ +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "fvector/fvector-perturb.h" +#include "feat/wave-reader.h" +#include "nnet3/nnet-utils.h" +#include "nnet3/nnet-simple-component.h" + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + + const char *usage = + "This binary is used to check the filter bank which is modeled by affine\n" + "component. It computes the band-with of each learned filter." + "Usage: fvector-debug-check-filter-bank [options...] \n"; + + ParseOptions po(usage); + + po.Read(argc, argv); + if (po.NumArgs() != 3) { + po.PrintUsage(); + exit(1); + } + + std::string nnet_rxfilename = po.GetArg(1), + component_name = po.GetArg(2), + stats_wxfilename = po.GetArg(3); + + Nnet nnet; + ReadKaldiObject(nnet_rxfilename, &nnet); + int32 component_index = nnet.GetComponentIndex(component_name); + Matrix filter_bank( + dynamic_cast(nnet.GetComponent(component_index))->LinearParams()); + std::ofstream out; + out.open(stats_wxfilename, std::ios::out); + if (!out.is_open()) { + std::cout << "File open error." << std::endl; + return -1; + } + int32 num_rows = filter_bank.NumRows(); + int32 num_columns = filter_bank.NumCols(); + out << "Number of rows: " << num_rows << std::endl; + out << "Number of columns: " << num_columns << std::endl; + // Each row can be regard as a filter. + for (MatrixIndexT i = 0; i < num_rows; i++) { + const SubVector current_row = filter_bank.Row(i); + BaseFloat current_sum_2 = VecVec(current_row, current_row); + BaseFloat current_max_2 = current_row.Max() * current_row.Max(); + BaseFloat band_with = current_sum_2 / current_max_2; + out << "Filter " << i+1 << ": Quadratic Sum is " << current_sum_2 + << " ;The square of max value is " << current_max_2 + << " ;Band with is " << band_with << std::endl; + } + + out.close(); + return 0; + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} diff --git a/src/fvectorbin/fvector-debug-wav-to-vector.cc b/src/fvectorbin/fvector-debug-wav-to-vector.cc new file mode 100644 index 00000000000..ad202b88742 --- /dev/null +++ b/src/fvectorbin/fvector-debug-wav-to-vector.cc @@ -0,0 +1,41 @@ +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "fvector/fvector-perturb.h" +#include "feat/wave-reader.h" + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + const char *usage = + "Usage: fvector-wav-to-vector [options...] \n"; + + ParseOptions po(usage); + BaseFloat sample_freq=16000; + po.Register("sample-frequency",&sample_freq, "sample-frequency of the wave."); + po.Read(argc, argv); + if (po.NumArgs() != 2) { + po.PrintUsage(); + exit(1); + } + + std::string wav_rspecifier = po.GetArg(1), + output_wspecifier = po.GetArg(2); + + SequentialTableReader reader(wav_rspecifier); + BaseFloatVectorWriter kaldi_writer(output_wspecifier); + + int64 num_read = 0, num_written = 0; + for (; !reader.Done(); reader.Next(), num_read++) { + std::string utt = reader.Key(); + const WaveData &wave_data = reader.Value(); + Vector waveform(SubVector(wave_data.Data(), 0)); + kaldi_writer.Write(utt, waveform); + } + KALDI_LOG << " Done " << num_written << " out of " << num_read + << " utterances."; + return (num_written != 0 ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} diff --git a/src/fvectorbin/fvector-debug-write-to-wav.cc b/src/fvectorbin/fvector-debug-write-to-wav.cc new file mode 100644 index 00000000000..6385a6c5a1b --- /dev/null +++ b/src/fvectorbin/fvector-debug-write-to-wav.cc @@ -0,0 +1,52 @@ +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "fvector/fvector-perturb.h" +#include "feat/wave-reader.h" + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + const char *usage = + "Usage: fvector-write-to-wav [options...] \n"; + + ParseOptions po(usage); + BaseFloat sample_freq=16000; + po.Register("sample-frequency",&sample_freq, "sample-frequency of the wave."); + po.Read(argc, argv); + if (po.NumArgs() != 2) { + po.PrintUsage(); + exit(1); + } + + std::string chunk_rspecifier = po.GetArg(1), + wave_path = po.GetArg(2); + + SequentialBaseFloatMatrixReader chunk_reader(chunk_rspecifier); + + int64 num_read = 0, num_written = 0; + for (; !chunk_reader.Done(); chunk_reader.Next(), num_read++) { + std::string key = chunk_reader.Key(); + // input_chunk has 3 lines. + const Matrix &input_chunk = chunk_reader.Value(); + num_read++; + for(int i=0; i temp(1, input_chunk.NumCols()); + temp.CopyRowFromVec(input_chunk.Row(i),0); + WaveData wave(sample_freq, temp); + + wave.Write(os.Stream()); + num_written++; + } + } + KALDI_LOG << " Done " << num_written << " out of " << num_read + << " utterances."; + return (num_written != 0 ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} diff --git a/src/fvectorbin/fvector-get-egs-block.cc b/src/fvectorbin/fvector-get-egs-block.cc new file mode 100644 index 00000000000..acc98827e46 --- /dev/null +++ b/src/fvectorbin/fvector-get-egs-block.cc @@ -0,0 +1,122 @@ +// fvectorbin/fvector-get-egs.cc + +// Copyright 2012-2016 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include +#include "util/common-utils.h" +#include "nnet3/nnet-example.h" + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + typedef kaldi::int32 int32; + + const char *usage = + "Get examples for training an nnet3 neural network for the fvector\n" + "system. Each output example contains a pair of feature chunks.\n" + "Different from fvector-get-egs, the input is a matrix block which \n" + "contains some pairs rather than one pair.\n" + "Usage: fvector-get-egs-block [options] \n" + "For example:\n" + "fvector-get-egs scp:perturbed_chunks.scp ark:egs.ark"; + + bool compress = true; + BaseFloat frame_length_ms = 25; // in milliseconds + BaseFloat frame_shift_ms = 10; // in milliseconds + BaseFloat samp_freq; + + ParseOptions po(usage); + po.Register("compress", &compress, "If true, write egs in " + "compressed format."); + po.Register("frame-length", &frame_length_ms, "Frame length in milliseconds"); + po.Register("frame-shift", &frame_shift_ms, "Frame shift in milliseconds"); + po.Register("sample-frequency", &samp_freq, "Waveform data sample frequency (" + "must match the waveform file, if specified there)"); + + po.Read(argc, argv); + + if (po.NumArgs() != 2) { + po.PrintUsage(); + exit(1); + } + + std::string feature_rspecifier = po.GetArg(1); + NnetExampleWriter example_writer(po.GetArg(2)); + + + SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier); + + int32 num_read = 0, + num_egs_written = 0; + for (; feature_reader.Done(); feature_reader.Next(), num_read++) { + std::string key = feature_reader.Key(); + const Matrix &feats = feature_reader.Value(); + //Please take care. Here, the 'feats' is a matrix block which is generated + //by fvector-add-noise-block.cc. Each two consecutive lines of the matrix + //represents two perturbed vectors(e.g 100ms wavform) which come from the + //same source signal.chunk1 and chunk2 corresponds to one line respectively. + for (MatrixIndexT i=0; i < feats.NumRows()/2; i++) { + SubVector chunk1(feats, 2*i), + chunk2(feats, 2*i+1); + //According to frame_length and frame_shift, cut the chunk into few pieces + //so that it is similiar with normal feature extract procedure. + int num_rows = ((int)((chunk1.Dim() / samp_freq - frame_length_ms) / + frame_shift_ms) + 1); + int num_cols = (int)(samp_freq * frame_length_ms); + Matrix chunk1_matrix(num_rows, num_cols), + chunk2_matrix(num_rows, num_cols); + for (MatrixIndexT i = 0; i < num_rows; i++) { + chunk1_matrix.Row(i).CopyFromVec(chunk1.Range(i*num_cols, num_cols)); + chunk2_matrix.Row(i).CopyFromVec(chunk2.Range(i*num_cols, num_cols)); + } + //generate the NnetIo + NnetIo nnet_io1 = NnetIo("input", 0, chunk1_matrix), + nnet_io2 = NnetIo("input", 0, chunk2_matrix); + //modify the n index, so that in a mini-batch Nnet3Example, the adjacent + //two NnetIos come from the same source signal. + for (std::vector::iterator indx_it = nnet_io1.indexes.begin(); + indx_it != nnet_io1.indexes.end(); ++indx_it) { + indx_it->n = 0; + } + for (std::vector::iterator indx_it = nnet_io2.indexes.begin(); + indx_it != nnet_io2.indexes.end(); ++indx_it) { + indx_it->n = 1; + } + NnetExample eg; + eg.io.push_back(nnet_io1); + eg.io.push_back(nnet_io2); + if (compress) { + eg.Compress(); + } + std::ostringstream os; + os << key << "-" << i; + std::string key_new = os.str(); + example_writer.Write(key_new, eg); + num_egs_written += 1; + } + } + KALDI_LOG << "Finished generating examples, " + << "successfully convert " << num_egs_written << " chunks into examples out of " + << num_read << " chunks"; + return (num_egs_written == 0); + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} diff --git a/src/fvectorbin/fvector-get-egs.cc b/src/fvectorbin/fvector-get-egs.cc new file mode 100644 index 00000000000..c5b59e02c3c --- /dev/null +++ b/src/fvectorbin/fvector-get-egs.cc @@ -0,0 +1,143 @@ +// fvectorbin/fvector-get-egs.cc + +// Copyright 2012-2016 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include +#include "util/common-utils.h" +#include "nnet3/nnet-example.h" + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + typedef kaldi::int32 int32; + + const char *usage = + "Get examples for training an nnet3 neural network for the fvector\n" + "system. Each output example contains a pair of feature chunks.\n" + "Usage: fvector-get-egs [options] \n" + "For example:\n" + "fvector-get-egs scp:perturbed_chunks.scp ark:egs.ark"; + + bool compress = true; + BaseFloat frame_length_ms = 25; // in milliseconds + BaseFloat frame_shift_ms = 10; // in milliseconds + BaseFloat samp_freq=16000; + int left_padding=0; + int right_padding=0; + + ParseOptions po(usage); + po.Register("compress", &compress, "If true, write egs in " + "compressed format."); + po.Register("frame-length", &frame_length_ms, "Frame length in milliseconds"); + po.Register("frame-shift", &frame_shift_ms, "Frame shift in milliseconds"); + po.Register("sample-frequency", &samp_freq, "Waveform data sample frequency (" + "must match the waveform file, if specified there)"); + po.Register("left-padding", &left_padding, "When we use convolutional NN," + "we tend to pad on the time axis with repeats of the first frame."); + po.Register("right-padding", &right_padding, "When we use convolutional NN," + "we tend to pad on the time axis with repeats of the last frame."); + + + po.Read(argc, argv); + + if (po.NumArgs() != 2) { + po.PrintUsage(); + exit(1); + } + + std::string feature_rspecifier = po.GetArg(1); + NnetExampleWriter example_writer(po.GetArg(2)); + + SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier); + + int32 num_read = 0, + num_egs_written = 0; + for (; !feature_reader.Done(); feature_reader.Next(), num_read++) { + std::string key = feature_reader.Key(); + const Matrix &feats = feature_reader.Value(); + //Please take care. Here, the 'feats' is a 2-lines matrix which is generated + //by fvector-add-noise.cc. The 2-lines matrix represents two perturbed + //vectors(e.g 100ms wavform) which come from the same source signal. + //chunk1 and chunk2 corresponds to one line respectively. + SubVector chunk1(feats, 0), + chunk2(feats, 1); + + //According to frame_length and frame_shift, cut the chunk into few pieces + //so that it is similiar with normal feature extract procedure. + int num_rows = ((int)(((chunk1.Dim() * 1.0 / samp_freq) * 1000 - frame_length_ms) / + frame_shift_ms) + 1); + int num_cols = (int)(samp_freq / 1000.0 * frame_length_ms); + Matrix chunk1_matrix(num_rows, num_cols), + chunk2_matrix(num_rows, num_cols); + for (MatrixIndexT i = 0; i < num_rows; i++) { + chunk1_matrix.Row(i).CopyFromVec(chunk1.Range(i*frame_shift_ms*samp_freq/1000, num_cols)); + chunk2_matrix.Row(i).CopyFromVec(chunk2.Range(i*frame_shift_ms*samp_freq/1000, num_cols)); + } + Matrix chunk1_matrix_out(chunk1_matrix), + chunk2_matrix_out(chunk2_matrix); + if((left_padding !=0) || (right_padding != 0)) { + int32 tot_num_rows = num_rows+left_padding+right_padding; + chunk1_matrix_out.Resize(tot_num_rows, num_cols, kUndefined); + chunk2_matrix_out.Resize(tot_num_rows, num_cols, kUndefined); + for(int32 row = 0; row < tot_num_rows; row++) { + int32 row_in = row - left_padding; + if (row_in < 0) { + row_in = 0; + } else if (row_in >= num_rows ) { + row_in = num_rows -1; + } + SubVector vec_chunk1_in(chunk1_matrix, row_in), + vec_chunk1_out(chunk1_matrix_out, row), + vec_chunk2_in(chunk2_matrix, row_in), + vec_chunk2_out(chunk2_matrix_out, row); + vec_chunk1_out.CopyFromVec(vec_chunk1_in); + vec_chunk2_out.CopyFromVec(vec_chunk2_in); + } + } + //generate the NnetIo + NnetIo nnet_io1 = NnetIo("input", -left_padding, chunk1_matrix_out), + nnet_io2 = NnetIo("input", -left_padding, chunk2_matrix_out); + //modify the n index, so that in a mini-batch Nnet3Example, the adjacent + //two NnetIos come from the same source signal. + for (std::vector::iterator indx_it = nnet_io1.indexes.begin(); + indx_it != nnet_io1.indexes.end(); ++indx_it) { + indx_it->n = 0; + } + for (std::vector::iterator indx_it = nnet_io2.indexes.begin(); + indx_it != nnet_io2.indexes.end(); ++indx_it) { + indx_it->n = 1; + } + NnetExample eg; + eg.io.push_back(nnet_io1); + eg.io.push_back(nnet_io2); + if (compress) { + eg.Compress(); + } + example_writer.Write(key, eg); + num_egs_written += 1; + } + KALDI_LOG << "Finished generating examples, " + << "successfully convert " << num_egs_written << " chunks into examples out of " + << num_read << " chunks"; + return (num_egs_written == 0); + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc index cc5fe3cc050..63cdb93b813 100644 --- a/src/nnet3/nnet-example-utils.cc +++ b/src/nnet3/nnet-example-utils.cc @@ -113,7 +113,9 @@ static void MergeIo(const std::vector &src, names_end = names.end(); std::vector::const_iterator eg_iter = src.begin(), eg_end = src.end(); + int32 n_offset = 0; for (int32 n = 0; eg_iter != eg_end; ++eg_iter, ++n) { + int32 max_source_n = 0; std::vector::const_iterator io_iter = eg_iter->io.begin(), io_end = eg_iter->io.end(); for (; io_iter != io_end; ++io_iter) { @@ -139,12 +141,21 @@ static void MergeIo(const std::vector &src, for (int32 i = this_offset; i < this_offset + this_size; i++) { // we could easily support merging already-merged egs, but I don't see a // need for it right now. - KALDI_ASSERT(output_iter[i].n == 0 && - "Merging already-merged egs? Not currentlysupported."); - output_iter[i].n = n; + + // For fvector, the NnetIos in the same NnetExample may have the same + // name, however the index.ns of them are different. + //KALDI_ASSERT(output_iter[i].n == 0 && + // "Merging already-merged egs? Not currentlysupported."); + //output_iter[i].n = n; + KALDI_ASSERT(output_iter[i].n >= 0); + if (output_iter[i].n > max_source_n) { + max_source_n = output_iter[i].n; + } + output_iter[i].n += n_offset; } this_offset += this_size; // note: this_offset is a reference. } + n_offset += max_source_n + 1; } KALDI_ASSERT(cur_size == sizes); for (int32 f = 0; f < num_feats; f++) { diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc index 8246148abc6..650e8b5aecf 100644 --- a/src/nnet3/nnet-utils.cc +++ b/src/nnet3/nnet-utils.cc @@ -2067,6 +2067,69 @@ bool PositiveUpdatableWeights(Nnet *nnet) { return true; } +/// For Xvector +void GetConstantOutput(const Nnet &nnet_const, const std::string &output_name, + Vector *output) { + Nnet nnet(nnet_const); + std::string input_name = "input"; + int32 left_context, + right_context, + input_node_index = nnet.GetNodeIndex(input_name), + output_node_index = nnet.GetNodeIndex(output_name); + if (output_node_index == -1 && !nnet.IsOutputNode(output_node_index)) + KALDI_ERR << "No output node called '" << output_name + << "' in the network."; + if (input_node_index == -1 && nnet.IsInputNode(input_node_index)) + KALDI_ERR << "No input node called '" << input_name + << "' in the network."; + KALDI_ASSERT(output->Dim() == nnet.OutputDim(output_name)); + ComputeSimpleNnetContext(nnet, &left_context, &right_context); + + // It's difficult to get the output of the node + // directly. Instead, we can create some fake input, + // propagate it through the network, and read out the + // output. + CuMatrix cu_feats(left_context + right_context + 1, + nnet.InputDim(input_name)); + Matrix feats(cu_feats); + + ComputationRequest request; + NnetIo nnet_io = NnetIo(input_name, 0, feats); + request.inputs.clear(); + request.outputs.clear(); + request.inputs.resize(1); + request.outputs.resize(1); + request.need_model_derivative = false; + request.store_component_stats = false; + + std::vector output_indexes; + request.inputs[0].name = input_name; + request.inputs[0].indexes = nnet_io.indexes; + request.inputs[0].has_deriv = false; + output_indexes.resize(1); + output_indexes[0].n = 0; + output_indexes[0].t = 0; + request.outputs[0].name = output_name; + request.outputs[0].indexes = output_indexes; + request.outputs[0].has_deriv = false; + + CachingOptimizingCompiler compiler(nnet, NnetOptimizeOptions()); + std::shared_ptr computation = compiler.Compile(request); + NnetComputer computer(NnetComputeOptions(), *computation, + nnet, &nnet); + + // check to see if something went wrong. + if (request.inputs.empty()) + KALDI_ERR << "No input in computation request."; + if (request.outputs.empty()) + KALDI_ERR << "No output in computation request."; + + computer.AcceptInput("input", &cu_feats); + computer.Run(); + const CuMatrixBase &output_mat = computer.GetOutput(output_name); + CuSubVector output_vec(output_mat, 0); + output->CopyFromVec(output_vec); +} } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h index 00aeb4a1661..932d0b8ed06 100644 --- a/src/nnet3/nnet-utils.h +++ b/src/nnet3/nnet-utils.h @@ -506,6 +506,13 @@ int32 GetNumNvalues(const std::vector &io_vec, */ bool PositiveUpdatableWeights(Nnet *nnet); +/// For Xvector +/// This function assumes that the node named in 'output_node' is a constant +/// function of the input features (e.g, a ConstantFunctionComponent is +/// its input) and returns it in 'out'. +void GetConstantOutput(const Nnet &nnet, const std::string &output_name, + Vector *out); + } // namespace nnet3 } // namespace kaldi diff --git a/src/xvector/Makefile b/src/xvector/Makefile new file mode 100644 index 00000000000..093dd68f1a8 --- /dev/null +++ b/src/xvector/Makefile @@ -0,0 +1,22 @@ + +all: + +OPENFST_CXXFLAGS = +OPENFST_LDLIBS = +include ../kaldi.mk + +LDFLAGS += $(CUDA_LDFLAGS) +LDLIBS += $(CUDA_LDLIBS) + +TESTFILES = xvector-test + +OBJFILES = xvector.o nnet-xvector-training.o nnet-xvector-diagnostics.o nnet-xvector-compute.o + +LIBNAME = kaldi-xvector + +ADDLIBS = ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../transform/kaldi-transform.a \ + ../nnet3/kaldi-nnet3.a ../cudamatrix/kaldi-cudamatrix.a \ + ../matrix/kaldi-matrix.a ../base/kaldi-base.a \ + ../util/kaldi-util.a + +include ../makefiles/default_rules.mk diff --git a/src/xvector/nnet-xvector-compute.cc b/src/xvector/nnet-xvector-compute.cc new file mode 100644 index 00000000000..a83e6e063f0 --- /dev/null +++ b/src/xvector/nnet-xvector-compute.cc @@ -0,0 +1,99 @@ +// xvector/nnet-xvector-compute.cc + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) +// 2016 David Snyder +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "xvector/nnet-xvector-compute.h" +#include "nnet3/nnet-utils.h" + +namespace kaldi { +namespace nnet3 { + +NnetXvectorComputer::NnetXvectorComputer( + const NnetSimpleComputationOptions &config, + Nnet *nnet): + nnet_(nnet), + config_(config), + compiler_(*nnet, config.optimize_config) { +} + +void NnetXvectorComputer::ComputeXvector(const MatrixBase &feats, + Vector *xvector) { + + ComputationRequest request; + GetComputationRequest(feats, &request); + std::shared_ptr computation = compiler_.Compile(request); + NnetComputer computer(config_.compute_config, *computation, + *nnet_, + nnet_); + std::string input_name = "input"; + CuMatrix cu_feats(feats); + computer.AcceptInput(input_name, &cu_feats); + computer.Run(); + const CuMatrixBase &output = computer.GetOutput("output"); + KALDI_ASSERT(output.NumRows() == 1 && output.NumCols() == xvector->Dim()); + CuSubVector xvector_tmp(output, 0); + xvector->CopyFromVec(xvector_tmp); +} + +void NnetXvectorComputer::GetComputationRequest( + const MatrixBase &feats, + ComputationRequest *request) { + std::string input_name = "input", + output_name = "output"; + NnetIo nnet_io = NnetIo(input_name, 0, feats); + request->inputs.clear(); + request->outputs.clear(); + request->inputs.resize(1); + request->outputs.resize(1); + request->need_model_derivative = false; + request->store_component_stats = false; + + int32 input_node_index = nnet_->GetNodeIndex(input_name); + + if (input_node_index == -1 && !nnet_->IsInputNode(input_node_index)) + KALDI_ERR << "No input node called '" << input_name + << "' in the network."; + + request->inputs[0].name = input_name; + request->inputs[0].indexes = nnet_io.indexes; + request->inputs[0].has_deriv = false; + + // We only need the output on frame t=0. + std::vector output_indexes; + output_indexes.resize(1); + output_indexes[0].n = 0; + output_indexes[0].t = 0; + + // Add an io_spec for the output node. + int32 output_node_index = nnet_->GetNodeIndex(output_name); + if (!nnet_->IsOutputNode(output_node_index)) + KALDI_ERR << "No output node called '" << output_name + << "' in the network."; + request->outputs[0].name = output_name; + request->outputs[0].indexes = output_indexes; + request->outputs[0].has_deriv = false; + + // check to see if something went wrong. + if (request->inputs.empty()) + KALDI_ERR << "No input in computation request."; + if (request->outputs.empty()) + KALDI_ERR << "No output in computation request."; +} + +} // namespace nnet3 +} // namespace kaldi diff --git a/src/xvector/nnet-xvector-compute.h b/src/xvector/nnet-xvector-compute.h new file mode 100644 index 00000000000..b551c24e76c --- /dev/null +++ b/src/xvector/nnet-xvector-compute.h @@ -0,0 +1,55 @@ +// xvector/nnet-xvector-compute.h + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) +// 2016 David Snyder +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_XVECTOR_NNET_XVECTOR_COMPUTE_H_ +#define KALDI_XVECTOR_NNET_XVECTOR_COMPUTE_H_ + +#include "nnet3/nnet-am-decodable-simple.h" // For NnetSimpleComputationOptions +#include "nnet3/nnet-computation.h" +#include "nnet3/nnet-compute.h" +#include "xvector/xvector.h" + +namespace kaldi { +namespace nnet3 { + +/** + class NnetXvectorComputer is responsible for extracting xvectors from + feature chunks. +**/ +class NnetXvectorComputer { + public: + /// Constructor. + NnetXvectorComputer(const NnetSimpleComputationOptions &opts, + Nnet *nnet); + /// Extracts an xvector given input features. + void ComputeXvector(const MatrixBase &feats, + Vector *xvector); + private: + Nnet *nnet_; + const NnetSimpleComputationOptions config_; + CachingOptimizingCompiler compiler_; + + /// Creates a computation request from the input features. + void GetComputationRequest(const MatrixBase &feats, + ComputationRequest *request); +}; +} // namespace nnet3 +} // namespace kaldi + +#endif // diff --git a/src/xvector/nnet-xvector-diagnostics.cc b/src/xvector/nnet-xvector-diagnostics.cc new file mode 100644 index 00000000000..071ed65a241 --- /dev/null +++ b/src/xvector/nnet-xvector-diagnostics.cc @@ -0,0 +1,214 @@ +// xvector/nnet-xvector-diagnostics.cc + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) +// Copyright 2016 Pegah Ghahremani +// David Snyder +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "nnet3/nnet-diagnostics.h" +#include "xvector/nnet-xvector-diagnostics.h" +#include "nnet3/nnet-utils.h" + +namespace kaldi { +namespace nnet3 { + +NnetXvectorComputeProb::NnetXvectorComputeProb(const NnetComputeProbOptions + &config, + const Nnet &nnet): + config_(config), + nnet_(nnet), + deriv_nnet_(NULL), + compiler_(nnet), + num_minibatches_processed_(0) { + if (config_.compute_deriv) { + deriv_nnet_ = new Nnet(nnet_); + ScaleNnet(0.0, deriv_nnet_); // force simple update + SetNnetAsGradient(deriv_nnet_); + } +} + +const Nnet &NnetXvectorComputeProb::GetDeriv() const { + if (deriv_nnet_ == NULL) + KALDI_ERR << "GetDeriv() called when no derivatives were requested."; + return *deriv_nnet_; +} + +NnetXvectorComputeProb::~NnetXvectorComputeProb() { + delete deriv_nnet_; // delete does nothing if pointer is NULL. +} + +void NnetXvectorComputeProb::Reset() { + num_minibatches_processed_ = 0; + objf_info_.clear(); + if (deriv_nnet_) { + ScaleNnet(0.0, deriv_nnet_); + SetNnetAsGradient(deriv_nnet_); + } +} + +void NnetXvectorComputeProb::Compute(const NnetExample &eg) { + bool need_model_derivative = config_.compute_deriv, + store_component_stats = false; + ComputationRequest request; + GetComputationRequestXvector(nnet_, eg, need_model_derivative, + store_component_stats, + &request); + std::shared_ptr computation = compiler_.Compile(request); + NnetComputer computer(config_.compute_config, *computation, + nnet_, deriv_nnet_); + // give the inputs to the computer object. + computer.AcceptInputs(nnet_, eg.io); + computer.Run(); + this->ProcessOutputs(&computer); + if (config_.compute_deriv) + computer.Run(); +} + +void NnetXvectorComputeProb::ProcessOutputs(NnetComputer *computer) { + for (int32 node_index = 0; node_index < nnet_.NumNodes(); node_index++) { + if (nnet_.IsOutputNode(node_index)) { + std::string xvector_name = nnet_.GetNodeName(node_index), + s_name = "s", b_name = "b"; + if (nnet_.GetNodeIndex(s_name) == -1 || nnet_.GetNodeIndex(b_name) == -1) + KALDI_ERR << "The nnet expected to have two output nodes with " + << "name s and b."; + + if (xvector_name == "output") { + const CuMatrixBase &xvector_pairs + = computer->GetOutput(xvector_name), + &xvec_s = computer->GetOutput(s_name), + &xvec_b = computer->GetOutput(b_name); + int32 num_rows = xvector_pairs.NumRows(), + num_cols = xvector_pairs.NumCols(); + CuMatrix xvector_deriv(num_rows, num_cols, kUndefined), + raw_scores(num_rows, num_rows, kUndefined); + int32 s_dim = num_cols * (num_cols + 1) / 2; + + // convert CuVector to CuSpMatrix + CuSpMatrix xvec_s_sp(num_cols); + xvec_s_sp.CopyFromVec(xvec_s.Row(0)); + CuVector deriv_s(s_dim); + + BaseFloat xvec_b_val = xvec_b(0,0), deriv_b; + BaseFloat tot_weight, tot_objf; + bool supply_deriv = config_.compute_deriv; + bool compute_accuracy = config_.compute_accuracy; + ComputeXvectorObjfAndDeriv(xvector_pairs, xvec_s_sp, xvec_b_val, + (supply_deriv ? &xvector_deriv : NULL), + (supply_deriv ? &deriv_s : NULL), + (supply_deriv ? &deriv_b : NULL), + (compute_accuracy ? &raw_scores : NULL), + &tot_objf, + &tot_weight); + if (supply_deriv) { + CuMatrix deriv_s_mat(1, s_dim), + deriv_b_mat(1,1); + deriv_b_mat(0,0) = deriv_b; + deriv_s_mat.CopyRowsFromVec(deriv_s); + computer->AcceptInput(xvector_name, &xvector_deriv); + computer->AcceptInput(s_name, &deriv_s_mat); + computer->AcceptInput(b_name, &deriv_b_mat); + + } + SimpleObjectiveInfo &totals = objf_info_[xvector_name]; + totals.tot_weight += tot_weight; + totals.tot_objective += tot_objf; + if (compute_accuracy) { + BaseFloat tot_acc; + SimpleObjectiveInfo &acc_totals = acc_info_[xvector_name]; + ComputeAccuracy(raw_scores, &tot_acc); + acc_totals.tot_objective += tot_weight * tot_acc; + acc_totals.tot_weight += tot_weight; + } + } + num_minibatches_processed_++; + } + } +} + +bool NnetXvectorComputeProb::PrintTotalStats() const { + bool ans = false; + unordered_map::const_iterator + iter, end; + { // First print regular objectives + iter = objf_info_.begin(); + end = objf_info_.end(); + for (; iter != end; ++iter) { + const std::string &name = iter->first; + int32 node_index = nnet_.GetNodeIndex(name); + KALDI_ASSERT(node_index >= 0); + ObjectiveType obj_type = nnet_.GetNode(node_index).u.objective_type; + const SimpleObjectiveInfo &info = iter->second; + KALDI_LOG << "Overall " + << (obj_type == kLinear ? "log-likelihood" : "objective") + << " for '" << name << "' is " + << (info.tot_objective / info.tot_weight) << " per chunk" + << ", over " << info.tot_weight << " chunks."; + if (info.tot_weight > 0) + ans = true; + } + } + if (config_.compute_accuracy) { // Now print the accuracy. + iter = acc_info_.begin(); + end = acc_info_.end(); + for (; iter != end; ++iter) { + const std::string &name = iter->first; + const SimpleObjectiveInfo &info = iter->second; + KALDI_LOG << "Overall accuracy for '" << name << "' is " + << (info.tot_objective / info.tot_weight) + << " per chunk" + << ", over " << ceil(info.tot_weight) << " chunks."; + } + } + return ans; +} + +void NnetXvectorComputeProb::ComputeAccuracy( + const CuMatrixBase &raw_scores, + BaseFloat *tot_accuracy_out) { + int32 num_rows = raw_scores.NumRows(); + BaseFloat K = 1.0 / (num_rows - 2.0), + threshold = 0; // Corresponds to prob_same(u,v) = 0.5. + BaseFloat count = 0, + error = 0; + for (int32 i = 0; i < num_rows; i++) { + for (int32 j = 0; j < num_rows; j++) { + if (i + 1 == j && i % 2 == 0) { + if (raw_scores(i, j) < threshold) + error++; + count++; + } else if (i < j) { + if (raw_scores(i, j) >= threshold) + error += K; + count += K; + } + } + } + (*tot_accuracy_out) = 1.0 - error / count; +} + +const SimpleObjectiveInfo* NnetXvectorComputeProb::GetObjective( + const std::string &output_name) const { + unordered_map::const_iterator + iter = objf_info_.find(output_name); + if (iter != objf_info_.end()) + return &(iter->second); + else + return NULL; +} + +} // namespace nnet3 +} // namespace kaldi diff --git a/src/xvector/nnet-xvector-diagnostics.h b/src/xvector/nnet-xvector-diagnostics.h new file mode 100644 index 00000000000..6a2de6b38bd --- /dev/null +++ b/src/xvector/nnet-xvector-diagnostics.h @@ -0,0 +1,95 @@ +// xvector/nnet-xvector-diagnostics.h + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) +// Copyright 2016 Pegah Ghahremani + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_XVECTOR_NNET_XVECTOR_DIAGNOSTICS_H_ +#define KALDI_XVECTOR_NNET_XVECTOR_DIAGNOSTICS_H_ + +#include "nnet3/nnet-example.h" +#include "nnet3/nnet-computation.h" +#include "nnet3/nnet-compute.h" +#include "nnet3/nnet-optimize.h" +#include "nnet3/nnet-example-utils.h" +#include "nnet3/nnet-training.h" +#include "xvector/nnet-xvector-training.h" +#include "xvector/xvector.h" + +namespace kaldi { +namespace nnet3 { + + + +/** This class is for computing cross-entropy values in a neural + network with xvector as output and unsupervised objective, for diagnostics. + Note: because we put a "logsoftmax" component in the nnet, the actual + objective function becomes linear at the output, but the printed messages + reflect the fact that it's the cross-entropy objective. + + TODO: In future we plan to check that the same values are returned whether + we run the computation with or without optimization. + */ +class NnetXvectorComputeProb { + public: + // does not store a reference to 'config' but does store one to 'nnet'. + NnetXvectorComputeProb(const NnetComputeProbOptions &config, + const Nnet &nnet); + + // Reset the likelihood stats, and the derivative stats (if computed). + void Reset(); + + // compute objective on one minibatch. + void Compute(const NnetExample &eg); + + // Prints out the final stats, and return true if there was a nonzero count. + bool PrintTotalStats() const; + + + // returns the objective-function info for this output name (e.g. "output"), + // or NULL if there is no such info. + const SimpleObjectiveInfo *GetObjective(const std::string &output_name) const; + + // if config.compute_deriv == true, returns a reference to the + // computed derivative. Otherwise crashes. + const Nnet &GetDeriv() const; + + ~NnetXvectorComputeProb(); + private: + void ProcessOutputs(NnetComputer *computer); + // Computes the accuracy for this minibatch. + void ComputeAccuracy(const CuMatrixBase &raw_scores, + BaseFloat *tot_accuracy_out); + NnetComputeProbOptions config_; + const Nnet &nnet_; + + Nnet *deriv_nnet_; + CachingOptimizingCompiler compiler_; + + // this is only for diagnostics. + int32 num_minibatches_processed_; + + unordered_map objf_info_; + unordered_map acc_info_; + +}; + + +} // namespace nnet3 +} // namespace kaldi + +#endif // KALDI_XVECTOR_NNET_XVECTOR_DIAGNOSTICS_H_ diff --git a/src/xvector/nnet-xvector-training.cc b/src/xvector/nnet-xvector-training.cc new file mode 100644 index 00000000000..8fc9423df1b --- /dev/null +++ b/src/xvector/nnet-xvector-training.cc @@ -0,0 +1,272 @@ +// xvector/nnet-xvector-training.cc + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) +// 2015 Xiaohui Zhang +// Copyright 2016 Pegah Ghahremani +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "xvector/nnet-xvector-training.h" +#include "nnet3/nnet-utils.h" + +namespace kaldi { +namespace nnet3 { + +NnetXvectorTrainer::NnetXvectorTrainer(const NnetTrainerOptions &config, + Nnet *nnet): + config_(config), + nnet_(nnet), + compiler_(*nnet, config_.optimize_config), + num_minibatches_processed_(0) { + if (config_.zero_component_stats) + ZeroComponentStats(nnet); + if (config_.momentum == 0.0 && + config_.max_param_change == 0.0) { + delta_nnet_= NULL; + } else { + KALDI_ASSERT(config_.momentum >= 0.0 && + config_.max_param_change >= 0.0); + delta_nnet_ = nnet_->Copy(); + bool is_gradient = false; // setting this to true would disable the + // natural-gradient updates. + if (is_gradient) { + ScaleNnet(0.0, delta_nnet_); + SetNnetAsGradient(delta_nnet_); + } else { + ScaleNnet(0.0, delta_nnet_); + } + } + if (config_.read_cache != "") { + bool binary; + try { + Input ki(config_.read_cache, &binary); + compiler_.ReadCache(ki.Stream(), binary); + } catch (...) { + KALDI_WARN << "Could not open cached computation. " + "Probably this is the first training iteration."; + } + } +} + + +void NnetXvectorTrainer::Train(const NnetExample &eg) { + bool need_model_derivative = true; + ComputationRequest request; + GetComputationRequestXvector(*nnet_, eg, need_model_derivative, + config_.store_component_stats, + &request); + std::shared_ptr computation = compiler_.Compile(request); + + NnetComputer computer(config_.compute_config, *computation, + *nnet_, + (delta_nnet_ == NULL ? nnet_ : delta_nnet_)); + // give the inputs to the computer object. + computer.AcceptInputs(*nnet_, eg.io); + computer.Run(); + + this->ProcessOutputs(&computer); + computer.Run(); + + if (delta_nnet_ != NULL) { + BaseFloat scale = (1.0 - config_.momentum); + if (config_.max_param_change != 0.0) { + BaseFloat param_delta = + std::sqrt(DotProduct(*delta_nnet_, *delta_nnet_)) * scale; + if (param_delta > config_.max_param_change) { + if (param_delta - param_delta != 0.0) { + KALDI_WARN << "Infinite parameter change, will not apply."; + ScaleNnet(0.0, delta_nnet_); + } else { + scale *= config_.max_param_change / param_delta; + KALDI_LOG << "Parameter change too big: " << param_delta << " > " + << "--max-param-change=" << config_.max_param_change + << ", scaling by " << config_.max_param_change / param_delta; + } + } + } + AddNnet(*delta_nnet_, scale, nnet_); + // impose positivity for AffineComponent max(W,0) + PositiveUpdatableWeights(nnet_); + + ScaleNnet(config_.momentum, delta_nnet_); + } + if (config_.write_cache != "") { + Output ko(config_.write_cache, + config_.binary_write_cache); + compiler_.WriteCache(ko.Stream(), config_.binary_write_cache); + } +} + +void NnetXvectorTrainer::ProcessOutputs(NnetComputer *computer) { + for (int32 node_index = 0; node_index < nnet_->NumNodes(); node_index++) { + if (nnet_->IsOutputNode(node_index)) { + BaseFloat tot_weight, tot_objf; + bool supply_deriv = true; + // For each xvector output node, we expect two output nodes with name "s" + // and "b", which store symmetric affine transformation and bias term + // for xvector-objective computation. + std::string xvector_name = nnet_->GetNodeName(node_index), + s_name = "s", b_name = "b"; + if (nnet_->GetNodeIndex(s_name) == -1 || nnet_->GetNodeIndex(b_name) == -1) + KALDI_ERR << "The nnet expected to have two output nodes with name s and b."; + + if (xvector_name == "output") { + const CuMatrixBase &xvector_pairs = computer->GetOutput(xvector_name), + &xvec_s = computer->GetOutput(s_name), + &xvec_b = computer->GetOutput(b_name); + CuMatrix xvector_deriv(xvector_pairs.NumRows(), xvector_pairs.NumCols(), + kUndefined); + int32 s_dim = xvector_pairs.NumCols() * (xvector_pairs.NumCols() + 1) / 2; + + // convert CuVector to CuSpMatrix + CuSpMatrix xvec_s_sp(xvector_pairs.NumCols()); + xvec_s_sp.CopyFromVec(xvec_s.Row(0)); + + CuVector deriv_s(s_dim); + BaseFloat xvec_b_val = xvec_b(0,0), deriv_b; + ComputeXvectorObjfAndDeriv(xvector_pairs, xvec_s_sp, xvec_b_val, + (supply_deriv ? &xvector_deriv : NULL), + (supply_deriv ? &deriv_s : NULL), + (supply_deriv ? &deriv_b : NULL), + NULL, // The raw scores aren't needed + &tot_objf, + &tot_weight); + + if (supply_deriv) { + CuMatrix deriv_s_mat(1, s_dim), + deriv_b_mat(1,1); + deriv_b_mat(0,0) = deriv_b; + deriv_s_mat.CopyRowsFromVec(deriv_s); + computer->AcceptInput(xvector_name, &xvector_deriv); + computer->AcceptInput(s_name, &deriv_s_mat); + computer->AcceptInput(b_name, &deriv_b_mat); + } + + objf_info_[xvector_name].UpdateStats(xvector_name, + config_.print_interval, + num_minibatches_processed_++, + tot_weight, tot_objf); + } + } + } +} + +bool NnetXvectorTrainer::PrintTotalStats() const { + unordered_map::const_iterator + iter = objf_info_.begin(), + end = objf_info_.end(); + std::vector > all_pairs; + for (; iter != end; ++iter) + all_pairs.push_back(std::pair( + iter->first, &(iter->second))); + // ensure deterministic order of these names (this will matter in situations + // where a script greps for the objective from the log). + std::sort(all_pairs.begin(), all_pairs.end()); + bool ans = false; + for (size_t i = 0; i < all_pairs.size(); i++) { + const std::string &name = all_pairs[i].first; + const ObjectiveFunctionInfo &info = *(all_pairs[i].second); + bool ok = info.PrintTotalStats(name); + ans = ans || ok; + } + return ans; +} + +NnetXvectorTrainer::~NnetXvectorTrainer() { + delete delta_nnet_; +} + +void GetComputationRequestXvector(const Nnet &nnet, + const NnetExample &eg, + bool need_model_derivative, + bool store_component_stats, + ComputationRequest *request) { + request->inputs.clear(); + request->inputs.reserve(eg.io.size()); + request->outputs.clear(); + request->outputs.reserve(eg.io.size()); + request->need_model_derivative = need_model_derivative; + request->store_component_stats = store_component_stats; + + // Examples for xvectors have no outputs. + for (size_t i = 0; i < eg.io.size(); i++) { + const NnetIo &io = eg.io[i]; + const std::string &name = io.name; + int32 node_index = nnet.GetNodeIndex(name); + + if (node_index == -1 && + !nnet.IsInputNode(node_index)) + KALDI_ERR << "xvector example has input named '" << name + << "', but no such input node is in the network."; + + std::vector &dest = request->inputs; + dest.resize(dest.size() + 1); + IoSpecification &io_spec = dest.back(); + io_spec.name = name; + io_spec.indexes = io.indexes; + io_spec.has_deriv = false; + } + + // We only need the output on frame t=0 for each n. + // So the output index for the output node is (n, 0, 0) + // for n=0 to max(n). + // Indexes for "s" and "b" output nodes are equal to (0,0,0). + int32 io_index_size = request->inputs[0].indexes.size(), + n_indx_size = 0; + std::vector output_indexes, + affine_output_indexes; + affine_output_indexes.resize(1); + affine_output_indexes[0].n = 0; + affine_output_indexes[0].t = 0; + + for (int32 indx = 0; indx < io_index_size; indx++) + n_indx_size = std::max(n_indx_size, + request->inputs[0].indexes[indx].n + 1); + + output_indexes.resize(n_indx_size); + for (int32 indx = 0; indx < n_indx_size; indx++) { + output_indexes[indx].n = indx; + output_indexes[indx].t = 0; + } + + // In order to generate computation request for output nodes, + // we should find output nodes and add io_spec for each one. + int32 num_nodes = nnet.NumNodes(); + for (size_t node_index = 0; node_index < num_nodes; node_index++) { + if (nnet.IsOutputNode(node_index)) { + std::vector &dest = request->outputs; + dest.resize(dest.size() + 1); + IoSpecification &io_spec = dest.back(); + io_spec.name = nnet.GetNodeName(node_index); + if (nnet.GetNodeName(node_index) == "s" || + nnet.GetNodeName(node_index) == "b") + io_spec.indexes = affine_output_indexes; + else + io_spec.indexes = output_indexes; + io_spec.has_deriv = need_model_derivative; + } + } + + // check to see if something went wrong. + if (request->inputs.empty()) + KALDI_ERR << "No inputs in computation request."; + if (request->outputs.empty()) + KALDI_ERR << "No outputs in computation request."; +} + + + +} // namespace nnet3 +} // namespace kaldi diff --git a/src/xvector/nnet-xvector-training.h b/src/xvector/nnet-xvector-training.h new file mode 100644 index 00000000000..121abcf0380 --- /dev/null +++ b/src/xvector/nnet-xvector-training.h @@ -0,0 +1,89 @@ +// xvector/nnet-xvector-training.h + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) +// 2016 Xiaohui Zhang +// Copyright 2016 Pegah Ghahremani +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_XVECTOR_NNET_XVECTOR_TRAINING_H_ +#define KALDI_XVECTOR_NNET_XVECTOR_TRAINING_H_ + +#include "nnet3/nnet-example.h" +#include "nnet3/nnet-computation.h" +#include "nnet3/nnet-compute.h" +#include "nnet3/nnet-optimize.h" +#include "nnet3/nnet-example-utils.h" +#include "xvector/xvector.h" +#include "nnet3/nnet-training.h" + +namespace kaldi { +namespace nnet3 { + + +/** This class is for single-threaded training of neural nets using + standard objective functions such as cross-entropy (implemented with + logsoftmax nonlinearity and a linear objective function) and quadratic loss. + + Something that we should do in the future is to make it possible to have + two different threads, one for the compilation, and one for the computation. + This would only improve efficiency in the cases where the structure of the + input example was different each time, which isn't what we expect to see in + speech-recognition training. (If the structure is the same each time, + the CachingOptimizingCompiler notices this and uses the computation from + last time). + */ +class NnetXvectorTrainer { + public: + NnetXvectorTrainer(const NnetTrainerOptions &config, + Nnet *nnet); + + // train on one minibatch. + void Train(const NnetExample &eg); + + // Prints out the final stats, and return true if there was a nonzero count. + bool PrintTotalStats() const; + + ~NnetXvectorTrainer(); + private: + void ProcessOutputs(NnetComputer *computer); + + const NnetTrainerOptions config_; + Nnet *nnet_; + Nnet *delta_nnet_; // Only used if momentum != 0.0. nnet representing + // accumulated parameter-change (we'd call this + // gradient_nnet_, but due to natural-gradient update, + // it's better to consider it as a delta-parameter nnet. + CachingOptimizingCompiler compiler_; + + // This code supports multiple output layers, even though in the + // normal case there will be just one output layer named "output". + // So we store the objective functions per output layer. + int32 num_minibatches_processed_; + + unordered_map objf_info_; +}; + + + +void GetComputationRequestXvector(const Nnet &nnet, + const NnetExample &eg, + bool need_model_derivative, + bool store_component_stats, + ComputationRequest *request); +} // namespace nnet3 +} // namespace kaldi + +#endif // diff --git a/src/xvector/xvector-test.cc b/src/xvector/xvector-test.cc new file mode 100644 index 00000000000..873e382851f --- /dev/null +++ b/src/xvector/xvector-test.cc @@ -0,0 +1,311 @@ +// ivector/xvector-test.cc + +// Copyright 2016 David Snyder + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "xvector/xvector.h" +#include "util/kaldi-io.h" +#include "cudamatrix/cu-matrix-lib.h" + +namespace kaldi { +BaseFloat TestSimilarityScore(const CuVector &v, + const CuVector &w, const CuSpMatrix &S, + BaseFloat b); + +void TestGetDeriv(const CuVector &v, + const CuVector &w, const CuSpMatrix &S, + BaseFloat b, bool is_same, BaseFloat similarity_score, + CuVector *deriv_v, CuVector *deriv_w, + CuVector *deriv_S, BaseFloat *deriv_b); + +void TestComputeXvectorObjfAndDeriv( + const CuMatrixBase &xvector_pairs, + const CuSpMatrix &S, + BaseFloat b, CuMatrixBase *deriv_xvector, + CuVector *deriv_S, BaseFloat *deriv_b, BaseFloat *tot_objf, + BaseFloat *tot_weight); + +bool TestXvectorExtractorDerivative(BaseFloat perturb_delta) { + int32 xvector_dim = RandInt(4, 100), + num_rows = 2 * RandInt(2, 10); // The number of rows must be even + // and greater than 2. + int32 num_rows_subset = RandInt(1, num_rows); + CuSpMatrix S(xvector_dim); + S.SetRandn(); + // Necessary to keep the similarity scores from getting too large or small. + S.Scale(1.0e-01); + BaseFloat b = RandInt(-100, 100) / 10.0, + tot_weight, + tot_objf, + deriv_b; + int32 S_dim = S.NumCols() * (S.NumCols() + 1) / 2; + CuMatrix xvector_pairs(num_rows, xvector_dim, kSetZero), + deriv_xvector(num_rows, xvector_dim, kSetZero); + CuVector deriv_S(S_dim, kSetZero); + xvector_pairs.SetRandn(); + ComputeXvectorObjfAndDeriv(xvector_pairs, S, b, &deriv_xvector, + &deriv_S, &deriv_b, NULL, &tot_objf, &tot_weight); + CuVector deriv_xvector_vec(xvector_dim); + + // Sum over the derivatives for xvector input. + deriv_xvector_vec.AddRowSumMat(1.0, deriv_xvector.RowRange(0, num_rows_subset), + 0.0); + BaseFloat l2_xvector = 0, + l2_S = 0, + l2_b = 0; + + // Compare the xvector derivatives calculated above with a numerical + // approximation. + for (int32 i = 0; i < xvector_dim; i++) { + CuMatrix xvector_pairs_p(xvector_pairs); + CuMatrix xvector_pairs_n(xvector_pairs); + for (int32 j = 0; j < num_rows_subset; j++) { + xvector_pairs_p(j, i) += perturb_delta; + xvector_pairs_n(j, i) += -perturb_delta; + } + BaseFloat tot_objf_p, + tot_objf_n; + ComputeXvectorObjfAndDeriv(xvector_pairs_p, S, b, NULL, + NULL, NULL, NULL, &tot_objf_p, &tot_weight); + ComputeXvectorObjfAndDeriv(xvector_pairs_n, S, b, NULL, + NULL, NULL, NULL, &tot_objf_n, &tot_weight); + BaseFloat delta = (tot_objf_p - tot_objf_n) + * 1.0 / (2.0 * perturb_delta); + l2_xvector += pow(deriv_xvector_vec(i) - delta, 2); + } + + // Compare the S derivative calculated above with a numerical + // approximation. + for (int32 i = 0; i < S_dim; i++) { + CuSpMatrix S_p(S); + CuSpMatrix S_n(S); + CuSubVector S_p_vec(S_p.Data(), S_dim); + CuSubVector S_n_vec(S_n.Data(), S_dim); + S_p_vec(i) += perturb_delta; + S_n_vec(i) += -perturb_delta; + BaseFloat tot_objf_p, + tot_objf_n; + ComputeXvectorObjfAndDeriv(xvector_pairs, S_p, b, NULL, + NULL, NULL, NULL, &tot_objf_p, &tot_weight); + ComputeXvectorObjfAndDeriv(xvector_pairs, S_n, b, NULL, + NULL, NULL, NULL, &tot_objf_n, &tot_weight); + BaseFloat delta = (tot_objf_p - tot_objf_n) + * 1.0 / (2.0 * perturb_delta); + l2_S += pow(deriv_S(i) - delta, 2); + } + + // Compare the b derivative calculated above with a numerical + // approximation. + BaseFloat b_p = b + perturb_delta; + BaseFloat b_n = b - perturb_delta; + BaseFloat tot_objf_p; + BaseFloat tot_objf_n; + ComputeXvectorObjfAndDeriv(xvector_pairs, S, b_p, NULL, + NULL, NULL, NULL, &tot_objf_p, &tot_weight); + ComputeXvectorObjfAndDeriv(xvector_pairs, S, b_n, NULL, + NULL, NULL, NULL, &tot_objf_n, &tot_weight); + BaseFloat delta = (tot_objf_p - tot_objf_n) + * 1.0 / (2.0 * perturb_delta); + l2_b = pow(deriv_b - delta, 2); + KALDI_ASSERT(l2_xvector < 1.0e-03); + KALDI_ASSERT(l2_S < 1.0e-03); + KALDI_ASSERT(l2_b < 1.0e-03); + return true; +} + +bool TestXvectorComputeObjf() { + int32 xvector_dim = RandInt(4, 100), + num_rows = 2 * RandInt(2, 10); // The number of rows must be even + // and greater than 2. + CuSpMatrix S(xvector_dim); + S.SetRandn(); + // Necessary to keep the similarity scores from getting too large or small. + S.Scale(1.0e-01); + BaseFloat b = RandInt(-200, 200) / 10.0, + tot_weight, + tot_weight_test, + tot_objf, + tot_objf_test, + deriv_b, + deriv_b_test; + int32 S_dim = S.NumCols() * (S.NumCols() + 1) / 2; + CuMatrix xvector_pairs(num_rows, xvector_dim, kSetZero), + deriv_xvector(num_rows, xvector_dim, kSetZero), + deriv_xvector_test(num_rows, xvector_dim, kSetZero); + CuVector deriv_S(S_dim, kSetZero), + deriv_S_test(S_dim, kSetZero); + xvector_pairs.SetRandn(); + + ComputeXvectorObjfAndDeriv(xvector_pairs, S, b, &deriv_xvector, + &deriv_S, &deriv_b, NULL, &tot_objf, &tot_weight); + TestComputeXvectorObjfAndDeriv(xvector_pairs, S, b, &deriv_xvector_test, + &deriv_S_test, &deriv_b_test, &tot_objf_test, &tot_weight_test); + + CuVector deriv_xvector_vec(xvector_dim); + deriv_xvector_vec.AddRowSumMat(1.0, deriv_xvector, 0.0); + CuVector deriv_xvector_vec_test(xvector_dim); + deriv_xvector_vec_test.AddRowSumMat(1.0, deriv_xvector_test, 0.0); + KALDI_ASSERT(deriv_xvector.ApproxEqual(deriv_xvector_test, 0.01)); + + // Verify that the objfs are the same. + KALDI_ASSERT(ApproxEqual(tot_objf, tot_objf_test, 0.001)); + + // Also verify that the gradients are the same. + for (int32 i = 0; i < deriv_xvector_vec.Dim(); i++) + KALDI_ASSERT(ApproxEqual(deriv_xvector_vec(i), + deriv_xvector_vec_test(i), 0.001)); + + // Verify that the S derivates are the same. + for (int32 i = 0; i < deriv_S.Dim(); i++) + KALDI_ASSERT(ApproxEqual(deriv_S(i), deriv_S_test(i), 0.001)); + + // Verify that the b derivates are the same. + KALDI_ASSERT(ApproxEqual(deriv_b, deriv_b_test, 0.001)); + return true; +} + +void TestComputeXvectorObjfAndDeriv( + const CuMatrixBase &xvector_pairs, + const CuSpMatrix &S, + BaseFloat b, CuMatrixBase *deriv_xvector, + CuVector *deriv_S, BaseFloat *deriv_b, BaseFloat *tot_objf, + BaseFloat *tot_weight) { + + int32 N = xvector_pairs.NumRows(); + BaseFloat same_objf = 0, + diff_objf = 0; + BaseFloat K = 1.0 / (N - 2.0); + (*deriv_b) = 0; + // Handle portion of the objf corresponding to pairs of xvectors + // from the same classes. + for (int32 i = 0; i < N/2; i++) { + const CuVector &v(xvector_pairs.Row(2 * i)), + &w(xvector_pairs.Row(2 * i + 1)); + CuVector deriv_v, + deriv_w, + deriv_S_part; + BaseFloat similarity_score = TestSimilarityScore(v, w, S, b), + deriv_b_part = 0; + same_objf += Log(1 + Exp(-similarity_score)); + TestGetDeriv(v, w, S, b, true, similarity_score, &deriv_v, + &deriv_w, &deriv_S_part, &deriv_b_part); + deriv_xvector->Row(2 * i).AddVec(1.0, deriv_v); + deriv_xvector->Row(2 * i + 1).AddVec(1.0, deriv_w); + deriv_S->AddVec(1.0, deriv_S_part); + (*deriv_b) += deriv_b_part; + } + + // Handle portion of the objf corresponding to pairs of xvectors + // from different classes. + for (int32 i = 0; i < N; i++) { + for (int32 j = 2 * std::ceil((i + 1) / 2.0); j < N; j++) { + const CuVector &v(xvector_pairs.Row(i)), + &w(xvector_pairs.Row(j)); + CuVector deriv_v, + deriv_w, + deriv_S_part; + BaseFloat similarity_score = TestSimilarityScore(v, w, S, b), + deriv_b_part = 0; + diff_objf += Log(1 + Exp(similarity_score)); + TestGetDeriv(v, w, S, b, false, similarity_score, &deriv_v, + &deriv_w, &deriv_S_part, &deriv_b_part); + deriv_xvector->Row(i).AddVec(K, deriv_v); + deriv_xvector->Row(j).AddVec(K, deriv_w); + deriv_S->AddVec(K, deriv_S_part); + (*deriv_b) += K * deriv_b_part; + } + } + // Scale the same and different portions of the objective function + // so that both contribute a weight of N. + (*tot_objf) = -same_objf - K * diff_objf; + (*tot_weight) = N; +} + + +void TestGetDeriv(const CuVector &v, + const CuVector &w, const CuSpMatrix &S, + BaseFloat b, bool is_same, BaseFloat similarity_score, + CuVector *deriv_v, CuVector *deriv_w, + CuVector *deriv_S, BaseFloat *deriv_b) { + int32 d = is_same ? 1 : -1, + S_dim = S.NumCols() * (S.NumCols() + 1) / 2; + deriv_v->Resize(v.Dim(), kSetZero); + deriv_w->Resize(v.Dim(), kSetZero); + deriv_S->Resize(S_dim, kSetZero); + + // This scalar is common to the different derivatives. + BaseFloat deriv_coef = -d * Exp(-1 * d * similarity_score) + / (1 + Exp(-1 * d * similarity_score)); + + // Handle derivative with respect to v and w. + deriv_v->CopyFromVec(w); + deriv_w->CopyFromVec(v); + deriv_v->AddSpVec(2.0, S, v, -1.0); + deriv_w->AddSpVec(2.0, S, w, -1.0); + deriv_v->Scale(deriv_coef); + deriv_w->Scale(deriv_coef); + + // Handle derivative with respect to S. + CuSpMatrix deriv_S_mat(S.NumCols(), kSetZero); + deriv_S_mat.AddVec2(2.0, v); + deriv_S_mat.AddVec2(2.0, w); + for (int32 i = 0; i < S.NumCols(); i++) + deriv_S_mat(i, i) = 0.5 * deriv_S_mat(i, i); + CuSubVector deriv_S_vec(deriv_S_mat.Data(), S_dim); + deriv_S->AddVec(deriv_coef, deriv_S_vec); + + // Handle derivative with respect to b. + (*deriv_b) = -deriv_coef; +} + +BaseFloat TestSimilarityScore(const CuVector &v, + const CuVector &w, const CuSpMatrix &S, + BaseFloat b) { + CuVector Sv(v.Dim()); + Sv.AddSpVec(1.0, S, v, 0); + CuVector Sw(w.Dim()); + Sw.AddSpVec(1.0, S, w, 0); + BaseFloat L = VecVec(v, w) - VecVec(v, Sv) - VecVec(w, Sw) + b; + return L; +} + +void UnitTestXvectorExtractor() { + if (!TestXvectorComputeObjf()) + KALDI_ERR << "Xvector objf test failed"; + if (!TestXvectorExtractorDerivative(1.0e-02) && + !TestXvectorExtractorDerivative(1.0e-03) && + !TestXvectorExtractorDerivative(1.0e-04) && + !TestXvectorExtractorDerivative(1.0e-05)) + KALDI_ERR << "Xvector derivative test failed"; +} + +} // namespace kaldi + +int main() { + using namespace kaldi; + for (int32 i = 0; i < 2; i++) { +#if HAVE_CUDA == 1 + if (i == 0) + CuDevice::Instantiate().SelectGpuId("no"); + else + CuDevice::Instantiate().SelectGpuId("yes"); +#endif + UnitTestXvectorExtractor(); + } + std::cout << "Xvector tests succeeded.\n"; + return 0; +} diff --git a/src/xvector/xvector.cc b/src/xvector/xvector.cc new file mode 100644 index 00000000000..10e05f8eef6 --- /dev/null +++ b/src/xvector/xvector.cc @@ -0,0 +1,130 @@ +// xvector/xvector.cc + +// Copyright 2016 David Snyder + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "xvector/xvector.h" + +namespace kaldi { + +void ComputeXvectorObjfAndDeriv( + const CuMatrixBase &xvector_pairs, + const CuSpMatrix &S, + BaseFloat b, CuMatrixBase *deriv_xvector, + CuVector *deriv_S, BaseFloat *deriv_b, + CuMatrixBase *scores_out, + BaseFloat *tot_objf, + BaseFloat *tot_weight) { + + int32 S_dim = S.NumCols() * (S.NumCols() + 1) / 2, + N = xvector_pairs.NumRows(), + xvector_dim = xvector_pairs.NumCols(); + (*tot_objf) = 0; + + if (deriv_xvector == NULL) + KALDI_ASSERT(deriv_S == NULL && deriv_b == NULL); + else { + KALDI_ASSERT(deriv_xvector->NumCols() == xvector_dim); + KALDI_ASSERT(deriv_xvector->NumRows() == N); + KALDI_ASSERT(deriv_S->Dim() == S_dim); + deriv_xvector->SetZero(); + deriv_S->SetZero(); + } + + + CuMatrix S_tmp(S), + P(N, xvector_dim), + Q(N, N), + R(N, N), + scores(N, N), // The raw scores. + objf_terms(N, N, kUndefined), + scores_deriv(N, N, // Derivative of the + kUndefined); // objf w.r.t. the scores. + CuVector r(N); + + P.AddMatMat(1.0, xvector_pairs, kNoTrans, S_tmp, kNoTrans, 0.0); + r.AddDiagMatMat(1.0, xvector_pairs, kNoTrans, P, kTrans, 0.0); + R.AddVecToRows(1.0, r); + Q.SymAddMat2(1.0, xvector_pairs, kNoTrans, 0.0); + Q.CopyLowerToUpper(); + scores.AddMat(1.0, Q, kNoTrans); + scores.AddMat(-1.0, R, kTrans); + scores.AddMat(-1.0, R, kNoTrans); + scores.Add(b); + if (scores_out != NULL) { + KALDI_ASSERT(scores_out->NumCols() == scores.NumCols() + && scores_out->NumRows() == scores.NumRows()); + scores_out->CopyFromMat(scores); + } + + cu::ComputeXvectorObjfFromScores(scores, &objf_terms, &scores_deriv); + CuVector objf_terms_vec(N); + objf_terms_vec.AddRowSumMat(1.0, objf_terms); + (*tot_objf) = objf_terms_vec.Sum(); + + if (deriv_xvector != NULL) { + // compute the derivatives of tot_objf w.r.t the inputs. + CuMatrix scores_deriv_plus_trans(scores_deriv, kTrans); + scores_deriv_plus_trans.AddMat(1.0, scores_deriv, kNoTrans); + CuVector r_deriv(N); + r_deriv.AddRowSumMat(-1.0, scores_deriv_plus_trans, 0.0); + + // Compute derivative of the objf with respect to the xvectors. + deriv_xvector->AddDiagVecMat(2.0, r_deriv, P, kNoTrans, 0.0); + deriv_xvector->AddMatMat(1.0, scores_deriv_plus_trans, kNoTrans, + xvector_pairs, kNoTrans, 1.0); + + // Compute derivative of the objf with respect to the symmetric matrix S: + // S_deriv += xvector_pairs' * diag(r_deriv) * xvector_pairs + CuMatrix S_deriv_mat(xvector_dim, xvector_dim); + // we don't need P any more so re-use it temporarily + // rderiv_xvector_pairs is the product of diag(r_deriv) times xvector_pairs. + CuMatrix &rderiv_xvector_pairs(P); + rderiv_xvector_pairs.AddDiagVecMat(1.0, r_deriv, xvector_pairs, kNoTrans, 0.0); + S_deriv_mat.AddMatMat(1.0, xvector_pairs, kTrans, rderiv_xvector_pairs, kNoTrans, 0.0); + CuSpMatrix S_deriv_sp(xvector_dim); + S_deriv_sp.CopyFromMat(S_deriv_mat, kTakeLower); + + // at this point S_deriv_sp represents the deriv w.r.t. S represented as a + // symmetric matrix; but we need the deriv w.r.t. S represented as a packed + // vector, which is a little different because each off-diagonal element is + // only represented once in the packed vector. This means we need + // to scale the off-diag elements by 2. + S_deriv_sp.Scale(2.0); + S_deriv_sp.ScaleDiag(0.5); + deriv_S->CopyFromVec(CuSubVector(S_deriv_sp.Data(), + S_dim)); + + // Compute derivative of objf with respect to the scalar offset b. + (*deriv_b) = scores_deriv.Sum(); + } + (*tot_weight) = N; +} + +BaseFloat SimilarityScore(const Vector &v, + const Vector &w, const SpMatrix &S, + BaseFloat b) { + KALDI_ASSERT(v.Dim() == w.Dim() && v.Dim() == S.NumRows()); + Vector Sv(v.Dim()); + Sv.AddSpVec(1.0, S, v, 0); + Vector Sw(w.Dim()); + Sw.AddSpVec(1.0, S, w, 0); + BaseFloat L = VecVec(v, w) - VecVec(v, Sv) - VecVec(w, Sw) + b; + return L; +} + +} // namespace kaldi diff --git a/src/xvector/xvector.h b/src/xvector/xvector.h new file mode 100644 index 00000000000..fa6c580ab43 --- /dev/null +++ b/src/xvector/xvector.h @@ -0,0 +1,94 @@ +// xvector/xvector.h + +// Copyright 2016 Johns Hopkins University (Author: Daniel Povey) +// 2016 David Snyder + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_XVECTOR_XVECTOR_H_ +#define KALDI_XVECTOR_XVECTOR_H_ + +#include +#include "base/kaldi-common.h" +#include "cudamatrix/cu-matrix-lib.h" +#include "itf/options-itf.h" +#include "util/common-utils.h" +#include "matrix/matrix-lib.h" + +namespace kaldi { + /* + Computes the training objective function and the derivatives for + the xvector. Let N = xvector_pairs.NumRows() be the number of + xvectors. There are N(N-1)/2 pairs in total and N/2 from the same + class. Let v(n) be the n'th row of the matrix xvector_pairs. + The total objective function written to 'tot_objf' is + \sum_{n=0}^{N/2} p_same(v(n*2), v(n*2+1)) + + 1/(N-2) \sum_{n=0}^{N} \sum_{m=2*ceil(n+1)/2)}^{N} + p_different(v(m), v(n)) + and let N be the normalizer for the objective function, written to + 'tot_weight' and equal to the total (weighted) number of samples over + which the objective function is computed. It is useful for displaying + the objective function correctly. + Let the log-odds L(v,w) [interpreted as log(p_same(v,w) / p_different(v,w))] + be defined as: + L(v, w) = v' w - v' S v - w' S w + b + then p_same(v, w) = -log(1 + exp(-l(v, w)), and + p_different(v, w) = 1 - p_same(v, w) = -log(1 + exp(-l(v, w)). + + @param [in] xvector_pairs Each row of 'xvector_pairs' is an xvector + extracted by the network for one sample, and the assumption is that + pairs of the form (2*k, 2*k+1), e.g., (0, 1), (2, 3), (4, 5), etc, + are from the same class, but any other pairs, e.g., (0, 2), (1, 2), + (2, 4), etc, are from different classes. + @param [out] deriv_xvector If non-NULL, the derivative of the objective + function with respect to the xvectors is written here. + @param [out] deriv_S If non-NULL, the derivative of the objective + function with respect to the parameter S are written here. + @param [out] deriv_b If other derivates are non-NULL the derivative of + the objective function with respect to the parameter b is written here. + @param [out] tot_objf The total objective function described above + @param [out] tot_weight The total normalizing factor for the objective + function, equal to xvector_pairs.NumRows(). + */ + void ComputeXvectorObjfAndDeriv(const CuMatrixBase &xvector_pairs, + const CuSpMatrix &S, + BaseFloat b, + CuMatrixBase *deriv_xvector, + CuVector *deriv_S, + BaseFloat *deriv_b, + CuMatrixBase *scores_out, + BaseFloat *tot_objf, + BaseFloat *tot_weight); + + /* + Compute the similarity score between two input xvectors. The score is + defined as: + L(v, w) = v' w - v' S v - w' S w + b + @param [in] v The first xvector. + @param [in] w The second xvector. + @param [in] S A symmetric matrix, usually a constant output of the + Nnet the xvectors came from. + @param [in] b A scalar offset, usually a constant output of the Nnet + the xvectors came from. + @return The score between vectors v and w. + */ + BaseFloat SimilarityScore(const Vector &v, + const Vector &w, const SpMatrix &S, + BaseFloat b); + +} // namespace kaldi + +#endif diff --git a/src/xvectorbin/Makefile b/src/xvectorbin/Makefile new file mode 100644 index 00000000000..63b78a36880 --- /dev/null +++ b/src/xvectorbin/Makefile @@ -0,0 +1,28 @@ + +all: +EXTRA_CXXFLAGS = -Wno-sign-compare +include ../kaldi.mk + +LDFLAGS += $(CUDA_LDFLAGS) +LDLIBS += $(CUDA_LDLIBS) + +BINFILES = nnet3-xvector-get-egs nnet3-xvector-compute-prob \ + nnet3-xvector-show-progress nnet3-xvector-train \ + nnet3-xvector-compute nnet3-xvector-scoring nnet3-xvector-get-egs-sre \ + nnet3-xvector-get-egs-sre-subsample nnet3-xvector-compute-simple + +OBJFILES = + +# Add this dependency to force cuda-compiled.o to be rebuilt when we reconfigure. +cuda-compiled.o: ../kaldi.mk + +TESTFILES = + +ADDLIBS = ../xvector/kaldi-xvector.a ../nnet3/kaldi-nnet3.a ../gmm/kaldi-gmm.a \ + ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a \ + ../transform/kaldi-transform.a ../tree/kaldi-tree.a \ + ../cudamatrix/kaldi-cudamatrix.a \ + ../matrix/kaldi-matrix.a ../fstext/kaldi-fstext.a \ + ../util/kaldi-util.a ../base/kaldi-base.a + +include ../makefiles/default_rules.mk diff --git a/src/xvectorbin/nnet3-xvector-compute-prob.cc b/src/xvectorbin/nnet3-xvector-compute-prob.cc new file mode 100644 index 00000000000..fb3975b259d --- /dev/null +++ b/src/xvectorbin/nnet3-xvector-compute-prob.cc @@ -0,0 +1,81 @@ +// xvectorbin/nnet3-xvector-compute-prob.cc + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "nnet3/nnet-diagnostics.h" +#include "xvector/nnet-xvector-diagnostics.h" + + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + + const char *usage = + "Computes and prints to in logging messages the average log-prob per frame of\n" + "the given data with an nnet3 neural net. The input of this is the output of\n" + "e.g. nnet3-xvector-get-egs | nnet3-merge-egs.\n" + "\n" + "Usage: nnet3-xvector-compute-prob [options] \n" + "e.g.: nnet3-xvector-compute-prob 0.raw ark:valid.egs\n"; + + + // This program doesn't support using a GPU, because these probabilities are + // used for diagnostics, and you can just compute them with a small enough + // amount of data that a CPU can do it within reasonable time. + + NnetComputeProbOptions opts; + + ParseOptions po(usage); + + opts.Register(&po); + + po.Read(argc, argv); + + if (po.NumArgs() != 2) { + po.PrintUsage(); + exit(1); + } + + std::string raw_nnet_rxfilename = po.GetArg(1), + examples_rspecifier = po.GetArg(2); + + Nnet nnet; + ReadKaldiObject(raw_nnet_rxfilename, &nnet); + + NnetXvectorComputeProb prob_computer(opts, nnet); + + SequentialNnetExampleReader example_reader(examples_rspecifier); + + for (; !example_reader.Done(); example_reader.Next()) + prob_computer.Compute(example_reader.Value()); + + bool ok = prob_computer.PrintTotalStats(); + + return (ok ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} + + diff --git a/src/xvectorbin/nnet3-xvector-compute-simple.cc b/src/xvectorbin/nnet3-xvector-compute-simple.cc new file mode 100644 index 00000000000..e588edd90b7 --- /dev/null +++ b/src/xvectorbin/nnet3-xvector-compute-simple.cc @@ -0,0 +1,155 @@ +// nnet3bin/nnet3-compute.cc + +// Copyright 2012-2015 Johns Hopkins University (author: Daniel Povey) +// 2016 David Snyder + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "base/timer.h" +#include "nnet3/nnet-utils.h" +#include "xvector/nnet-xvector-compute.h" + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + + const char *usage = + "Propagate the features through the network and write the output\n" + "xvectors. By default, xvectors are extracted once every\n" + "--xvector-period using --chunk-size frames and output as an archive\n" + "of matrices. If --repeat=true, the xvectors are copied between\n" + "periods, so that the output matrix has the same number of rows as\n" + "the input. If --output-as-vector=true, the xvectors are averaged\n" + "across periods, and the output is a single vector for each utterance.\n" + "\n" + "Usage: nnet3-xvector-compute [options] " + " \n" + " e.g.: nnet3-xvector-compute --xvector-period=50 final.raw " + "scp:feats.scp ark:xvectors.ark\n"; + + ParseOptions po(usage); + Timer timer; + + NnetSimpleComputationOptions opts; + std::string use_gpu = "yes"; + int32 chunk_size = 100; + + opts.Register(&po); + + po.Register("use-gpu", &use_gpu, + "yes|no|optional|wait, only has effect if compiled with CUDA"); + po.Register("chunk-size", &chunk_size, + "Feature chunk size over which the xvector is computed. " + "If not set, defaults to xvector-period."); + po.Read(argc, argv); + + if (po.NumArgs() != 3) { + po.PrintUsage(); + exit(1); + } + +#if HAVE_CUDA==1 + CuDevice::Instantiate().SelectGpuId(use_gpu); +#endif + + KALDI_ASSERT(chunk_size > 0); + + std::string nnet_rxfilename = po.GetArg(1), + feat_rspecifier = po.GetArg(2), + vector_wspecifier = po.GetArg(3); + Nnet nnet; + ReadKaldiObject(nnet_rxfilename, &nnet); + NnetXvectorComputer nnet_computer(opts, &nnet); + + BaseFloatVectorWriter vector_writer(vector_wspecifier); + + int32 num_success = 0, + num_fail = 0, + left_context, + right_context, + xvector_dim = nnet.OutputDim("output"); + int32 min_chunk_size = 100; + int64 frame_count = 0; + SequentialBaseFloatMatrixReader feat_reader(feat_rspecifier); + for (; !feat_reader.Done(); feat_reader.Next()) { + std::string utt = feat_reader.Key(); + const Matrix &feats (feat_reader.Value()); + int32 num_rows = feats.NumRows(), + feat_dim = feats.NumCols(), + this_chunk_size = chunk_size; + + if (num_rows < min_chunk_size) { + KALDI_WARN << "Minimum chunk size of " << min_chunk_size + << " is greater than the number of rows " + << "in utterance: " << utt; + num_fail++; + continue; + } else if (num_rows < this_chunk_size) { + KALDI_LOG << "Chunk size of " << this_chunk_size << " is greater than " + << "the number of rows in utterance: " << utt + << ", using chunk size of " << num_rows; + this_chunk_size = num_rows; + } + + int32 num_chunks = ceil(num_rows / static_cast(chunk_size)); + + Vector xvector_avg(xvector_dim, kSetZero); + BaseFloat tot_weight = 0.0; + + // Iterate over the feature chunks. + for (int32 chunk_indx = 0; chunk_indx < num_chunks; chunk_indx++) { + // If we're nearing the end of the input, we may need to shift the + // offset back so that we can get this_chunk_size frames of input to + // the nnet. + int32 offset = std::min(chunk_size, num_rows - chunk_indx * chunk_size); + if (offset < min_chunk_size) + continue; + SubMatrix sub_feats(feats, chunk_indx * chunk_size, offset, + 0, feat_dim); + Vector xvector(xvector_dim); + nnet_computer.ComputeXvector(sub_feats, &xvector); + tot_weight += offset; + xvector_avg.AddVec(offset, xvector); + } + + // If output is a vector, scale it by the total weight. + xvector_avg.Scale(1.0 / tot_weight); + vector_writer.Write(utt, xvector_avg); + + frame_count += feats.NumRows(); + num_success++; + } + + double elapsed = timer.Elapsed(); + KALDI_LOG << "Time taken "<< elapsed + << "s: real-time factor assuming 100 frames/sec is " + << (elapsed*100.0/frame_count); + KALDI_LOG << "Done " << num_success << " utterances, failed for " + << num_fail; + + if (num_success != 0) return 0; + else return 1; + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} diff --git a/src/xvectorbin/nnet3-xvector-compute.cc b/src/xvectorbin/nnet3-xvector-compute.cc new file mode 100644 index 00000000000..c2d16c867ca --- /dev/null +++ b/src/xvectorbin/nnet3-xvector-compute.cc @@ -0,0 +1,211 @@ +// nnet3bin/nnet3-compute.cc + +// Copyright 2012-2015 Johns Hopkins University (author: Daniel Povey) +// 2016 David Snyder + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "base/timer.h" +#include "nnet3/nnet-utils.h" +#include "xvector/nnet-xvector-compute.h" + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + + const char *usage = + "Propagate the features through the network and write the output\n" + "xvectors. By default, xvectors are extracted once every\n" + "--xvector-period using --chunk-size frames and output as an archive\n" + "of matrices. If --repeat=true, the xvectors are copied between\n" + "periods, so that the output matrix has the same number of rows as\n" + "the input. If --output-as-vector=true, the xvectors are averaged\n" + "across periods, and the output is a single vector for each utterance.\n" + "\n" + "Usage: nnet3-xvector-compute [options] " + " \n" + " e.g.: nnet3-xvector-compute --xvector-period=50 final.raw " + "scp:feats.scp ark:xvectors.ark\n"; + + ParseOptions po(usage); + Timer timer; + + NnetSimpleComputationOptions opts; + std::string use_gpu = "yes"; + int32 xvector_period = 10, + chunk_size = -1; + bool output_as_vector = false, + repeat = false; + + opts.Register(&po); + + po.Register("use-gpu", &use_gpu, + "yes|no|optional|wait, only has effect if compiled with CUDA"); + po.Register("xvector-period", &xvector_period, + "Extract a new xvector once for each period."); + po.Register("chunk-size", &chunk_size, + "Feature chunk size over which the xvector is computed. " + "If not set, defaults to xvector-period."); + po.Register("output-as-vector", &output_as_vector, + "If true, average the chunk-level xvectors and output as an " + "archive of vectors."); + po.Register("repeat", &repeat, "If true, the xvectors are copied between " + "periods so that the output has the same number of rows as the input."); + po.Read(argc, argv); + + if (po.NumArgs() != 3) { + po.PrintUsage(); + exit(1); + } + +#if HAVE_CUDA==1 + CuDevice::Instantiate().SelectGpuId(use_gpu); +#endif + + if (output_as_vector && repeat) + KALDI_ERR << "Options --output-as-vector and --repeat cannot both " + << "be true."; + if (chunk_size == -1) + chunk_size = xvector_period; + + KALDI_ASSERT(chunk_size > 0 && xvector_period > 0); + + std::string nnet_rxfilename = po.GetArg(1), + feat_rspecifier = po.GetArg(2), + vector_wspecifier = po.GetArg(3); + Nnet nnet; + ReadKaldiObject(nnet_rxfilename, &nnet); + NnetXvectorComputer nnet_computer(opts, &nnet); + + BaseFloatMatrixWriter matrix_writer(output_as_vector + ? "" : vector_wspecifier); + BaseFloatVectorWriter vector_writer(output_as_vector + ? vector_wspecifier : ""); + + int32 num_success = 0, + num_fail = 0, + left_context, + right_context, + xvector_dim = nnet.OutputDim("output"); + ComputeSimpleNnetContext(nnet, &left_context, &right_context); + int32 min_chunk_size = left_context + right_context; + int64 frame_count = 0; + + SequentialBaseFloatMatrixReader feat_reader(feat_rspecifier); + for (; !feat_reader.Done(); feat_reader.Next()) { + std::string utt = feat_reader.Key(); + const Matrix &feats (feat_reader.Value()); + int32 num_rows = feats.NumRows(), + feat_dim = feats.NumCols(), + this_chunk_size = chunk_size; + + if (num_rows < min_chunk_size) { + KALDI_WARN << "Minimum chunk size of " << min_chunk_size + << " is greater than the number of rows " + << "in utterance: " << utt; + num_fail++; + continue; + } else if (num_rows < this_chunk_size) { + KALDI_LOG << "Chunk size of " << this_chunk_size << " is greater than " + << "the number of rows in utterance: " << utt + << ", using chunk size of " << num_rows; + this_chunk_size = num_rows; + } + + int32 num_chunks = ceil((num_rows - this_chunk_size) + / static_cast(xvector_period)) + 1; + int32 num_xvectors = repeat ? num_rows : num_chunks; + + // The number of frames by which the last two chunks overlap. + int32 overlap = std::max(0, (num_chunks - 1) * xvector_period + - num_rows + this_chunk_size); + BaseFloat total_chunk_weight = 0.0; + Vector xvector_avg; + Matrix xvector_mat; + + // Create the output xvector vector or matrix. Only allocate memory + // for the one we're going to output. + if (output_as_vector) + xvector_avg.Resize(xvector_dim); + else + xvector_mat.Resize(num_xvectors, xvector_dim); + + // Iterate over the feature chunks. + for (int32 chunk_indx = 0; chunk_indx < num_chunks; chunk_indx++) { + // If we're nearing the end of the input, we may need to shift the + // offset back so that we can get this_chunk_size frames of input to + // the nnet. + int32 offset = std::min(chunk_indx * xvector_period, + num_rows - this_chunk_size); + SubMatrix sub_feats(feats, offset, this_chunk_size, + 0, feat_dim); + Vector xvector(xvector_dim); + nnet_computer.ComputeXvector(sub_feats, &xvector); + + if (output_as_vector) { + // The second to last chunk may have extra overlap with the + // final chunk. We need to reduce the weight on these + // chunks, so that the overlapping portion isn't counted twice. + BaseFloat weight; + if (chunk_indx < num_chunks - 2) + weight = this_chunk_size; + else + weight = this_chunk_size - 0.5 * overlap; + total_chunk_weight += weight; + xvector_avg.AddVec(weight, xvector); + // Cases for outputting as a matrix: + } else if (repeat) { + int32 out_offset = chunk_indx * xvector_period; + for (int32 i = out_offset; + i < std::min(out_offset + xvector_period, num_rows); i++) + xvector_mat.Row(i).CopyFromVec(xvector); + } else { + xvector_mat.Row(chunk_indx).CopyFromVec(xvector); + } + } + + // If output is a vector, scale it by the total weight. + if (output_as_vector) { + xvector_avg.Scale(1.0 / total_chunk_weight); + vector_writer.Write(utt, xvector_avg); + } else { + matrix_writer.Write(utt, xvector_mat); + } + + frame_count += feats.NumRows(); + num_success++; + } + + double elapsed = timer.Elapsed(); + KALDI_LOG << "Time taken "<< elapsed + << "s: real-time factor assuming 100 frames/sec is " + << (elapsed*100.0/frame_count); + KALDI_LOG << "Done " << num_success << " utterances, failed for " + << num_fail; + + if (num_success != 0) return 0; + else return 1; + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} diff --git a/src/xvectorbin/nnet3-xvector-get-egs-sre-subsample.cc b/src/xvectorbin/nnet3-xvector-get-egs-sre-subsample.cc new file mode 100644 index 00000000000..685279f356a --- /dev/null +++ b/src/xvectorbin/nnet3-xvector-get-egs-sre-subsample.cc @@ -0,0 +1,264 @@ +// xvectorbin/nnet3-xvector-get-egs.cc + +// Copyright 2012-2016 Johns Hopkins University (author: Daniel Povey) +// 2016 David Snyder + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "util/common-utils.h" +#include "nnet3/nnet-example.h" + +namespace kaldi { +namespace nnet3 { + +// A struct for holding information about the position and +// duration of each pair of chunks. +struct ChunkPairInfo { + std::string pair_name; + std::string utt1; + std::string utt2; + int32 output_archive_id; + int32 start_frame1; + int32 start_frame2; + int32 num_frames1; + int32 num_frames2; +}; + +// Process the range input file and store it as a map from utterance +// name to vector of ChunkPairInfo structs. +static void ProcessRangeFile(const std::string &range_rxfilename, + std::vector *pairs) { + Input range_input(range_rxfilename); + if (!range_rxfilename.empty()) { + std::string line; + while (std::getline(range_input.Stream(), line)) { + ChunkPairInfo *pair = new ChunkPairInfo(); + std::vector fields; + SplitStringToVector(line, " \t\n\r", true, &fields); + if (fields.size() != 8) + KALDI_ERR << "Expected 7 fields in line of range file, got " + << fields.size() << " instead."; + + std::string utt1 = fields[0], + utt2 = fields[1], + start_frame1_str = fields[4], + num_frames1_str = fields[5], + start_frame2_str = fields[6], + num_frames2_str = fields[7]; + pair->utt1 = utt1; + pair->utt2 = utt2; + if (!ConvertStringToInteger(fields[2], &(pair->output_archive_id)) + || !ConvertStringToInteger(start_frame1_str, &(pair->start_frame1)) + || !ConvertStringToInteger(start_frame2_str, &(pair->start_frame2)) + || !ConvertStringToInteger(num_frames1_str, &(pair->num_frames1)) + || !ConvertStringToInteger(num_frames2_str, &(pair->num_frames2))) + KALDI_ERR << "Expected integer for output archive in range file."; + pair->pair_name = utt1 + "-" + start_frame1_str + "-" + num_frames1_str + + "-" + utt2 + + "-" + start_frame2_str + "-" + num_frames2_str; + pairs->push_back(pair); + } + } +} + +static void WriteExample(const MatrixBase &feat1, + const MatrixBase &feat2, + const ChunkPairInfo *pair, + int32 subsample, + bool compress, + int32 *num_egs_written, + std::vector *example_writers) { + NnetExample eg; + int32 num_rows1 = feat1.NumRows(), + feat_dim1 = feat1.NumCols(), + num_rows2 = feat2.NumRows(), + feat_dim2 = feat2.NumCols(); + std::string utt1 = pair->utt1, + utt2 = pair->utt2; + + KALDI_ASSERT(feat_dim1 == feat_dim2); + + if (num_rows1 < pair->num_frames1) { + KALDI_WARN << "Unable to create examples for utterance " + << utt1 + << ". Requested chunk size of " + << pair->num_frames1 + << " but utterance has only " << num_rows1 << " frames."; + return; + } + if (num_rows2 < pair->num_frames2) { + KALDI_WARN << "Unable to create examples for utterance " + << utt2 + << ". Requested chunk size of " + << pair->num_frames2 + << " but utterance has only " << num_rows2 << " frames."; + return; + } + // The requested chunk positions are approximate. It's possible + // that they slightly exceed the number of frames in the utterance. + // If that occurs, we can shift the chunks location back slightly. + int32 shift1 = std::min(0, num_rows1 - pair->start_frame1 + - pair->num_frames1), + shift2 = std::min(0, num_rows2 - pair->start_frame2 + - pair->num_frames2); + + SubMatrix chunk1_sub(feat1, pair->start_frame1 + shift1, + pair->num_frames1, 0, feat_dim1), + chunk2_sub(feat2, pair->start_frame2 + shift2, + pair->num_frames2, 0, feat_dim2); + Matrix chunk1_tmp(chunk1_sub); + Matrix chunk2_tmp(chunk2_sub); + + int32 real_chunk_size1 = chunk1_tmp.NumRows() / subsample; + int32 real_chunk_size2 = chunk2_tmp.NumRows() / subsample; + Matrix chunk1(real_chunk_size1, chunk1_tmp.NumCols()); + Matrix chunk2(real_chunk_size2, chunk2_tmp.NumCols()); + + std::vector index_vector1; + for (int32 i = 0; i < chunk1_tmp.NumRows(); i++) + index_vector1.push_back(i); + + std::vector index_vector2; + for (int32 i = 0; i < chunk2_tmp.NumRows(); i++) + index_vector2.push_back(i); + + std::random_shuffle(index_vector1.begin(), index_vector1.end()); + for (int32 i = 0; i < real_chunk_size1; i++) + chunk1.Row(i).CopyFromVec(chunk1_tmp.Row(i)); + + std::random_shuffle(index_vector2.begin(), index_vector2.end()); + for (int32 i = 0; i < real_chunk_size2; i++) + chunk2.Row(i).CopyFromVec(chunk2_tmp.Row(i)); + + NnetIo nnet_io1 = NnetIo("input", 0, chunk1), + nnet_io2 = NnetIo("input", 0, chunk2); + for (std::vector::iterator indx_it = nnet_io1.indexes.begin(); + indx_it != nnet_io1.indexes.end(); ++indx_it) + indx_it->n = 0; + for (std::vector::iterator indx_it = nnet_io2.indexes.begin(); + indx_it != nnet_io2.indexes.end(); ++indx_it) + indx_it->n = 1; + + eg.io.push_back(nnet_io1); + eg.io.push_back(nnet_io2); + if (compress) + eg.Compress(); + + if (pair->output_archive_id >= example_writers->size()) + KALDI_ERR << "Requested output index exceeds number of specified " + << "output files."; + (*example_writers)[pair->output_archive_id]->Write( + pair->pair_name, eg); + (*num_egs_written) += 1; +} + +// Delete the dynamically allocated memory. +static void Cleanup(std::vector *pairs, + std::vector *writers) { + for (std::vector::iterator + vec_it = pairs->begin(); vec_it != pairs->end(); + ++vec_it) + delete *vec_it; + for (std::vector::iterator + it = writers->begin(); it != writers->end(); ++it) + delete *it; +} + +} // namespace nnet3 +} // namespace kaldi + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + typedef kaldi::int32 int32; + + const char *usage = + "Get examples for training an nnet3 neural network for the xvector\n" + "system. Each output example contains a pair of feature chunks from\n" + "the same utterance. The location and length of the feature chunks\n" + "are specified in the 'ranges' file. Each line is interpreted as\n" + "follows:\n" + " " + " " + " \n" + "where is interpreted as a zero-based\n" + "index into the wspecifiers specified on the command line (\n" + "and so on), and is ignored by this program.\n" + "For example:\n" + " utt1 3 13 0 65 112 110\n" + " utt1 0 10 160 50 214 180\n" + " utt2 ...\n" + "\n" + "Usage: nnet3-xvector-get-egs [options] " + " ... \n" + "\n" + "For example:\n" + "nnet3-xvector-get-egs ranges.1 \"$feats\" ark:egs_temp.1.ark" + " ark:egs_temp.2.ark ark:egs_temp.3.ark\n"; + + bool compress = true; + int32 subsample = 5; + + ParseOptions po(usage); + po.Register("compress", &compress, "If true, write egs in " + "compressed format."); + po.Register("subsample", &subsample, "TODO"); + + po.Read(argc, argv); + + if (po.NumArgs() < 3) { + po.PrintUsage(); + exit(1); + } + + std::string + range_rspecifier = po.GetArg(1), + feature_rspecifier = po.GetArg(2); + std::vector example_writers; + + for (int32 i = 3; i <= po.NumArgs(); i++) + example_writers.push_back(new NnetExampleWriter(po.GetArg(i))); + + std::vector pairs; + ProcessRangeFile(range_rspecifier, &pairs); + RandomAccessBaseFloatMatrixReader feat_reader1(feature_rspecifier); + RandomAccessBaseFloatMatrixReader feat_reader2(feature_rspecifier); + int32 num_done = 0, + num_err = 0, + num_egs_written = 0; + for (int32 i = 0; i < pairs.size(); i++) { + ChunkPairInfo *pair = pairs[i]; + const Matrix &feat1(feat_reader1.Value(pair->utt1)); + const Matrix &feat2(feat_reader2.Value(pair->utt2)); + WriteExample(feat1, feat2, pair, subsample, compress, &num_egs_written, + &example_writers); + num_done++; + } + Cleanup(&pairs, &example_writers); + + KALDI_LOG << "Finished generating examples, " + << "successfully processed " << num_done + << " feature files, wrote " << num_egs_written << " examples; " + << num_err << " files had errors."; + return (num_egs_written == 0 || num_err > num_done ? 1 : 0); + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} diff --git a/src/xvectorbin/nnet3-xvector-get-egs-sre.cc b/src/xvectorbin/nnet3-xvector-get-egs-sre.cc new file mode 100644 index 00000000000..28fde0fbf36 --- /dev/null +++ b/src/xvectorbin/nnet3-xvector-get-egs-sre.cc @@ -0,0 +1,237 @@ +// xvectorbin/nnet3-xvector-get-egs.cc + +// Copyright 2012-2016 Johns Hopkins University (author: Daniel Povey) +// 2016 David Snyder + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "util/common-utils.h" +#include "nnet3/nnet-example.h" + +namespace kaldi { +namespace nnet3 { + +// A struct for holding information about the position and +// duration of each pair of chunks. +struct ChunkPairInfo { + std::string pair_name; + std::string utt1; + std::string utt2; + int32 output_archive_id; + int32 start_frame1; + int32 start_frame2; + int32 num_frames1; + int32 num_frames2; +}; + +// Process the range input file and store it as a map from utterance +// name to vector of ChunkPairInfo structs. +static void ProcessRangeFile(const std::string &range_rxfilename, + std::vector *pairs) { + Input range_input(range_rxfilename); + if (!range_rxfilename.empty()) { + std::string line; + while (std::getline(range_input.Stream(), line)) { + ChunkPairInfo *pair = new ChunkPairInfo(); + std::vector fields; + SplitStringToVector(line, " \t\n\r", true, &fields); + if (fields.size() != 8) + KALDI_ERR << "Expected 7 fields in line of range file, got " + << fields.size() << " instead."; + + std::string utt1 = fields[0], + utt2 = fields[1], + start_frame1_str = fields[4], + num_frames1_str = fields[5], + start_frame2_str = fields[6], + num_frames2_str = fields[7]; + pair->utt1 = utt1; + pair->utt2 = utt2; + if (!ConvertStringToInteger(fields[2], &(pair->output_archive_id)) + || !ConvertStringToInteger(start_frame1_str, &(pair->start_frame1)) + || !ConvertStringToInteger(start_frame2_str, &(pair->start_frame2)) + || !ConvertStringToInteger(num_frames1_str, &(pair->num_frames1)) + || !ConvertStringToInteger(num_frames2_str, &(pair->num_frames2))) + KALDI_ERR << "Expected integer for output archive in range file."; + pair->pair_name = utt1 + "-" + start_frame1_str + "-" + num_frames1_str + + "-" + utt2 + + "-" + start_frame2_str + "-" + num_frames2_str; + pairs->push_back(pair); + } + } +} + +static void WriteExample(const MatrixBase &feat1, + const MatrixBase &feat2, + const ChunkPairInfo *pair, + bool compress, + int32 *num_egs_written, + std::vector *example_writers) { + NnetExample eg; + int32 num_rows1 = feat1.NumRows(), + feat_dim1 = feat1.NumCols(), + num_rows2 = feat2.NumRows(), + feat_dim2 = feat2.NumCols(); + std::string utt1 = pair->utt1, + utt2 = pair->utt2; + + KALDI_ASSERT(feat_dim1 == feat_dim2); + + if (num_rows1 < pair->num_frames1) { + KALDI_WARN << "Unable to create examples for utterance " + << utt1 + << ". Requested chunk size of " + << pair->num_frames1 + << " but utterance has only " << num_rows1 << " frames."; + return; + } + if (num_rows2 < pair->num_frames2) { + KALDI_WARN << "Unable to create examples for utterance " + << utt2 + << ". Requested chunk size of " + << pair->num_frames2 + << " but utterance has only " << num_rows2 << " frames."; + return; + } + // The requested chunk positions are approximate. It's possible + // that they slightly exceed the number of frames in the utterance. + // If that occurs, we can shift the chunks location back slightly. + int32 shift1 = std::min(0, num_rows1 - pair->start_frame1 + - pair->num_frames1), + shift2 = std::min(0, num_rows2 - pair->start_frame2 + - pair->num_frames2); + + SubMatrix chunk1(feat1, pair->start_frame1 + shift1, + pair->num_frames1, 0, feat_dim1), + chunk2(feat2, pair->start_frame2 + shift2, + pair->num_frames2, 0, feat_dim2); + NnetIo nnet_io1 = NnetIo("input", 0, chunk1), + nnet_io2 = NnetIo("input", 0, chunk2); + for (std::vector::iterator indx_it = nnet_io1.indexes.begin(); + indx_it != nnet_io1.indexes.end(); ++indx_it) + indx_it->n = 0; + for (std::vector::iterator indx_it = nnet_io2.indexes.begin(); + indx_it != nnet_io2.indexes.end(); ++indx_it) + indx_it->n = 1; + + eg.io.push_back(nnet_io1); + eg.io.push_back(nnet_io2); + if (compress) + eg.Compress(); + + if (pair->output_archive_id >= example_writers->size()) + KALDI_ERR << "Requested output index exceeds number of specified " + << "output files."; + (*example_writers)[pair->output_archive_id]->Write( + pair->pair_name, eg); + (*num_egs_written) += 1; +} + +// Delete the dynamically allocated memory. +static void Cleanup(std::vector *pairs, + std::vector *writers) { + for (std::vector::iterator + vec_it = pairs->begin(); vec_it != pairs->end(); + ++vec_it) + delete *vec_it; + for (std::vector::iterator + it = writers->begin(); it != writers->end(); ++it) + delete *it; +} + +} // namespace nnet3 +} // namespace kaldi + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + typedef kaldi::int32 int32; + + const char *usage = + "Get examples for training an nnet3 neural network for the xvector\n" + "system. Each output example contains a pair of feature chunks from\n" + "the same utterance. The location and length of the feature chunks\n" + "are specified in the 'ranges' file. Each line is interpreted as\n" + "follows:\n" + " " + " " + " \n" + "where is interpreted as a zero-based\n" + "index into the wspecifiers specified on the command line (\n" + "and so on), and is ignored by this program.\n" + "For example:\n" + " utt1 3 13 0 65 112 110\n" + " utt1 0 10 160 50 214 180\n" + " utt2 ...\n" + "\n" + "Usage: nnet3-xvector-get-egs [options] " + " ... \n" + "\n" + "For example:\n" + "nnet3-xvector-get-egs ranges.1 \"$feats\" ark:egs_temp.1.ark" + " ark:egs_temp.2.ark ark:egs_temp.3.ark\n"; + + bool compress = true; + + ParseOptions po(usage); + po.Register("compress", &compress, "If true, write egs in " + "compressed format."); + + po.Read(argc, argv); + + if (po.NumArgs() < 3) { + po.PrintUsage(); + exit(1); + } + + std::string + range_rspecifier = po.GetArg(1), + feature_rspecifier = po.GetArg(2); + std::vector example_writers; + + for (int32 i = 3; i <= po.NumArgs(); i++) + example_writers.push_back(new NnetExampleWriter(po.GetArg(i))); + + std::vector pairs; + ProcessRangeFile(range_rspecifier, &pairs); + RandomAccessBaseFloatMatrixReader feat_reader1(feature_rspecifier); + RandomAccessBaseFloatMatrixReader feat_reader2(feature_rspecifier); + int32 num_done = 0, + num_err = 0, + num_egs_written = 0; + for (int32 i = 0; i < pairs.size(); i++) { + ChunkPairInfo *pair = pairs[i]; + const Matrix &feat1(feat_reader1.Value(pair->utt1)); + const Matrix &feat2(feat_reader2.Value(pair->utt2)); + WriteExample(feat1, feat2, pair, compress, &num_egs_written, + &example_writers); + num_done++; + } + Cleanup(&pairs, &example_writers); + + KALDI_LOG << "Finished generating examples, " + << "successfully processed " << num_done + << " feature files, wrote " << num_egs_written << " examples; " + << num_err << " files had errors."; + return (num_egs_written == 0 || num_err > num_done ? 1 : 0); + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} diff --git a/src/xvectorbin/nnet3-xvector-get-egs.cc b/src/xvectorbin/nnet3-xvector-get-egs.cc new file mode 100644 index 00000000000..ab9a020e839 --- /dev/null +++ b/src/xvectorbin/nnet3-xvector-get-egs.cc @@ -0,0 +1,244 @@ +// xvectorbin/nnet3-xvector-get-egs.cc + +// Copyright 2012-2016 Johns Hopkins University (author: Daniel Povey) +// 2016 David Snyder + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "util/common-utils.h" +#include "nnet3/nnet-example.h" + +namespace kaldi { +namespace nnet3 { + +// A struct for holding information about the position and +// duration of each pair of chunks. +struct ChunkPairInfo { + std::string pair_name; + int32 output_archive_id; + int32 start_frame1; + int32 start_frame2; + int32 num_frames1; + int32 num_frames2; +}; + +// Process the range input file and store it as a map from utterance +// name to vector of ChunkPairInfo structs. +static void ProcessRangeFile(const std::string &range_rxfilename, + unordered_map > *utt_to_pairs) { + Input range_input(range_rxfilename); + if (!range_rxfilename.empty()) { + std::string line; + while (std::getline(range_input.Stream(), line)) { + ChunkPairInfo *pair = new ChunkPairInfo(); + std::vector fields; + SplitStringToVector(line, " \t\n\r", true, &fields); + if (fields.size() != 7) + KALDI_ERR << "Expected 7 fields in line of range file, got " + << fields.size() << " instead."; + + std::string utt = fields[0], + start_frame1_str = fields[3], + num_frames1_str = fields[4], + start_frame2_str = fields[5], + num_frames2_str = fields[6]; + + if (!ConvertStringToInteger(fields[1], &(pair->output_archive_id)) + || !ConvertStringToInteger(start_frame1_str, &(pair->start_frame1)) + || !ConvertStringToInteger(start_frame2_str, &(pair->start_frame2)) + || !ConvertStringToInteger(num_frames1_str, &(pair->num_frames1)) + || !ConvertStringToInteger(num_frames2_str, &(pair->num_frames2))) + KALDI_ERR << "Expected integer for output archive in range file."; + pair->pair_name = utt + "-" + start_frame1_str + "-" + num_frames1_str + + "-" + start_frame2_str + "-" + num_frames2_str; + unordered_map >::iterator + got = utt_to_pairs->find(utt); + if (got == utt_to_pairs->end()) { + std::vector pairs; + pairs.push_back(pair); + utt_to_pairs->insert(std::pair > (utt, pairs)); + } else { + got->second.push_back(pair); + } + } + } +} + +static void WriteExamples(const MatrixBase &feats, + const std::vector &pairs, + const std::string &utt, + bool compress, + int32 *num_egs_written, + std::vector *example_writers) { + for (std::vector::const_iterator it = pairs.begin(); + it != pairs.end(); ++it) { + ChunkPairInfo *pair = *it; + NnetExample eg; + int32 num_rows = feats.NumRows(), + feat_dim = feats.NumCols(); + if (num_rows < std::max(pair->num_frames1, pair->num_frames2)) { + KALDI_WARN << "Unable to create examples for utterance " << utt + << ". Requested chunk size of " + << std::max(pair->num_frames1, pair->num_frames2) + << " but utterance has only " << num_rows << " frames."; + } else { + // The requested chunk positions are approximate. It's possible + // that they slightly exceed the number of frames in the utterance. + // If that occurs, we can shift the chunks location back slightly. + int32 shift1 = std::min(0, num_rows - pair->start_frame1 + - pair->num_frames1), + shift2 = std::min(0, num_rows - pair->start_frame2 + - pair->num_frames2); + SubMatrix chunk1(feats, pair->start_frame1 + shift1, + pair->num_frames1, 0, feat_dim), + chunk2(feats, pair->start_frame2 + shift2, + pair->num_frames2, 0, feat_dim); + NnetIo nnet_io1 = NnetIo("input", 0, chunk1), + nnet_io2 = NnetIo("input", 0, chunk2); + for (std::vector::iterator indx_it = nnet_io1.indexes.begin(); + indx_it != nnet_io1.indexes.end(); ++indx_it) + indx_it->n = 0; + for (std::vector::iterator indx_it = nnet_io2.indexes.begin(); + indx_it != nnet_io2.indexes.end(); ++indx_it) + indx_it->n = 1; + + NnetExample eg; + eg.io.push_back(nnet_io1); + eg.io.push_back(nnet_io2); + if (compress) + eg.Compress(); + + if (pair->output_archive_id >= example_writers->size()) + KALDI_ERR << "Requested output index exceeds number of specified " + << "output files."; + (*example_writers)[pair->output_archive_id]->Write( + pair->pair_name, eg); + (*num_egs_written) += 1; + } + } +} + +// Delete the dynamically allocated memory. +static void Cleanup(unordered_map > *utt_to_pairs, + std::vector *writers) { + for (unordered_map >::iterator + map_it = utt_to_pairs->begin(); + map_it != utt_to_pairs->end(); ++map_it) + for (std::vector::iterator + vec_it = map_it->second.begin(); vec_it != map_it->second.end(); + ++vec_it) + delete *vec_it; + for (std::vector::iterator + it = writers->begin(); it != writers->end(); ++it) + delete *it; +} + +} // namespace nnet3 +} // namespace kaldi + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + typedef kaldi::int32 int32; + + const char *usage = + "Get examples for training an nnet3 neural network for the xvector\n" + "system. Each output example contains a pair of feature chunks from\n" + "the same utterance. The location and length of the feature chunks\n" + "are specified in the 'ranges' file. Each line is interpreted as\n" + "follows:\n" + " " + " " + " \n" + "where is interpreted as a zero-based\n" + "index into the wspecifiers specified on the command line (\n" + "and so on), and is ignored by this program.\n" + "For example:\n" + " utt1 3 13 0 65 112 110\n" + " utt1 0 10 160 50 214 180\n" + " utt2 ...\n" + "\n" + "Usage: nnet3-xvector-get-egs [options] " + " ... \n" + "\n" + "For example:\n" + "nnet3-xvector-get-egs ranges.1 \"$feats\" ark:egs_temp.1.ark" + " ark:egs_temp.2.ark ark:egs_temp.3.ark\n"; + + bool compress = true; + + ParseOptions po(usage); + po.Register("compress", &compress, "If true, write egs in " + "compressed format."); + + po.Read(argc, argv); + + if (po.NumArgs() < 3) { + po.PrintUsage(); + exit(1); + } + + std::string + range_rspecifier = po.GetArg(1), + feature_rspecifier = po.GetArg(2); + std::vector example_writers; + + for (int32 i = 3; i <= po.NumArgs(); i++) + example_writers.push_back(new NnetExampleWriter(po.GetArg(i))); + + unordered_map > utt_to_pairs; + ProcessRangeFile(range_rspecifier, &utt_to_pairs); + SequentialBaseFloatMatrixReader feat_reader(feature_rspecifier); + + int32 num_done = 0, + num_err = 0, + num_egs_written = 0; + + for (; !feat_reader.Done(); feat_reader.Next()) { + std::string key = feat_reader.Key(); + const Matrix &feats = feat_reader.Value(); + unordered_map >::iterator + got = utt_to_pairs.find(key); + if (got == utt_to_pairs.end()) { + KALDI_WARN << "Could not create examples from utterance " + << key << " because it has no entry in the ranges " + << "input file."; + num_err++; + } else { + std::vector pairs = got->second; + WriteExamples(feats, pairs, key, compress, &num_egs_written, + &example_writers); + num_done++; + } + } + Cleanup(&utt_to_pairs, &example_writers); + + KALDI_LOG << "Finished generating examples, " + << "successfully processed " << num_done + << " feature files, wrote " << num_egs_written << " examples; " + << num_err << " files had errors."; + return (num_egs_written == 0 || num_err > num_done ? 1 : 0); + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} diff --git a/src/xvectorbin/nnet3-xvector-scoring.cc b/src/xvectorbin/nnet3-xvector-scoring.cc new file mode 100644 index 00000000000..0b2512df83d --- /dev/null +++ b/src/xvectorbin/nnet3-xvector-scoring.cc @@ -0,0 +1,151 @@ +// xvectorbin/nnet3-xvector-scoring.cc + +// Copyright 2013 Daniel Povey +// 2016 David Snyder + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "nnet3/nnet-utils.h" +#include "xvector/xvector.h" + + +int main(int argc, char *argv[]) { + using namespace kaldi; + using namespace kaldi::nnet3; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + try { + const char *usage = + "Computes scores between pairs of xvectors.\n" + "The 'trials-file' has lines of the form\n" + " \n" + "and the output will have the form\n" + " []\n" + "(if either key could not be found, the score field in the output\n" + "will be absent, and this program will print a warning)\n" + "\n" + "Usage: nnet3-xvector-scoring [options] " + " " + "\n" + "e.g.: \n" + " nnet3-xvector-scoring nnet.final trials ark:spk_xvectors.scp " + "ark:test_xvectors.scp trials.scored\n" + "See also: ivector-plda-scoring and ivector-compute-dot-products\n"; + + ParseOptions po(usage); + + po.Read(argc, argv); + + if (po.NumArgs() != 5) { + po.PrintUsage(); + exit(1); + } + + std::string nnet_rxfilename = po.GetArg(1), + trials_rxfilename = po.GetArg(2), + xvector1_rspecifier = po.GetArg(3), + xvector2_rspecifier = po.GetArg(4), + scores_wxfilename = po.GetArg(5); + + + int64 num_done = 0, num_err = 0; + Nnet nnet; + ReadKaldiObject(nnet_rxfilename, &nnet); + // We need to ensure that the Nnet has outputs called 's' and 'b' + // and that 'b' is a scalar and 's' can be interpreted as a symmetric + // matrix. + int32 s_index = nnet.GetNodeIndex("s"), + b_index = nnet.GetNodeIndex("b"); + if (s_index == -1 || b_index == -1) + KALDI_ERR << "The input Nnet cannot be used for xvector scoring" + << "because it has no output called 's' or 'b'."; + if (!nnet.IsOutputNode(s_index) || !nnet.IsOutputNode(b_index)) + KALDI_ERR << "The nodes 's' and 'b' must be output nodes."; + + int32 s_dim = nnet.OutputDim("s"), + b_dim = nnet.OutputDim("b"); + if (b_dim != 1) + KALDI_ERR << "The output 'b' is a scalar offset. Input Nnet has an" + << "output called 'b' but it has a dimension of " << b_dim; + int32 d = (0.5) * (1 + sqrt(1 + 8 * s_dim)) - 1; + if (((d + 1) * d) / 2 != s_dim) + KALDI_ERR << "Output 's' cannot be interpretedas a symmetric matrix."; + Vector s_vec(s_dim); + Vector b_vec(1); + GetConstantOutput(nnet, "s", &s_vec); + GetConstantOutput(nnet, "b", &b_vec); + SpMatrix S(d); + SubVector s_vec_sub(s_vec, 0, s_dim); + S.CopyFromVec(s_vec_sub); + BaseFloat b = b_vec(0); + + RandomAccessBaseFloatVectorReader xvector1_reader(xvector1_rspecifier); + RandomAccessBaseFloatVectorReader xvector2_reader(xvector2_rspecifier); + + Input ki(trials_rxfilename); + + bool binary = false; + Output ko(scores_wxfilename, binary); + double sum = 0.0, sumsq = 0.0; + + std::string line; + while (std::getline(ki.Stream(), line)) { + std::vector fields; + SplitStringToVector(line, " \t\n\r", true, &fields); + if (fields.size() != 2) { + KALDI_ERR << "Bad line " << (num_done + num_err) << " in input " + << "(expected two fields: key1 key2): " << line; + } + std::string key1 = fields[0], key2 = fields[1]; + if (!xvector1_reader.HasKey(key1)) { + KALDI_WARN << "Key " << key1 << " not present in 1st table of xvectors."; + num_err++; + continue; + } + if (!xvector2_reader.HasKey(key2)) { + KALDI_WARN << "Key " << key2 << " not present in 2nd table of xvectors."; + num_err++; + continue; + } + const Vector &xvector1 = xvector1_reader.Value(key1), + &xvector2 = xvector2_reader.Value(key2); + // The following will crash if the dimensions differ, but + // they would likely also differ for all the xvectors so it's probably + // best to just crash. + BaseFloat score = SimilarityScore(xvector1, xvector2, S, b); + sum += score; + sumsq += score * score; + num_done++; + ko.Stream() << key1 << ' ' << key2 << ' ' << score << std::endl; + } + + if (num_done != 0) { + BaseFloat mean = sum / num_done, scatter = sumsq / num_done, + variance = scatter - mean * mean, stddev = sqrt(variance); + KALDI_LOG << "Mean score was " << mean << ", standard deviation was " + << stddev; + } + KALDI_LOG << "Processed " << num_done << " trials " << num_err + << " had errors."; + return (num_done != 0 ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} diff --git a/src/xvectorbin/nnet3-xvector-show-progress.cc b/src/xvectorbin/nnet3-xvector-show-progress.cc new file mode 100644 index 00000000000..951a7b1eb3a --- /dev/null +++ b/src/xvectorbin/nnet3-xvector-show-progress.cc @@ -0,0 +1,158 @@ +// xvectorbin/nnet3-xvector-show-progress.cc + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) +// 2015 Xingyu Na + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "hmm/transition-model.h" +#include "nnet3/nnet-utils.h" +#include "nnet3/nnet-diagnostics.h" +#include "xvector/nnet-xvector-diagnostics.h" + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + + const char *usage = + "Given an old and a new 'raw' nnet3 network and some training examples\n" + "(possibly held-out), show the average objective function given the\n" + "mean of the two networks, and the breakdown by component of why this\n" + "happened (computed from derivative information). Also shows parameter\n" + "differences per layer. If training examples not provided, only shows\n" + "parameter differences per layer.\n" + "\n" + "Usage: nnet3-xvector-show-progress [options] " + " []\n" + "e.g.: nnet3-xvector-show-progress 1.nnet 2.nnet ark:valid.egs\n"; + + ParseOptions po(usage); + + int32 num_segments = 1; + std::string use_gpu = "no"; + NnetComputeProbOptions compute_prob_opts; + compute_prob_opts.compute_deriv = true; + + po.Register("num-segments", &num_segments, + "Number of line segments used for computing derivatives"); + po.Register("use-gpu", &use_gpu, + "yes|no|optional|wait, only has effect if compiled with CUDA"); + compute_prob_opts.Register(&po); + + po.Read(argc, argv); + + if (po.NumArgs() < 2 || po.NumArgs() > 3) { + po.PrintUsage(); + exit(1); + } + +#if HAVE_CUDA==1 + CuDevice::Instantiate().SelectGpuId(use_gpu); +#endif + + std::string nnet1_rxfilename = po.GetArg(1), + nnet2_rxfilename = po.GetArg(2), + examples_rspecifier = po.GetOptArg(3); + + Nnet nnet1, nnet2; + ReadKaldiObject(nnet1_rxfilename, &nnet1); + ReadKaldiObject(nnet2_rxfilename, &nnet2); + + if (NumParameters(nnet1) != NumParameters(nnet2)) { + KALDI_WARN << "Parameter-dim mismatch, cannot show progress."; + exit(0); + } + + if (!examples_rspecifier.empty()) { + std::vector examples; + SequentialNnetExampleReader example_reader(examples_rspecifier); + for (; !example_reader.Done(); example_reader.Next()) + examples.push_back(example_reader.Value()); + + int32 num_examples = examples.size(); + + if (num_examples == 0) + KALDI_ERR << "No examples read."; + + int32 num_updatable = NumUpdatableComponents(nnet1); + Vector diff(num_updatable); + + for (int32 s = 0; s < num_segments; s++) { + // start and end segments of the line between 0 and 1 + BaseFloat start = (s + 0.0) / num_segments, + end = (s + 1.0) / num_segments, middle = 0.5 * (start + end); + Nnet interp_nnet(nnet2); + ScaleNnet(middle, &interp_nnet); + AddNnet(nnet1, 1.0 - middle, &interp_nnet); + + NnetXvectorComputeProb prob_computer(compute_prob_opts, interp_nnet); + std::vector::const_iterator eg_iter = examples.begin(), + eg_end = examples.end(); + for (; eg_iter != eg_end; ++eg_iter) + prob_computer.Compute(*eg_iter); + const SimpleObjectiveInfo *objf_info = prob_computer.GetObjective("output"); + double objf_per_frame = objf_info->tot_objective / objf_info->tot_weight; + const Nnet &nnet_gradient = prob_computer.GetDeriv(); + KALDI_LOG << "At position " << middle + << ", objf per frame is " << objf_per_frame; + + Vector old_dotprod(num_updatable), new_dotprod(num_updatable); + ComponentDotProducts(nnet_gradient, nnet1, &old_dotprod); + ComponentDotProducts(nnet_gradient, nnet2, &new_dotprod); + old_dotprod.Scale(1.0 / objf_info->tot_weight); + new_dotprod.Scale(1.0 / objf_info->tot_weight); + diff.AddVec(1.0/ num_segments, new_dotprod); + diff.AddVec(-1.0 / num_segments, old_dotprod); + KALDI_VLOG(1) << "By segment " << s << ", objf change is " + << PrintVectorPerUpdatableComponent(nnet1, diff); + } + KALDI_LOG << "Total objf change per component is " + << PrintVectorPerUpdatableComponent(nnet1, diff); + } + + { // Get info about magnitude of parameter change. + Nnet diff_nnet(nnet1); + AddNnet(nnet2, -1.0, &diff_nnet); + int32 num_updatable = NumUpdatableComponents(diff_nnet); + Vector dot_prod(num_updatable); + ComponentDotProducts(diff_nnet, diff_nnet, &dot_prod); + dot_prod.ApplyPow(0.5); // take sqrt to get l2 norm of diff + KALDI_LOG << "Parameter differences per layer are " + << PrintVectorPerUpdatableComponent(nnet1, dot_prod); + + Vector baseline_prod(num_updatable); + ComponentDotProducts(nnet1, nnet1, &baseline_prod); + baseline_prod.ApplyPow(0.5); + dot_prod.DivElements(baseline_prod); + KALDI_LOG << "Relative parameter differences per layer are " + << PrintVectorPerUpdatableComponent(nnet1, dot_prod); + } +#if HAVE_CUDA==1 + CuDevice::Instantiate().PrintProfile(); +#endif + return 0; + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} + + diff --git a/src/xvectorbin/nnet3-xvector-train.cc b/src/xvectorbin/nnet3-xvector-train.cc new file mode 100644 index 00000000000..a120879e72c --- /dev/null +++ b/src/xvectorbin/nnet3-xvector-train.cc @@ -0,0 +1,94 @@ +// xvectorbin/nnet3-xvector-train.cc + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "nnet3/nnet-training.h" +#include "xvector/nnet-xvector-training.h" + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + + const char *usage = + "Train xvector neural network parameters with backprop and stochastic\n" + "gradient descent. Minibatches are to be created by nnet3-merge-egs in\n" + "the input pipeline. This training program is single-threaded (best to\n" + "use it with a GPU); see nnet3-train-parallel for multi-threaded training\n" + "that is better suited to CPUs.\n" + "\n" + "Usage: nnet3-xvector-train [options] \n" + "\n" + "e.g.:\n" + "nnet3-xvector-train 1.raw 'ark:nnet3-merge-egs 1.egs ark:-|' 2.raw\n"; + + bool binary_write = true; + std::string use_gpu = "yes"; + NnetTrainerOptions train_config; + + ParseOptions po(usage); + po.Register("binary", &binary_write, "Write output in binary mode"); + po.Register("use-gpu", &use_gpu, + "yes|no|optional|wait, only has effect if compiled with CUDA"); + + train_config.Register(&po); + + po.Read(argc, argv); + + if (po.NumArgs() != 3) { + po.PrintUsage(); + exit(1); + } + +#if HAVE_CUDA==1 + CuDevice::Instantiate().SelectGpuId(use_gpu); +#endif + + std::string nnet_rxfilename = po.GetArg(1), + examples_rspecifier = po.GetArg(2), + nnet_wxfilename = po.GetArg(3); + + Nnet nnet; + ReadKaldiObject(nnet_rxfilename, &nnet); + + NnetXvectorTrainer trainer(train_config, &nnet); + + SequentialNnetExampleReader example_reader(examples_rspecifier); + + for (; !example_reader.Done(); example_reader.Next()) + trainer.Train(example_reader.Value()); + + bool ok = trainer.PrintTotalStats(); + +#if HAVE_CUDA==1 + CuDevice::Instantiate().PrintProfile(); +#endif + WriteKaldiObject(nnet, nnet_wxfilename, binary_write); + KALDI_LOG << "Wrote model to " << nnet_wxfilename; + return (ok ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} + + diff --git a/tools/config/common_path.sh b/tools/config/common_path.sh index 9a7ae2d9b29..4dba58ac929 100644 --- a/tools/config/common_path.sh +++ b/tools/config/common_path.sh @@ -22,4 +22,6 @@ ${KALDI_ROOT}/src/rnnlmbin:\ ${KALDI_ROOT}/src/sgmm2bin:\ ${KALDI_ROOT}/src/sgmmbin:\ ${KALDI_ROOT}/src/tfrnnlmbin:\ +${KALDI_ROOT}/src/xvectorbin:\ +${KALDI_ROOT}/src/fvectorbin:\ $PATH