"
+ exit 1;
+fi
+
+if [ ! -s `which normalizer_main` ] ; then
+ echo "Sparrowhawk normalizer was not found installed !"
+ echo "Go to $KALDI_ROOT/tools and execute install_sparrowhawk.sh and try again!"
+ exit 1
+fi
+
+txtdir=$1
+textdir=$(realpath $txtdir)
+outdir=$(realpath $2)
+
+workdir=$outdir/tmp
+if [ $stage -le 0 ]; then
+ rm -rf $outdir
+ mkdir -p $workdir
+ mkdir -p $textdir/splits
+ mkdir -p $outdir/data
+ split -l 1000000 $textdir/in.txt $textdir/splits/out
+ numsplits=0
+ for x in $textdir/splits/*; do
+ numsplits=$((numsplits+1))
+ ln -s $x $outdir/data/$numsplits
+ done
+ echo $numsplits
+ cp $SPARROWHAWK_ROOT/documentation/grammars/sentence_boundary_exceptions.txt .
+ $train_cmd --max_jobs_run 100 JOB=1:$numsplits $outdir/sparrowhawk/log/JOB.log \
+ local/run_norm.sh \
+ sparrowhawk_configuration.ascii_proto \
+ $SPARROWHAWK_ROOT/language-resources/esp/sparrowhawk/ \
+ $outdir/data \
+ JOB \
+ $outdir/sparrowhawk/
+ cat $outdir/sparrowhawk/*.txt | sed "/^$/d" > $outdir/text_normalized
+
+ # check if numbers are there in normalized output
+ awk '{for(i=1;i<=NF;i++) {if (!seen[$i]) {print $i; seen[$i]=1} }}' \
+ $outdir/text_normalized > $outdir/unique_words
+ grep "[0-9]" $outdir/unique_words | sort -u > $outdir/numbers
+fi
diff --git a/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_all_gigaword.sh b/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_all_gigaword.sh
new file mode 100755
index 00000000000..242359e7c28
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_all_gigaword.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+set -e
+
+# Path to Gigaword corpus with all data files decompressed.
+export GIGAWORDDIR=$1
+# The directory to write output to
+export OUTPUTDIR=$2
+# The number of jobs to run at once
+export NUMJOBS=$3
+
+echo "Flattening Gigaword with ${NUMJOBS} processes..."
+mkdir -p $OUTPUTDIR
+find ${GIGAWORDDIR}/data/*/* -type f -print -exec local/flatten_gigaword/run_flat.sh {} ${OUTPUTDIR} \;
+echo "Combining the flattened files into one..."
+cat ${OUTPUTDIR}/*.flat > ${OUTPUTDIR}/flattened_gigaword.txt
diff --git a/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_one_gigaword.py b/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_one_gigaword.py
new file mode 100644
index 00000000000..29f6766dd84
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_one_gigaword.py
@@ -0,0 +1,61 @@
+# -*- coding: utf-8 -*-
+
+import logging
+import os
+import re
+import spacy
+import gzip
+
+from argparse import ArgumentParser
+from bs4 import BeautifulSoup
+
+en_nlp = spacy.load("es")
+
+
+def flatten_one_gigaword_file(file_path):
+ f = gzip.open(file_path)
+ html = f.read()
+ # Parse the text with BeautifulSoup
+ soup = BeautifulSoup(html, "html.parser")
+
+ # Iterate over all items and get the text for each.
+ all_paragraphs = []
+ for paragraph in soup("p"):
+ # Turn inter-paragraph newlines into spaces
+ paragraph = paragraph.get_text()
+ paragraph = re.sub(r"\n+", "\n", paragraph)
+ paragraph = paragraph.replace("\n", " ")
+ # Tokenize the paragraph into words
+ tokens = en_nlp.tokenizer(paragraph)
+ words = [str(token) for token in tokens if not
+ str(token).isspace()]
+ if len(words) < 3:
+ continue
+ all_paragraphs.append(words)
+ # Return a list of strings, where each string is a
+ # space-tokenized paragraph.
+ return [" ".join(paragraph) for paragraph in all_paragraphs]
+
+
+if __name__ == "__main__":
+ log_fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+ logging.basicConfig(level=logging.INFO, format=log_fmt)
+ logger = logging.getLogger(__name__)
+
+ parser = ArgumentParser(description=("Flatten a gigaword data file for "
+ "use in language modeling."))
+ parser.add_argument("--gigaword-path", required=True,
+ metavar="", type=str,
+ help=("Path to Gigaword directory, with "
+ "all .gz files unzipped."))
+ parser.add_argument("--output-dir", required=True, metavar="",
+ type=str, help=("Directory to write final flattened "
+ "Gigaword file."))
+
+ A = parser.parse_args()
+ all_paragraphs = flatten_one_gigaword_file(A.gigaword_path)
+ output_path = os.path.join(A.output_dir,
+ os.path.basename(A.gigaword_path) + ".flat")
+ with open(output_path, "w") as output_file:
+ for paragraph in all_paragraphs:
+ output_file.write("{}\n".format(paragraph))
diff --git a/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/run_flat.sh b/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/run_flat.sh
new file mode 100755
index 00000000000..6b236be0ab9
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/run_flat.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+set -e
+
+. ./path_venv.sh
+
+# Path to Gigaword corpus with all data files decompressed.
+GIGAWORDPATH=$1
+# The directory to write output to
+OUTPUTDIR=$2
+file=$(basename ${GIGAWORDPATH})
+if [ ! -e ${OUTPUTDIR}/${file}.flat ]; then
+ echo "flattening to ${OUTPUTDIR}/${file}.flat"
+ python local/flatten_gigaword/flatten_one_gigaword.py --gigaword-path ${GIGAWORDPATH} --output-dir ${OUTPUTDIR}
+else
+ echo "skipping ${file}.flat"
+fi
+
diff --git a/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh b/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh
index 11d65da3e95..22b98a6c9db 100755
--- a/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh
+++ b/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh
@@ -133,6 +133,7 @@ if [ $stage -le 2 ]; then
sed 's:::g' | \
sed 's:foreign>::g' | \
+ sed 's:\[noise\]:[noise] :g' | \
sed 's:>::g' | \
#How do you handle numbers?
grep -v '()' | \
diff --git a/egs/fisher_callhome_spanish/s5/local/get_data_weights.pl b/egs/fisher_callhome_spanish/s5/local/get_data_weights.pl
new file mode 100755
index 00000000000..ca5b2a46f8e
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/get_data_weights.pl
@@ -0,0 +1,39 @@
+#!/usr/bin/env perl
+
+# Nagendra Kumar Goel
+
+# This takes two arguments:
+# 1) Pocolm training output folder
+# 2) rnnlm weights file name (for output)
+
+use POSIX;
+use List::Util qw[min max];
+
+if (@ARGV != 2) {
+ die "Usage: get_data_weights.pl \n";
+}
+
+$pdir = shift @ARGV;
+$out = shift @ARGV;
+
+open(P, "<$pdir/metaparameters") || die "Could not open $pdir/metaparameters";
+open(N, "<$pdir/names") || die "Could not open $pdir/names" ;
+open(O, ">$out") || die "Could not open $out for writing" ;
+
+my %scores = ();
+
+while() {
+ @n = split(/\s/,$_);
+ $name = $n[1];
+ $w = ;
+ @w = split(/\s/,$w);
+ $weight = $w[1];
+ $scores{$name} = $weight;
+}
+
+$min = min(values %scores);
+
+for(keys %scores) {
+ $weightout = POSIX::ceil($scores{$_} / $min);
+ print O "$_\t1\t$weightout\n";
+}
diff --git a/egs/fisher_callhome_spanish/s5/local/get_rnnlm_wordlist.py b/egs/fisher_callhome_spanish/s5/local/get_rnnlm_wordlist.py
new file mode 100755
index 00000000000..fc13a7af701
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/get_rnnlm_wordlist.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# 2018 Saikiran Valluri, GoVivace inc.
+
+import os, sys
+
+if len(sys.argv) < 5:
+ print( "Usage: python get_rnnlm_wordlist.py ")
+ sys.exit()
+
+lexicon_words = open(sys.argv[1], 'r', encoding="utf-8")
+pocolm_words = open(sys.argv[2], 'r', encoding="utf-8")
+rnnlm_wordsout = open(sys.argv[3], 'w', encoding="utf-8")
+oov_wordlist = open(sys.argv[4], 'w', encoding="utf-8")
+
+line_count=0
+lexicon=[]
+
+for line in lexicon_words:
+ lexicon.append(line.split()[0])
+ rnnlm_wordsout.write(line.split()[0] + " " + str(line_count)+'\n')
+ line_count = line_count + 1
+
+for line in pocolm_words:
+ if not line.split()[0] in lexicon:
+ oov_wordlist.write(line.split()[0]+'\n')
+ rnnlm_wordsout.write(line.split()[0] + " " + str(line_count)+'\n')
+ line_count = line_count + 1
+
+lexicon_words.close()
+pocolm_words.close()
+rnnlm_wordsout.close()
+oov_wordlist.close()
diff --git a/egs/fisher_callhome_spanish/s5/local/get_unigram_weights_vocab.py b/egs/fisher_callhome_spanish/s5/local/get_unigram_weights_vocab.py
new file mode 100644
index 00000000000..3ecd16772d7
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/get_unigram_weights_vocab.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# 2018 Saikiran Valluri, GoVivace inc.
+
+import os, sys
+
+if len(sys.argv) < 3:
+ print("Usage : python . ")
+ print(" Used for generating the unigram weights for second pass vocabulary from the first pass pocolm training metaparameters.")
+ sys.exit()
+
+pocolmdir=sys.argv[1]
+unigramwts=open(sys.argv[2], 'w')
+
+names = open(pocolmdir+"/names", 'r')
+metaparams = open(pocolmdir+"/metaparameters", 'r')
+
+name_mapper={}
+for line in names:
+ fields=line.split()
+ name_mapper[fields[0]] = fields[1]
+
+lns = metaparams.readlines()
+for lineno in range(len(name_mapper.keys())):
+ line = lns[lineno]
+ fileid = line.split()[0].split("_")[-1]
+ weight = line.split()[1]
+ unigramwts.write(name_mapper[fileid] + " " + weight + "\n")
+
+names.close()
+unigramwts.close()
+metaparams.close()
diff --git a/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py b/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py
index 864b76b671b..c7e0f140d2f 100755
--- a/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py
+++ b/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py
@@ -1,11 +1,14 @@
#!/usr/bin/env python
-# Copyright 2014 Gaurav Kumar. Apache 2.0
# -*- coding: utf-8 -*-
#
-# Merges unique words from Spanish Fisher, Gigaword and the LDC spanish lexicon
+# Copyright 2014 Gaurav Kumar. Apache 2.0
+# 2018 Nagendra Kumar Goel, Saikiran Valluri, GoVivace inc., Avaaya
+
+# Merges unique words from Spanish Fisher, Gigaword and the LDC spanish lexicon
from __future__ import print_function
import sys
+import re
import json
import codecs
import operator
@@ -17,6 +20,7 @@
uw_gigaword = tmpdir + "/es_wordlist.json"
uw_LDC = ldc_lexicon + "/callhome_spanish_lexicon_970908/preferences"
+filtered_letters = re.compile(u'[¡¥ª°º¿àçèëìîôö0123456789]')
merged_lexicon = []
# All three lexicons are in different formats
# First add the data from lexicon_fisher (A) into the dictionary
@@ -55,7 +59,8 @@
ltuples = sorted(merged_lexicon)
for item in ltuples:
- lf.write(item + "\n")
+ if not item==u'ñ' and not re.search(filtered_letters, item):
+ lf.write(item + "\n")
lf.close()
diff --git a/egs/fisher_callhome_spanish/s5/local/pocolm_cust.sh b/egs/fisher_callhome_spanish/s5/local/pocolm_cust.sh
new file mode 100755
index 00000000000..0a5649c2a79
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/pocolm_cust.sh
@@ -0,0 +1,120 @@
+#!/usr/bin/env bash
+
+# this script generates Pocolm-estimated language models with various
+# data sources in data/text folder and places the output in data/lm.
+
+set -euo pipefail
+
+. ./path.sh
+
+export POCOLM_ROOT=$(cd $KALDI_ROOT/tools/pocolm/; pwd -P)
+export PATH=$PATH:$POCOLM_ROOT/scripts
+
+
+wordlist=None
+num_word=100000
+pocolm_stage=1
+ngram_order=3
+lm_dir=
+arpa_dir=
+textdir=
+max_memory='--max-memory=8G'
+
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+
+# If you do not want to set memory limitation for "sort", you can use
+#max_memory=
+# Choices for the max-memory can be:
+# 1) integer + 'K', 'M', 'G', ...
+# 2) integer + 'b', meaning unit is byte and no multiplication
+# 3) integer + '%', meaning a percentage of memory
+# 4) integer, default unit is 'K'
+
+fold_dev_opt=
+# If you want to fold the dev-set in to the 'swbd1' set to produce the final
+# model, un-comment the following line. For use in the Kaldi example script for
+# ASR, this isn't suitable because the 'dev' set is the first 10k lines of the
+# switchboard data, which we also use as dev data for speech recognition
+# purposes.
+#fold_dev_opt="--fold-dev-into=swbd1"
+
+bypass_metaparam_optim_opt=
+# If you want to bypass the metaparameter optimization steps with specific metaparameters
+# un-comment the following line, and change the numbers to some appropriate values.
+# You can find the values from output log of train_lm.py.
+# These example numbers of metaparameters is for 3-gram model running with train_lm.py.
+# the dev perplexity should be close to the non-bypassed model.
+#bypass_metaparam_optim_opt="--bypass-metaparameter-optimization=0.091,0.867,0.753,0.275,0.100,0.018,0.902,0.371,0.183,0.070"
+# Note: to use these example parameters, you may need to remove the .done files
+# to make sure the make_lm_dir.py be called and tain only 3-gram model
+#for order in 3; do
+#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done
+
+#limit_unk_history_opt=
+# If you want to limit the left of in the history of a n-gram
+# un-comment the following line
+limit_unk_history_opt="--limit-unk-history=true"
+
+for order in ${ngram_order}; do
+ # decide on the vocabulary.
+ # Note: you'd use --wordlist if you had a previously determined word-list
+ # that you wanted to use.
+ lm_name="${num_word}_${order}"
+ min_counts=''
+ # Note: the following might be a more reasonable setting:
+ # min_counts='fisher=2 swbd1=1'
+ if [ -n "${min_counts}" ]; then
+ lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`"
+ fi
+ unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
+ train_lm.py --num-words=${num_word} --num-splits=5 --warm-start-ratio=10 ${max_memory} \
+ --min-counts=${min_counts} \
+ --keep-int-data=true ${fold_dev_opt} ${bypass_metaparam_optim_opt} \
+ ${limit_unk_history_opt} ${textdir} ${order} ${lm_dir}/work ${unpruned_lm_dir}
+
+ if [ $pocolm_stage -eq 2 ];then
+ mkdir -p ${arpa_dir}
+ format_arpa_lm.py ${max_memory} ${unpruned_lm_dir} | gzip -c > ${arpa_dir}/${lm_name}_${order}gram_unpruned.arpa.gz
+
+ # example of pruning. note: the threshold can be less than or more than one.
+ get_data_prob.py ${max_memory} ${textdir}/dev.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity'
+ for threshold in 1.0 2.0 4.0; do
+ pruned_lm_dir=${lm_dir}/${lm_name}_prune${threshold}.pocolm
+ prune_lm_dir.py --final-threshold=${threshold} ${max_memory} ${unpruned_lm_dir} ${pruned_lm_dir} 2>&1 | tail -n 5 | head -n 3
+ get_data_prob.py ${max_memory} ${textdir}/dev.txt ${pruned_lm_dir} 2>&1 | grep -F '[perplexity'
+
+ format_arpa_lm.py ${max_memory} ${pruned_lm_dir} | gzip -c > ${arpa_dir}/${lm_name}_${order}gram_prune${threshold}.arpa.gz
+
+ done
+
+ # example of pruning by size.
+ size=1000000
+ pruned_lm_dir=${lm_dir}/${lm_name}_prune${size}.pocolm
+ prune_lm_dir.py --target-num-ngrams=${size} ${max_memory} ${unpruned_lm_dir} ${pruned_lm_dir} 2>&1 | tail -n 8 | head -n 6 | grep -v 'log-prob changes'
+ get_data_prob.py ${textdir}/dev.txt ${max_memory} ${pruned_lm_dir} 2>&1 | grep -F '[perplexity'
+
+ format_arpa_lm.py ${max_memory} ${pruned_lm_dir} | gzip -c > ${arpa_dir}/${lm_name}_${order}gram_prune${size}.arpa.gz
+ fi
+done
+
+# (run local/srilm_baseline.sh ${num_word} to see the following result e.g. local/srilm_baseline.sh 40000 )
+
+# the following does does some self-testing, including
+# that the computed derivatives are accurate.
+# local/self_test.sh
+
+# perplexities from pocolm-estimated language models with pocolm's interpolation
+# method from orders 3, 4, and 5 are:
+# order 3: optimize_metaparameters.py: final perplexity without barrier function was -4.358818 (perplexity: 78.164689)
+# order 4: optimize_metaparameters.py: final perplexity without barrier function was -4.309507 (perplexity: 74.403797)
+# order 5: optimize_metaparameters.py: final perplexity without barrier function was -4.301741 (perplexity: 73.828181)
+
+# note, the perplexities from pocolm-estimated language models with SRILM's
+# interpolation from orders 3 and 4 are (from local/pocolm_with_srilm_combination.sh),
+# 78.8449 and 75.2202 respectively.
+
+# note, the perplexities from SRILM-estimated language models with SRILM's
+# interpolation tool from orders 3 and 4 are (from local/srilm_baseline.sh),
+# 78.9056 and 75.5528 respectively.
diff --git a/egs/fisher_callhome_spanish/s5/local/rnnlm.sh b/egs/fisher_callhome_spanish/s5/local/rnnlm.sh
new file mode 100755
index 00000000000..3850910f312
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/rnnlm.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+
+# Copyright 2012 Johns Hopkins University (author: Daniel Povey)
+# 2015 Guoguo Chen
+# 2017 Hainan Xu
+# 2017 Xiaohui Zhang
+
+# This script trains LMs on the swbd LM-training data.
+
+# rnnlm/train_rnnlm.sh: best iteration (out of 35) was 34, linking it to final iteration.
+# rnnlm/train_rnnlm.sh: train/dev perplexity was 41.9 / 50.0.
+# Train objf: -5.07 -4.43 -4.25 -4.17 -4.12 -4.07 -4.04 -4.01 -3.99 -3.98 -3.96 -3.94 -3.92 -3.90 -3.88 -3.87 -3.86 -3.85 -3.84 -3.83 -3.82 -3.81 -3.80 -3.79 -3.78 -3.78 -3.77 -3.77 -3.76 -3.75 -3.74 -3.73 -3.73 -3.72 -3.71
+# Dev objf: -10.32 -4.68 -4.43 -4.31 -4.24 -4.19 -4.15 -4.13 -4.10 -4.09 -4.05 -4.03 -4.02 -4.00 -3.99 -3.98 -3.98 -3.97 -3.96 -3.96 -3.95 -3.94 -3.94 -3.94 -3.93 -3.93 -3.93 -3.92 -3.92 -3.92 -3.92 -3.91 -3.91 -3.91 -3.91
+
+
+dir=Spanish_gigawrd/rnnlm
+pocolm_dir=Spanish_gigawrd/work_pocolm/lm/110000_3.pocolm_pruned
+wordslist=
+embedding_dim=1024
+lstm_rpd=256
+lstm_nrpd=256
+stage=0
+train_stage=-30
+text_dir=Spanish_gigawrd/text_lm
+
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+mkdir -p $dir/config
+set -e
+
+for f in $text_dir/dev.txt; do
+ [ ! -f $f ] && \
+ echo "$0: expected file $f to exist;" && exit 1
+done
+
+if [ $stage -le 0 ]; then
+ if [ -f $text_dir/unigram_weights ] ; then
+ mv $text_dir/unigram_weights $pocolm_dir/
+ fi
+ cp $wordslist $dir/config/words.txt
+ n=`cat $dir/config/words.txt | wc -l`
+ echo " $n" >> $dir/config/words.txt
+
+ # words that are not present in words.txt but are in the training or dev data, will be
+ # mapped to during training.
+ echo "" >$dir/config/oov.txt
+ local/get_data_weights.pl $pocolm_dir $dir/config/data_weights.txt
+ rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
+ --unk-word="" \
+ --data-weights-file=$dir/config/data_weights.txt \
+ $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
+
+ # choose features
+ rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
+ --use-constant-feature=true \
+ --special-words=',,,,[noise],[laughter]' \
+ $dir/config/words.txt > $dir/config/features.txt
+fi
+
+if [ $stage -le 1 ]; then
+ cat <$dir/config/xconfig
+ input dim=$embedding_dim name=input
+ relu-renorm-layer name=tdnn1 dim=$embedding_dim input=Append(0, IfDefined(-1))
+ fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+ relu-renorm-layer name=tdnn2 dim=$embedding_dim input=Append(0, IfDefined(-3))
+ fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+ relu-renorm-layer name=tdnn3 dim=$embedding_dim input=Append(0, IfDefined(-3))
+ output-layer name=output include-log-softmax=false dim=$embedding_dim
+EOF
+ rnnlm/validate_config_dir.sh $text_dir $dir/config
+fi
+
+if [ $stage -le 2 ]; then
+ rnnlm/prepare_rnnlm_dir.sh $text_dir $dir/config $dir
+fi
+
+if [ $stage -le 3 ]; then
+ rnnlm/train_rnnlm.sh --num-jobs-initial 1 --num-jobs-final 2 \
+ --stage $train_stage --num-epochs 5 --cmd "$train_cmd" $dir
+fi
+
+exit 0
diff --git a/egs/fisher_callhome_spanish/s5/local/rnnlm/train_rnnlm.sh b/egs/fisher_callhome_spanish/s5/local/rnnlm/train_rnnlm.sh
deleted file mode 100755
index 3713fe228d6..00000000000
--- a/egs/fisher_callhome_spanish/s5/local/rnnlm/train_rnnlm.sh
+++ /dev/null
@@ -1,101 +0,0 @@
-#!/bin/bash
-
-# Copyright 2012 Johns Hopkins University (author: Daniel Povey) Tony Robinson
-# 2017 Hainan Xu
-# 2017 Ke Li
-
-# This script is similar to rnnlm_lstm_tdnn_a.sh except for adding L2 regularization.
-
-# local/rnnlm/train_rnnlm.sh: best iteration (out of 18) was 17, linking it to final iteration.
-# local/rnnlm/train_rnnlm.sh: train/dev perplexity was 45.6 / 68.7.
-# Train objf: -651.50 -4.44 -4.26 -4.15 -4.08 -4.03 -4.00 -3.97 -3.94 -3.92 -3.90 -3.89 -3.88 -3.86 -3.85 -3.84 -3.83 -3.82
-# Dev objf: -10.76 -4.68 -4.47 -4.38 -4.33 -4.29 -4.28 -4.27 -4.26 -4.26 -4.25 -4.24 -4.24 -4.24 -4.23 -4.23 -4.23 -4.23
-
-# Begin configuration section.
-dir=exp/rnnlm_lstm_tdnn_1b
-embedding_dim=200
-embedding_l2=0.005 # embedding layer l2 regularize
-comp_l2=0.005 # component-level l2 regularize
-output_l2=0.005 # output-layer l2 regularize
-epochs=90
-mic=
-stage=-10
-train_stage=0
-
-. ./cmd.sh
-. ./utils/parse_options.sh
-[ -z "$cmd" ] && cmd=$train_cmd
-
-train=data/train/text
-dev=data/dev2/text # We at no stage in run.sh should decode dev2 partition for results!
-wordlist=data/lang/words.txt
-text_dir=data/local/rnnlm/text
-mkdir -p $dir/config
-set -e
-
-for f in $train $dev $wordlist; do
- [ ! -f $f ] && \
- echo "$0: expected file $f to exist; search for run.sh and utils/prepare_lang.sh in run.sh" && exit 1
-done
-
-if [ $stage -le 0 ]; then
- mkdir -p $text_dir
- cat $train | cut -d ' ' -f2- > $text_dir/ami.txt
- cat $dev | cut -d ' ' -f2- > $text_dir/dev.txt
-fi
-
-if [ $stage -le 1 ]; then
- cp $wordlist $dir/config/
- n=`cat $dir/config/words.txt | wc -l`
- echo " $n" >> $dir/config/words.txt
-
- # words that are not present in words.txt but are in the training or dev data, will be
- # mapped to during training.
- echo "" >$dir/config/oov.txt
-
- cat > $dir/config/data_weights.txt <" \
- --data-weights-file=$dir/config/data_weights.txt \
- $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
-
- # choose features
- rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
- --use-constant-feature=true \
- --top-word-features 10000 \
- --min-frequency 1.0e-03 \
- --special-words=',,,,[noise],[laughter]' \
- $dir/config/words.txt > $dir/config/features.txt
-
-lstm_opts="l2-regularize=$comp_l2"
-tdnn_opts="l2-regularize=$comp_l2"
-output_opts="l2-regularize=$output_l2"
-
- cat >$dir/config/xconfig < $dir/normalize/$job/substitute.sh
+
+bash $dir/normalize/$job/substitute.sh | \
+ sed "s: 's:'s:g" | sed "s: 'm:'m:g" | \
+ sed "s: \s*: :g" > $dir/normalize/$job/text
+
+local/clean_abbrevs_text.py $dir/normalize/$job/text $data/"$job"_processed
+tr 'A-ZÂÁÀÄÊÉÈËÏÍÎÖÓÔÖÚÙÛÑÇ' 'a-zâáàäêéèëïíîöóôöúùûñç' < $data/"$job"_processed > $dir/normalize/$job/text
+
+normalizer_main --config=$config --path_prefix=$path_prefix <$dir/normalize/$job/text >$dir/$job.txt
+
+exit 0;
diff --git a/egs/fisher_callhome_spanish/s5/local/train_pocolm.sh b/egs/fisher_callhome_spanish/s5/local/train_pocolm.sh
new file mode 100755
index 00000000000..b8b3ca35ef9
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/train_pocolm.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+stage=-2
+num_words_pocolm=110000
+prune_size=1000000
+
+. ./path.sh
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+set -euo pipefail
+
+export POCOLM_ROOT=$(cd $KALDI_ROOT/tools/pocolm/; pwd -P)
+export PATH=$PATH:$POCOLM_ROOT/scripts
+
+textdir=$1
+pocolm_dir=$2
+
+
+if [ $stage -le -2 ]; then
+ echo "****"
+ echo " POCOLM experiment : Running STAGE 1 : 2-gram Pocolm general closed vocabulary model"
+ echo " Will estimate the metaparams to be used as unigram weights for stage 2 ....."
+ echo "****"
+ if [ -e "$textdir"/unigram_weights ]; then
+ rm "$textdir"/unigram_weights
+ fi
+ if [ -e "$pocolm_dir" ]; then
+ rm -r "$pocolm_dir"
+ fi
+
+ bash local/pocolm_cust.sh --num-word 0 --ngram-order 2 --pocolm-stage 1 --lm-dir "$pocolm_dir"/lm \
+ --arpa-dir "$pocolm_dir"/arpa --textdir "$textdir"
+
+fi
+
+if [ $stage -le -1 ];then
+ echo "********"
+ echo "POCOLM experiment : RUNNING STAGE 2 : 3gram POCOLM using unigram wts estimates in 1st stage....."
+ echo "********"
+
+ echo " " > "$pocolm_dir"/lm/work/.unigram_weights.done
+ python local/get_unigram_weights_vocab.py "$pocolm_dir"/lm/0_2.pocolm/ "$textdir"/unigram_weights
+ bash local/pocolm_cust.sh --num-word "$num_words_pocolm" --lm-dir "$pocolm_dir"/lm \
+ --arpa-dir "$pocolm_dir"/arpa --textdir "$textdir"
+ prune_lm_dir.py --target-num-ngrams=$prune_size "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm \
+ "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm_pruned_"$prune_size"
+ mkdir -p "$pocolm_dir"/arpa
+ format_arpa_lm.py "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm_pruned_"$prune_size" | \
+ gzip -c > "$pocolm_dir"/arpa/"$num_words_pocolm"_3_pruned_"$prune_size".arpa.gz
+fi
+
+
+exit 0;
diff --git a/egs/fisher_callhome_spanish/s5/path.sh b/egs/fisher_callhome_spanish/s5/path.sh
index 17ffb0369f8..201edd95876 100755
--- a/egs/fisher_callhome_spanish/s5/path.sh
+++ b/egs/fisher_callhome_spanish/s5/path.sh
@@ -1,6 +1,11 @@
-export KALDI_ROOT=`pwd`/../../..
+export KALDI_ROOT=`pwd`/../../../
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
. $KALDI_ROOT/tools/config/common_path.sh
-export LC_ALL=C
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/dpovey/libs
+
+export SPARROWHAWK_ROOT=$KALDI_ROOT/tools/sparrowhawk
+export PATH=$SPARROWHAWK_ROOT/bin:$PATH
+export LC_ALL=C.UTF-8
+export LANG=C.UTF-8
diff --git a/egs/fisher_callhome_spanish/s5/run.sh b/egs/fisher_callhome_spanish/s5/run.sh
index 6e2752a7b68..70d4d0555a4 100755
--- a/egs/fisher_callhome_spanish/s5/run.sh
+++ b/egs/fisher_callhome_spanish/s5/run.sh
@@ -4,14 +4,25 @@
# Copyright 2014 Gaurav Kumar. Apache 2.0
# Recipe for Fisher/Callhome-Spanish
-stage=0
-train_stage=-20
+stage=-1
+lmstage=-2
+
+# GIGAWORD RNNLM training based options below.
+# GIGAWORD RAW CORPUS DATA is assumed to be already downloaded in the gigaword_datapath.
+train_rnnlm=false
+start_textcleanup=false # WARNING : IT starts from flattening gigaword corpus to preparing text folder.
+ # If you already have the normalised gigword text somewhere, you can bypass the
+ # time consuming text cleanup (~1 week) by setting this option false.
+addtraintext=true # If true, this option appends the Fisher train text to the Gigaword corpus textfile, to
+ # perform the A, A + G, Dev type POCOLM training configuration.
+ # A=fsp train, G=gigword text,
+num_words_pocolm=100000
train_sgmm2=false
# call the next line with the directory where the Spanish Fisher data is
# (the values below are just an example).
sfisher_speech=/export/corpora/LDC/LDC2010S01
-sfisher_transcripts=/export/corpora/LDC/LDC2010T04
+sfisher_transcripts=/export/c03/svalluri//LDC2010T04
spanish_lexicon=/export/corpora/LDC/LDC96L16
split=local/splits/split_fisher
@@ -19,15 +30,17 @@ callhome_speech=/export/corpora/LDC/LDC96S35
callhome_transcripts=/export/corpora/LDC/LDC96T17
split_callhome=local/splits/split_callhome
+gigaword_datapath=/export/c03/svalluri/Spanish_gigaword_rawcorpus/data # GIGAWORD RAW CORPUS DATA DOWNLOAD PATH
+rnnlm_workdir=workdir_rnnlm_Spanish_gigaword
mfccdir=`pwd`/mfcc
. ./cmd.sh
if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;
-set -e
+set -eou pipefail
-if [ $stage -le 1 ]; then
+if [ $stage -le -1 ]; then
local/fsp_data_prep.sh $sfisher_speech $sfisher_transcripts
local/callhome_data_prep.sh $callhome_speech $callhome_transcripts
@@ -37,19 +50,14 @@ if [ $stage -le 1 ]; then
# ES gigaword corpus to bring the total to 64k words. The ES frequency sorted
# wordlist is downloaded if it is not available.
local/fsp_prepare_dict.sh $spanish_lexicon
+ # Let's keep the original dict copy for G2P training
+ cp -r data/local/dict data/local/dict_orig
+ (
+ steps/dict/train_g2p_seq2seq.sh data/local/dict_orig/lexicon.txt exp/g2p || touch exp/g2p/.error
+ ) &
# Added c,j, v to the non silences phones manually
- utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang
-
- # Make sure that you do not use your test and your dev sets to train the LM
- # Some form of cross validation is possible where you decode your dev/set based on an
- # LM that is trained on everything but that that conversation
- # When in doubt about what your data partitions should be use local/fsp_ideal_data_partitions.pl
- # to get the numbers. Depending on your needs, you might have to change the size of
- # the splits within that file. The default paritions are based on the Kaldi + Joshua
- # requirements which means that I have very large dev and test sets
- local/fsp_train_lms.sh $split
- local/fsp_create_test_lang.sh
+ utils/prepare_lang.sh data/local/dict_orig "" data/local/lang_orig data/lang_orig
utils/fix_data_dir.sh data/local/data/train_all
@@ -70,34 +78,65 @@ if [ $stage -le 1 ]; then
cp -r data/local/data/callhome_train_all data/callhome_train_all
- # Creating data partitions for the pipeline
- # We need datasets for both the ASR and SMT system
- # We have 257455 utterances left, so the partitions are roughly as follows
- # ASR Train : 100k utterances
- # ASR Tune : 17455 utterances
- # ASR Eval : 20k utterances
- # MT Train : 100k utterances
- # MT Tune : Same as the ASR eval set (Use the lattices from here)
- # MT Eval : 20k utterances
- # The dev and the test sets need to be carefully chosen so that there is no conversation/speaker
- # overlap. This has been setup and the script local/fsp_ideal_data_partitions provides the numbers that are needed below.
- # As noted above, the LM has not been trained on the dev and the test sets.
- #utils/subset_data_dir.sh --first data/train_all 158126 data/dev_and_test
- #utils/subset_data_dir.sh --first data/dev_and_test 37814 data/asr_dev_and_test
- #utils/subset_data_dir.sh --last data/dev_and_test 120312 data/mt_train_and_test
- #utils/subset_data_dir.sh --first data/asr_dev_and_test 17662 data/dev
- #utils/subset_data_dir.sh --last data/asr_dev_and_test 20152 data/test
- #utils/subset_data_dir.sh --first data/mt_train_and_test 100238 data/mt_train
- #utils/subset_data_dir.sh --last data/mt_train_and_test 20074 data/mt_test
- #rm -r data/dev_and_test
- #rm -r data/asr_dev_and_test
- #rm -r data/mt_train_and_test
-
local/create_splits.sh $split
local/callhome_create_splits.sh $split_callhome
+
fi
+if [ $stage -le 0 ]; then
+ if $start_textcleanup; then
+ echo "WARNING : Starting from cleaning up and normalizing the Gigword text"
+ echo " This might take few days........... You can skip out this stage "
+ echo " by setting start_textcleanup=false, and having normalised_gigaword_corpus/text_normalized ready inside $rnnlm_workdir."
+
+ mkdir -p "$rnnlm_workdir"/gigaword_rawtext
+ local/flatten_gigaword/flatten_all_gigaword.sh "$gigaword_datapath" "$rnnlm_workdir"/flattened_gigaword_corpus 24
+ cat "$rnnlm_workdir"/flattened_gigaword_corpus/*.flat > "$rnnlm_workdir"/gigaword_rawtext/in.txt
+ local/clean_txt_dir.sh "$rnnlm_workdir"/gigaword_rawtext/ \
+ "$rnnlm_workdir"/normalised_gigaword_corpus/
+ fi
+ mkdir -p "$rnnlm_workdir"/text_lm
+ cut -d " " -f 2- data/train/text > "$rnnlm_workdir"/text_lm/train.txt
+ cut -d " " -f 2- data/dev2/text > "$rnnlm_workdir"/text_lm/dev.txt # For RNNLM and POCOLM training we use dev2/text as dev file.
+ cp "$rnnlm_workdir"/normalised_gigaword_corpus/text_normalized "$rnnlm_workdir"/text_lm/spanish_gigaword_normalised.txt
+ if $addtraintext; then
+ cat "$rnnlm_workdir"/text_lm/train.txt >> "$rnnlm_workdir"/text_lm/spanish_gigaword_normalised.txt
+ fi
+fi
+
+if [ $stage -le 1 ]; then
+ local/train_pocolm.sh --stage $lmstage --num-words-pocolm $num_words_pocolm "$rnnlm_workdir"/text_lm/ "$rnnlm_workdir"/pocolm
+ local/get_rnnlm_wordlist.py data/lang_orig/words.txt "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm/words.txt \
+ "$rnnlm_workdir"/rnnlm_wordlist "$rnnlm_workdir"/oov_pocolmwords
+ if $train_rnnlm; then
+ local/rnnlm.sh --stage $lmstage --dir "$rnnlm_workdir"/rnnlm --pocolm-dir "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm \
+ --wordslist "$rnnlm_workdir"/rnnlm_wordlist --text-dir "$rnnlm_workdir"/text_lm
+ fi
+fi
+
+
if [ $stage -le 2 ]; then
+ wait # wait till G2P training finishes
+ if [ -f exp/g2p/.error ]; then
+ rm exp/g2p/.error || true
+ echo "Fail to train the G2P model." && exit 1;
+ fi
+ steps/dict/apply_g2p_seq2seq.sh "$rnnlm_workdir"/oov_pocolmwords exp/g2p "$rnnlm_workdir"/oov_g2p.lex
+ cat "$rnnlm_workdir"/oov_g2p.lex/lexicon.lex data/local/dict/lexicon.txt | sed "/^[[:space:]]*$/d" | sort | uniq > "$rnnlm_workdir"/lexicon_extended.txt
+ cp "$rnnlm_workdir"/lexicon_extended.txt data/local/dict/lexicon.txt # Replacing original lexicon with extended version.
+
+ utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang
+
+ # Make sure that you do not use your test and your dev sets to train the LM
+ # Some form of cross validation is possible where you decode your dev/set based on an
+ # LM that is trained on everything but that that conversation
+ # When in doubt about what your data partitions should be use local/fsp_ideal_data_partitions.pl
+ # to get the numbers. Depending on your needs, you might have to change the size of
+ # the splits within that file. The default paritions are based on the Kaldi + Joshua
+ # requirements which means that I have very large dev and test sets
+ local/fsp_train_lms.sh $split
+ local/fsp_create_test_lang.sh
+
# Now compute CMVN stats for the train, dev and test subsets
steps/compute_cmvn_stats.sh data/dev exp/make_mfcc/dev $mfccdir
steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test $mfccdir
@@ -264,8 +303,11 @@ for iter in 1 2 3 4; do
data/lang_test data/dev/ exp/sgmm5/decode_dev $decode
done
) &
-
fi
-local/chain/run_tdnn_1g.sh --stage $stage --train-stage $train_stage || exit 1;
+wait;
+
+if [ $stage -le 6 ]; then
+ local/chain/run_tdnn_1g.sh --stage 0 --gigaword-workdir $rnnlm_workdir || exit 1;
+fi
exit 0;
diff --git a/egs/fisher_english/s5/local/chain/run_tdnn.sh b/egs/fisher_english/s5/local/chain/run_tdnn.sh
index 14174e617c4..1fd0f1fdf3a 100755
--- a/egs/fisher_english/s5/local/chain/run_tdnn.sh
+++ b/egs/fisher_english/s5/local/chain/run_tdnn.sh
@@ -112,7 +112,7 @@ if [ $stage -le 12 ]; then
echo "$0: creating neural net configs using the xconfig parser";
num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
- learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+ learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
mkdir -p $dir/configs
cat < $dir/configs/network.xconfig
diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh
index e95de232304..b76efc4f1de 100644
--- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh
+++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh
@@ -231,7 +231,7 @@ if [ $stage -le 11 ]; then
echo "$0: creating neural net configs using the xconfig parser";
num_targets=$(tree-info $sup_tree_dir/tree |grep num-pdfs|awk '{print $2}')
- learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+ learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
mkdir -p $dir/configs
cat < $dir/configs/network.xconfig
diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_1a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_1a.sh
index e76df666e8a..b1c133942ef 100755
--- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_1a.sh
+++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_1a.sh
@@ -142,7 +142,7 @@ if [ $stage -le 13 ]; then
echo "$0: creating neural net configs using the xconfig parser";
num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
- learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+ learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
mkdir -p $dir/configs
cat < $dir/configs/network.xconfig
diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh
index 2d5b2f8480e..53aac8c08ea 100755
--- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh
+++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh
@@ -250,7 +250,7 @@ if [ $stage -le 11 ]; then
echo "$0: creating neural net configs using the xconfig parser";
num_targets=$(tree-info $sup_tree_dir/tree |grep num-pdfs|awk '{print $2}')
- learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+ learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
mkdir -p $dir/configs
cat < $dir/configs/network.xconfig
diff --git a/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh b/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh
index cbf0ef6cb6c..c12f604f26b 100755
--- a/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh
@@ -133,7 +133,7 @@ if [ $stage -le 12 ]; then
echo "$0: creating neural net configs using the xconfig parser";
num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
- learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+ learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
mkdir -p $dir/configs
cat < $dir/configs/network.xconfig
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_7c.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_7c.sh
index 12b3187a5fa..efcd1eced4a 100644
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_7c.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_7c.sh
@@ -129,7 +129,7 @@ if [ $stage -le 12 ]; then
echo "$0: creating neural net configs using the xconfig parser";
num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
- learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+ learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
mkdir -p $dir/configs
cat < $dir/configs/network.xconfig
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_7d.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_7d.sh
index 7d640c3262a..e4a555abfdd 100644
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_7d.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_7d.sh
@@ -134,7 +134,7 @@ if [ $stage -le 12 ]; then
echo "$0: creating neural net configs using the xconfig parser";
num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
- learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+ learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
opts="l2-regularize=0.002"
linear_opts="orthonormal-constraint=1.0"
output_opts="l2-regularize=0.0005 bottleneck-dim=256"
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh
index 07e88b59ddc..5650cedca28 100755
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh
@@ -142,7 +142,7 @@ if [ $stage -le 12 ]; then
echo "$0: creating neural net configs using the xconfig parser";
num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
- learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+ learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
lstm_opts="decay-time=20"
mkdir -p $dir/configs
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a_svd.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a_svd.sh
new file mode 100644
index 00000000000..5beb2e74a9a
--- /dev/null
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a_svd.sh
@@ -0,0 +1,448 @@
+#!/bin/bash
+#
+# Copyright 2018 Nagendra Kumar Goel,
+# Saikiran Valluri, Govivace.Inc - Apache 2.0
+
+# The script is organized as below.
+# First we train the baseline LSTMP-TDNN config chain model for few epochs on the (Fisher+swbd)-english data,
+# Then, we perform SVD based refactoring of all the Affine components in this baseline final.mdl,
+# in order to reduce the overall model parameters size,
+# as determined by the bottleneck dim value or Energy and Shrinkage threshold values.
+# Then, we finetune the weight parameters of the refactored model using entire Fisher + switchboard data for single epoch.
+
+# Command used for comparing WERs of decoding on different testsets using pre-SVD and SVD models:
+# ./local/chain/compare_wer_general.sh --looped tdnn_lstm_1a_sp tdnn_lstm_1a_svd_sp
+#
+# Please run this entire script till the end before running the above WER compare command...
+
+
+# System tdnn_lstm_1a_sp
+# WER on eval2000(tg) 12.3
+# [looped:] 12.2
+# WER on eval2000(fg) 12.1
+# [looped:] 12.1
+# WER on eval2000(fg)
+# [SVD retrained + looped] 12.1
+# WER on rt03(tg) 11.6
+# [looped:] 11.6
+# WER on rt03(tg)
+# [SVD retrained] 12
+# WER on rt03(fg) 11.3
+# [looped:] 11.3
+# Final train prob -0.074
+# Final valid prob -0.084
+# Final train prob (xent) -0.882
+# Final valid prob (xent) -0.9393
+
+# WER stats for eval2000 using tdnn_lstm_1a_sp
+# | #Snt #Wrd | Corr Sub Del Ins Err S.Err |
+# %WER 16.0 | 2628 21594 | 86.3 9.0 4.7 2.3 16.0 54.4 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_tg/score_7_0.0/eval2000_hires.ctm.callhm.filt.sys
+# %WER 12.3 | 4459 42989 | 89.4 7.1 3.5 1.7 12.3 49.8 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_tg/score_8_0.0/eval2000_hires.ctm.filt.sys
+# %WER 8.4 | 1831 21395 | 92.7 5.1 2.2 1.1 8.4 42.3 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+# %WER 15.9 | 2628 21594 | 86.4 8.9 4.7 2.3 15.9 54.3 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_fg/score_7_0.0/eval2000_hires.ctm.callhm.filt.sys
+# %WER 12.1 | 4459 42989 | 89.6 6.9 3.5 1.7 12.1 49.2 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_fg/score_8_0.0/eval2000_hires.ctm.filt.sys
+# %WER 8.2 | 1831 21395 | 93.1 5.1 1.8 1.3 8.2 41.7 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_fg/score_8_0.0/eval2000_hires.ctm.swbd.filt.sys
+
+# WER stats for rt03 using tdnn_lstm_1a_sp
+# %WER 9.6 | 3970 36721 | 91.5 5.5 3.0 1.1 9.6 41.2 | exp/chain/tdnn_lstm_1a_sp/decode_rt03_fsh_sw1_tg/score_7_0.0/rt03_hires.ctm.fsh.filt.sys
+# %WER 11.6 | 8420 76157 | 89.7 6.8 3.4 1.4 11.6 43.0 | exp/chain/tdnn_lstm_1a_sp/decode_rt03_fsh_sw1_tg/score_7_0.0/rt03_hires.ctm.filt.sys
+# %WER 13.3 | 4450 39436 | 88.0 7.4 4.6 1.3 13.3 44.5 | exp/chain/tdnn_lstm_1a_sp/decode_rt03_fsh_sw1_tg/score_9_0.0/rt03_hires.ctm.swbd.filt.sys
+# %WER 9.4 | 3970 36721 | 91.8 5.3 2.9 1.1 9.4 40.3 | exp/chain/tdnn_lstm_1a_sp/decode_rt03_fsh_sw1_fg/score_7_0.0/rt03_hires.ctm.fsh.filt.sys
+# %WER 11.3 | 8420 76157 | 89.9 6.4 3.7 1.2 11.3 42.4 | exp/chain/tdnn_lstm_1a_sp/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.filt.sys
+# %WER 13.1 | 4450 39436 | 88.3 7.5 4.2 1.4 13.1 44.0 | exp/chain/tdnn_lstm_1a_sp/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.swbd.filt.sys
+
+# WER stats for rt03 using tdnn_lstm_1a_svd_sp
+# %WER 9.7 | 3970 36721 | 91.3 5.9 2.8 1.0 9.7 40.0 | exp/chain/tdnn_lstm_1a_svd_sp/decode_rt03_fsh_sw1_tg/score_8_0.0/rt03_hires.ctm.fsh.filt.sys
+# %WER 12 | 8420 76157 | 89.3 7.3 3.4 1.3 12.0 42.0 | exp/chain/tdnn_lstm_1a_svd_sp/decode_rt03_fsh_sw1_tg/score_8_0.0/rt03_hires.ctm.filt.sys
+# %WER 14.1 | 4450 39436 | 87.4 8.2 4.3 1.5 14.1 44.6 | exp/chain/tdnn_lstm_1a_svd_sp/decode_rt03_fsh_sw1_tg/score_9_0.0/rt03_hires.ctm.swbd.filt.sys
+
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-20
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_lstm_1a # Note: _sp will get added to this if $speed_perturb == true.
+svd_dir=${dir}_svd # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+decode_dir_affix=
+
+# training options
+leftmost_questions_truncate=-1
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=0
+xent_regularize=0.025
+self_repair_scale=0.00001
+label_delay=5
+# decode options
+extra_left_context=50
+extra_right_context=0
+frames_per_chunk=
+
+remove_egs=false
+common_egs_dir=
+
+affix=
+
+# config for svd
+apply_svd=true
+energy_threshold=0.81
+shrinkage_threshold=0.64
+primary_lr_factor=0.25
+
+# End configuration section.
+echo "$0 $@" # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+ cat <$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+ # Build a tree using our new topology.
+ steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+ --leftmost-questions-truncate $leftmost_questions_truncate \
+ --context-opts "--context-width=2 --central-position=1" \
+ --cmd "$train_cmd" 11000 data/$build_tree_train_set $lang $build_tree_ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+ echo "$0: creating neural net configs using the xconfig parser";
+
+ num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+ learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+ lstm_opts="decay-time=20"
+
+ mkdir -p $dir/configs
+ cat < $dir/configs/network.xconfig
+ input dim=100 name=ivector
+ input dim=40 name=input
+
+ # please note that it is important to have input layer with the name=input
+ # as the layer immediately preceding the fixed-affine-layer to enable
+ # the use of short notation for the descriptor
+ fixed-affine-layer name=lda input=Append(-2,-1,0,1,2, ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+ # the first splicing is moved before the lda layer, so no splicing here
+ relu-renorm-layer name=tdnn1 dim=1024
+ relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+ relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+ # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+ lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+ relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+ relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+ lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+ relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+ relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+ lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+
+ ## adding the layers for chain branch
+ output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+ # adding the layers for xent branch
+ # This block prints the configs for a separate output that will be
+ # trained with a cross-entropy objective in the 'chain' models... this
+ # has the effect of regularizing the hidden parts of the model. we use
+ # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+ # 0.5 / args.xent_regularize is suitable as it means the xent
+ # final-layer learns at a rate independent of the regularization
+ # constant; and the 0.5 was tuned so as to make the relative progress
+ # similar in the xent and regular final layers.
+ output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+ steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+ if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+ utils/create_split_dir.pl \
+ /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+ fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+ --cmd "$decode_cmd" \
+ --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+ --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+ --chain.xent-regularize $xent_regularize \
+ --chain.leaky-hmm-coefficient 0.1 \
+ --chain.l2-regularize 0.00005 \
+ --chain.apply-deriv-weights false \
+ --chain.lm-opts="--num-extra-lm-states=2000" \
+ --trainer.num-chunk-per-minibatch 64 \
+ --trainer.frames-per-iter 1200000 \
+ --trainer.max-param-change 2.0 \
+ --trainer.num-epochs 4 \
+ --trainer.optimization.shrink-value 0.99 \
+ --trainer.optimization.num-jobs-initial 3 \
+ --trainer.optimization.num-jobs-final 16 \
+ --trainer.optimization.initial-effective-lrate 0.001 \
+ --trainer.optimization.final-effective-lrate 0.0001 \
+ --trainer.optimization.momentum 0.0 \
+ --trainer.deriv-truncate-margin 8 \
+ --egs.stage $get_egs_stage \
+ --egs.opts "--frames-overlap-per-eg 0" \
+ --egs.chunk-width $chunk_width \
+ --egs.chunk-left-context $chunk_left_context \
+ --egs.chunk-right-context $chunk_right_context \
+ --egs.chunk-left-context-initial 0 \
+ --egs.chunk-right-context-final 0 \
+ --egs.dir "$common_egs_dir" \
+ --cleanup.remove-egs $remove_egs \
+ --feat-dir data/${train_set}_hires \
+ --tree-dir $treedir \
+ --lat-dir exp/tri5a_lats_nodup$suffix \
+ --dir $dir || exit 1;
+fi
+
+src_mdl=${dir}/final.mdl
+if $apply_svd && [ $stage -le 14 ]; then
+ # model compression using SVD
+
+ # threshold configs for tdnn layers
+ mkdir -p $svd_dir/configs
+ edits_config=$svd_dir/configs/final.config
+ common_egs_dir=$dir/egs
+ cat < ${edits_config}
+ set-learning-rate-factor learning-rate-factor=$primary_lr_factor
+ apply-svd name=* energy-threshold=$energy_threshold shrinkage-threshold=$shrinkage_threshold
+EOF
+
+ # Copy files / directories from source directory
+ cp ${dir}/{cmvn_opts,tree,frame_subsampling_factor,0.trans_mdl,normalization.fst,den.fst} $svd_dir/.
+
+ # Generate initial model from trained model
+ $train_cmd $svd_dir/log/generate_input_mdl.log \
+ nnet3-am-copy --edits-config=$edits_config $src_mdl $svd_dir/input.raw
+
+ # Retrain the model for 1 epoch
+ steps/nnet3/chain/train.py --stage $train_stage \
+ --cmd "$decode_cmd" \
+ --trainer.input-model $svd_dir/input.raw \
+ --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+ --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+ --chain.xent-regularize $xent_regularize \
+ --chain.leaky-hmm-coefficient 0.1 \
+ --chain.l2-regularize 0.00005 \
+ --chain.apply-deriv-weights false \
+ --chain.lm-opts="--num-extra-lm-states=2000" \
+ --trainer.num-chunk-per-minibatch 64 \
+ --trainer.frames-per-iter 1200000 \
+ --trainer.max-param-change 2.0 \
+ --trainer.num-epochs 1 \
+ --trainer.optimization.shrink-value 0.99 \
+ --trainer.optimization.num-jobs-initial 3 \
+ --trainer.optimization.num-jobs-final 16 \
+ --trainer.optimization.initial-effective-lrate 0.001 \
+ --trainer.optimization.final-effective-lrate 0.0001 \
+ --trainer.optimization.momentum 0.0 \
+ --trainer.deriv-truncate-margin 8 \
+ --egs.stage $get_egs_stage \
+ --egs.opts "--frames-overlap-per-eg 0" \
+ --egs.chunk-width $chunk_width \
+ --egs.chunk-left-context $chunk_left_context \
+ --egs.chunk-right-context $chunk_right_context \
+ --egs.chunk-left-context-initial 0 \
+ --egs.chunk-right-context-final 0 \
+ --egs.dir "$common_egs_dir" \
+ --cleanup.remove-egs $remove_egs \
+ --feat-dir data/${train_set}_hires \
+ --tree-dir $treedir \
+ --lat-dir exp/tri5a_lats_nodup$suffix \
+ --dir ${svd_dir} || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+ # Note: it might appear that this $lang directory is mismatched, and it is as
+ # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+ # the lang directory.
+ utils/mkgraph.sh --self-loop-scale 1.0 data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
+fi
+
+decode_suff=fsh_sw1_tg
+graph_dir=$dir/graph_fsh_sw1_tg
+
+if [ $stage -le 16 ]; then
+ [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+ [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+ [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+ if [ ! -z $decode_iter ]; then
+ iter_opts=" --iter $decode_iter "
+ fi
+ for decode_set in rt03 eval2000; do
+ (
+ steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+ --nj 50 --cmd "$decode_cmd" $iter_opts \
+ --extra-left-context $extra_left_context \
+ --extra-right-context $extra_right_context \
+ --extra-left-context-initial 0 \
+ --extra-right-context-final 0 \
+ --frames-per-chunk "$frames_per_chunk" \
+ --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+ $graph_dir data/${decode_set}_hires \
+ $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
+ if $has_fisher; then
+ steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+ data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
+ $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1;
+ fi
+ ) &
+ done
+fi
+
+test_online_decoding=true
+lang=data/lang_fsh_sw1_tg
+if $test_online_decoding && [ $stage -le 17 ]; then
+ # note: if the features change (e.g. you add pitch features), you will have to
+ # change the options of the following command line.
+ steps/online/nnet3/prepare_online_decoding.sh \
+ --mfcc-config conf/mfcc_hires.conf \
+ $lang exp/nnet3/extractor $dir ${dir}_online
+
+ rm $dir/.error 2>/dev/null || true
+ for decode_set in rt03 eval2000; do
+ (
+ # note: we just give it "$decode_set" as it only uses the wav.scp, the
+ # feature type does not matter.
+
+ steps/online/nnet3/decode.sh --nj 50 --cmd "$decode_cmd" $iter_opts \
+ --acwt 1.0 --post-decode-acwt 10.0 \
+ $graph_dir data/${decode_set}_hires \
+ ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+ if $has_fisher; then
+ steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+ data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
+ ${dir}_online/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1;
+ fi
+ ) || touch $dir/.error &
+ done
+ wait
+ if [ -f $dir/.error ]; then
+ echo "$0: something went wrong in online decoding"
+ exit 1
+ fi
+fi
+
+if $apply_svd; then
+ # Decoding the svd retrained model.
+ dir=$svd_dir
+fi
+
+if [ $stage -le 18 ]; then
+ [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+ [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+ [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+ if [ ! -z $decode_iter ]; then
+ iter_opts=" --iter $decode_iter "
+ fi
+ for decode_set in rt03 eval2000; do
+ (
+ steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+ --nj 50 --cmd "$decode_cmd" $iter_opts \
+ --extra-left-context $extra_left_context \
+ --extra-right-context $extra_right_context \
+ --extra-left-context-initial 0 \
+ --extra-right-context-final 0 \
+ --frames-per-chunk "$frames_per_chunk" \
+ --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+ $graph_dir data/${decode_set}_hires \
+ $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
+ if $has_fisher; then
+ steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+ data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
+ $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1;
+ fi
+ ) &
+ done
+fi
+
+test_online_decoding=true
+lang=data/lang_fsh_sw1_tg
+if $test_online_decoding && [ $stage -le 19 ]; then
+ # note: if the features change (e.g. you add pitch features), you will have to
+ # change the options of the following command line.
+ steps/online/nnet3/prepare_online_decoding.sh \
+ --mfcc-config conf/mfcc_hires.conf \
+ $lang exp/nnet3/extractor $dir ${dir}_online
+
+ rm $dir/.error 2>/dev/null || true
+ for decode_set in rt03 eval2000; do
+ (
+ # note: we just give it "$decode_set" as it only uses the wav.scp, the
+ # feature type does not matter.
+
+ steps/online/nnet3/decode.sh --nj 50 --cmd "$decode_cmd" $iter_opts \
+ --acwt 1.0 --post-decode-acwt 10.0 \
+ $graph_dir data/${decode_set}_hires \
+ ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+ if $has_fisher; then
+ steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+ data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
+ ${dir}_online/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1;
+ fi
+ ) || touch $dir/.error &
+ done
+ wait
+ if [ -f $dir/.error ]; then
+ echo "$0: something went wrong in online decoding"
+ exit 1
+ fi
+fi
+
+exit 0;
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh
index c9d50d1f7bd..f3cc869e6de 100755
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh
@@ -151,7 +151,7 @@ if [ $stage -le 12 ]; then
echo "$0: creating neural net configs using the xconfig parser";
num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
- learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+ learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
lstm_opts="decay-time=20 dropout-proportion=0.0"
mkdir -p $dir/configs
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh
index 1cce08abeee..059a81e15fc 100755
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh
@@ -148,7 +148,7 @@ if [ $stage -le 12 ]; then
echo "$0: creating neural net configs using the xconfig parser";
num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
- learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+ learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
gru_opts="dropout-per-frame=true dropout-proportion=0.0 "
mkdir -p $dir/configs
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh
index 2334c6a1bc1..d86b699d6f6 100755
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh
@@ -149,7 +149,7 @@ if [ $stage -le 12 ]; then
echo "$0: creating neural net configs using the xconfig parser";
num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
- learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+ learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
gru_opts="dropout-per-frame=true dropout-proportion=0.0 "
mkdir -p $dir/configs
diff --git a/egs/formosa/README.txt b/egs/formosa/README.txt
new file mode 100644
index 00000000000..3b9d78dad92
--- /dev/null
+++ b/egs/formosa/README.txt
@@ -0,0 +1,22 @@
+### Welcome to the demo recipe of the Formosa Speech in the Wild (FSW) Project ###
+
+The language habits of Taiwanese people are different from other Mandarin speakers (both accents and cultures) [1]. Especially Tainwaese use tranditional Chinese characters, i.e., 繁體中文). To address this issue, a Taiwanese speech corpus collection project "Formosa Speech in the Wild (FSW)" was initiated in 2017 to improve the development of Taiwanese-specific speech recognition techniques.
+
+FSW corpus will be a large-scale database of real-Life/multi-gene Taiwanese Spontaneous speech collected and transcribed from various sources (radio, TV, open courses, etc.). To demostrate that this database is a reasonable data resource for Taiwanese spontaneous speech recognition research, a baseline recipe is provied here for everybody, especially students, to develop their own systems easily and quickly.
+
+This recipe is based on the "NER-Trs-Vol1" corpus (about 150 hours broadcast radio speech selected from FSW). For more details, please visit:
+* Formosa Speech in the Wild (FSW) project (https://sites.google.com/speech.ntut.edu.tw/fsw)
+
+If you want to apply the NER-Trs-Vol1 corpus, please contact Yuan-Fu Liao (廖元甫) via "yfliao@mail.ntut.edu.tw". This corpus is only for non-commercial research/education use and will be distributed via our GitLab server in https://speech.nchc.org.tw.
+
+Any bug, errors, comments or suggestions are very welcomed.
+
+Yuan-Fu Liao (廖元甫)
+Associate Professor
+Department of electronic Engineering,
+National Taipei University of Technology
+http://www.ntut.edu.tw/~yfliao
+yfliao@mail.ntut.edu.tw
+
+............
+[1] The languages of Taiwan consist of several varieties of languages under families of the Austronesian languages and the Sino-Tibetan languages. Taiwanese Mandarin, Hokkien, Hakka and Formosan languages are used by 83.5%, 81.9%, 6.6% and 1.4% of the population respectively (2010). Given the prevalent use of Taiwanese Hokkien, the Mandarin spoken in Taiwan has been to a great extent influenced by it.
diff --git a/egs/formosa/s5/RESULTS b/egs/formosa/s5/RESULTS
new file mode 100644
index 00000000000..b047e5cefe4
--- /dev/null
+++ b/egs/formosa/s5/RESULTS
@@ -0,0 +1,43 @@
+#
+# Reference results
+#
+# Experimental settings:
+#
+# training set: show CS, BG, DA, QG, SR, SY and WK, in total 18977 utt., 1,088,948 words
+# test set: show JZ, GJ, KX and YX, in total 2112 utt., 135,972 words
+# eval set: show JX, TD and WJ, in total 2222 utt., 104,648 words
+#
+# lexicon: 274,036 words
+# phones (IPA): 196 (tonal)
+#
+
+# WER: test
+
+%WER 61.32 [ 83373 / 135972, 5458 ins, 19156 del, 58759 sub ] exp/mono/decode_test/wer_11_0.0
+%WER 41.00 [ 55742 / 135972, 6725 ins, 12763 del, 36254 sub ] exp/tri1/decode_test/wer_15_0.0
+%WER 40.41 [ 54948 / 135972, 7366 ins, 11505 del, 36077 sub ] exp/tri2/decode_test/wer_14_0.0
+%WER 38.67 [ 52574 / 135972, 6855 ins, 11250 del, 34469 sub ] exp/tri3a/decode_test/wer_15_0.0
+%WER 35.70 [ 48546 / 135972, 7197 ins, 9717 del, 31632 sub ] exp/tri4a/decode_test/wer_17_0.0
+%WER 32.11 [ 43661 / 135972, 6112 ins, 10185 del, 27364 sub ] exp/tri5a/decode_test/wer_17_0.5
+%WER 31.36 [ 42639 / 135972, 6846 ins, 8860 del, 26933 sub ] exp/tri5a_cleaned/decode_test/wer_17_0.5
+%WER 24.43 [ 33218 / 135972, 5524 ins, 7583 del, 20111 sub ] exp/nnet3/tdnn_sp/decode_test/wer_12_0.0
+%WER 23.95 [ 32568 / 135972, 4457 ins, 10271 del, 17840 sub ] exp/chain/tdnn_1a_sp/decode_test/wer_10_0.0
+%WER 23.54 [ 32006 / 135972, 4717 ins, 8644 del, 18645 sub ] exp/chain/tdnn_1b_sp/decode_test/wer_10_0.0
+%WER 20.64 [ 28067 / 135972, 4434 ins, 7946 del, 15687 sub ] exp/chain/tdnn_1c_sp/decode_test/wer_11_0.0
+%WER 20.98 [ 28527 / 135972, 4706 ins, 7816 del, 16005 sub ] exp/chain/tdnn_1d_sp/decode_test/wer_10_0.0
+
+# CER: test
+
+%WER 54.09 [ 116688 / 215718, 4747 ins, 24510 del, 87431 sub ] exp/mono/decode_test/cer_10_0.0
+%WER 32.61 [ 70336 / 215718, 5866 ins, 16282 del, 48188 sub ] exp/tri1/decode_test/cer_13_0.0
+%WER 32.10 [ 69238 / 215718, 6186 ins, 15772 del, 47280 sub ] exp/tri2/decode_test/cer_13_0.0
+%WER 30.40 [ 65583 / 215718, 6729 ins, 13115 del, 45739 sub ] exp/tri3a/decode_test/cer_12_0.0
+%WER 27.53 [ 59389 / 215718, 6311 ins, 13008 del, 40070 sub ] exp/tri4a/decode_test/cer_15_0.0
+%WER 24.21 [ 52232 / 215718, 6425 ins, 11543 del, 34264 sub ] exp/tri5a/decode_test/cer_15_0.0
+%WER 23.41 [ 50492 / 215718, 6645 ins, 10997 del, 32850 sub ] exp/tri5a_cleaned/decode_test/cer_17_0.0
+%WER 17.07 [ 36829 / 215718, 4734 ins, 9938 del, 22157 sub ] exp/nnet3/tdnn_sp/decode_test/cer_12_0.0
+%WER 16.83 [ 36305 / 215718, 4772 ins, 10810 del, 20723 sub ] exp/chain/tdnn_1a_sp/decode_test/cer_9_0.0
+%WER 16.44 [ 35459 / 215718, 4216 ins, 11278 del, 19965 sub ] exp/chain/tdnn_1b_sp/decode_test/cer_10_0.0
+%WER 13.72 [ 29605 / 215718, 4678 ins, 8066 del, 16861 sub ] exp/chain/tdnn_1c_sp/decode_test/cer_10_0.0
+%WER 14.08 [ 30364 / 215718, 5182 ins, 7588 del, 17594 sub ] exp/chain/tdnn_1d_sp/decode_test/cer_9_0.0
+
diff --git a/egs/formosa/s5/cmd.sh b/egs/formosa/s5/cmd.sh
new file mode 100755
index 00000000000..66ae9090820
--- /dev/null
+++ b/egs/formosa/s5/cmd.sh
@@ -0,0 +1,27 @@
+# "queue.pl" uses qsub. The options to it are
+# options to qsub. If you have GridEngine installed,
+# change this to a queue you have access to.
+# Otherwise, use "run.pl", which will run jobs locally
+# (make sure your --num-jobs options are no more than
+# the number of cpus on your machine.
+
+# Run locally:
+#export train_cmd=run.pl
+#export decode_cmd=run.pl
+
+# JHU cluster (or most clusters using GridEngine, with a suitable
+# conf/queue.conf).
+export train_cmd="queue.pl"
+export decode_cmd="queue.pl --mem 4G"
+
+host=$(hostname -f)
+if [ ${host#*.} == "fit.vutbr.cz" ]; then
+ queue_conf=$HOME/queue_conf/default.conf # see example /homes/kazi/iveselyk/queue_conf/default.conf,
+ export train_cmd="queue.pl --config $queue_conf --mem 2G --matylda 0.2"
+ export decode_cmd="queue.pl --config $queue_conf --mem 3G --matylda 0.1"
+ export cuda_cmd="queue.pl --config $queue_conf --gpu 1 --mem 10G --tmp 40G"
+elif [ ${host#*.} == "cm.cluster" ]; then
+ # MARCC bluecrab cluster:
+ export train_cmd="slurm.pl --time 4:00:00 "
+ export decode_cmd="slurm.pl --mem 4G --time 4:00:00 "
+fi
diff --git a/egs/formosa/s5/conf/decode.config b/egs/formosa/s5/conf/decode.config
new file mode 100644
index 00000000000..d91f86183af
--- /dev/null
+++ b/egs/formosa/s5/conf/decode.config
@@ -0,0 +1,5 @@
+beam=11.0 # beam for decoding. Was 13.0 in the scripts.
+first_beam=8.0 # beam for 1st-pass decoding in SAT.
+
+
+
diff --git a/egs/formosa/s5/conf/mfcc.conf b/egs/formosa/s5/conf/mfcc.conf
new file mode 100644
index 00000000000..a1aa3d6c158
--- /dev/null
+++ b/egs/formosa/s5/conf/mfcc.conf
@@ -0,0 +1,2 @@
+--use-energy=false # only non-default option.
+--sample-frequency=16000
diff --git a/egs/formosa/s5/conf/mfcc_hires.conf b/egs/formosa/s5/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..ca067e77b37
--- /dev/null
+++ b/egs/formosa/s5/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false # use average of log energy, not energy.
+--sample-frequency=16000 # Switchboard is sampled at 8kHz
+--num-mel-bins=40 # similar to Google's setup.
+--num-ceps=40 # there is no dimensionality reduction.
+--low-freq=40 # low cutoff frequency for mel bins
+--high-freq=-200 # high cutoff frequently, relative to Nyquist of 8000 (=3800)
diff --git a/egs/formosa/s5/conf/online_cmvn.conf b/egs/formosa/s5/conf/online_cmvn.conf
new file mode 100644
index 00000000000..591367e7ae9
--- /dev/null
+++ b/egs/formosa/s5/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used when invoking online2-wav-nnet3-latgen-faster.
diff --git a/egs/formosa/s5/conf/pitch.conf b/egs/formosa/s5/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs/formosa/s5/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs/formosa/s5/local/chain/run_tdnn.sh b/egs/formosa/s5/local/chain/run_tdnn.sh
new file mode 120000
index 00000000000..e1adaa9346d
--- /dev/null
+++ b/egs/formosa/s5/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1d.sh
\ No newline at end of file
diff --git a/egs/formosa/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/formosa/s5/local/chain/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..66c5ad3335f
--- /dev/null
+++ b/egs/formosa/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,181 @@
+#!/bin/bash
+
+# This script is based on run_tdnn_7h.sh in swbd chain recipe.
+
+set -e
+
+# configs for 'chain'
+affix=1a
+stage=0
+train_stage=-10
+get_egs_stage=-10
+dir=exp/chain/tdnn # Note: _sp will get added to this
+decode_iter=
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=2
+num_jobs_final=12
+minibatch_size=128
+frames_per_eg=150,110,90
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+# End configuration section.
+echo "$0 $@" # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+ cat <$lang/topo
+fi
+
+if [ $stage -le 9 ]; then
+ # Build a tree using our new topology. This is the critically different
+ # step compared with other recipes.
+ steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+ --context-opts "--context-width=2 --central-position=1" \
+ --cmd "$train_cmd" 5000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 10 ]; then
+ echo "$0: creating neural net configs using the xconfig parser";
+
+ num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+ learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+
+ mkdir -p $dir/configs
+ cat < $dir/configs/network.xconfig
+ input dim=100 name=ivector
+ input dim=43 name=input
+
+ # please note that it is important to have input layer with the name=input
+ # as the layer immediately preceding the fixed-affine-layer to enable
+ # the use of short notation for the descriptor
+ fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+ # the first splicing is moved before the lda layer, so no splicing here
+ relu-batchnorm-layer name=tdnn1 dim=625
+ relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=625
+ relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=625
+ relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=625
+ relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=625
+ relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=625
+
+ ## adding the layers for chain branch
+ relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=625 target-rms=0.5
+ output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+ # adding the layers for xent branch
+ # This block prints the configs for a separate output that will be
+ # trained with a cross-entropy objective in the 'chain' models... this
+ # has the effect of regularizing the hidden parts of the model. we use
+ # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+ # 0.5 / args.xent_regularize is suitable as it means the xent
+ # final-layer learns at a rate independent of the regularization
+ # constant; and the 0.5 was tuned so as to make the relative progress
+ # similar in the xent and regular final layers.
+ relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=625 target-rms=0.5
+ output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+ steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 11 ]; then
+ steps/nnet3/chain/train.py --stage $train_stage \
+ --cmd "$decode_cmd" \
+ --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+ --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+ --chain.xent-regularize $xent_regularize \
+ --chain.leaky-hmm-coefficient 0.1 \
+ --chain.l2-regularize 0.00005 \
+ --chain.apply-deriv-weights false \
+ --chain.lm-opts="--num-extra-lm-states=2000" \
+ --egs.dir "$common_egs_dir" \
+ --egs.stage $get_egs_stage \
+ --egs.opts "--frames-overlap-per-eg 0" \
+ --egs.chunk-width $frames_per_eg \
+ --trainer.num-chunk-per-minibatch $minibatch_size \
+ --trainer.frames-per-iter 1500000 \
+ --trainer.num-epochs $num_epochs \
+ --trainer.optimization.num-jobs-initial $num_jobs_initial \
+ --trainer.optimization.num-jobs-final $num_jobs_final \
+ --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+ --trainer.optimization.final-effective-lrate $final_effective_lrate \
+ --trainer.max-param-change $max_param_change \
+ --cleanup.remove-egs $remove_egs \
+ --feat-dir data/${train_set}_hires \
+ --tree-dir $treedir \
+ --lat-dir exp/tri5a_sp_lats \
+ --use-gpu wait \
+ --dir $dir || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+ # Note: it might appear that this $lang directory is mismatched, and it is as
+ # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+ # the lang directory.
+ utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+fi
+
+graph_dir=$dir/graph
+if [ $stage -le 13 ]; then
+ for test_set in test eval; do
+ steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+ --nj 10 --cmd "$decode_cmd" \
+ --online-ivector-dir exp/nnet3/ivectors_$test_set \
+ $graph_dir data/${test_set}_hires $dir/decode_${test_set} || exit 1;
+ done
+ wait;
+fi
+
+exit 0;
diff --git a/egs/formosa/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/formosa/s5/local/chain/tuning/run_tdnn_1b.sh
new file mode 100755
index 00000000000..1981bb0530d
--- /dev/null
+++ b/egs/formosa/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -0,0 +1,188 @@
+#!/bin/bash
+
+# This script shows improvement arising from data cleaning.
+
+# CER:
+# %WER 16.83 [ 36305 / 215718, 4772 ins, 10810 del, 20723 sub ] exp/chain/tdnn_1a_sp/decode_test/cer_9_0.0
+# %WER 16.44 [ 35459 / 215718, 4216 ins, 11278 del, 19965 sub ] exp/chain/tdnn_1b_sp/decode_test/cer_10_0.0
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_1b_sp
+# exp/chain/tdnn_1b_sp: num-iters=133 nj=2..12 num-params=12.5M dim=43+100->4528 combine=-0.073->-0.073 (over 2) xent:train/valid[87,132,final]=(-1.05,-0.964,-0.963/-1.10,-1.06,-1.05) logprob:train/valid[87,132,final]=(-0.079,-0.065,-0.065/-0.094,-0.092,-0.092)
+
+set -e
+
+# configs for 'chain'
+affix=1b
+nnet3_affix=_1b
+stage=0
+train_stage=-10
+get_egs_stage=-10
+dir=exp/chain/tdnn # Note: _sp will get added to this
+decode_iter=
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=2
+num_jobs_final=12
+minibatch_size=128
+frames_per_eg=150,110,90
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+# End configuration section.
+echo "$0 $@" # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+ cat <$lang/topo
+fi
+
+if [ $stage -le 9 ]; then
+ # Build a tree using our new topology. This is the critically different
+ # step compared with other recipes.
+ steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+ --context-opts "--context-width=2 --central-position=1" \
+ --cmd "$train_cmd" 5000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 10 ]; then
+ echo "$0: creating neural net configs using the xconfig parser";
+
+ num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+ learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+
+ mkdir -p $dir/configs
+ cat < $dir/configs/network.xconfig
+ input dim=100 name=ivector
+ input dim=43 name=input
+
+ # please note that it is important to have input layer with the name=input
+ # as the layer immediately preceding the fixed-affine-layer to enable
+ # the use of short notation for the descriptor
+ fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+ # the first splicing is moved before the lda layer, so no splicing here
+ relu-batchnorm-layer name=tdnn1 dim=625
+ relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=625
+ relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=625
+ relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=625
+ relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=625
+ relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=625
+
+ ## adding the layers for chain branch
+ relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=625 target-rms=0.5
+ output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+ # adding the layers for xent branch
+ # This block prints the configs for a separate output that will be
+ # trained with a cross-entropy objective in the 'chain' models... this
+ # has the effect of regularizing the hidden parts of the model. we use
+ # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+ # 0.5 / args.xent_regularize is suitable as it means the xent
+ # final-layer learns at a rate independent of the regularization
+ # constant; and the 0.5 was tuned so as to make the relative progress
+ # similar in the xent and regular final layers.
+ relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=625 target-rms=0.5
+ output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+ steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 11 ]; then
+ steps/nnet3/chain/train.py --stage $train_stage \
+ --cmd "$decode_cmd" \
+ --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${train_set} \
+ --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+ --chain.xent-regularize $xent_regularize \
+ --chain.leaky-hmm-coefficient 0.1 \
+ --chain.l2-regularize 0.00005 \
+ --chain.apply-deriv-weights false \
+ --chain.lm-opts="--num-extra-lm-states=2000" \
+ --egs.dir "$common_egs_dir" \
+ --egs.stage $get_egs_stage \
+ --egs.opts "--frames-overlap-per-eg 0" \
+ --egs.chunk-width $frames_per_eg \
+ --trainer.num-chunk-per-minibatch $minibatch_size \
+ --trainer.frames-per-iter 1500000 \
+ --trainer.num-epochs $num_epochs \
+ --trainer.optimization.num-jobs-initial $num_jobs_initial \
+ --trainer.optimization.num-jobs-final $num_jobs_final \
+ --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+ --trainer.optimization.final-effective-lrate $final_effective_lrate \
+ --trainer.max-param-change $max_param_change \
+ --cleanup.remove-egs $remove_egs \
+ --feat-dir data/${train_set}_hires \
+ --tree-dir $treedir \
+ --lat-dir exp/tri5a_sp_lats \
+ --use-gpu wait \
+ --dir $dir || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+ # Note: it might appear that this $lang directory is mismatched, and it is as
+ # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+ # the lang directory.
+ utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+fi
+
+graph_dir=$dir/graph
+if [ $stage -le 13 ]; then
+ for test_set in test eval; do
+ steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+ --nj 10 --cmd "$decode_cmd" \
+ --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_$test_set \
+ $graph_dir data/${test_set}_hires $dir/decode_${test_set} || exit 1;
+ done
+ wait;
+fi
+exit 0;
diff --git a/egs/formosa/s5/local/chain/tuning/run_tdnn_1c.sh b/egs/formosa/s5/local/chain/tuning/run_tdnn_1c.sh
new file mode 100755
index 00000000000..6fa10344cfc
--- /dev/null
+++ b/egs/formosa/s5/local/chain/tuning/run_tdnn_1c.sh
@@ -0,0 +1,191 @@
+#!/bin/bash
+
+# CER:
+# %WER 16.44 [ 35459 / 215718, 4216 ins, 11278 del, 19965 sub ] exp/chain/tdnn_1b_sp/decode_test/cer_10_0.0
+# %WER 13.72 [ 29605 / 215718, 4678 ins, 8066 del, 16861 sub ] exp/chain/tdnn_1c_sp/decode_test/cer_10_0.0
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_1c_sp
+# exp/chain/tdnn_1c_sp: num-iters=147 nj=3..16 num-params=17.9M dim=43+100->4528 combine=-0.041->-0.041 (over 2) xent:train/valid[97,146,final]=(-0.845,-0.625,-0.618/-0.901,-0.710,-0.703) logprob:train/valid[97,146,final]=(-0.064,-0.040,-0.039/-0.072,-0.058,-0.057)
+
+set -e
+
+# configs for 'chain'
+affix=1c
+nnet3_affix=_1b
+stage=0
+train_stage=-10
+get_egs_stage=-10
+dir=exp/chain/tdnn # Note: _sp will get added to this
+decode_iter=
+
+# training options
+num_epochs=6
+initial_effective_lrate=0.00025
+final_effective_lrate=0.000025
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=64
+frames_per_eg=150,110,90
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+# End configuration section.
+echo "$0 $@" # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+ cat <$lang/topo
+fi
+
+if [ $stage -le 9 ]; then
+ # Build a tree using our new topology. This is the critically different
+ # step compared with other recipes.
+ steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+ --context-opts "--context-width=2 --central-position=1" \
+ --cmd "$train_cmd" 5000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 10 ]; then
+ echo "$0: creating neural net configs using the xconfig parser";
+
+ num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+ learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+ affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+ tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+ linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+ prefinal_opts="l2-regularize=0.01"
+ output_opts="l2-regularize=0.002"
+
+ mkdir -p $dir/configs
+ cat < $dir/configs/network.xconfig
+ input dim=100 name=ivector
+ input dim=43 name=input
+
+ # please note that it is important to have input layer with the name=input
+ # as the layer immediately preceding the fixed-affine-layer to enable
+ # the use of short notation for the descriptor
+ fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+ # the first splicing is moved before the lda layer, so no splicing here
+ relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536
+ tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+ tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+ tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+ tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0
+ tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+ tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+ tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+ tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+ tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+ tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+ tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+ tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+ tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+ tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+ linear-component name=prefinal-l dim=256 $linear_opts
+ prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+ output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+ prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+ output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+ steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 11 ]; then
+ steps/nnet3/chain/train.py --stage $train_stage \
+ --cmd "$decode_cmd" \
+ --feat.online-ivector-dir exp/nnet3$nnet3_affix/ivectors_${train_set} \
+ --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+ --chain.xent-regularize $xent_regularize \
+ --chain.leaky-hmm-coefficient 0.1 \
+ --chain.l2-regularize 0.0 \
+ --chain.apply-deriv-weights false \
+ --chain.lm-opts="--num-extra-lm-states=2000" \
+ --trainer.dropout-schedule $dropout_schedule \
+ --trainer.add-option="--optimization.memory-compression-level=2" \
+ --egs.dir "$common_egs_dir" \
+ --egs.stage $get_egs_stage \
+ --egs.opts "--frames-overlap-per-eg 0 --constrained false" \
+ --egs.chunk-width $frames_per_eg \
+ --trainer.num-chunk-per-minibatch $minibatch_size \
+ --trainer.frames-per-iter 1500000 \
+ --trainer.num-epochs $num_epochs \
+ --trainer.optimization.num-jobs-initial $num_jobs_initial \
+ --trainer.optimization.num-jobs-final $num_jobs_final \
+ --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+ --trainer.optimization.final-effective-lrate $final_effective_lrate \
+ --trainer.max-param-change $max_param_change \
+ --cleanup.remove-egs $remove_egs \
+ --feat-dir data/${train_set}_hires \
+ --tree-dir $treedir \
+ --lat-dir exp/tri5a_sp_lats \
+ --use-gpu wait \
+ --dir $dir || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+ # Note: it might appear that this $lang directory is mismatched, and it is as
+ # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+ # the lang directory.
+ utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+fi
+
+graph_dir=$dir/graph
+if [ $stage -le 13 ]; then
+ for test_set in test eval; do
+ steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+ --nj 10 --cmd "$decode_cmd" \
+ --online-ivector-dir exp/nnet3${nnet3_affix:+_$nnet3_affix}/ivectors_$test_set \
+ $graph_dir data/${test_set}_hires $dir/decode_${test_set} || exit 1;
+ done
+ wait;
+fi
+
+exit 0;
diff --git a/egs/formosa/s5/local/chain/tuning/run_tdnn_1d.sh b/egs/formosa/s5/local/chain/tuning/run_tdnn_1d.sh
new file mode 100755
index 00000000000..1f4b7e12850
--- /dev/null
+++ b/egs/formosa/s5/local/chain/tuning/run_tdnn_1d.sh
@@ -0,0 +1,190 @@
+#!/bin/bash
+
+# CER:
+# 1a: %WER 16.83 [ 36305 / 215718, 4772 ins, 10810 del, 20723 sub ] exp/chain/tdnn_1a_sp/decode_test/cer_9_0.0
+# 1d: %WER 14.08 [ 30364 / 215718, 5182 ins, 7588 del, 17594 sub ] exp/chain/tdnn_1d_sp/decode_test/cer_9_0.0
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_1d_sp
+# exp/chain/tdnn_1d_sp: num-iters=157 nj=3..16 num-params=18.6M dim=43+100->5792 combine=-0.050->-0.050 (over 1) xent:train/valid[103,156,final]=(-0.977,-0.735,-0.725/-0.953,-0.772,-0.768) logprob:train/valid[103,156,final]=(-0.077,-0.052,-0.052/-0.079,-0.065,-0.066)
+
+set -e
+
+# configs for 'chain'
+affix=1d
+stage=0
+train_stage=-10
+get_egs_stage=-10
+dir=exp/chain/tdnn # Note: _sp will get added to this
+decode_iter=
+
+# training options
+num_epochs=6
+initial_effective_lrate=0.00025
+final_effective_lrate=0.000025
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=64
+frames_per_eg=150,110,90
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+# End configuration section.
+echo "$0 $@" # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+ cat <$lang/topo
+fi
+
+if [ $stage -le 9 ]; then
+ # Build a tree using our new topology. This is the critically different
+ # step compared with other recipes.
+ steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+ --context-opts "--context-width=2 --central-position=1" \
+ --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 10 ]; then
+ echo "$0: creating neural net configs using the xconfig parser";
+
+ num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+ learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+ affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+ tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+ linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+ prefinal_opts="l2-regularize=0.01"
+ output_opts="l2-regularize=0.002"
+
+ mkdir -p $dir/configs
+ cat < $dir/configs/network.xconfig
+ input dim=100 name=ivector
+ input dim=43 name=input
+
+ # please note that it is important to have input layer with the name=input
+ # as the layer immediately preceding the fixed-affine-layer to enable
+ # the use of short notation for the descriptor
+ fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+ # the first splicing is moved before the lda layer, so no splicing here
+ relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536
+ tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+ tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+ tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+ tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0
+ tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+ tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+ tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+ tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+ tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+ tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+ tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+ tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+ tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+ tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+ linear-component name=prefinal-l dim=256 $linear_opts
+ prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+ output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+ prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+ output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+ steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 11 ]; then
+ steps/nnet3/chain/train.py --stage $train_stage \
+ --cmd "$decode_cmd" \
+ --feat.online-ivector-dir exp/nnet3$nnet3_affix/ivectors_${train_set} \
+ --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+ --chain.xent-regularize $xent_regularize \
+ --chain.leaky-hmm-coefficient 0.1 \
+ --chain.l2-regularize 0.0 \
+ --chain.apply-deriv-weights false \
+ --chain.lm-opts="--num-extra-lm-states=2000" \
+ --trainer.dropout-schedule $dropout_schedule \
+ --trainer.add-option="--optimization.memory-compression-level=2" \
+ --egs.dir "$common_egs_dir" \
+ --egs.stage $get_egs_stage \
+ --egs.opts "--frames-overlap-per-eg 0 --constrained false" \
+ --egs.chunk-width $frames_per_eg \
+ --trainer.num-chunk-per-minibatch $minibatch_size \
+ --trainer.frames-per-iter 1500000 \
+ --trainer.num-epochs $num_epochs \
+ --trainer.optimization.num-jobs-initial $num_jobs_initial \
+ --trainer.optimization.num-jobs-final $num_jobs_final \
+ --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+ --trainer.optimization.final-effective-lrate $final_effective_lrate \
+ --trainer.max-param-change $max_param_change \
+ --cleanup.remove-egs $remove_egs \
+ --feat-dir data/${train_set}_hires \
+ --tree-dir $treedir \
+ --lat-dir exp/tri5a_sp_lats \
+ --use-gpu wait \
+ --dir $dir || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+ # Note: it might appear that this $lang directory is mismatched, and it is as
+ # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+ # the lang directory.
+ utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+fi
+
+graph_dir=$dir/graph
+if [ $stage -le 13 ]; then
+ for test_set in test eval; do
+ steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+ --nj 10 --cmd "$decode_cmd" \
+ --online-ivector-dir exp/nnet3${nnet3_affix:+_$nnet3_affix}/ivectors_$test_set \
+ $graph_dir data/${test_set}_hires $dir/decode_${test_set} || exit 1;
+ done
+ wait;
+fi
+
+exit 0;
diff --git a/egs/formosa/s5/local/nnet3/run_ivector_common.sh b/egs/formosa/s5/local/nnet3/run_ivector_common.sh
new file mode 100755
index 00000000000..723589ddd2e
--- /dev/null
+++ b/egs/formosa/s5/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,145 @@
+#!/bin/bash
+
+set -euo pipefail
+
+# This script is modified based on mini_librispeech/s5/local/nnet3/run_ivector_common.sh
+
+# This script is called from local/nnet3/run_tdnn.sh and
+# local/chain/run_tdnn.sh (and may eventually be called by more
+# scripts). It contains the common feature preparation and
+# iVector-related parts of the script. See those scripts for examples
+# of usage.
+
+stage=0
+train_set=train
+test_sets="test eval"
+gmm=tri5a
+
+nnet3_affix=
+
+. ./cmd.sh
+. ./path.sh
+. utils/parse_options.sh
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_sp_ali
+
+for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
+ if [ ! -f $f ]; then
+ echo "$0: expected file $f to exist"
+ exit 1
+ fi
+done
+
+if [ $stage -le 1 ]; then
+ # Although the nnet will be trained by high resolution data, we still have to
+ # perturb the normal data to get the alignment _sp stands for speed-perturbed
+ echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+ utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+ echo "$0: making MFCC features for low-resolution speed-perturbed data"
+ steps/make_mfcc_pitch.sh --cmd "$train_cmd" --nj 70 data/${train_set}_sp \
+ exp/make_mfcc/${train_set}_sp mfcc_perturbed || exit 1;
+ steps/compute_cmvn_stats.sh data/${train_set}_sp \
+ exp/make_mfcc/${train_set}_sp mfcc_perturbed || exit 1;
+ utils/fix_data_dir.sh data/${train_set}_sp
+fi
+
+if [ $stage -le 2 ]; then
+ echo "$0: aligning with the perturbed low-resolution data"
+ steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
+ data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1
+fi
+
+if [ $stage -le 3 ]; then
+ # Create high-resolution MFCC features (with 40 cepstra instead of 13).
+ # this shows how you can split across multiple file-systems.
+ echo "$0: creating high-resolution MFCC features"
+ mfccdir=mfcc_perturbed_hires
+
+ for datadir in ${train_set}_sp ${test_sets}; do
+ utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+ done
+
+ # do volume-perturbation on the training data prior to extracting hires
+ # features; this helps make trained nnets more invariant to test data volume.
+ utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires || exit 1;
+
+ for datadir in ${train_set}_sp ${test_sets}; do
+ steps/make_mfcc_pitch.sh --nj 10 --mfcc-config conf/mfcc_hires.conf \
+ --cmd "$train_cmd" data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
+ steps/compute_cmvn_stats.sh data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
+ utils/fix_data_dir.sh data/${datadir}_hires || exit 1;
+ # create MFCC data dir without pitch to extract iVector
+ utils/data/limit_feature_dim.sh 0:39 data/${datadir}_hires data/${datadir}_hires_nopitch || exit 1;
+ steps/compute_cmvn_stats.sh data/${datadir}_hires_nopitch exp/make_hires/$datadir $mfccdir || exit 1;
+ done
+fi
+
+if [ $stage -le 4 ]; then
+ echo "$0: computing a subset of data to train the diagonal UBM."
+ # We'll use about a quarter of the data.
+ mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
+ temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
+
+ num_utts_total=$(wc -l $dir/configs/network.xconfig
+ input dim=100 name=ivector
+ input dim=43 name=input
+
+ # please note that it is important to have input layer with the name=input
+ # as the layer immediately preceding the fixed-affine-layer to enable
+ # the use of short notation for the descriptor
+ fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+ # the first splicing is moved before the lda layer, so no splicing here
+ relu-batchnorm-layer name=tdnn1 dim=850
+ relu-batchnorm-layer name=tdnn2 dim=850 input=Append(-1,0,2)
+ relu-batchnorm-layer name=tdnn3 dim=850 input=Append(-3,0,3)
+ relu-batchnorm-layer name=tdnn4 dim=850 input=Append(-7,0,2)
+ relu-batchnorm-layer name=tdnn5 dim=850 input=Append(-3,0,3)
+ relu-batchnorm-layer name=tdnn6 dim=850
+ output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5
+EOF
+ steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 8 ]; then
+ steps/nnet3/train_dnn.py --stage=$train_stage \
+ --cmd="$decode_cmd" \
+ --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+ --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+ --trainer.num-epochs $num_epochs \
+ --trainer.optimization.num-jobs-initial $num_jobs_initial \
+ --trainer.optimization.num-jobs-final $num_jobs_final \
+ --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+ --trainer.optimization.final-effective-lrate $final_effective_lrate \
+ --egs.dir "$common_egs_dir" \
+ --cleanup.remove-egs $remove_egs \
+ --cleanup.preserve-model-interval 500 \
+ --use-gpu wait \
+ --feat-dir=data/${train_set}_hires \
+ --ali-dir $ali_dir \
+ --lang data/lang \
+ --reporting.email="$reporting_email" \
+ --dir=$dir || exit 1;
+fi
+
+if [ $stage -le 9 ]; then
+ # this version of the decoding treats each utterance separately
+ # without carrying forward speaker information.
+
+ for decode_set in test eval; do
+ num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+ decode_dir=${dir}/decode_$decode_set
+ steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
+ --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+ $graph_dir data/${decode_set}_hires $decode_dir || exit 1;
+ done
+ wait;
+fi
+
+exit 0;
diff --git a/egs/formosa/s5/local/prepare_data.sh b/egs/formosa/s5/local/prepare_data.sh
new file mode 100755
index 00000000000..68f342e1549
--- /dev/null
+++ b/egs/formosa/s5/local/prepare_data.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+# Copyright 2015-2016 Sarah Flora Juan
+# Copyright 2016 Johns Hopkins University (Author: Yenda Trmal)
+# Copyright 2018 Yuan-Fu Liao, National Taipei University of Technology
+# AsusTek Computer Inc. (Author: Alex Hung)
+
+# Apache 2.0
+
+set -e -o pipefail
+
+train_dir=NER-Trs-Vol1/Train
+eval_dir=NER-Trs-Vol1-Eval
+eval_key_dir=NER-Trs-Vol1-Eval-Key
+
+. ./path.sh
+. parse_options.sh
+
+for x in $train_dir $eval_dir; do
+ if [ ! -d "$x" ] ; then
+ echo >&2 "The directory $x does not exist"
+ fi
+done
+
+if [ -z "$(command -v dos2unix 2>/dev/null)" ]; then
+ echo "dos2unix not found on PATH. Please install it manually."
+ exit 1;
+fi
+
+# have to remvoe previous files to avoid filtering speakers according to cmvn.scp and feats.scp
+rm -rf data/all data/train data/test data/eval data/local/train
+mkdir -p data/all data/train data/test data/eval data/local/train
+
+
+# make utt2spk, wav.scp and text
+find $train_dir -name *.wav -exec sh -c 'x={}; y=$(basename -s .wav $x); printf "%s %s\n" $y $y' \; | dos2unix > data/all/utt2spk
+find $train_dir -name *.wav -exec sh -c 'x={}; y=$(basename -s .wav $x); printf "%s %s\n" $y $x' \; | dos2unix > data/all/wav.scp
+find $train_dir -name *.txt -exec sh -c 'x={}; y=$(basename -s .txt $x); printf "%s " $y; cat $x' \; | dos2unix > data/all/text
+
+# fix_data_dir.sh fixes common mistakes (unsorted entries in wav.scp,
+# duplicate entries and so on). Also, it regenerates the spk2utt from
+# utt2spk
+utils/fix_data_dir.sh data/all
+
+echo "Preparing train and test data"
+# test set: JZ, GJ, KX, YX
+grep -E "(JZ|GJ|KX|YX)_" data/all/utt2spk | awk '{print $1}' > data/all/cv.spk
+utils/subset_data_dir_tr_cv.sh --cv-spk-list data/all/cv.spk data/all data/train data/test
+
+# for LM training
+echo "cp data/train/text data/local/train/text for language model training"
+cat data/train/text | awk '{$1=""}1;' | awk '{$1=$1}1;' > data/local/train/text
+
+# preparing EVAL set.
+find $eval_dir -name *.wav -exec sh -c 'x={}; y=$(basename -s .wav $x); printf "%s %s\n" $y $y' \; | dos2unix > data/eval/utt2spk
+find $eval_dir -name *.wav -exec sh -c 'x={}; y=$(basename -s .wav $x); printf "%s %s\n" $y $x' \; | dos2unix > data/eval/wav.scp
+find $eval_key_dir -name *.txt -exec sh -c 'x={}; y=$(basename -s .txt $x); printf "%s " $y; cat $x' \; | dos2unix > data/eval/text
+utils/fix_data_dir.sh data/eval
+
+echo "Data preparation completed."
+exit 0;
diff --git a/egs/formosa/s5/local/prepare_dict.sh b/egs/formosa/s5/local/prepare_dict.sh
new file mode 100755
index 00000000000..4e580f5f6e8
--- /dev/null
+++ b/egs/formosa/s5/local/prepare_dict.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# Copyright 2015-2016 Sarah Flora Juan
+# Copyright 2016 Johns Hopkins University (Author: Yenda Trmal)
+# Copyright 2018 Yuan-Fu Liao, National Taipei University of Technology
+# Apache 2.0
+
+source_dir=NER-Trs-Vol1/Language
+dict_dir=data/local/dict
+rm -rf $dict_dir
+mkdir -p $dict_dir
+
+#
+#
+#
+rm -f $dict_dir/lexicon.txt
+touch $dict_dir/lexicon.txt
+cat $source_dir/lexicon.txt > $dict_dir/lexicon.txt
+echo " SIL" >> $dict_dir/lexicon.txt
+
+#
+# define silence phone
+#
+rm -f $dict_dir/silence_phones.txt
+touch $dict_dir/silence_phones.txt
+
+echo "SIL" > $dict_dir/silence_phones.txt
+
+#
+# find nonsilence phones
+#
+rm -f $dict_dir/nonsilence_phones.txt
+touch $dict_dir/nonsilence_phones.txt
+
+cat $source_dir/lexicon.txt | grep -v -F -f $dict_dir/silence_phones.txt | \
+ perl -ane 'print join("\n", @F[1..$#F]) . "\n"; ' | \
+ sort -u > $dict_dir/nonsilence_phones.txt
+
+#
+# add optional silence phones
+#
+
+rm -f $dict_dir/optional_silence.txt
+touch $dict_dir/optional_silence.txt
+echo "SIL" > $dict_dir/optional_silence.txt
+
+#
+# extra questions
+#
+rm -f $dict_dir/extra_questions.txt
+touch $dict_dir/extra_questions.txt
+cat $dict_dir/silence_phones.txt | awk '{printf("%s ", $1);} END{printf "\n";}' > $dict_dir/extra_questions.txt || exit 1;
+cat $dict_dir/nonsilence_phones.txt | awk '{printf("%s ", $1);} END{printf "\n";}' >> $dict_dir/extra_questions.txt || exit 1;
+
+echo "Dictionary preparation succeeded"
+exit 0;
diff --git a/egs/formosa/s5/local/prepare_lm.sh b/egs/formosa/s5/local/prepare_lm.sh
new file mode 100755
index 00000000000..59fe1529658
--- /dev/null
+++ b/egs/formosa/s5/local/prepare_lm.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+# Copyright 2015-2016 Sarah Flora Juan
+# Copyright 2016 Johns Hopkins University (Author: Yenda Trmal)
+# Apache 2.0
+
+set -e -o pipefail
+
+# To create G.fst from ARPA language model
+. ./path.sh || die "path.sh expected";
+
+local/train_lms_srilm.sh --train-text data/train/text data/ data/srilm
+
+#nl -nrz -w10 corpus/LM/iban-bp-2012.txt | utils/shuffle_list.pl > data/local/external_text
+local/train_lms_srilm.sh --train-text data/local/external_text data/ data/srilm_external
+
+# let's do ngram interpolation of the previous two LMs
+# the lm.gz is always symlink to the model with the best perplexity, so we use that
+
+mkdir -p data/srilm_interp
+for w in 0.9 0.8 0.7 0.6 0.5; do
+ ngram -lm data/srilm/lm.gz -mix-lm data/srilm_external/lm.gz \
+ -lambda $w -write-lm data/srilm_interp/lm.${w}.gz
+ echo -n "data/srilm_interp/lm.${w}.gz "
+ ngram -lm data/srilm_interp/lm.${w}.gz -ppl data/srilm/dev.txt | paste -s -
+done | sort -k15,15g > data/srilm_interp/perplexities.txt
+
+# for basic decoding, let's use only a trigram LM
+[ -d data/lang_test/ ] && rm -rf data/lang_test
+cp -R data/lang data/lang_test
+lm=$(cat data/srilm/perplexities.txt | grep 3gram | head -n1 | awk '{print $1}')
+local/arpa2G.sh $lm data/lang_test data/lang_test
+
+# for decoding using bigger LM let's find which interpolated gave the most improvement
+[ -d data/lang_big ] && rm -rf data/lang_big
+cp -R data/lang data/lang_big
+lm=$(cat data/srilm_interp/perplexities.txt | head -n1 | awk '{print $1}')
+local/arpa2G.sh $lm data/lang_big data/lang_big
+
+# for really big lm, we should only decode using small LM
+# and resocre using the big lm
+utils/build_const_arpa_lm.sh $lm data/lang_big data/lang_big
+exit 0;
diff --git a/egs/formosa/s5/local/run_cleanup_segmentation.sh b/egs/formosa/s5/local/run_cleanup_segmentation.sh
new file mode 100755
index 00000000000..b72cd89b4d1
--- /dev/null
+++ b/egs/formosa/s5/local/run_cleanup_segmentation.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+# Copyright 2016 Vimal Manohar
+# 2016 Johns Hopkins University (author: Daniel Povey)
+# 2017 Nagendra Kumar Goel
+# 2019 AsusTek Computer Inc. (author: Alex Hung)
+# Apache 2.0
+
+# This script demonstrates how to re-segment training data selecting only the
+# "good" audio that matches the transcripts.
+# The basic idea is to decode with an existing in-domain acoustic model, and a
+# biased language model built from the reference, and then work out the
+# segmentation from a ctm like file.
+
+# For nnet3 and chain results after cleanup, see the scripts in
+# local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh
+
+# GMM Results for speaker-independent (SI) and speaker adaptive training (SAT) systems on dev and test sets
+# [will add these later].
+
+set -e
+set -o pipefail
+set -u
+
+stage=0
+cleanup_stage=0
+data=data/train
+cleanup_affix=cleaned
+srcdir=exp/tri5a
+langdir=data/lang_test
+nj=20
+decode_nj=20
+decode_num_threads=1
+
+. ./cmd.sh
+if [ -f ./path.sh ]; then . ./path.sh; fi
+. utils/parse_options.sh
+
+cleaned_data=${data}_${cleanup_affix}
+
+dir=${srcdir}_${cleanup_affix}_work
+cleaned_dir=${srcdir}_${cleanup_affix}
+
+if [ $stage -le 1 ]; then
+ # This does the actual data cleanup.
+ steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage \
+ --nj $nj --cmd "$train_cmd" \
+ $data $langdir $srcdir $dir $cleaned_data
+fi
+
+if [ $stage -le 2 ]; then
+ steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+ $cleaned_data $langdir $srcdir ${srcdir}_ali_${cleanup_affix}
+fi
+
+if [ $stage -le 3 ]; then
+ steps/train_sat.sh --cmd "$train_cmd" \
+ 3500 100000 $cleaned_data $langdir ${srcdir}_ali_${cleanup_affix} ${cleaned_dir}
+fi
+
+utils/data/get_utt2dur.sh data/train_cleaned
+ori_avg_dur=$(awk 'BEGIN{total=0}{total += $2}END{printf("%.2f", total/NR)}' ${data}/utt2dur)
+new_avg_dur=$(awk 'BEGIN{total=0}{total += $2}END{printf("%.2f", total/NR)}' ${cleaned_data}/utt2dur)
+echo "average duration was reduced from ${ori_avg_dur}s to ${new_avg_dur}s."
+# average duration was reduced from 21.68s to 10.97s.
+exit 0;
diff --git a/egs/formosa/s5/local/score.sh b/egs/formosa/s5/local/score.sh
new file mode 100755
index 00000000000..a9786169973
--- /dev/null
+++ b/egs/formosa/s5/local/score.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+set -e -o pipefail
+set -x
+steps/score_kaldi.sh "$@"
+steps/scoring/score_kaldi_cer.sh --stage 2 "$@"
+
+echo "$0: Done"
diff --git a/egs/formosa/s5/local/train_lms.sh b/egs/formosa/s5/local/train_lms.sh
new file mode 100755
index 00000000000..efc5b92c573
--- /dev/null
+++ b/egs/formosa/s5/local/train_lms.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+
+# To be run from one directory above this script.
+. ./path.sh
+
+text=data/local/train/text
+lexicon=data/local/dict/lexicon.txt
+
+for f in "$text" "$lexicon"; do
+ [ ! -f $x ] && echo "$0: No such file $f" && exit 1;
+done
+
+# This script takes no arguments. It assumes you have already run
+# aishell_data_prep.sh.
+# It takes as input the files
+# data/local/train/text
+# data/local/dict/lexicon.txt
+dir=data/local/lm
+mkdir -p $dir
+
+kaldi_lm=`which train_lm.sh`
+if [ -z $kaldi_lm ]; then
+ echo "$0: train_lm.sh is not found. That might mean it's not installed"
+ echo "$0: or it is not added to PATH"
+ echo "$0: Use the script tools/extra/install_kaldi_lm.sh to install it"
+ exit 1
+fi
+
+cleantext=$dir/text.no_oov
+
+cat $text | awk -v lex=$lexicon 'BEGIN{while((getline0){ seen[$1]=1; } }
+ {for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf(" ");} } printf("\n");}' \
+ > $cleantext || exit 1;
+
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
+ sort -nr > $dir/word.counts || exit 1;
+
+# Get counts from acoustic training transcripts, and add one-count
+# for each word in the lexicon (but not silence, we don't want it
+# in the LM-- we'll add it optionally later).
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
+ cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
+ sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
+
+# note: we probably won't really make use of as there aren't any OOVs
+cat $dir/unigram.counts | awk '{print $2}' | get_word_map.pl "" "" "" > $dir/word_map \
+ || exit 1;
+
+# note: ignore 1st field of train.txt, it's the utterance-id.
+cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline0)map[$1]=$2;}
+ { for(n=2;n<=NF;n++) { printf map[$n]; if(n$dir/train.gz \
+ || exit 1;
+
+train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1;
+
+# LM is small enough that we don't need to prune it (only about 0.7M N-grams).
+# Perplexity over 128254.000000 words is 90.446690
+
+# note: output is
+# data/local/lm/3gram-mincount/lm_unpruned.gz
+
+exit 0;
diff --git a/egs/formosa/s5/local/wer_hyp_filter b/egs/formosa/s5/local/wer_hyp_filter
new file mode 100755
index 00000000000..519d92ee80d
--- /dev/null
+++ b/egs/formosa/s5/local/wer_hyp_filter
@@ -0,0 +1,19 @@
+#!/usr/bin/env perl
+
+@filters=('');
+
+foreach $w (@filters) {
+ $bad{$w} = 1;
+}
+
+while() {
+ @A = split(" ", $_);
+ $id = shift @A;
+ print "$id ";
+ foreach $a (@A) {
+ if (!defined $bad{$a}) {
+ print "$a ";
+ }
+ }
+ print "\n";
+}
diff --git a/egs/formosa/s5/local/wer_output_filter b/egs/formosa/s5/local/wer_output_filter
new file mode 100755
index 00000000000..06a99a43e34
--- /dev/null
+++ b/egs/formosa/s5/local/wer_output_filter
@@ -0,0 +1,25 @@
+#!/usr/bin/env perl
+# Copyright 2012-2014 Johns Hopkins University (Author: Yenda Trmal)
+# Apache 2.0
+use utf8;
+
+use open qw(:encoding(utf8));
+binmode STDIN, ":utf8";
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+
+while (<>) {
+ @F = split " ";
+ print $F[0] . " ";
+ foreach $s (@F[1..$#F]) {
+ if (($s =~ /\[.*\]/) || ($s =~ /\<.*\>/) || ($s =~ "")) {
+ print "";
+ } else {
+ print "$s"
+ }
+ print " ";
+ }
+ print "\n";
+}
+
+
diff --git a/egs/formosa/s5/local/wer_ref_filter b/egs/formosa/s5/local/wer_ref_filter
new file mode 100755
index 00000000000..519d92ee80d
--- /dev/null
+++ b/egs/formosa/s5/local/wer_ref_filter
@@ -0,0 +1,19 @@
+#!/usr/bin/env perl
+
+@filters=('');
+
+foreach $w (@filters) {
+ $bad{$w} = 1;
+}
+
+while() {
+ @A = split(" ", $_);
+ $id = shift @A;
+ print "$id ";
+ foreach $a (@A) {
+ if (!defined $bad{$a}) {
+ print "$a ";
+ }
+ }
+ print "\n";
+}
diff --git a/egs/formosa/s5/path.sh b/egs/formosa/s5/path.sh
new file mode 100755
index 00000000000..2d17b17a84a
--- /dev/null
+++ b/egs/formosa/s5/path.sh
@@ -0,0 +1,6 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/formosa/s5/run.sh b/egs/formosa/s5/run.sh
new file mode 100755
index 00000000000..a4d0f2dcd1d
--- /dev/null
+++ b/egs/formosa/s5/run.sh
@@ -0,0 +1,217 @@
+#!/bin/bash
+#
+# Copyright 2018, Yuan-Fu Liao, National Taipei University of Technology, yfliao@mail.ntut.edu.tw
+#
+# Before you run this recipe, please apply, download and put or make a link of the corpus under this folder (folder name: "NER-Trs-Vol1").
+# For more detail, please check:
+# 1. Formosa Speech in the Wild (FSW) project (https://sites.google.com/speech.ntut.edu.tw/fsw/home/corpus)
+# 2. Formosa Speech Recognition Challenge (FSW) 2018 (https://sites.google.com/speech.ntut.edu.tw/fsw/home/challenge)
+stage=-2
+num_jobs=20
+
+train_dir=NER-Trs-Vol1/Train
+eval_dir=NER-Trs-Vol1-Eval
+eval_key_dir=NER-Trs-Vol1-Eval-Key
+
+# shell options
+set -eo pipefail
+
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+# configure number of jobs running in parallel, you should adjust these numbers according to your machines
+# data preparation
+if [ $stage -le -2 ]; then
+ # Lexicon Preparation,
+ echo "$0: Lexicon Preparation"
+ local/prepare_dict.sh || exit 1;
+
+ # Data Preparation
+ echo "$0: Data Preparation"
+ local/prepare_data.sh --train-dir $train_dir --eval-dir $eval_dir --eval-key-dir $eval_key_dir || exit 1;
+
+ # Phone Sets, questions, L compilation
+ echo "$0: Phone Sets, questions, L compilation Preparation"
+ rm -rf data/lang
+ utils/prepare_lang.sh --position-dependent-phones false data/local/dict \
+ "" data/local/lang data/lang || exit 1;
+
+ # LM training
+ echo "$0: LM training"
+ rm -rf data/local/lm/3gram-mincount
+ local/train_lms.sh || exit 1;
+
+ # G compilation, check LG composition
+ echo "$0: G compilation, check LG composition"
+ utils/format_lm.sh data/lang data/local/lm/3gram-mincount/lm_unpruned.gz \
+ data/local/dict/lexicon.txt data/lang_test || exit 1;
+
+fi
+
+# Now make MFCC plus pitch features.
+# mfccdir should be some place with a largish disk where you
+# want to store MFCC features.
+mfccdir=mfcc
+
+# mfcc
+if [ $stage -le -1 ]; then
+ echo "$0: making mfccs"
+ for x in train test eval; do
+ steps/make_mfcc_pitch.sh --cmd "$train_cmd" --nj $num_jobs data/$x exp/make_mfcc/$x $mfccdir || exit 1;
+ steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1;
+ utils/fix_data_dir.sh data/$x || exit 1;
+ done
+fi
+
+# mono
+if [ $stage -le 0 ]; then
+ echo "$0: train mono model"
+ # Make some small data subsets for early system-build stages.
+ echo "$0: make training subsets"
+ utils/subset_data_dir.sh --shortest data/train 3000 data/train_mono
+
+ # train mono
+ steps/train_mono.sh --boost-silence 1.25 --cmd "$train_cmd" --nj $num_jobs \
+ data/train_mono data/lang exp/mono || exit 1;
+
+ # Get alignments from monophone system.
+ steps/align_si.sh --boost-silence 1.25 --cmd "$train_cmd" --nj $num_jobs \
+ data/train data/lang exp/mono exp/mono_ali || exit 1;
+
+ # Monophone decoding
+ (
+ utils/mkgraph.sh data/lang_test exp/mono exp/mono/graph || exit 1;
+ steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj $num_jobs \
+ exp/mono/graph data/test exp/mono/decode_test
+ )&
+fi
+
+# tri1
+if [ $stage -le 1 ]; then
+ echo "$0: train tri1 model"
+ # train tri1 [first triphone pass]
+ steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \
+ 2500 20000 data/train data/lang exp/mono_ali exp/tri1 || exit 1;
+
+ # align tri1
+ steps/align_si.sh --cmd "$train_cmd" --nj $num_jobs \
+ data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
+
+ # decode tri1
+ (
+ utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph || exit 1;
+ steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj $num_jobs \
+ exp/tri1/graph data/test exp/tri1/decode_test
+ )&
+fi
+
+# tri2
+if [ $stage -le 2 ]; then
+ echo "$0: train tri2 model"
+ # train tri2 [delta+delta-deltas]
+ steps/train_deltas.sh --cmd "$train_cmd" \
+ 2500 20000 data/train data/lang exp/tri1_ali exp/tri2 || exit 1;
+
+ # align tri2b
+ steps/align_si.sh --cmd "$train_cmd" --nj $num_jobs \
+ data/train data/lang exp/tri2 exp/tri2_ali || exit 1;
+
+ # decode tri2
+ (
+ utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph
+ steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj $num_jobs \
+ exp/tri2/graph data/test exp/tri2/decode_test
+ )&
+fi
+
+# tri3a
+if [ $stage -le 3 ]; then
+ echo "$-: train tri3 model"
+ # Train tri3a, which is LDA+MLLT,
+ steps/train_lda_mllt.sh --cmd "$train_cmd" \
+ 2500 20000 data/train data/lang exp/tri2_ali exp/tri3a || exit 1;
+
+ # decode tri3a
+ (
+ utils/mkgraph.sh data/lang_test exp/tri3a exp/tri3a/graph || exit 1;
+ steps/decode.sh --cmd "$decode_cmd" --nj $num_jobs --config conf/decode.config \
+ exp/tri3a/graph data/test exp/tri3a/decode_test
+ )&
+fi
+
+# tri4
+if [ $stage -le 4 ]; then
+ echo "$0: train tri4 model"
+ # From now, we start building a more serious system (with SAT), and we'll
+ # do the alignment with fMLLR.
+ steps/align_fmllr.sh --cmd "$train_cmd" --nj $num_jobs \
+ data/train data/lang exp/tri3a exp/tri3a_ali || exit 1;
+
+ steps/train_sat.sh --cmd "$train_cmd" \
+ 2500 20000 data/train data/lang exp/tri3a_ali exp/tri4a || exit 1;
+
+ # align tri4a
+ steps/align_fmllr.sh --cmd "$train_cmd" --nj $num_jobs \
+ data/train data/lang exp/tri4a exp/tri4a_ali
+
+ # decode tri4a
+ (
+ utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph
+ steps/decode_fmllr.sh --cmd "$decode_cmd" --nj $num_jobs --config conf/decode.config \
+ exp/tri4a/graph data/test exp/tri4a/decode_test
+ )&
+fi
+
+# tri5
+if [ $stage -le 5 ]; then
+ echo "$0: train tri5 model"
+ # Building a larger SAT system.
+ steps/train_sat.sh --cmd "$train_cmd" \
+ 3500 100000 data/train data/lang exp/tri4a_ali exp/tri5a || exit 1;
+
+ # align tri5a
+ steps/align_fmllr.sh --cmd "$train_cmd" --nj $num_jobs \
+ data/train data/lang exp/tri5a exp/tri5a_ali || exit 1;
+
+ # decode tri5
+ (
+ utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph || exit 1;
+ steps/decode_fmllr.sh --cmd "$decode_cmd" --nj $num_jobs --config conf/decode.config \
+ exp/tri5a/graph data/test exp/tri5a/decode_test || exit 1;
+ )&
+fi
+
+# nnet3 tdnn models
+# commented out by default, since the chain model is usually faster and better
+#if [ $stage -le 6 ]; then
+ # echo "$0: train nnet3 model"
+ # local/nnet3/run_tdnn.sh
+#fi
+
+# chain model
+if [ $stage -le 7 ]; then
+ # The iVector-extraction and feature-dumping parts coulb be skipped by setting "--train_stage 7"
+ echo "$0: train chain model"
+ local/chain/run_tdnn.sh
+fi
+
+# getting results (see RESULTS file)
+if [ $stage -le 8 ]; then
+ echo "$0: extract the results"
+ for test_set in test eval; do
+ echo "WER: $test_set"
+ for x in exp/*/decode_${test_set}*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done 2>/dev/null
+ for x in exp/*/*/decode_${test_set}*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done 2>/dev/null
+ echo
+
+ echo "CER: $test_set"
+ for x in exp/*/decode_${test_set}*; do [ -d $x ] && grep WER $x/cer_* | utils/best_wer.sh; done 2>/dev/null
+ for x in exp/*/*/decode_${test_set}*; do [ -d $x ] && grep WER $x/cer_* | utils/best_wer.sh; done 2>/dev/null
+ echo
+ done
+fi
+
+# finish
+echo "$0: all done"
+
+exit 0;
diff --git a/egs/formosa/s5/steps b/egs/formosa/s5/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/formosa/s5/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/formosa/s5/utils b/egs/formosa/s5/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/formosa/s5/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file
diff --git a/egs/gale_arabic/s5/local/gale_format_data.sh b/egs/gale_arabic/s5/local/gale_format_data.sh
index 85a946a58d9..053323dc194 100755
--- a/egs/gale_arabic/s5/local/gale_format_data.sh
+++ b/egs/gale_arabic/s5/local/gale_format_data.sh
@@ -57,4 +57,4 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
echo gale_format_data succeeded.
-exit 0
\ No newline at end of file
+exit 0
diff --git a/egs/gale_arabic/s5/local/gale_prep_dict.sh b/egs/gale_arabic/s5/local/gale_prep_dict.sh
index 74ef789eda7..f6fd83378d0 100755
--- a/egs/gale_arabic/s5/local/gale_prep_dict.sh
+++ b/egs/gale_arabic/s5/local/gale_prep_dict.sh
@@ -25,9 +25,8 @@ echo SIL > $dir/optional_silence.txt
cat $dir/lexicon.txt | cut -d ' ' -f2- | tr -s ' ' '\n' |\
sort -u > $dir/nonsilence_phones.txt || exit 1;
+perl -i -pe 'print " SIL\n" if $.==1' $dir/lexicon.txt
- sed -i '1i SIL' $dir/lexicon.txt
-
echo Dictionary preparation succeeded
exit 0
diff --git a/egs/gale_arabic/s5/local/gale_train_lms.sh b/egs/gale_arabic/s5/local/gale_train_lms.sh
index 1b5d4665a19..8f8e715390f 100755
--- a/egs/gale_arabic/s5/local/gale_train_lms.sh
+++ b/egs/gale_arabic/s5/local/gale_train_lms.sh
@@ -113,4 +113,4 @@ fi
echo train lm succeeded
-exit 0
\ No newline at end of file
+exit 0
diff --git a/egs/gale_arabic/s5b/RESULTS b/egs/gale_arabic/s5b/RESULTS
index 2260a106654..a485240ff6b 100644
--- a/egs/gale_arabic/s5b/RESULTS
+++ b/egs/gale_arabic/s5b/RESULTS
@@ -2,13 +2,7 @@
# This file is generated using local/split_wer.sh $galeData //galeData is a local folder to keep intermediate gale data
# look at the end of run.sh in the same folder
##
-##### RESULTS generated by amali at 2017-01-01-08-05-59
-
Report Results WER:
-%WER 9.50 [ 2124 / 22363, 160 ins, 275 del, 1689 sub ] exp/chain_cleaned/tdnn_lstm1a_sp_bi/decode/wer_report_9
-%WER 10.72 [ 2398 / 22363, 163 ins, 313 del, 1922 sub ] exp/chain_cleaned/tdnn1b_sp_bi/decode/wer_report_9
-%WER 12.04 [ 2693 / 22363, 226 ins, 271 del, 2196 sub ] exp/nnet3_cleaned/lstm_ld5_sp/decode/wer_report_9
-%WER 12.29 [ 2749 / 22363, 273 ins, 266 del, 2210 sub ] exp/nnet3_cleaned/tdnn_sp/decode/wer_report_10
%WER 17.82 [ 3986 / 22363, 315 ins, 618 del, 3053 sub ] exp/sgmm_5a_mmi_b0.1/decode/wer_report_12
%WER 18.15 [ 4059 / 22363, 335 ins, 589 del, 3135 sub ] exp/sgmm_5a_mmi_b0.1/decode4/wer_report_11
%WER 18.42 [ 4119 / 22363, 346 ins, 590 del, 3183 sub ] exp/sgmm_5a_mmi_b0.1/decode3/wer_report_11
@@ -27,10 +21,6 @@ Report Results WER:
%WER 25.66 [ 5738 / 22363, 478 ins, 838 del, 4422 sub ] exp/tri2a/decode/wer_report_14
%WER 26.38 [ 5900 / 22363, 435 ins, 929 del, 4536 sub ] exp/tri1/decode/wer_report_15
Conversational Results WER:
-%WER 21.59 [ 10213 / 47305, 944 ins, 3092 del, 6177 sub ] exp/chain_cleaned/tdnn_lstm1a_sp_bi/decode/wer_conversational_9
-%WER 24.77 [ 11716 / 47305, 1098 ins, 3579 del, 7039 sub ] exp/chain_cleaned/tdnn1b_sp_bi/decode/wer_conversational_9
-%WER 26.78 [ 12670 / 47305, 1741 ins, 2434 del, 8495 sub ] exp/nnet3_cleaned/lstm_ld5_sp/decode/wer_conversational_9
-%WER 27.55 [ 13032 / 47305, 1800 ins, 2666 del, 8566 sub ] exp/nnet3_cleaned/tdnn_sp/decode/wer_conversational_11
%WER 34.10 [ 16133 / 47305, 1903 ins, 3245 del, 10985 sub ] exp/sgmm_5a_mmi_b0.1/decode/wer_conversational_11
%WER 34.81 [ 16466 / 47305, 2077 ins, 3037 del, 11352 sub ] exp/sgmm_5a_mmi_b0.1/decode4/wer_conversational_10
%WER 35.19 [ 16648 / 47305, 1933 ins, 3264 del, 11451 sub ] exp/sgmm_5a_mmi_b0.1/decode3/wer_conversational_11
@@ -49,10 +39,6 @@ Conversational Results WER:
%WER 45.92 [ 21724 / 47305, 1995 ins, 5213 del, 14516 sub ] exp/tri2a/decode/wer_conversational_14
%WER 46.86 [ 22166 / 47305, 2212 ins, 4819 del, 15135 sub ] exp/tri1/decode/wer_conversational_13
Combined Results for Reports and Conversational WER:
-%WER 17.64 [ 12286 / 69668, 1310 ins, 2807 del, 8169 sub ] exp/chain_cleaned/tdnn_lstm1a_sp_bi/decode/wer_8
-%WER 20.26 [ 14114 / 69668, 1261 ins, 3892 del, 8961 sub ] exp/chain_cleaned/tdnn1b_sp_bi/decode/wer_9
-%WER 22.05 [ 15363 / 69668, 1967 ins, 2705 del, 10691 sub ] exp/nnet3_cleaned/lstm_ld5_sp/decode/wer_9
-%WER 22.66 [ 15786 / 69668, 2047 ins, 2955 del, 10784 sub ] exp/nnet3_cleaned/tdnn_sp/decode/wer_11
%WER 28.89 [ 20127 / 69668, 2244 ins, 3829 del, 14054 sub ] exp/sgmm_5a_mmi_b0.1/decode/wer_11
%WER 29.48 [ 20541 / 69668, 2243 ins, 3860 del, 14438 sub ] exp/sgmm_5a_mmi_b0.1/decode4/wer_11
%WER 29.81 [ 20767 / 69668, 2279 ins, 3854 del, 14634 sub ] exp/sgmm_5a_mmi_b0.1/decode3/wer_11
@@ -65,8 +51,30 @@ Combined Results for Reports and Conversational WER:
%WER 32.36 [ 22542 / 69668, 2156 ins, 4184 del, 16202 sub ] exp/tri2b_mmi/decode_it4/wer_11
%WER 32.50 [ 22640 / 69668, 2393 ins, 3956 del, 16291 sub ] exp/tri2b_mmi/decode_it3/wer_11
%WER 32.79 [ 22847 / 69668, 2407 ins, 4760 del, 15680 sub ] exp/tri2b_mpe/decode_it3/wer_13
+# WER with train_sat_basis
+%WER 33.35 [ 23233 / 69668, 2385 ins, 5274 del, 15574 sub ] exp/tri3b/decode/wer_16_0.5
+# WER with train_sat
%WER 33.61 [ 23413 / 69668, 2817 ins, 4577 del, 16019 sub ] exp/tri3b/decode/wer_17
%WER 35.73 [ 24894 / 69668, 2630 ins, 4944 del, 17320 sub ] exp/tri3b/decode.si/wer_15
%WER 36.17 [ 25196 / 69668, 2429 ins, 5393 del, 17374 sub ] exp/tri2b/decode/wer_16
%WER 39.42 [ 27462 / 69668, 2473 ins, 6051 del, 18938 sub ] exp/tri2a/decode/wer_14
%WER 40.35 [ 28113 / 69668, 2713 ins, 5635 del, 19765 sub ] exp/tri1/decode/wer_13
+
+
+# Effect of GMM seed model (tri2b instead of tri3b). Using tri3b give a slightly better result
+# as compared to using tri2b as seed.
+%WER 16.66 [ 11610 / 69668, 1233 ins, 2747 del, 7630 sub ] exp/chain/tdnn_1a_3b_sp/decode_test/wer_10_0.0
+%WER 16.71 [ 11642 / 69668, 1145 ins, 2908 del, 7589 sub ] exp/chain/tdnn_1a_2b_sp/decode_test/wer_9_0.0
+
+# Effect of Tree-size (3500, 4500, 7000, 11000)
+%WER 16.66 [ 11610 / 69668, 1233 ins, 2747 del, 7630 sub ] exp/chain/tdnn_1a_3500_sp/decode_test/wer_10_0.0
+%WER 16.59 [ 11557 / 69668, 1234 ins, 2646 del, 7677 sub ] exp/chain/tdnn_1a_4500_sp/decode_test/wer_10_0.0
+%WER 16.47 [ 11474 / 69668, 1421 ins, 2207 del, 7846 sub ] exp/chain/tdnn_1a_7000_sp/decode_test/wer_9_0.0
+%WER 16.62 [ 11580 / 69668, 1164 ins, 2789 del, 7627 sub ] exp/chain/tdnn_1a_11000_sp/decode_test/wer_10_0.0
+
+# Effect of l2-regularization on the output with tree-size=7000. l2 on the output (0.005,0.002)
+%WER 16.54 [ 11522 / 69668, 1123 ins, 2739 del, 7660 sub ] exp/chain/tdnn_1a_7000_005_sp/decode_test/wer_9_0.5
+%WER 16.47 [ 11474 / 69668, 1421 ins, 2207 del, 7846 sub ] exp/chain/tdnn_1a_7000_002_sp/decode_test/wer_9_0.0
+
+#current best 'chain' models (see local/chain/tuning/run_tdnn_1a.sh)
+%WER 16.47 [ 11474 / 69668, 1421 ins, 2207 del, 7846 sub ] exp/chain/tdnn_1a_sp/decode_test/wer_9_0.0
diff --git a/egs/gale_arabic/s5b/cmd.sh b/egs/gale_arabic/s5b/cmd.sh
index 71dd849a93b..ea341c98d4a 100755
--- a/egs/gale_arabic/s5b/cmd.sh
+++ b/egs/gale_arabic/s5b/cmd.sh
@@ -10,6 +10,6 @@
# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
-export train_cmd="queue.pl --mem 2G"
-export decode_cmd="queue.pl --mem 4G"
-export mkgraph_cmd="queue.pl --mem 8G"
+export train_cmd="retry.pl queue.pl --mem 2G"
+export decode_cmd="retry.pl queue.pl --mem 4G"
+export mkgraph_cmd="retry.pl queue.pl --mem 8G"
diff --git a/egs/gale_arabic/s5b/local/chain/compare_wer.sh b/egs/gale_arabic/s5b/local/chain/compare_wer.sh
new file mode 100755
index 00000000000..1a40523355a
--- /dev/null
+++ b/egs/gale_arabic/s5b/local/chain/compare_wer.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b}
+
+# ./local/chain/compare_wer.sh exp/chain/cnn1a
+# System cnn1a
+# WER 0.61
+# CER 0.15
+# Final train prob -0.0377
+# Final valid prob -0.0380
+# Final train prob (xent) -0.0830
+# Final valid prob (xent) -0.0838
+
+if [ $# == 0 ]; then
+ echo "Usage: $0: [ ... ]"
+ echo "e.g.: $0 exp/chain/cnn{1a,1b}"
+ exit 1
+fi
+
+echo "# $0 $*"
+used_epochs=false
+
+echo -n "# System "
+for x in $*; do printf "% 10s" " $(basename $x)"; done
+echo
+
+echo -n "# WER "
+for x in $*; do
+ wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}')
+ printf "% 10s" $wer
+done
+echo
+
+echo -n "# CER "
+for x in $*; do
+ cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}')
+ printf "% 10s" $cer
+done
+echo
+
+if $used_epochs; then
+ exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+echo -n "# Final train prob "
+for x in $*; do
+ prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+ printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob "
+for x in $*; do
+ prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+ printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train prob (xent) "
+for x in $*; do
+ prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+ printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent) "
+for x in $*; do
+ prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+ printf "% 10s" $prob
+done
+echo
diff --git a/egs/gale_arabic/s5b/local/chain/run_chain_common.sh b/egs/gale_arabic/s5b/local/chain/run_chain_common.sh
new file mode 100755
index 00000000000..da37e148441
--- /dev/null
+++ b/egs/gale_arabic/s5b/local/chain/run_chain_common.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+
+# this script has common stages shared across librispeech chain recipes.
+# It generates a new topology in a new lang directory, gets the alignments as
+# lattices, and builds a tree for the new topology
+set -e
+
+stage=11
+
+# input directory names. These options are actually compulsory, and they have
+# been named for convenience
+gmm_dir=
+ali_dir=
+lores_train_data_dir=
+
+num_leaves=6000
+
+# output directory names. They are also compulsory.
+lang=
+lat_dir=
+tree_dir=
+# End configuration section.
+echo "$0 $@" # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+[ -z $lang ] && echo "Set --lang, this specifies the new lang directory which will have the new topology" && exit 1;
+[ -z $lat_dir ] && echo "Set --lat-dir, this specifies the experiment directory to store lattice" && exit 1;
+[ -z $tree_dir ] && echo "Set --tree-dir, this specifies the directory to store new tree " && exit 1;
+
+for f in $gmm_dir/final.mdl $ali_dir/ali.1.gz $lores_train_data_dir/feats.scp; do
+ [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 11 ]; then
+ echo "$0: creating lang directory with one state per phone."
+ # Create a version of the lang/ directory that has one state per phone in the
+ # topo file. [note, it really has two states.. the first one is only repeated
+ # once, the second one has zero or more repeats.]
+ if [ -d $lang ]; then
+ if [ $lang/L.fst -nt data/lang/L.fst ]; then
+ echo "$0: $lang already exists, not overwriting it; continuing"
+ else
+ echo "$0: $lang already exists and seems to be older than data/lang..."
+ echo " ... not sure what to do. Exiting."
+ exit 1;
+ fi
+ else
+ cp -r data/lang $lang
+ silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+ nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+ # Use our special topology... note that later on may have to tune this
+ # topology.
+ steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+ fi
+fi
+
+if [ $stage -le 12 ]; then
+ # Get the alignments as lattices (gives the chain training more freedom).
+ # use the same num-jobs as the alignments
+ nj=$(cat ${ali_dir}/num_jobs) || exit 1;
+ steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \
+ $lang $gmm_dir $lat_dir
+ rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 13 ]; then
+ # Build a tree using our new topology. We know we have alignments for the
+ # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+ # those.
+ if [ -f $tree_dir/final.mdl ]; then
+ echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+ exit 1;
+ fi
+ steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+ --context-opts "--context-width=2 --central-position=1" \
+ --cmd "$train_cmd" $num_leaves ${lores_train_data_dir} $lang $ali_dir $tree_dir
+fi
+
+exit 0;
diff --git a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh
index 7afafb31ff6..bf2e45c9914 100755
--- a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh
@@ -1,31 +1,51 @@
#!/bin/bash
-#started from tedlium recipe with few edits
+# ./local/chain/compare_wer.sh exp/chain/tdnn_1a_sp
+# System tdnn_1a_sp
+# WER 16.47
+# CER 6.68
+# Final train prob -0.0652
+# Final valid prob -0.0831
+# Final train prob (xent) -0.8965
+# Final valid prob (xent) -0.9964
+# steps/info/chain_dir_info.pl exp/chain/tdnn_1a_sp/
+# exp/chain/tdnn_1a_sp/: num-iters=441 nj=3..16 num-params=18.6M dim=40+100->5816 combine=-0.063->-0.062 (over 6) xent:train/valid[293,440,final]=(-1.22,-0.912,-0.896/-1.29,-1.01,-0.996) logprob:train/valid[293,440,final]=(-0.097,-0.066,-0.065/-0.108,-0.084,-0.083)
-set -e -o pipefail
-# First the options that are passed through to run_ivector_common.sh
-# (some of which are also used in this script directly).
+set -e -o pipefail
stage=0
nj=30
-decode_nj=30
-min_seg_len=1.55
-xent_regularize=0.1
train_set=train
-gmm=tri2b # the gmm for the target data
+test_set=test
+gmm=tri3b # this is the source gmm-dir that we'll use for alignments; it
+ # should have alignments for the specified training data.
num_threads_ubm=32
-nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
-
-# The rest are configs specific to this script. Most of the parameters
-# are just hardcoded at this level, in the commands below.
-train_stage=-10 #default -10
-tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
-tdnn_affix=1b #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
-common_egs_dir= # you can set this to use previously dumped egs.
+nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# LSTM/chain options
+train_stage=-10
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+# training chunk-options
+chunk_width=150,110,100
+get_egs_stage=-10
+
+# training options
+srand=0
+remove_egs=true
+run_ivector_common=true
+run_chain_common=true
# End configuration section.
echo "$0 $@" # Print the command line for logging
+
. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh
@@ -39,169 +59,162 @@ where "nvcc" is installed.
EOF
fi
-local/nnet3/run_ivector_common.sh --stage $stage \
- --nj $nj \
- --min-seg-len $min_seg_len \
- --train-set $train_set \
- --gmm $gmm \
- --num-threads-ubm $num_threads_ubm \
- --nnet3-affix "$nnet3_affix"
-
-
-gmm_dir=exp/$gmm
-ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
-tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
-lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
-dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi
-train_data_dir=data/${train_set}_sp_hires_comb
-lores_train_data_dir=data/${train_set}_sp_comb
-train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
-
+if $run_ivector_common; then
+ local/nnet3/run_ivector_common.sh \
+ --stage $stage --nj $nj \
+ --train-set $train_set --gmm $gmm \
+ --num-threads-ubm $num_threads_ubm \
+ --nnet3-affix "$nnet3_affix"
+fi
-for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
- $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+# note: you don't necessarily have to change the treedir name
+# each time you do a new experiment-- only if you change the
+# configuration in a way that affects the tree.
+tree_dir=exp/chain${nnet3_affix}/tree_a_sp
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+ $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+ $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
[ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
done
-if [ $stage -le 14 ]; then
- echo "$0: creating lang directory with one state per phone."
- # Create a version of the lang/ directory that has one state per phone in the
- # topo file. [note, it really has two states.. the first one is only repeated
- # once, the second one has zero or more repeats.]
- if [ -d data/lang_chain ]; then
- if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
- echo "$0: data/lang_chain already exists, not overwriting it; continuing"
- else
- echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
- echo " ... not sure what to do. Exiting."
- exit 1;
- fi
- else
- cp -r data/lang data/lang_chain
- silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
- nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
- # Use our special topology... note that later on may have to tune this
- # topology.
- steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
- fi
+# Please take this as a reference on how to specify all the options of
+# local/chain/run_chain_common.sh
+if $run_chain_common; then
+ local/chain/run_chain_common.sh --stage $stage \
+ --gmm-dir $gmm_dir \
+ --ali-dir $ali_dir \
+ --lores-train-data-dir ${lores_train_data_dir} \
+ --lang $lang \
+ --lat-dir $lat_dir \
+ --num-leaves 7000 \
+ --tree-dir $tree_dir || exit 1;
fi
if [ $stage -le 15 ]; then
- # Get the alignments as lattices (gives the chain training more freedom).
- # use the same num-jobs as the alignments
- steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
- data/lang $gmm_dir $lat_dir
- rm $lat_dir/fsts.*.gz # save space
-fi
-
-if [ $stage -le 16 ]; then
- # Build a tree using our new topology. We know we have alignments for the
- # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
- # those.
- if [ -f $tree_dir/final.mdl ]; then
- echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
- exit 1;
- fi
- steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
- --context-opts "--context-width=2 --central-position=1" \
- --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
-fi
-
-if [ $stage -le 17 ]; then
mkdir -p $dir
-
echo "$0: creating neural net configs using the xconfig parser";
num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
- learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+ learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+ affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+ tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+ linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+ prefinal_opts="l2-regularize=0.01"
+ output_opts="l2-regularize=0.002"
mkdir -p $dir/configs
+
cat < $dir/configs/network.xconfig
input dim=100 name=ivector
input dim=40 name=input
-
# please note that it is important to have input layer with the name=input
# as the layer immediately preceding the fixed-affine-layer to enable
# the use of short notation for the descriptor
fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
-
# the first splicing is moved before the lda layer, so no splicing here
- relu-renorm-layer name=tdnn1 dim=450
- relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=450
- relu-renorm-layer name=tdnn3 input=Append(-1,0,1,2) dim=450
- relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=450
- relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=450
- relu-renorm-layer name=tdnn6 input=Append(-6,-3,0) dim=450
-
- ## adding the layers for chain branch
- relu-renorm-layer name=prefinal-chain input=tdnn6 dim=450 target-rms=0.5
- output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
-
- # adding the layers for xent branch
- # This block prints the configs for a separate output that will be
- # trained with a cross-entropy objective in the 'chain' models... this
- # has the effect of regularizing the hidden parts of the model. we use
- # 0.5 / args.xent_regularize as the learning rate factor- the factor of
- # 0.5 / args.xent_regularize is suitable as it means the xent
- # final-layer learns at a rate independent of the regularization
- # constant; and the 0.5 was tuned so as to make the relative progress
- # similar in the xent and regular final layers.
- relu-renorm-layer name=prefinal-xent input=tdnn6 dim=450 target-rms=0.5
- output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
-
+ relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536
+ tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+ tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+ tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+ tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0
+ tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+ tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+ tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+ tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+ tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+ tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+ tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+ tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+ tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+ tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+ linear-component name=prefinal-l dim=256 $linear_opts
+ prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+ output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+ prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+ output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
EOF
steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
-
fi
-if [ $stage -le 18 ]; then
+
+if [ $stage -le 16 ]; then
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
utils/create_split_dir.pl \
- /export/b0{5,6,7,8}/$USER/kaldi-data/egs/gale_arabic-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+ /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
fi
- steps/nnet3/chain/train.py --stage $train_stage \
+ steps/nnet3/chain/train.py --stage $train_stage \
--cmd "$decode_cmd" \
--feat.online-ivector-dir $train_ivector_dir \
--feat.cmvn-opts "--norm-means=false --norm-vars=false" \
--chain.xent-regularize $xent_regularize \
--chain.leaky-hmm-coefficient 0.1 \
- --chain.l2-regularize 0.00005 \
+ --chain.l2-regularize 0.0 \
--chain.apply-deriv-weights false \
--chain.lm-opts="--num-extra-lm-states=2000" \
- --egs.dir "$common_egs_dir" \
- --egs.opts "--frames-overlap-per-eg 0" \
- --egs.chunk-width 150 \
- --trainer.num-chunk-per-minibatch 128 \
+ --trainer.dropout-schedule $dropout_schedule \
+ --trainer.srand=$srand \
+ --trainer.max-param-change=2.0 \
+ --trainer.num-epochs 6 \
--trainer.frames-per-iter 1500000 \
- --trainer.num-epochs 4 \
- --trainer.optimization.num-jobs-initial 2 \
- --trainer.optimization.num-jobs-final 2 \
- --trainer.optimization.initial-effective-lrate 0.001 \
- --trainer.optimization.final-effective-lrate 0.0001 \
- --trainer.max-param-change 2.0 \
- --cleanup.remove-egs true \
- --feat-dir $train_data_dir \
+ --trainer.optimization.num-jobs-initial 3 \
+ --trainer.optimization.num-jobs-final 16 \
+ --trainer.optimization.initial-effective-lrate 0.00025 \
+ --trainer.optimization.final-effective-lrate 0.000025 \
+ --trainer.num-chunk-per-minibatch=64,32 \
+ --trainer.add-option="--optimization.memory-compression-level=2" \
+ --egs.chunk-width=$chunk_width \
+ --egs.dir="$common_egs_dir" \
+ --egs.opts "--frames-overlap-per-eg 0 --constrained false" \
+ --egs.stage $get_egs_stage \
+ --reporting.email="$reporting_email" \
+ --cleanup.remove-egs=$remove_egs \
+ --feat-dir=$train_data_dir \
--tree-dir $tree_dir \
- --lat-dir $lat_dir \
- --dir $dir
-fi
-
+ --lat-dir=$lat_dir \
+ --dir $dir || exit 1;
+fi
-if [ $stage -le 19 ]; then
- # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
- # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
- # the lang directory.
- utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+if [ $stage -le 17 ]; then
+ # The reason we are using data/lang here, instead of $lang, is just to
+ # emphasize that it's not actually important to give mkgraph.sh the
+ # lang directory with the matched topology (since it gets the
+ # topology file from the model). So you could give it a different
+ # lang directory, one that contained a wordlist and LM of your choice,
+ # as long as phones.txt was compatible.
+
+ utils/lang/check_phones_compatible.sh \
+ data/lang_test/phones.txt $lang/phones.txt
+ utils/mkgraph.sh \
+ --self-loop-scale 1.0 data/lang_test \
+ $tree_dir $tree_dir/graph || exit 1;
fi
-if [ $stage -le 20 ]; then
+if [ $stage -le 18 ]; then
+ frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
rm $dir/.error 2>/dev/null || true
- steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
- --acwt 1.0 --post-decode-acwt 10.0 \
- --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_test_hires \
- --scoring-opts "--min-lmwt 5 " \
- $dir/graph data/test_hires $dir/decode || exit 1;
+
+ steps/nnet3/decode.sh \
+ --acwt 1.0 --post-decode-acwt 10.0 \
+ --extra-left-context 0 --extra-right-context 0 \
+ --extra-left-context-initial 0 \
+ --extra-right-context-final 0 \
+ --frames-per-chunk $frames_per_chunk \
+ --nj $nj --cmd "$decode_cmd" --num-threads 4 \
+ --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${test_set}_hires \
+ $tree_dir/graph data/${test_set}_hires ${dir}/decode_${test_set} || exit 1
fi
-exit 0
diff --git a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
index 604f32a1de4..deebafc95e4 100755
--- a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -120,7 +120,7 @@ if [ $stage -le 17 ]; then
echo "$0: creating neural net configs using the xconfig parser";
num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
- learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+ learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
mkdir -p $dir/configs
cat < $dir/configs/network.xconfig
diff --git a/egs/gale_arabic/s5b/local/gale_data_prep_audio.sh b/egs/gale_arabic/s5b/local/gale_data_prep_audio.sh
deleted file mode 100755
index 0125272d06c..00000000000
--- a/egs/gale_arabic/s5b/local/gale_data_prep_audio.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-
-# Copyright 2014 QCRI (author: Ahmed Ali)
-# Apache 2.0
-
-
-galeData=$(utils/make_absolute.sh "${@: -1}" ); # last argumnet; the local folder
-audio_dvds=${@:1:${#}-1} # all the audio dvds for GALE corpus; ; check audio=( in ../run.sh
-
-mkdir -p $galeData
-
-# check that sox is installed
-which sox &>/dev/null
-if [[ $? != 0 ]]; then
- echo "sox is not installed"; exit 1
-fi
-
-for dvd in $audio_dvds; do
- dvd_full_path=$(utils/make_absolute.sh $dvd)
- if [[ ! -e $dvd_full_path ]]; then
- echo missing $dvd_full_path; exit 1;
- fi
- find $dvd_full_path \( -name "*.wav" -o -name "*.flac" \) | while read file; do
- id=$(basename $file | awk '{gsub(".wav","");gsub(".flac","");print}')
- echo "$id sox $file -r 16000 -t wav - |"
- done
-done | sort -u > $galeData/wav.scp
-
-echo data prep audio succeded
-
-exit 0
-
diff --git a/egs/gale_arabic/s5b/local/gale_data_prep_split.sh b/egs/gale_arabic/s5b/local/gale_data_prep_split.sh
deleted file mode 100755
index b18a4e5b105..00000000000
--- a/egs/gale_arabic/s5b/local/gale_data_prep_split.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash
-
-# Copyright 2014 QCRI (author: Ahmed Ali)
-# Apache 2.0
-
-if [ $# -ne 1 ]; then
- echo "Arguments should be the "; exit 1
-fi
-
-
-#data will data/local
-
-galeData=$(utils/make_absolute.sh $1)
-mkdir -p data/local
-dir=$(utils/make_absolute.sh data/local)
-
-
-grep -f local/test_list $galeData/all | grep -v -f local/bad_segments > $galeData/all.test
-grep -v -f local/test_list $galeData/all | grep -v -f local/bad_segments > $galeData/all.train
-
-for x in test train; do
- outdir=$dir/$x
- file=$galeData/all.$x
- mkdir -p $outdir
- awk '{print $2 " " $2}' $file | sort -u > $outdir/utt2spk
- cp -pr $outdir/utt2spk $outdir/spk2utt
- awk '{print $2 " " $1 " " $3 " " $4}' $file | sort -u > $outdir/segments
- awk '{printf $2 " "; for (i=5; i<=NF; i++) {printf $i " "} printf "\n"}' $file | sort -u > $outdir/text
-done
-
-
-grep -f local/test_list $galeData/wav.scp > $dir/test/wav.scp
-
-cat $galeData/wav.scp | awk -v seg=$dir/train/segments 'BEGIN{while((getline0) {seen[$2]=1;}}
- {if (seen[$1]) { print $0}}' > $dir/train/wav.scp
-
-echo data prep split succeeded
-
-exit 0
diff --git a/egs/gale_arabic/s5b/local/gale_data_prep_txt.sh b/egs/gale_arabic/s5b/local/gale_data_prep_txt.sh
deleted file mode 100755
index 04529d88ac0..00000000000
--- a/egs/gale_arabic/s5b/local/gale_data_prep_txt.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/bash
-
-# Copyright 2014 QCRI (author: Ahmed Ali)
-# Apache 2.0
-
-galeData=$(utils/make_absolute.sh "${@: -1}" ); # last argumnet; the local folder
-txt_dvds=${@:1:${#}-1} # all the txt cds correspoding to the audio corpus; check text=( in ../run.sh
-
-
-top_pwd=`pwd`
-txtdir=$galeData/txt
-mkdir -p $txtdir; cd $txtdir
-
-for cdx in $txt_dvds; do
- echo "Preparing $cdx"
- if [[ $cdx == *.tgz ]] ; then
- tar -xvf $cdx
- elif [ -d "$cdx" ]; then
- ln -s $cdx `basename $cdx`
- else
- echo "I don't really know what I shall do with $cdx " >&2
- fi
-done
-
-find -L . -type f -name "*.tdf" | while read file; do
-sed '1,3d' $file # delete the first 3 lines
-done > all.tmp$$
-
-perl -e '
- ($inFile,$idFile,$txtFile)= split /\s+/, $ARGV[0];
- open(IN, "$inFile");
- open(ID, ">$idFile");
- open(TXT, ">$txtFile");
- while () {
- @arr= split /\t/,$_;
- $start=sprintf ("%0.3f",$arr[2]);$rStart=$start;$start=~s/\.//; $start=~s/^0+$/0/; $start=~s/^0+([^0])/$1/; # remove zeros at the beginning
- $end=sprintf ("%0.3f",$arr[3]);$rEnd=$end;$end=~s/^0+([^0])/$1/;$end=~s/\.//;
- if ( ($arr[11] !~ m/report/) && ($arr[11] !~ m/conversational/) ){$arr[11]="UNK";}
- $id="$arr[11] $arr[0] $arr[0]_${start}_${end} $rStart $rEnd\n";
- next if ($rStart == $rEnd);
- $id =~ s/.sph//g;
- print ID $id;
- print TXT "$arr[7]\n";
- }' "all.tmp$$ allid.tmp$$ contentall.tmp$$"
-
-
-perl ${top_pwd}/local/normalize_transcript_BW.pl contentall.tmp$$ contentall.buck.tmp$$
-
-paste allid.tmp$$ contentall.buck.tmp$$ | sed 's: $::' | awk '{if (NF>5) {print $0}}' > all_1.tmp$$
-
-awk '{$1="";print $0}' all_1.tmp$$ | sed 's:^ ::' > $galeData/all
-awk '{if ($1 == "report") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' > $galeData/report
-awk '{if ($1 == "conversational") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' > $galeData/conversational
-
-cd ..;
-rm -fr $txtdir
-cd $top_pwd
-echo data prep text succeeded
-
-exit 0
diff --git a/egs/gale_arabic/s5b/local/gale_prep_grapheme_dict.sh b/egs/gale_arabic/s5b/local/gale_prep_grapheme_dict.sh
deleted file mode 100755
index 5f101f8245b..00000000000
--- a/egs/gale_arabic/s5b/local/gale_prep_grapheme_dict.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/bin/bash
-
-# Copyright 2017 QCRI (author: Ahmed Ali)
-# Apache 2.0
-
-
-# run this from ../
-dir=$(utils/make_absolute.sh data/local/dict)
-mkdir -p $dir
-
-
-# (1) Get all avaialble dictionaries, since this is a grapheme model, so we mainly need the most frequent word lists
-wget http://alt.qcri.org//resources/speech/dictionary/ar-ar_grapheme_lexicon_2016-02-09.bz2 || exit 1;
-wget http://alt.qcri.org//resources/speech/dictionary/ar-ar_lexicon_2014-03-17.txt.bz2 || exit 1;
-bzcat ar-ar_grapheme_lexicon_2016-02-09.bz2 | sed '1,3d' | awk '{print $1}' > tmp$$
-bzcat ar-ar_lexicon_2014-03-17.txt.bz2 | sed '1,3d' | awk '{print $1}' >> tmp$$
-# (2) Now we add all the words appeared in the training data
-cat data/local/train/text | cut -d ' ' -f 2- | tr -s " " "\n" | sort -u >> tmp$$
-grep -v [0-9] tmp$$ | sed -e 's:[FNKaui\~o\`]::g' -e 's:{:}:g' | sort -u > tmp1.$$ # remove vowels and rare alef wasla
-cat tmp1.$$ | sed 's:\(\):\1 :g' | sed -e 's: : :g' -e 's: : :g' -e 's:\s*: :g' -e 's:\*:V:g' > tmp2.$$
-paste -d ' ' tmp1.$$ tmp2.$$ > $dir/lexicon.txt
-
-#(2) Dictionary preparation:
-
-# silence phones, one per line.
-echo SIL > $dir/silence_phones.txt
-echo SIL > $dir/optional_silence.txt
-
-# nonsilence phones; on each line is a list of phones that correspond
-# really to the same base phone.
-cat tmp2.$$ | tr -s ' ' '\n' | grep -v ^$ | sort -u > $dir/nonsilence_phones.txt || exit 1;
-
-sed -i '1i SIL' $dir/lexicon.txt # insert word with phone sil at the begining of the dictionary
-
-rm -fr ar-ar_lexicon_2014-03-17.txt.bz2 ar-ar_grapheme_lexicon_2016-02-09.bz2 tmp$$ tmp1.$$ tmp2.$$
-echo Dictionary preparation succeeded
-
-# The script is still missing dates and numbers
-
-exit 0
-
diff --git a/egs/gale_arabic/s5b/local/gale_train_lms.sh b/egs/gale_arabic/s5b/local/gale_train_lms.sh
deleted file mode 100755
index 3988ec3818f..00000000000
--- a/egs/gale_arabic/s5b/local/gale_train_lms.sh
+++ /dev/null
@@ -1,81 +0,0 @@
-#!/bin/bash
-
-
-# To be run from one directory above this script.
-
-
-lexicon=data/local/dict/lexicon.txt
-[ ! -f $lexicon ] && echo "$0: No such file $lexicon" && exit 1;
-
-
-# This script takes no arguments. It assumes you have already run
-# previus steps successfully
-# It takes as input the files
-#data/local/train.*/text
-#data/local/dict/lexicon.txt
-
-
-export LC_ALL=C # You'll get errors about things being not sorted, if you
-# have a different locale.
-export PATH=$PATH:./../../../tools/kaldi_lm
-( # First make sure the kaldi_lm toolkit is installed.
- cd $KALDI_ROOT/tools || exit 1;
- if [ -d kaldi_lm ]; then
- echo Not installing the kaldi_lm toolkit since it is already there.
- else
- echo Downloading and installing the kaldi_lm tools
- if [ ! -f kaldi_lm.tar.gz ]; then
- wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz || exit 1;
- fi
- tar -xvzf kaldi_lm.tar.gz || exit 1;
- cd kaldi_lm
- make || exit 1;
- echo Done making the kaldi_lm tools
- fi
-) || exit 1;
-
-
-dir=data/local/lm
- mkdir -p $dir
- text=data/local/train/text
- [ ! -f $text ] && echo "$0: No such file $text" && exit 1;
-
- cleantext=$dir/text.no_oov
-
- cat $text | awk -v lex=$lexicon 'BEGIN{while((getline0){ seen[$1]=1; } }
- {for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf(" ",$n);} } printf("\n");}' \
- > $cleantext || exit 1;
-
-
- cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
- sort -nr > $dir/word.counts || exit 1;
-
-
-# Get counts from acoustic training transcripts, and add one-count
-# for each word in the lexicon (but not silence, we don't want it
-# in the LM-- we'll add it optionally later).
- cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
- cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
- sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
-
-# note: we probably won't really make use of as there aren't any OOVs
- cat $dir/unigram.counts | awk '{print $2}' | get_word_map.pl "" "" "" > $dir/word_map \
- || exit 1;
-
-# note: ignore 1st field of train.txt, it's the utterance-id.
- cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline0)map[$1]=$2;}
- { for(n=2;n<=NF;n++) { printf map[$n]; if(n$dir/train.gz \
- || exit 1;
-
- train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1;
-
-# LM is small enough that we don't need to prune it (only about 0.7M N-grams).
-# Perplexity over 128254.000000 words is 90.446690
-
-# note: output is
-# data/local/lm/3gram-mincount/lm_unpruned.gz
-
-
-echo train lm succeeded
-
-exit 0
diff --git a/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh b/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh
index f14c8441869..a03cc5b2fa3 100755
--- a/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh
+++ b/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh
@@ -2,31 +2,29 @@
set -e -o pipefail
-# This script is called from local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh (and may eventually
-# be called by more scripts). It contains the common feature preparation and iVector-related parts
-# of the script. See those scripts for examples of usage.
+# This script is called from scripts like local/nnet3/run_tdnn.sh and
+# local/chain/run_tdnn.sh (and may eventually be called by more scripts). It
+# contains the common feature preparation and iVector-related parts of the
+# script. See those scripts for examples of usage.
stage=0
nj=100
-min_seg_len=1.55 # min length in seconds... we do this because chain training
- # will discard segments shorter than 1.5 seconds. Must remain in sync
- # with the same option given to prepare_lores_feats_and_alignments.sh
train_set=train # you might set this to e.g. train.
-gmm=tri2b # This specifies a GMM-dir from the features of the type you're training the system on;
+test_sets="test"
+gmm=tri3b # This specifies a GMM-dir from the features of the type you're training the system on;
# it should contain alignments for 'train_set'.
num_threads_ubm=32
-nnet3_affix=_cleaned # affix for exp/nnet3 directory to put iVector stuff in, so it
- # becomes exp/nnet3_cleaned or whatever.
+nnet3_affix= # affix for exp/nnet3 directory to put iVector stuff
. ./cmd.sh
. ./path.sh
-. ./utils/parse_options.sh
+. utils/parse_options.sh
gmm_dir=exp/${gmm}
-ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+ali_dir=exp/${gmm}_ali_${train_set}_sp
for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
if [ ! -f $f ]; then
@@ -61,7 +59,7 @@ if [ $stage -le 2 ]; then
utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/gale_arabic-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
fi
- for datadir in ${train_set}_sp test; do
+ for datadir in ${train_set}_sp ${test_sets}; do
utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
done
@@ -69,7 +67,7 @@ if [ $stage -le 2 ]; then
# features; this helps make trained nnets more invariant to test data volume.
utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires
- for datadir in ${train_set}_sp test; do
+ for datadir in ${train_set}_sp ${test_sets}; do
steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
--cmd "$train_cmd" data/${datadir}_hires
steps/compute_cmvn_stats.sh data/${datadir}_hires
@@ -78,75 +76,33 @@ if [ $stage -le 2 ]; then
fi
if [ $stage -le 3 ]; then
- echo "$0: combining short segments of speed-perturbed high-resolution MFCC training data"
- # we have to combine short segments or we won't be able to train chain models
- # on those segments.
- utils/data/combine_short_segments.sh \
- data/${train_set}_sp_hires $min_seg_len data/${train_set}_sp_hires_comb
-
- # just copy over the CMVN to avoid having to recompute it.
- cp data/${train_set}_sp_hires/cmvn.scp data/${train_set}_sp_hires_comb/
- utils/fix_data_dir.sh data/${train_set}_sp_hires_comb/
-fi
-
-if [ $stage -le 4 ]; then
- echo "$0: selecting segments of hires training data that were also present in the"
- echo " ... original training data."
-
- # note, these data-dirs are temporary; we put them in a sub-directory
- # of the place where we'll make the alignments.
- temp_data_root=exp/nnet3${nnet3_affix}/tri5
- mkdir -p $temp_data_root
-
- utils/data/subset_data_dir.sh --utt-list data/${train_set}/feats.scp \
- data/${train_set}_sp_hires $temp_data_root/${train_set}_hires
-
- # note: essentially all the original segments should be in the hires data.
- n1=$(wc -l /dev/null
+if [[ $? != 0 ]]; then
+ echo "$0: sox is not installed"; exit 1
+fi
+
+for dvd in $dir1 $dir2 $dir3; do
+ dvd_full_path=$(utils/make_absolute.sh $dvd)
+ if [[ ! -e $dvd_full_path ]]; then
+ echo "$0: missing $dvd_full_path"; exit 1;
+ fi
+ find $dvd_full_path \( -name "*.wav" -o -name "*.flac" \) | while read file; do
+ id=$(basename $file | awk '{gsub(".wav","");gsub(".flac","");print}')
+ echo "$id sox $file -r 16000 -t wav - |"
+ done
+done | sort -u > $gale_data/wav.scp
+echo "$0:data prep audio succeded"
+
+gale_data=$(utils/make_absolute.sh "GALE" );
+top_pwd=`pwd`
+txtdir=$gale_data/txt
+mkdir -p $txtdir; cd $txtdir
+
+for cdx in $text1 $text2 $text3; do
+ echo "$0:Preparing $cdx"
+ if [[ $cdx == *.tgz ]] ; then
+ tar -xvf $cdx
+ elif [ -d "$cdx" ]; then
+ ln -s $cdx `basename $cdx`
+ else
+ echo "$0:I don't really know what I shall do with $cdx " >&2
+ fi
+done
+
+find -L . -type f -name "*.tdf" | while read file; do
+sed '1,3d' $file # delete the first 3 lines
+done > all.tmp$$
+
+perl -e '
+ ($inFile,$idFile,$txtFile)= split /\s+/, $ARGV[0];
+ open(IN, "$inFile");
+ open(ID, ">$idFile");
+ open(TXT, ">$txtFile");
+ while () {
+ @arr= split /\t/,$_;
+ $start=sprintf ("%0.3f",$arr[2]);$rStart=$start;$start=~s/\.//; $start=~s/^0+$/0/; $start=~s/^0+([^0])/$1/; # remove zeros at the beginning
+ $end=sprintf ("%0.3f",$arr[3]);$rEnd=$end;$end=~s/^0+([^0])/$1/;$end=~s/\.//;
+ if ( ($arr[11] !~ m/report/) && ($arr[11] !~ m/conversational/) ){$arr[11]="UNK";}
+ $id="$arr[11] $arr[0] $arr[0]_${start}_${end} $rStart $rEnd\n";
+ next if ($rStart == $rEnd);
+ $id =~ s/.sph//g;
+ print ID $id;
+ print TXT "$arr[7]\n";
+ }' "all.tmp$$ allid.tmp$$ contentall.tmp$$"
+
+perl ${top_pwd}/local/normalize_transcript_BW.pl contentall.tmp$$ contentall.buck.tmp$$
+paste allid.tmp$$ contentall.buck.tmp$$ | sed 's: $::' | awk '{if (NF>5) {print $0}}' > all_1.tmp$$
+
+
+awk '{$1="";print $0}' all_1.tmp$$ | sed 's:^ ::' > $gale_data/all
+awk '{if ($1 == "report") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' > $gale_data/report
+awk '{if ($1 == "conversational") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' > $gale_data/conversational
+
+cd ..;
+rm -fr $txtdir
+cd $top_pwd
+echo "$0:dat a prep text succeeded"
+
+mkdir -p data
+dir=$(utils/make_absolute.sh data/)
+grep -f local/test_list $gale_data/all | grep -v -f local/bad_segments > $gale_data/all.test
+grep -v -f local/test_list $gale_data/all | grep -v -f local/bad_segments > $gale_data/all.train
+
+for x in test train; do
+ outdir=data/$x
+ file=$gale_data/all.$x
+ mkdir -p $outdir
+ awk '{print $2 " " $2}' $file | sort -u > $outdir/utt2spk
+ cp -pr $outdir/utt2spk $outdir/spk2utt
+ awk '{print $2 " " $1 " " $3 " " $4}' $file | sort -u > $outdir/segments
+ awk '{printf $2 " "; for (i=5; i<=NF; i++) {printf $i " "} printf "\n"}' $file | sort -u > $outdir/text
+done
+
+grep -f local/test_list $gale_data/wav.scp > $dir/test/wav.scp
+
+cat $gale_data/wav.scp | awk -v seg=$dir/train/segments 'BEGIN{while((getline0) {seen[$2]=1;}}
+ {if (seen[$1]) { print $0}}' > $dir/train/wav.scp
+
+echo "$0:data prep split succeeded"
+exit 0
diff --git a/egs/gale_arabic/s5b/local/prepare_dict.sh b/egs/gale_arabic/s5b/local/prepare_dict.sh
new file mode 100755
index 00000000000..47b5869fdf1
--- /dev/null
+++ b/egs/gale_arabic/s5b/local/prepare_dict.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+
+# Copyright 2017 QCRI (author: Ahmed Ali)
+# Apache 2.0
+# This script prepares the dictionary.
+
+set -e
+dir=data/local/dict
+lexicon_url1="http://alt.qcri.org//resources/speech/dictionary/ar-ar_grapheme_lexicon_2016-02-09.bz2";
+lexicon_url2="http://alt.qcri.org//resources/speech/dictionary/ar-ar_lexicon_2014-03-17.txt.bz2";
+stage=0
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+mkdir -p $dir data/local/lexicon_data
+
+if [ $stage -le 0 ]; then
+ echo "$0: Downloading text for lexicon... $(date)."
+ wget -P data/local/lexicon_data $lexicon_url1
+ wget -P data/local/lexicon_data $lexicon_url2
+ bzcat data/local/lexicon_data/ar-ar_grapheme_lexicon_2016-02-09.bz2 | sed '1,3d' | awk '{print $1}' > data/local/lexicon_data/grapheme_lexicon
+ bzcat data/local/lexicon_data/ar-ar_lexicon_2014-03-17.txt.bz2 | sed '1,3d' | awk '{print $1}' >> data/local/lexicon_data/grapheme_lexicon
+ cat data/train/text | cut -d ' ' -f 2- | tr -s " " "\n" | sort -u >> data/local/lexicon_data/grapheme_lexicon
+fi
+
+
+if [ $stage -le 0 ]; then
+ echo "$0: processing lexicon text and creating lexicon... $(date)."
+ # remove vowels and rare alef wasla
+ grep -v [0-9] data/local/lexicon_data/grapheme_lexicon | sed -e 's:[FNKaui\~o\`]::g' -e 's:{:}:g' | sort -u > data/local/lexicon_data/processed_lexicon
+ local/prepare_lexicon.py
+fi
+
+cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1;
+
+sed -i '1i UNK' $dir/lexicon.txt
+
+echo UNK >> $dir/nonsilence_phones.txt
+
+echo ' SIL' >> $dir/lexicon.txt
+
+echo SIL > $dir/silence_phones.txt
+
+echo SIL >$dir/optional_silence.txt
+
+echo -n "" >$dir/extra_questions.txt
+
+echo "$0: Dictionary preparation succeeded"
diff --git a/egs/gale_arabic/s5b/local/prepare_lexicon.py b/egs/gale_arabic/s5b/local/prepare_lexicon.py
new file mode 100755
index 00000000000..215541585eb
--- /dev/null
+++ b/egs/gale_arabic/s5b/local/prepare_lexicon.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+
+# Copyright 2018 Ashish Arora
+# Apache 2.0
+
+# This script prepares lexicon.
+
+import argparse
+import os
+
+parser = argparse.ArgumentParser(description="""Creates the list of characters and words in lexicon""")
+args = parser.parse_args()
+
+### main ###
+lex = {}
+text_path = os.path.join('data','local', 'lexicon_data', 'processed_lexicon')
+with open(text_path, 'r', encoding='utf-8') as f:
+ for line in f:
+ line = line.strip()
+ characters = list(line)
+ characters = " ".join(['V' if char == '*' else char for char in characters])
+ lex[line] = characters
+
+with open(os.path.join('data','local','dict', 'lexicon.txt'), 'w', encoding='utf-8') as fp:
+ for key in sorted(lex):
+ fp.write(key + " " + lex[key] + "\n")
diff --git a/egs/gale_arabic/s5b/local/prepare_lm.sh b/egs/gale_arabic/s5b/local/prepare_lm.sh
new file mode 100755
index 00000000000..6fdf35f471a
--- /dev/null
+++ b/egs/gale_arabic/s5b/local/prepare_lm.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# Copyright 2012 Vassil Panayotov
+# 2017 Ewald Enzinger
+# Apache 2.0
+
+. ./path.sh || exit 1
+
+echo "=== Building a language model ..."
+
+dir=data/local/lm/
+text=data/train/text
+lexicon=data/local/dict/lexicon.txt
+# Language model order
+order=3
+
+. utils/parse_options.sh
+
+# Prepare a LM training corpus from the transcripts
+mkdir -p $dir
+
+for f in "$text" "$lexicon"; do
+ [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
+
+loc=`which ngram-count`;
+if [ -z $loc ]; then
+ if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
+ sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64
+ else
+ sdir=$KALDI_ROOT/tools/srilm/bin/i686
+ fi
+ if [ -f $sdir/ngram-count ]; then
+ echo Using SRILM tools from $sdir
+ export PATH=$PATH:$sdir
+ else
+ echo You appear to not have SRILM tools installed, either on your path,
+ echo or installed in $sdir. See tools/install_srilm.sh for installation
+ echo instructions.
+ exit 1
+ fi
+fi
+
+cat data/train/text | cut -d " " -f 2- > $dir/text.txt
+cut -d' ' -f1 $lexicon > $dir/wordlist
+
+ngram-count -text $dir/text.txt -order $order -limit-vocab -vocab $dir/wordlist \
+ -unk -map-unk "" -kndiscount -interpolate -lm $dir/lm.gz
+
+#ngram -lm $dir/lm.gz -ppl $dir/dev.txt
+echo "*** Finished building the LM model!"
diff --git a/egs/gale_arabic/s5b/local/score.sh b/egs/gale_arabic/s5b/local/score.sh
index 83366f7c7fc..1d84815fc69 100755
--- a/egs/gale_arabic/s5b/local/score.sh
+++ b/egs/gale_arabic/s5b/local/score.sh
@@ -1,60 +1,6 @@
-#!/bin/bash
-# Copyright 2012 Johns Hopkins University (Author: Daniel Povey)
-# Apache 2.0
-
-[ -f ./path.sh ] && . ./path.sh
-
-# begin configuration section.
-cmd=run.pl
-stage=0
-decode_mbr=true
-word_ins_penalty=0.0
-min_lmwt=7
-max_lmwt=17
-iter= #some of the scripts from steps/ seem to use it
-#end configuration section.
-
-echo "$0 $#"
-
-[ -f ./path.sh ] && . ./path.sh
-. parse_options.sh || exit 1;
-
-if [ $# -ne 3 ]; then
- echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)]