diff --git a/egs/aspire/s5/conf/mfcc_hires_bp.conf b/egs/aspire/s5/conf/mfcc_hires_bp.conf new file mode 100644 index 00000000000..14f5c2d10a7 --- /dev/null +++ b/egs/aspire/s5/conf/mfcc_hires_bp.conf @@ -0,0 +1,15 @@ +# config for high-resolution MFCC features, intended for neural network training. +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +# This config is defined only on the frequencies from 330 Hz to +# 3000 Hz conrresponding to the telephone bandwidth. +--use-energy=false # use average of log energy, not energy. +--sample-frequency=8000 # Switchboard is sampled at 8kHz +--num-mel-bins=28 +--num-ceps=28 +--cepstral-lifter=0 +--low-freq=330 # low cutoff frequency for mel bins +--high-freq=-1000 # high cutoff frequently, relative to Nyquist of 4000 (=3000) + + diff --git a/egs/aspire/s5/conf/segmentation_music.conf b/egs/aspire/s5/conf/segmentation_music.conf new file mode 100644 index 00000000000..83687e98c55 --- /dev/null +++ b/egs/aspire/s5/conf/segmentation_music.conf @@ -0,0 +1,14 @@ +# General segmentation options for segmentation on music / non-music +pad_length=-1 # Pad speech segments by this many frames on either side +max_blend_length=-1 # Maximum duration of speech that will be removed as part + # of smoothing process. This is only if there are no other + # speech segments nearby. +max_intersegment_length=0 # Merge nearby speech segments if the silence + # between them is less than this many frames. +post_pad_length=-1 # Pad speech segments by this many frames on either side + # after the merging process using max_intersegment_length +max_segment_length=1000 # Segments that are longer than this are split into + # overlapping frames. +overlap_length=250 # Overlapping frames when segments are split. + # See the above option. +min_silence_length=100000 # Min silence length at which to split very long segments diff --git a/egs/aspire/s5/conf/segmentation_speech.conf b/egs/aspire/s5/conf/segmentation_speech.conf new file mode 100644 index 00000000000..062edade265 --- /dev/null +++ b/egs/aspire/s5/conf/segmentation_speech.conf @@ -0,0 +1,14 @@ +# General segmentation options for SAD +pad_length=20 # Pad speech segments by this many frames on either side +max_relabel_length=10 # Maximum duration of speech that will be removed as part + # of smoothing process. This is only if there are no other + # speech segments nearby. +max_intersegment_length=30 # Merge nearby speech segments if the silence + # between them is less than this many frames. +post_pad_length=10 # Pad speech segments by this many frames on either side + # after the merging process using max_intersegment_length +max_segment_length=1000 # Segments that are longer than this are split into + # overlapping frames. +overlap_length=250 # Overlapping frames when segments are split. + # See the above option. +min_silence_length=20 # Min silence length at which to split very long segments diff --git a/egs/aspire/s5/conf/segmentation_speech_simple.conf b/egs/aspire/s5/conf/segmentation_speech_simple.conf new file mode 100644 index 00000000000..6e7085a0065 --- /dev/null +++ b/egs/aspire/s5/conf/segmentation_speech_simple.conf @@ -0,0 +1,15 @@ +# A simple segmentation post-processing options for SAD +pad_length=20 # Pad speech segments by this many frames on either side +max_relabel_length=-1 # Maximum duration of speech that will be removed as part + # of smoothing process. This is only if there are no other + # speech segments nearby. -1 is to disable this step. +max_intersegment_length=30 # Merge nearby speech segments if the silence + # between them is less than this many frames. +post_pad_length=-1 # Pad speech segments by this many frames on either side + # after the merging process using max_intersegment_length + # -1 is to disable this step. +max_segment_length=1000 # Segments that are longer than this are split into + # overlapping frames. +overlap_length=250 # Overlapping frames when segments are split. + # See the above option. +min_silence_length=20 # Min silence length at which to split very long segments diff --git a/egs/aspire/s5/local/multi_condition/get_ctm.sh b/egs/aspire/s5/local/multi_condition/get_ctm.sh index f67a1191544..3260fc157d3 100755 --- a/egs/aspire/s5/local/multi_condition/get_ctm.sh +++ b/egs/aspire/s5/local/multi_condition/get_ctm.sh @@ -7,8 +7,7 @@ decode_mbr=true filter_ctm_command=cp glm= stm= -window=10 -overlap=5 +resolve_overlaps=true [ -f ./path.sh ] && . ./path.sh . parse_options.sh || exit 1; @@ -62,7 +61,13 @@ lattice-align-words-lexicon --output-error-lats=true --output-if-empty=true --ma lattice-to-ctm-conf $frame_shift_opt --decode-mbr=$decode_mbr ark:- $decode_dir/score_$LMWT/penalty_$wip/ctm.overlapping || exit 1; # combine the segment-wise ctm files, while resolving overlaps -python local/multi_condition/resolve_ctm_overlaps.py --overlap $overlap --window-length $window $data_dir/utt2spk $decode_dir/score_$LMWT/penalty_$wip/ctm.overlapping $decode_dir/score_$LMWT/penalty_$wip/ctm.merged || exit 1; +if $resolve_overlaps; then + steps/resolve_ctm_overlaps.py $data_dir/segments \ + $decode_dir/score_$LMWT/penalty_$wip/ctm.overlapping \ + $decode_dir/score_$LMWT/penalty_$wip/ctm.merged || exit 1; +else + cp $decode_dir/score_$LMWT/penalty_$wip/ctm.overlapping $decode_dir/score_$LMWT/penalty_$wip/ctm.merged || exit 1; +fi merged_ctm=$decode_dir/score_$LMWT/penalty_$wip/ctm.merged cat $merged_ctm | utils/int2sym.pl -f 5 $lang/words.txt | \ diff --git a/egs/aspire/s5/local/nnet3/prep_test_aspire_segmentation.sh b/egs/aspire/s5/local/nnet3/prep_test_aspire_segmentation.sh new file mode 100755 index 00000000000..020714ae1cc --- /dev/null +++ b/egs/aspire/s5/local/nnet3/prep_test_aspire_segmentation.sh @@ -0,0 +1,153 @@ +#!/bin/bash + +# Copyright Johns Hopkins University (Author: Daniel Povey, Vijayaditya Peddinti) 2016. Apache 2.0. +# This script generates the ctm files for dev_aspire, test_aspire and eval_aspire +# for scoring with ASpIRE scoring server. +# It also provides the WER for dev_aspire data. + +set -e +set -o pipefail +set -u + +# general opts +iter=final +stage=0 +decode_num_jobs=30 +num_jobs=30 +affix= + +# ivector opts +max_count=75 # parameter for extract_ivectors.sh +sub_speaker_frames=6000 +ivector_scale=0.75 +filter_ctm=true +weights_file= +silence_weight=0.00001 + +# decode opts +pass2_decode_opts="--min-active 1000" +lattice_beam=8 +extra_left_context=0 # change for (B)LSTM +extra_right_context=0 # change for BLSTM +frames_per_chunk=50 # change for (B)LSTM +acwt=0.1 # important to change this when using chain models +post_decode_acwt=1.0 # important to change this when using chain models + +. ./cmd.sh +[ -f ./path.sh ] && . ./path.sh +. utils/parse_options.sh || exit 1; + +if [ $# -ne 5 ]; then + echo "Usage: $0 [options] " + echo " Options:" + echo " --stage (0|1|2) # start scoring script from part-way through." + echo "e.g.:" + echo "$0 dev_aspire data/lang exp/tri5a/graph_pp exp/nnet3/tdnn" + exit 1; +fi + +data_set=$1 +seg_data_dir=$2 +lang=$3 # data/lang +graph=$4 #exp/tri5a/graph_pp +dir=$5 # exp/nnet3/tdnn + +model_affix=`basename $dir` +ivector_dir=exp/nnet3 +ivector_affix=${affix:+_$affix}_chain_${model_affix}_iter$iter +affix=_${affix}_iter${iter} +act_data_set=${data_set} # we will modify the data dir, when segmenting it + # so we will keep track of original data dirfor the glm and stm files + +if [[ "$data_set" =~ "test_aspire" ]]; then + out_file=single_dev_test${affix}_$model_affix.ctm +elif [[ "$data_set" =~ "eval_aspire" ]]; then + out_file=single_eval${affix}_$model_affix.ctm +elif [[ "$data_set" =~ "dev_aspire" ]]; then + # we will just decode the directory without oracle segments file + # as we would like to operate in the actual evaluation condition + out_file=single_dev${affix}_${model_affix}.ctm +else + exit 1 +fi + +# uniform segmentation script would have created this dataset +# so update that script if you plan to change this variable +segmented_data_set=${data_set}${affix}_seg + +if [ $stage -le 1 ]; then + utils/copy_data_dir.sh $seg_data_dir data/${segmented_data_set} +fi + +if [ $stage -le 2 ]; then + mfccdir=mfcc_reverb + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + date=$(date +'%m_%d_%H_%M') + utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/aspire-$date/s5/$mfccdir/storage $mfccdir/storage + fi + + utils/copy_data_dir.sh data/${segmented_data_set} data/${segmented_data_set}_hires + steps/make_mfcc.sh --nj 30 --cmd "$train_cmd" \ + --mfcc-config conf/mfcc_hires.conf data/${segmented_data_set}_hires \ + exp/make_reverb_hires/${segmented_data_set} $mfccdir + steps/compute_cmvn_stats.sh data/${segmented_data_set}_hires \ + exp/make_reverb_hires/${segmented_data_set} $mfccdir + utils/fix_data_dir.sh data/${segmented_data_set}_hires + utils/validate_data_dir.sh --no-text data/${segmented_data_set}_hires +fi + +decode_dir=$dir/decode_${segmented_data_set}_pp +if [ $stage -le 5 ]; then + echo "Extracting i-vectors, stage 2" + # this does offline decoding, except we estimate the iVectors per + # speaker, excluding silence (based on alignments from a DNN decoding), with a + # different script. This is just to demonstrate that script. + # the --sub-speaker-frames is optional; if provided, it will divide each speaker + # up into "sub-speakers" of at least that many frames... can be useful if + # acoustic conditions drift over time within the speaker's data. + steps/online/nnet2/extract_ivectors.sh --cmd "$train_cmd" --nj 20 \ + --sub-speaker-frames $sub_speaker_frames --max-count $max_count \ + data/${segmented_data_set}_hires $lang $ivector_dir/extractor \ + $ivector_dir/ivectors_${segmented_data_set}${ivector_affix}; +fi + +if [ $stage -le 6 ]; then + echo "Generating lattices, stage 2 with --acwt $acwt" + rm -f ${decode_dir}_tg/.error + steps/nnet3/decode.sh --nj $decode_num_jobs --cmd "$decode_cmd" --config conf/decode.config $pass2_decode_opts \ + --acwt $acwt --post-decode-acwt $post_decode_acwt \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --frames-per-chunk "$frames_per_chunk" \ + --skip-scoring true --iter $iter --lattice-beam $lattice_beam \ + --online-ivector-dir $ivector_dir/ivectors_${segmented_data_set}${ivector_affix} \ + $graph data/${segmented_data_set}_hires ${decode_dir}_tg || touch ${decode_dir}_tg/.error + [ -f ${decode_dir}_tg/.error ] && echo "$0: Error decoding" && exit 1; +fi + +if [ $stage -le 7 ]; then + echo "Rescoring lattices" + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + --skip-scoring true \ + ${lang}_pp_test{,_fg} data/${segmented_data_set}_hires \ + ${decode_dir}_{tg,fg}; +fi + +decode_dir=${decode_dir}_fg + +if [ $stage -le 8 ]; then + local/score_aspire.sh --cmd "$decode_cmd" \ + --min-lmwt 1 --max-lmwt 20 \ + --word-ins-penalties "0.0,0.25,0.5,0.75,1.0" \ + --ctm-beam 6 \ + --iter $iter \ + --decode-mbr true \ + --resolve-overlaps false \ + --tune-hyper true \ + $lang $decode_dir $act_data_set $segmented_data_set $out_file +fi + +# Two-pass decoding baseline +# %WER 27.8 | 2120 27217 | 78.2 13.6 8.2 6.0 27.8 75.9 | -0.613 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iterfinal_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# Using automatic segmentation +# %WER 28.2 | 2120 27214 | 76.5 12.4 11.1 4.7 28.2 75.2 | -0.522 | exp/chain/tdnn_7b/decode_dev_aspire_seg_v7_n_stddev_iterfinal_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys diff --git a/egs/aspire/s5/local/score_aspire.sh b/egs/aspire/s5/local/score_aspire.sh index 3e35b6d3dae..9c08a6c85d1 100755 --- a/egs/aspire/s5/local/score_aspire.sh +++ b/egs/aspire/s5/local/score_aspire.sh @@ -14,10 +14,9 @@ word_ins_penalties=0.0,0.25,0.5,0.75,1.0 default_wip=0.0 ctm_beam=6 decode_mbr=true -window=30 -overlap=5 cmd=run.pl stage=1 +resolve_overlaps=true tune_hyper=true # if true: # if the data set is "dev_aspire" we check for the # best lmwt and word_insertion_penalty, @@ -89,7 +88,7 @@ if $tune_hyper ; then # or use the default values if [ $stage -le 1 ]; then - if [ "$act_data_set" == "dev_aspire" ]; then + if [[ "$act_data_set" =~ "dev_aspire" ]]; then wip_string=$(echo $word_ins_penalties | sed 's/,/ /g') temp_wips=($wip_string) $cmd WIP=1:${#temp_wips[@]} $decode_dir/scoring/log/score.wip.WIP.log \ @@ -98,8 +97,8 @@ if $tune_hyper ; then echo \$wip \&\& \ $cmd LMWT=$min_lmwt:$max_lmwt $decode_dir/scoring/log/score.LMWT.\$wip.log \ local/multi_condition/get_ctm.sh --filter-ctm-command "$filter_ctm_command" \ - --window $window --overlap $overlap \ --beam $ctm_beam --decode-mbr $decode_mbr \ + --resolve-overlaps $resolve_overlaps \ --glm data/${act_data_set}/glm --stm data/${act_data_set}/stm \ LMWT \$wip $lang data/${segmented_data_set}_hires $model $decode_dir || exit 1; @@ -124,7 +123,7 @@ wipfile.close() fi - if [ "$act_data_set" == "test_aspire" ] || [ "$act_data_set" == "eval_aspire" ]; then + if [[ "$act_data_set" =~ "test_aspire" ]] || [[ "$act_data_set" =~ "eval_aspire" ]]; then # check for the best values from dev_aspire decodes dev_decode_dir=$(echo $decode_dir|sed "s/test_aspire/dev_aspire_whole/g; s/eval_aspire/dev_aspire_whole/g") if [ -f $dev_decode_dir/scoring/bestLMWT ]; then diff --git a/egs/aspire/s5/local/segmentation/do_corruption_data_dir.sh b/egs/aspire/s5/local/segmentation/do_corruption_data_dir.sh new file mode 100755 index 00000000000..85fc89ddd73 --- /dev/null +++ b/egs/aspire/s5/local/segmentation/do_corruption_data_dir.sh @@ -0,0 +1,147 @@ +#! /bin/bash + +# Copyright 2016 Vimal Manohar +# Apache 2.0 + +# This script is deprecated in favor of do_corruption_data_dir_snr.sh + +set -e +set -u +set -o pipefail + +. path.sh + +# The following are the main parameters to modify +data_dir=data/train_si284 # Expecting whole data directory. +vad_dir= # Output of prepare_unsad_data.sh. + # If provided, the speech labels and deriv weights will be + # copied into the output data directory. + +num_data_reps=5 # Number of corrupted versions +foreground_snrs="20:10:15:5:0:-5" +background_snrs="20:10:15:5:2:0:-2:-5" + +stage=0 + +# Parallel options +nj=4 +cmd=run.pl + +# Options for feature extraction +mfcc_config=conf/mfcc_hires_bp.conf +feat_suffix=hires_bp + +# Data options +corrupt_only=false +speed_perturb=true +speeds="0.9 1.0 1.1" +resample_data_dir=false + + + +. utils/parse_options.sh + +if [ $# -ne 0 ]; then + echo "Usage: $0" + exit 1 +fi + +data_id=`basename ${data_dir}` + +rvb_opts=() +# This is the config for the system using simulated RIRs and point-source noises +rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list") +rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list") +rvb_opts+=(--noise-set-parameters "0.1, RIRS_NOISES/pointsource_noises/background_noise_list") +rvb_opts+=(--noise-set-parameters "0.9, RIRS_NOISES/pointsource_noises/foreground_noise_list") + +if $resample_data_dir; then + sample_frequency=`cat $mfcc_config | perl -ne 'if (m/--sample-frequency=(\S+)/) { print $1; }'` + if [ -z "$sample_frequency" ]; then + sample_frequency=16000 + fi + + utils/data/resample_data_dir.sh $sample_frequency ${data_dir} || exit 1 + data_id=`basename ${data_dir}` + rvb_opts+=(--source-sampling-rate=$sample_frequency) +fi + +corrupted_data_id=${data_id}_corrupted + +if [ $stage -le 1 ]; then + python steps/data/reverberate_data_dir.py \ + "${rvb_opts[@]}" \ + --prefix="rev" \ + --foreground-snrs=$foreground_snrs \ + --background-snrs=$background_snrs \ + --speech-rvb-probability=1 \ + --pointsource-noise-addition-probability=1 \ + --isotropic-noise-addition-probability=1 \ + --num-replications=$num_data_reps \ + --max-noises-per-minute=2 \ + data/${data_id} data/${corrupted_data_id} +fi + +corrupted_data_dir=data/${corrupted_data_id} + +if $speed_perturb; then + if [ $stage -le 2 ]; then + ## Assuming whole data directories + for x in $corrupted_data_dir; do + cp $x/reco2dur $x/utt2dur + utils/data/perturb_data_dir_speed_random.sh --speeds "$speeds" $x ${x}_spr + done + fi + + corrupted_data_dir=${corrupted_data_dir}_spr + corrupted_data_id=${corrupted_data_id}_spr + + if [ $stage -le 3 ]; then + utils/data/perturb_data_dir_volume.sh --scale-low 0.03125 --scale-high 2 \ + ${corrupted_data_dir} + fi +fi + +if $corrupt_only; then + echo "$0: Got corrupted data directory in ${corrupted_data_dir}" + exit 0 +fi + +mfccdir=`basename $mfcc_config` +mfccdir=${mfccdir%%.conf} + +if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage +fi + +if [ $stage -le 4 ]; then + utils/copy_data_dir.sh $corrupted_data_dir ${corrupted_data_dir}_$feat_suffix + corrupted_data_dir=${corrupted_data_dir}_$feat_suffix + steps/make_mfcc.sh --mfcc-config $mfcc_config \ + --cmd "$cmd" --nj $reco_nj --write-utt2num-frames true \ + $corrupted_data_dir exp/make_${feat_suffix}/${corrupted_data_id} $mfccdir + steps/compute_cmvn_stats.sh --fake \ + $corrupted_data_dir exp/make_${feat_suffix}/${corrupted_data_id} $mfccdir +else + corrupted_data_dir=${corrupted_data_dir}_$feat_suffix +fi + +if [ $stage -le 8 ]; then + if [ ! -z "$vad_dir" ]; then + if [ ! -f $vad_dir/speech_labels.scp ]; then + echo "$0: Could not find file $vad_dir/speech_labels.scp" + exit 1 + fi + + cat $vad_dir/speech_labels.scp | \ + steps/segmentation/get_reverb_scp.pl -f 1 $num_data_reps | \ + sort -k1,1 > ${corrupted_data_dir}/speech_labels.scp + + cat $vad_dir/deriv_weights.scp | \ + steps/segmentation/get_reverb_scp.pl -f 1 $num_data_reps | \ + sort -k1,1 > ${corrupted_data_dir}/deriv_weights.scp + fi +fi + +exit 0 diff --git a/egs/aspire/s5/local/segmentation/do_corruption_data_dir_music.sh b/egs/aspire/s5/local/segmentation/do_corruption_data_dir_music.sh new file mode 100755 index 00000000000..c1f3a66b5bf --- /dev/null +++ b/egs/aspire/s5/local/segmentation/do_corruption_data_dir_music.sh @@ -0,0 +1,307 @@ +#!/bin/bash + +# Copyright 2016 Vimal Manohar +# Apache 2.0 + +# This script adds music to speech waveforms and creates music_labels.scp +# for music / non-music detection and speech_labels.scp for speech activity +# detection. + +set -e +set -u +set -o pipefail + +. path.sh + +# The following are the main parameters to modify. These are required! +data_dir=data/train_si284 +vad_dir= # Location of directory with VAD labels (speech_labels.scp) + # This is created by the script prepare_unsad_labels.sh and + # the archive must be indexed by the utterance-id of the + # input data. + +num_data_reps=5 # Number of corrupted versions +foreground_snrs="5:2:1:0:-2:-5:-10:-20" +background_snrs="5:2:1:0:-2:-5:-10:-20" + +stage=0 + +# Parallel options +nj=4 +cmd=run.pl + +# Options for feature extraction +mfcc_config=conf/mfcc_hires_bp.conf # Band-passed config for telephone speech +feat_suffix=hires_bp + +corrupt_only=false # If true, exits after creating the corrupted directory +speed_perturb=true # Do speed perturbation by randomly perturbing the + # recordings at speeds specified by the --speeds + # option. +speeds="0.9 1.0 1.1" +resample_data_dir=false # If true, the input data is resampled at the + # sampling-rate specified in the mfcc-config. + # Usually applicable when the input data is 8kHz + # and needs to be upsampled to 16kHz. + +label_dir=music_labels # Directory to dump music labels + +. utils/parse_options.sh + +if [ $# -ne 0 ]; then + echo "Usage: $0" + exit 1 +fi + +data_id=`basename ${data_dir}` + +if [ ! -d RIRS_NOISES/ ]; then + # Prepare MUSAN rirs and noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip +fi + +if [ ! -d RIRS_NOISES/music ]; then + wget --no-check-certificate http://www.openslr.org/resources/17/musan.tar.gz + tar -xvf musan.tar.gz + + # Prepare MUSAN music + local/segmentation/prepare_musan_music.sh musan RIRS_NOISES/music +fi + +rvb_opts=() +# This is the config for the system using simulated RIRs and point-source noises +rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list") +rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list") +rvb_opts+=(--noise-set-parameters RIRS_NOISES/music/music_list) + +music_utt2num_frames=RIRS_NOISES/music/split_utt2num_frames + +for f in RIRS_NOISES/simulated_rirs/smallroom/rir_list \ + RIRS_NOISES/simulated_rirs/mediumroom/rir_list \ + RIRS_NOISES/music/music_list \ + RIRS_NOISES/music/split_utt2num_frames \ + $data_dir/wav.scp; do + [ ! -f $f ] && echo "$0: Could not find $f" && exit 1 +done + +if $resample_data_dir; then + # Resample input data directory at a different sampling rate. + # It is assumed that the noise and impulse responses are at 8kHz. + sample_frequency=`cat $mfcc_config | perl -ne 'if (m/--sample-frequency=(\S+)/) { print $1; }'` + if [ -z "$sample_frequency" ]; then + sample_frequency=16000 + fi + + utils/data/resample_data_dir.sh $sample_frequency ${data_dir} || exit 1 + data_id=`basename ${data_dir}` + rvb_opts+=(--source-sampling-rate=$sample_frequency) +fi + +corrupted_data_id=${data_id}_music_corrupted +orig_corrupted_data_id=$corrupted_data_id + +if [ $stage -le 1 ]; then + python steps/data/reverberate_data_dir.py \ + "${rvb_opts[@]}" \ + --prefix="music" \ + --foreground-snrs=$foreground_snrs \ + --background-snrs=$background_snrs \ + --speech-rvb-probability=1 \ + --pointsource-noise-addition-probability=1 \ + --isotropic-noise-addition-probability=1 \ + --num-replications=$num_data_reps \ + --max-noises-per-minute=5 \ + data/${data_id} data/${corrupted_data_id} +fi + +corrupted_data_dir=data/${corrupted_data_id} +# Data dir without speed perturbation +orig_corrupted_data_dir=$corrupted_data_dir + +if $speed_perturb; then + if [ $stage -le 2 ]; then + for x in $corrupted_data_dir; do + utils/data/perturb_data_dir_speed_random.sh --speeds "$speeds" $x ${x}_spr + done + fi + + corrupted_data_dir=${corrupted_data_dir}_spr + corrupted_data_id=${corrupted_data_id}_spr + + if [ $stage -le 3 ]; then + utils/data/perturb_data_dir_volume.sh --scale-low 0.03125 --scale-high 2 \ + ${corrupted_data_dir} + fi +fi + +if $corrupt_only; then + echo "$0: Got corrupted data directory in ${corrupted_data_dir}" + exit 0 +fi + +mfccdir=`basename $mfcc_config` +mfccdir=${mfccdir%%.conf} + +if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage +fi + +if [ $stage -le 4 ]; then + if [ ! -z $feat_suffix ]; then + utils/copy_data_dir.sh $corrupted_data_dir ${corrupted_data_dir}_$feat_suffix + corrupted_data_dir=${corrupted_data_dir}_$feat_suffix + fi + steps/make_mfcc.sh --mfcc-config $mfcc_config \ + --cmd "$cmd" --nj $nj --write-utt2num-frames true \ + $corrupted_data_dir exp/make_${mfccdir}/${corrupted_data_id} $mfccdir + steps/compute_cmvn_stats.sh --fake \ + $corrupted_data_dir exp/make_${mfccdir}/${corrupted_data_id} $mfccdir +else + if [ ! -z $feat_suffix ]; then + corrupted_data_dir=${corrupted_data_dir}_$feat_suffix + fi +fi + +if [ $stage -le 5 ]; then + if [ ! -z "$vad_dir" ]; then + if [ ! -f $vad_dir/speech_labels.scp ]; then + echo "$0: Could not find file $vad_dir/speech_labels.scp." + echo "$0: Run script prepare_unsad_data.sh or similar to create this file." + exit 1 + fi + + # Get speech labels for music-corrupted and reverberated data + cat $vad_dir/speech_labels.scp | \ + steps/segmentation/get_reverb_scp.pl -f 1 $num_data_reps "music" | \ + sort -k1,1 > ${corrupted_data_dir}/speech_labels.scp + + if [ -f $vad_dir/deriv_weights.scp ]; then + cat $vad_dir/deriv_weights.scp | \ + steps/segmentation/get_reverb_scp.pl -f 1 $num_data_reps "music" | \ + sort -k1,1 > ${corrupted_data_dir}/deriv_weights.scp + fi + fi +fi + +# music_dir is without speed perturbation +music_dir=exp/make_music_labels/${orig_corrupted_data_id} +mkdir -p $music_dir + +cp $music_utt2num_frames $music_dir/music_utt2num_frames + +if [ $stage -le 6 ]; then + if [ ! -f $orig_corrupted_data_dir/additive_signals_info.txt ]; then + echo "$0: Could not find $orig_corrupted_data_dir/additive_signals_info.txt." + echo "$0: It is expected to be created by the script reverberate_data_dir.py" + exit 1 + fi + + splits= + for n in `seq $nj`; do + splits="$splits $music_dir/additive_signals_info.$n.$nj.txt" + done + utils/split_scp.pl $orig_corrupted_data_dir/additive_signals_info.txt $splits + + # additive_signals_info.txt is created by the script reverberate_data_dir.py. + # additive_signals_info.txt is indexed by the recording-id and has the format: + # list-of-space-separated-tuples + # where each tuple is written in the format :: + # It specifies the location where the noise (music) is added and the duration + # of the noise added. + # Note that if the end time of the noise is beyond the duration of the + # recording, then it will be truncated. + utils/data/get_reco2dur.sh $orig_corrupted_data_dir + + awk -v fs=`utils/data/get_frame_shift.sh $corrupted_data_dir` '{print $1" "int($2 / fs)}' \ + $orig_corrupted_data_dir/reco2dur > $orig_corrupted_data_dir/reco2num_frames + + if [ -f $orig_corrupted_data_dir/segments ]; then + $cmd JOB=1:$nj $music_dir/log/get_music_seg.JOB.log \ + segmentation-init-from-additive-signals-info \ + --lengths-rspecifier=ark,t:$orig_corrupted_data_dir/reco2num_frames \ + --additive-signals-segmentation-rspecifier="ark:segmentation-init-from-lengths ark,t:$music_dir/music_utt2num_frames ark:- |" \ + ark,t:$music_dir/additive_signals_info.JOB.${nj}.txt ark:- \| \ + segmentation-to-ali --lengths-rspecifier=ark,t:$orig_corrupted_data_dir/reco2num_frames ark:- ark,t:- \| \ + steps/segmentation/convert_ali_to_vec.pl \| \ + vector-to-feat ark,t:- ark:- \| \ + extract-feature-segments ark:- $orig_corrupted_data_dir/segments \ + ark:- \| extract-column ark:- ark,t:- \| \ + steps/segmentation/quantize_vector.pl \| \ + segmentation-init-from-ali ark:- ark:- \| \ + segmentation-post-process --merge-adjacent-segments \ + ark:- ark,scp:$music_dir/music_segmentation.JOB.ark,$music_dir/music_segmentation.JOB.scp + else + $cmd JOB=1:$nj $music_dir/log/get_music_seg.JOB.log \ + segmentation-init-from-additive-signals-info \ + --lengths-rspecifier=ark,t:$orig_corrupted_data_dir/reco2num_frames \ + --additive-signals-segmentation-rspecifier="ark:segmentation-init-from-lengths ark,t:$music_dir/music_utt2num_frames ark:- |" \ + ark,t:$music_dir/additive_signals_info.JOB.${nj}.txt ark:- \| \ + segmentation-to-ali --lengths-rspecifier=ark,t:$orig_corrupted_data_dir/reco2num_frames ark:- ark:- \| \ + segmentation-init-from-ali ark:- ark:- \| \ + segmentation-post-process --merge-adjacent-segments \ + ark:- ark,scp:$music_dir/music_segmentation.JOB.ark,$music_dir/music_segmentation.JOB.scp + fi + + for n in `seq $nj`; do + cat $music_dir/music_segmentation.$n.scp + done > $music_dir/music_segmentation.scp +fi + +# Convert label_dir to absolute pathname +mkdir -p $label_dir +label_dir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $label_dir ${PWD}` + +if [ $stage -le 7 ]; then + utils/split_data.sh $corrupted_data_dir $nj + utils/data/get_utt2num_frames.sh $orig_corrupted_data_dir + + awk '{print $1" "$2; print "sp0.9-"$1" "int($2 / 0.9); print "sp1.1-"$1" "int($2 / 1.1)}' $orig_corrupted_data_dir/utt2num_frames > \ + $music_dir/utt2num_frames_sp + + if $speed_perturb; then + $cmd JOB=1:$nj $music_dir/log/get_music_labels.JOB.log \ + segmentation-speed-perturb --speeds=0.9:1.0:1.1 ark:$music_dir/music_segmentation.JOB.ark ark:- \| \ + segmentation-to-ali --lengths-rspecifier=ark,t:$music_dir/utt2num_frames_sp ark:- \ + ark,scp:$label_dir/music_labels_${corrupted_data_id}.JOB.ark,$label_dir/music_labels_${corrupted_data_id}.JOB.scp + else + $cmd JOB=1:$nj $music_dir/log/get_music_labels.JOB.log \ + segmentation-to-ali --lengths-rspecifier=ark,t:$corrupted_data_dir/utt2num_frames \ + ark:$music_dir/music_segmentation.JOB.ark \ + ark,scp:$label_dir/music_labels_${corrupted_data_id}.JOB.ark,$label_dir/music_labels_${corrupted_data_id}.JOB.scp + fi + + for n in `seq $nj`; do + cat $label_dir/music_labels_${corrupted_data_id}.$n.scp + done | \ + utils/filter_scp.pl ${corrupted_data_dir}/utt2spk | sort -k1,1 > ${corrupted_data_dir}/music_labels.scp + + if [ ! -s $corrupted_data_dir/music_labels.scp ]; then + echo "$0: $corrupted_data_dir/music_labels.scp is empty" && exit 1 + fi + +fi + +if [ $stage -le 8 ]; then + utils/split_data.sh --per-utt ${corrupted_data_dir} $nj + + cat < $music_dir/speech_music_map +0 0 0 +0 1 3 +1 0 1 +1 1 2 +EOF + + $cmd JOB=1:$nj $music_dir/log/get_speech_music_labels.JOB.log \ + intersect-int-vectors --mapping-in=$music_dir/speech_music_map --length-tolerance=2 \ + "scp:utils/filter_scp.pl ${corrupted_data_dir}/split${nj}utt/JOB/utt2spk ${corrupted_data_dir}/speech_labels.scp |" \ + "scp:utils/filter_scp.pl ${corrupted_data_dir}/split${nj}utt/JOB/utt2spk ${corrupted_data_dir}/music_labels.scp |" \ + ark,scp:$label_dir/speech_music_labels_${corrupted_data_id}.JOB.ark,$label_dir/speech_music_labels_${corrupted_data_id}.JOB.scp + + for n in `seq $nj`; do + cat $label_dir/speech_music_labels_${corrupted_data_id}.$n.scp + done > $corrupted_data_dir/speech_music_labels.scp +fi + +exit 0 diff --git a/egs/aspire/s5/local/segmentation/do_corruption_data_dir_snr.sh b/egs/aspire/s5/local/segmentation/do_corruption_data_dir_snr.sh new file mode 100755 index 00000000000..855cfeef580 --- /dev/null +++ b/egs/aspire/s5/local/segmentation/do_corruption_data_dir_snr.sh @@ -0,0 +1,258 @@ +#! /bin/bash + +# Copyright 2016 Vimal Manohar +# Apache 2.0 + +# This script adds reverberation and noise to speech waveforms and +# creates speech_labels.scp for speech activity detection, along with +# irm_targets.scp for auxiliary subband-level targets for training neural network. + +set -e +set -u +set -o pipefail + +. path.sh + +# The following are the main parameters to modify. +data_dir=data/train_si284 # Expecting whole data directory. +vad_dir= # Output of prepare_unsad_data.sh containing speech_labels.scp, + # deriv_weights.scp and deriv_weights_manual_seg.scp. + # The archives are all to be indexed by the utterance-id of the input data. + # If provided, these archives will be copied into the output data directory. + +num_data_reps=5 # Number of corrupted versions +foreground_snrs="20:10:15:5:0:-5" +background_snrs="20:10:15:5:2:0:-2:-5" + +stage=0 + +# Parallel options +nj=4 +cmd=run.pl + +# Options for feature extraction +mfcc_config=conf/mfcc_hires_bp.conf # Band-passed config for telephone speech +feat_suffix=hires_bp + +# Data options +corrupt_only=false # If true, exits after creating the corrupted directory +speed_perturb=true # Do speed perturbation by randomly perturbing the + # recordings at speeds specified by the --speeds + # option. +speeds="0.9 1.0 1.1" +resample_data_dir=false # If true, the input data is resampled at the + # sampling-rate specified in the mfcc-config. + # Usually applicable when the input data is 8kHz + # and needs to be upsampled to 16kHz. + +targets_dir=irm_targets # Directory to dump irm_targets + +. utils/parse_options.sh + +if [ $# -ne 0 ]; then + echo "Usage: $0" + exit 1 +fi + +data_id=`basename ${data_dir}` + +if [ ! -d RIRS_NOISES/ ]; then + # Prepare MUSAN rirs and noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip +fi + +rvb_opts=() +# This is the config for the system using simulated RIRs and point-source noises +rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list") +rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list") +rvb_opts+=(--noise-set-parameters "0.1, RIRS_NOISES/pointsource_noises/background_noise_list") +rvb_opts+=(--noise-set-parameters "0.9, RIRS_NOISES/pointsource_noises/foreground_noise_list") + +for f in RIRS_NOISES/simulated_rirs/smallroom/rir_list \ + RIRS_NOISES/simulated_rirs/mediumroom/rir_list \ + $data_dir/wav.scp; do + [ ! -f $f ] && echo "$0: Could not find $f" && exit 1 +done + +if $resample_data_dir; then + # Resample input data directory at a different sampling rate. + # It is assumed that the noise and impulse responses are at 8kHz. + sample_frequency=`cat $mfcc_config | perl -ne 'if (m/--sample-frequency=(\S+)/) { print $1; }'` + if [ -z "$sample_frequency" ]; then + sample_frequency=16000 + fi + + utils/data/resample_data_dir.sh $sample_frequency ${data_dir} || exit 1 + data_id=`basename ${data_dir}` + rvb_opts+=(--source-sampling-rate=$sample_frequency) +fi + +corrupted_data_id=${data_id}_corrupted +clean_data_id=${data_id}_clean +noise_data_id=${data_id}_noise + +if [ $stage -le 1 ]; then + python steps/data/reverberate_data_dir.py \ + "${rvb_opts[@]}" \ + --prefix="rev" \ + --foreground-snrs=$foreground_snrs \ + --background-snrs=$background_snrs \ + --speech-rvb-probability=1 \ + --pointsource-noise-addition-probability=1 \ + --isotropic-noise-addition-probability=1 \ + --num-replications=$num_data_reps \ + --max-noises-per-minute=2 \ + --output-additive-noise-dir=data/${noise_data_id} \ + --output-reverb-dir=data/${clean_data_id} \ + data/${data_id} data/${corrupted_data_id} +fi + +corrupted_data_dir=data/${corrupted_data_id} +clean_data_dir=data/${clean_data_id} +noise_data_dir=data/${noise_data_id} + +if $speed_perturb; then + if [ $stage -le 2 ]; then + ## Assuming whole data directories + for x in $corrupted_data_dir $clean_data_dir $noise_data_dir; do + cp $x/reco2dur $x/utt2dur + utils/data/perturb_data_dir_speed_random.sh --speeds "$speeds" $x ${x}_spr + done + fi + + corrupted_data_dir=${corrupted_data_dir}_spr + clean_data_dir=${clean_data_dir}_spr + noise_data_dir=${noise_data_dir}_spr + corrupted_data_id=${corrupted_data_id}_spr + clean_data_id=${clean_data_id}_spr + noise_data_id=${noise_data_id}_spr + + if [ $stage -le 3 ]; then + utils/data/perturb_data_dir_volume.sh --scale-low 0.03125 --scale-high 2 \ + ${corrupted_data_dir} + utils/data/perturb_data_dir_volume.sh --reco2vol ${corrupted_data_dir}/reco2vol ${clean_data_dir} + utils/data/perturb_data_dir_volume.sh --reco2vol ${corrupted_data_dir}/reco2vol ${noise_data_dir} + fi +fi + +if $corrupt_only; then + echo "$0: Got corrupted data directory in ${corrupted_data_dir}" + exit 0 +fi + +mfccdir=`basename $mfcc_config` +mfccdir=${mfccdir%%.conf} + +if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage +fi + +if [ $stage -le 4 ]; then + utils/copy_data_dir.sh $corrupted_data_dir ${corrupted_data_dir}_$feat_suffix + corrupted_data_dir=${corrupted_data_dir}_$feat_suffix + steps/make_mfcc.sh --mfcc-config $mfcc_config \ + --cmd "$cmd" --nj $nj --write-utt2num-frames true \ + $corrupted_data_dir exp/make_${feat_suffix}/${corrupted_data_id} $mfccdir + steps/compute_cmvn_stats.sh --fake \ + $corrupted_data_dir exp/make_${feat_suffix}/${corrupted_data_id} $mfccdir +else + corrupted_data_dir=${corrupted_data_dir}_$feat_suffix +fi + +if [ $stage -le 5 ]; then + utils/copy_data_dir.sh $clean_data_dir ${clean_data_dir}_$feat_suffix + clean_data_dir=${clean_data_dir}_$feat_suffix + steps/make_mfcc.sh --mfcc-config $mfcc_config \ + --cmd "$cmd" --nj $nj \ + $clean_data_dir exp/make_${feat_suffix}/${clean_data_id} $mfccdir + steps/compute_cmvn_stats.sh --fake \ + $clean_data_dir exp/make_${feat_suffix}/${clean_data_id} $mfccdir +else + clean_data_dir=${clean_data_dir}_$feat_suffix +fi + +if [ $stage -le 6 ]; then + utils/copy_data_dir.sh $noise_data_dir ${noise_data_dir}_$feat_suffix + noise_data_dir=${noise_data_dir}_$feat_suffix + steps/make_mfcc.sh --mfcc-config $mfcc_config \ + --cmd "$cmd" --nj $nj \ + $noise_data_dir exp/make_${feat_suffix}/${noise_data_id} $mfccdir + steps/compute_cmvn_stats.sh --fake \ + $noise_data_dir exp/make_${feat_suffix}/${noise_data_id} $mfccdir +else + noise_data_dir=${noise_data_dir}_$feat_suffix +fi + +if [ $stage -le 7 ]; then + mkdir -p exp/make_log_snr/${corrupted_data_id} + + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $targets_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5/$targets_dir/storage $targets_dir/storage + fi + + idct_params=`cat $mfcc_config | perl -e ' + $num_mel_bins = 23; $num_ceps = 13; $cepstral_lifter = 22.0; + while (<>) { + chomp; + s/#.+//g; + if (m/^\s*$/) { next; } + if (m/--num-mel-bins=(\S+)/) { + $num_mel_bins = $1; + } elsif (m/--num-ceps=(\S+)/) { + $num_ceps = $1; + } elsif (m/--cepstral-lifter=(\S+)/) { + $cepstral_lifter = $1; + } + } + print "$num_mel_bins $num_ceps $cepstral_lifter";'` + + num_filters=`echo $idct_params | awk '{print $1}'` + num_ceps=`echo $idct_params | awk '{print $2}'` + cepstral_lifter=`echo $idct_params | awk '{print $3}'` + echo "$num_filters $num_ceps $cepstral_lifter" + + mkdir -p exp/make_irm_targets/$corrupted_data_id + utils/data/get_dct_matrix.py --get-idct-matrix=true \ + --num-filters=$num_filters --num-ceps=$num_ceps \ + --cepstral-lifter=$cepstral_lifter \ + exp/make_irm_targets/$corrupted_data_id/idct_matrix + + # Get log-IRM targets + steps/segmentation/make_snr_targets.sh \ + --nj $nj --cmd "$cmd" \ + --target-type Irm --compress false \ + --transform-matrix exp/make_irm_targets/$corrupted_data_id/idct_matrix \ + ${clean_data_dir} ${noise_data_dir} ${corrupted_data_dir} \ + exp/make_irm_targets/${corrupted_data_id} $targets_dir +fi + +if [ $stage -le 8 ]; then + if [ ! -z "$vad_dir" ]; then + if [ ! -f $vad_dir/speech_labels.scp ]; then + echo "$0: Could not find file $vad_dir/speech_labels.scp" + exit 1 + fi + + # Get speech labels for corrupted data + cat $vad_dir/speech_labels.scp | \ + steps/segmentation/get_reverb_scp.pl -f 1 $num_data_reps | \ + sort -k1,1 > ${corrupted_data_dir}/speech_labels.scp + + if [ -f $vad_dir/deriv_weights.scp ]; then + cat $vad_dir/deriv_weights.scp | \ + steps/segmentation/get_reverb_scp.pl -f 1 $num_data_reps | \ + sort -k1,1 > ${corrupted_data_dir}/deriv_weights.scp + fi + + if [ -f $vad_dir/deriv_weights_manual_seg.scp ]; then + cat $vad_dir/deriv_weights_manual_seg.scp | \ + steps/segmentation/get_reverb_scp.pl -f 1 $num_data_reps | \ + sort -k1,1 > ${corrupted_data_dir}/deriv_weights_for_irm_targets.scp + fi + fi +fi + +exit 0 diff --git a/egs/aspire/s5/local/segmentation/make_musan_music.py b/egs/aspire/s5/local/segmentation/make_musan_music.py new file mode 100755 index 00000000000..5c4ccf3fd58 --- /dev/null +++ b/egs/aspire/s5/local/segmentation/make_musan_music.py @@ -0,0 +1,81 @@ +#! /usr/bin/env python + +# Copyright 2017 Vimal Manohar +# Apache 2.0. + +"""This script prepares MUSAN music corpus for perturbing data directory.""" + +from __future__ import print_function +import argparse +import os + + +def _get_args(): + parser = argparse.ArgumentParser() + + parser.add_argument("--use-vocals", type=str, default="false", + choices=["true", "false"], + help="If true, also add music with vocals in the " + "output music-set-parameters") + parser.add_argument("root_dir", type=str, + help="Root directory of MUSAN corpus") + parser.add_argument("music_list", type=argparse.FileType('w'), + help="Convert music list into noise-set-paramters " + "for steps/data/reverberate_data_dir.py") + + args = parser.parse_args() + + args.use_vocals = True if args.use_vocals == "true" else False + return args + + +def read_vocals(annotations): + vocals = {} + for line in open(annotations): + parts = line.strip().split() + if parts[2] == "Y": + vocals[parts[0]] = True + return vocals + + +def write_music(utt, file_path, music_list): + """Write music file to list""" + print ('{utt} {file_path}'.format( + utt=utt, file_path=file_path), file=music_list) + + +def prepare_music_set(root_dir, use_vocals, music_list): + """The main function that goes through the music part of the MUSAN corpus + and writes out the files to a table indexed by the recording-id.""" + vocals = {} + music_dir = os.path.join(root_dir, "music") + num_done = 0 + for root, dirs, files in os.walk(music_dir): + if os.path.exists(os.path.join(root, "ANNOTATIONS")): + vocals = read_vocals(os.path.join(root, "ANNOTATIONS")) + + for f in files: + file_path = os.path.join(root, f) + if f.endswith(".wav"): + utt = str(f).replace(".wav", "") + if not use_vocals and utt in vocals: + continue + num_done += 1 + write_music(utt, file_path, music_list) + if num_done == 0: + raise RuntimeError("Failed to get any music files") + music_list.close() + + +def main(): + args = _get_args() + + try: + prepare_music_set(args.root_dir, args.use_vocals, + args.music_list) + finally: + args.music_list.close() + + +if __name__ == '__main__': + main() diff --git a/egs/aspire/s5/local/segmentation/prepare_babel_data.sh b/egs/aspire/s5/local/segmentation/prepare_babel_data.sh new file mode 100755 index 00000000000..44a80974fc6 --- /dev/null +++ b/egs/aspire/s5/local/segmentation/prepare_babel_data.sh @@ -0,0 +1,115 @@ +#! /bin/bash + +# Copyright 2016 Vimal Manohar +# Apache 2.0. + +# This script prepares Babel data for training speech activity detection, +# music detection. + +. path.sh +. cmd.sh + +set -e +set -o pipefail +set -u + +lang_id=assamese # An arbitrary name added as a suffix to the created + # directories. +subset= # Number of recordings to keep before speed perturbation and corruption. + # In limitedLP, this is about 120. + +# The path below can be modified to any absolute path containing the +# Babel system. +ROOT_DIR=/home/vimal/workspace_waveform/egs/babel/s5c_assamese/ + +stage=-10 +prepare_stage=-10 + +. utils/parse_options.sh + +if [ $# -ne 0 ]; then + echo "Usage: $0" + echo "This script is to serve as an example recipe." + echo "Edit the script to change variables if needed." + exit 1 +fi + +dir=exp/unsad/make_unsad_babel_${lang_id}_train # Work dir + +model_dir=$ROOT_DIR/exp/tri4 # Model directory used for decoding +sat_model_dir=$ROOT_DIR/exp/tri5 # Model directory used for getting alignments +lang=$ROOT_DIR/data/lang # Language directory +lang_test=$ROOT_DIR/data/lang # Language directory used to build graph + +mkdir -p $dir + +# Hard code the mapping from phones to SAD labels +# 0 for silence, 1 for speech, 2 for noise, 3 for unk +cat < $dir/babel_sad.map + 3 +_B 3 +_E 3 +_I 3 +_S 3 + 2 +_B 2 +_E 2 +_I 2 +_S 2 + 2 +_B 2 +_E 2 +_I 2 +_S 2 +SIL 0 +SIL_B 0 +SIL_E 0 +SIL_I 0 +SIL_S 0 +EOF + +utils/copy_data_dir.sh $ROOT_DIR/data/train data/babel_${lang_id}_train +train_data_dir=data/babel_${lang_id}_train + +if [ $stage -le 0 ]; then + # The original data directory which will be converted to a whole (recording-level) directory. + # Expecting the user to have done run-1-main.sh to have $model_dir, + # $sat_model_dir, $lang, $lang_test, $train_data_dir. + # The default config using plp + pitch is assumed. + local/segmentation/prepare_unsad_data.sh \ + --sad-map $dir/babel_sad.map --stage $prepare_stage \ + --config-dir $ROOT_DIR/conf --feat-type plp --add-pitch true \ + --reco-nj 40 --nj 100 --cmd "$train_cmd" \ + --sat-model-dir $sat_model_dir \ + --lang-test $lang_test \ + $train_data_dir $lang $model_dir $dir +fi + +data_dir=${train_data_dir}_whole + +if [ ! -z $subset ]; then + # Work on a subset of recordings. + utils/subset_data_dir.sh --speakers ${data_dir} $subset \ + ${data_dir}_$subset + data_dir=${data_dir}_$subset +fi + +reco_vad_dir=$dir/`basename $model_dir`_reco_vad_`basename $train_data_dir`_sp + +if [ $stage -le 1 ]; then + # Add noise from MUSAN corpus to data directory and create a new data directory + local/segmentation/do_corruption_data_dir_snr.sh \ + --cmd "$train_cmd" --nj 40 --stage $prepare_stage \ + --data-dir $data_dir \ + --vad-dir $reco_vad_dir \ + --feat-suffix hires_bp --mfcc-config conf/mfcc_hires_bp.conf +fi + +if [ $stage -le 2 ]; then + # Add music from MUSAN corpus to data directory and create a new data directory + local/segmentation/do_corruption_data_dir_music.sh \ + --cmd "$train_cmd" --nj 40 --stage $prepare_stage \ + --data-dir $data_dir \ + --vad-dir $reco_vad_dir \ + --feat-suffix hires_bp --mfcc-config conf/mfcc_hires_bp.conf +fi diff --git a/egs/aspire/s5/local/segmentation/prepare_babel_data_flp.sh b/egs/aspire/s5/local/segmentation/prepare_babel_data_flp.sh new file mode 100644 index 00000000000..2bce05d9f9e --- /dev/null +++ b/egs/aspire/s5/local/segmentation/prepare_babel_data_flp.sh @@ -0,0 +1,116 @@ +#! /bin/bash + +# Copyright 2016 Vimal Manohar +# Apache 2.0. + +# This script prepares Babel data for training speech activity detection, +# music detection. +# This script is similar to prepare_babel_data.sh, but uses FullLP systems. + +. path.sh +. cmd.sh + +set -e +set -o pipefail +set -u + +lang_id=cantonese_flp # An arbitrary name added as a suffix to the created + # directories. +subset= # Number of recordings to keep before speed perturbation and corruption. + # In limitedLP, this is about 120. So subset, if specified, must be lower that that. + +# All the paths below can be modified to any absolute path. +ROOT_DIR=/export/b17/jtrmal/babel/101-cantonese-flp-p-basic + +stage=-10 +prepare_stage=-10 + +. utils/parse_options.sh + +if [ $# -ne 0 ]; then + echo "Usage: $0" + echo "This script is to serve as an example recipe." + echo "Edit the script to change variables if needed." + exit 1 +fi + +# The path below can be modified to any absolute path containing the +# Babel system. +dir=exp/unsad/make_unsad_babel_${lang_id}_train # Work dir + +model_dir=$ROOT_DIR/exp/tri4 # Model directory used for decoding +sat_model_dir=$ROOT_DIR/exp/tri5 # Model directory used for getting alignments +lang=$ROOT_DIR/data/lang # Language directory +lang_test=$ROOT_DIR/data/lang # Language directory used to build graph + +mkdir -p $dir + +# Hard code the mapping from phones to SAD labels +# 0 for silence, 1 for speech, 2 for noise, 3 for unk +cat < $dir/babel_sad.map + 3 +_B 3 +_E 3 +_I 3 +_S 3 + 2 +_B 2 +_E 2 +_I 2 +_S 2 + 2 +_B 2 +_E 2 +_I 2 +_S 2 +SIL 0 +SIL_B 0 +SIL_E 0 +SIL_I 0 +SIL_S 0 +EOF + +utils/copy_data_dir.sh $ROOT_DIR/data/train data/babel_${lang_id}_train +train_data_dir=data/babel_${lang_id}_train + +if [ $stage -le 0 ]; then + # The original data directory which will be converted to a whole (recording-level) directory. + # Expecting the user to have done run.sh to have $model_dir, + # $sat_model_dir, $lang, $lang_test, $train_data_dir + local/segmentation/prepare_unsad_data.sh \ + --sad-map $dir/babel_sad.map --stage $prepare_stage \ + --config-dir $ROOT_DIR/conf --feat-type plp --add-pitch true \ + --reco-nj 40 --nj 100 --cmd "$train_cmd" \ + --sat-model-dir $sat_model_dir \ + --lang-test $lang_test \ + $train_data_dir $lang $model_dir $dir +fi + +data_dir=${train_data_dir}_whole + +if [ ! -z $subset ]; then + # Work on a subset + utils/subset_data_dir.sh --speakers ${data_dir} $subset \ + ${data_dir}_$subset + data_dir=${data_dir}_$subset +fi + +reco_vad_dir=$dir/`basename $model_dir`_reco_vad_`basename $train_data_dir`_sp + +if [ $stage -le 1 ]; then + # Add noise from MUSAN corpus to data directory and create a new data directory + local/segmentation/do_corruption_data_dir_snr.sh \ + --cmd "$train_cmd" --nj 40 --stage $prepare_stage \ + --data-dir $data_dir \ + --vad-dir $reco_vad_dir \ + --feat-suffix hires_bp --mfcc-config conf/mfcc_hires_bp.conf +fi + +if [ $stage -le 2 ]; then + # Add music from MUSAN corpus to data directory and create a new data directory + local/segmentation/do_corruption_data_dir_music.sh \ + --cmd "$train_cmd" --nj 40 --stage $prepare_stage \ + --data-dir $data_dir \ + --vad-dir $reco_vad_dir \ + --feat-suffix hires_bp --mfcc-config conf/mfcc_hires_bp.conf +fi diff --git a/egs/aspire/s5/local/segmentation/prepare_babel_data_simple.sh b/egs/aspire/s5/local/segmentation/prepare_babel_data_simple.sh new file mode 100755 index 00000000000..1d615d42eb0 --- /dev/null +++ b/egs/aspire/s5/local/segmentation/prepare_babel_data_simple.sh @@ -0,0 +1,142 @@ +#! /bin/bash + +# Copyright 2016 Vimal Manohar +# Apache 2.0. + +# This script prepares Babel data for training speech activity detection, +# music detection. +# This script is similar to prepare_babel_data.sh, but is a simpler version +# that create perturbed data at utterance-level using only manual segment +# regions. + +. path.sh +. cmd.sh + +set -e +set -o pipefail +set -u + +lang_id=cantonese_flp_simple # An arbitrary name added as a suffix to the created + # directories. +subset_fraction=0.1 # Fraction of utterances to keep before perturbation and + # corruption. +realign=false # If true, the speed-perturbed data is realigned using + # the SAT model. Otherwise, the existing alignment is + # warped to the required speed. + +# The path below can be modified to any absolute path containing the +# Babel system. +ROOT_DIR=/export/b17/jtrmal/babel/101-cantonese-flp-p-basic + +stage=-10 +prepare_stage=-10 + +. utils/parse_options.sh + +if [ $# -ne 0 ]; then + echo "Usage: $0" + echo "This script is to serve as an example recipe." + echo "Edit the script to change variables if needed." + exit 1 +fi + +dir=exp/unsad_simple/make_unsad_babel_${lang_id}_train_cleaned_pitch_sp # Work dir + +train_data_dir=$ROOT_DIR/data/train_cleaned_pitch_sp # Input data +unperturbed_data_dir=$ROOT_DIR/data/train_cleaned_pitch # Unperturbed data directory +model_dir=$ROOT_DIR/exp/tri5_cleaned # Model directory # SAT model used for getting alignments, if --realign is true +lang=$ROOT_DIR/data/lang # Language directory + +mkdir -p $dir + +# Hard code the mapping from phones to SAD labels +# 0 for silence, 1 for speech, 2 for noise, 3 for unk +cat < $dir/babel_sad.map + 3 +_B 3 +_E 3 +_I 3 +_S 3 + 2 +_B 2 +_E 2 +_I 2 +_S 2 + 2 +_B 2 +_E 2 +_I 2 +_S 2 +SIL 0 +SIL_B 0 +SIL_E 0 +SIL_I 0 +SIL_S 0 +EOF + +utils/copy_data_dir.sh $train_data_dir data/babel_${lang_id}_train_sp +train_data_dir=data/babel_${lang_id}_train_sp + +utils/copy_data_dir.sh $unperturbed_data_dir data/babel_${lang_id}_train +unperturbed_data_dir=data/babel_${lang_id}_train + +if $realign; then + ali_dir=$dir/`basename $model_dir`_ali_$(basename $train_data_dir) + + if [ $stage -le 0 ]; then + steps/align_fmllr.sh --nj 32 --cmd "$train_cmd" \ + $train_data_dir $lang $model_dir $ali_dir + fi + + if [ $stage -le 1 ]; then + # Expecting the user to have done run-1-main.sh to have $model_dir, + # $sat_model_dir, $lang, $lang_test, $train_data_dir. + local/segmentation/prepare_unsad_data_simple.sh \ + --sad-map $dir/babel_sad.map --cmd "$train_cmd" \ + $train_data_dir $lang $ali_dir $dir + fi + + vad_dir=$dir/`basename $ali_dir`_vad_$(basename $train_data_dir) +else + if [ $stage -le 1 ]; then + # Expecting the user to have done run-1-main.sh to have $model_dir, + # $sat_model_dir, $lang, $lang_test, $train_data_dir. + local/segmentation/prepare_unsad_data_simple.sh --speed-perturb true \ + --sad-map $dir/babel_sad.map --cmd "$train_cmd" \ + $unperturbed_data_dir $lang $model_dir $dir + fi + + vad_dir=$dir/`basename $model_dir`_vad_$(basename $unperturbed_data_dir) +fi + +data_dir=${unperturbed_data_dir} + +if [ ! -z "$subset_fraction" ]; then + # Work on a subset of utterances + num_utts=`cat $unperturbed_data_dir/utt2spk | wc -l` + subset=`python -c "n=int($num_utts * $subset_fraction / 1000.0) * 1000; print (n if n > 4000 else 4000)"` + subset_affix=`echo $subset | perl -pe 's/000/k/g'` + utils/subset_data_dir.sh --speakers ${unperturbed_data_dir} $subset \ + ${unperturbed_data_dir}_${subset_affix} + data_dir=${unperturbed_data_dir}_${subset_affix} +fi + +if [ $stage -le 2 ]; then + # Add noise from MUSAN corpus to data directory and create a new data directory + local/segmentation/do_corruption_data_dir_snr.sh \ + --cmd "$train_cmd" --nj 40 --stage $prepare_stage \ + --data-dir $data_dir \ + --vad-dir $vad_dir \ + --feat-suffix hires_bp --mfcc-config conf/mfcc_hires_bp.conf +fi + +if [ $stage -le 3 ]; then + # Add music from MUSAN corpus to data directory and create a new data directory + local/segmentation/do_corruption_data_dir_music.sh \ + --cmd "$train_cmd" --nj 40 --stage $prepare_stage \ + --data-dir $data_dir \ + --vad-dir $vad_dir \ + --feat-suffix hires_bp --mfcc-config conf/mfcc_hires_bp.conf +fi + +utils/fix_data_dir.sh --utt-extra-files "irm_targets.scp speech_labels.scp music_labels.scp speech_music_labels.scp deriv_weights.scp deriv_weights_manual_seg.scp deriv_weights_for_irm_targets.scp" ${data_dir}_corrupted_spr_hires_bp diff --git a/egs/aspire/s5/local/segmentation/prepare_fisher_data.sh b/egs/aspire/s5/local/segmentation/prepare_fisher_data.sh new file mode 100644 index 00000000000..8a6352742b0 --- /dev/null +++ b/egs/aspire/s5/local/segmentation/prepare_fisher_data.sh @@ -0,0 +1,112 @@ +#! /bin/bash + +# This script prepares Fisher data for training a speech activity detection +# and music detection system + +# Copyright 2016 Vimal Manohar +# Apache 2.0. + +. path.sh +. cmd.sh + +set -e -o pipefail + +if [ $# -ne 0 ]; then + echo "Usage: $0" + echo "This script is to serve as an example recipe." + echo "Edit the script to change variables if needed." + exit 1 +fi + +dir=exp/unsad/make_unsad_fisher_train_100k # Work dir +subset=900 # Number of recordings to keep before speed perturbation and corruption. + +stage=-10 +prepare_stage=-10 + +# All the paths below can be modified to any absolute path. + +# The original data directory which will be converted to a whole (recording-level) directory. +train_data_dir=data/fisher_train_100k + +model_dir=exp/tri3a # Model directory used for decoding +sat_model_dir=exp/tri4a # Model directory used for getting alignments +lang=data/lang # Language directory +lang_test=data/lang_test # Language directory used to build graph + +# Hard code the mapping from phones to SAD labels +# 0 for silence, 1 for speech, 2 for noise, 3 for unk +cat < $dir/fisher_sad.map +sil 0 +sil_B 0 +sil_E 0 +sil_I 0 +sil_S 0 +laughter 2 +laughter_B 2 +laughter_E 2 +laughter_I 2 +laughter_S 2 +noise 2 +noise_B 2 +noise_E 2 +noise_I 2 +noise_S 2 +oov 3 +oov_B 3 +oov_E 3 +oov_I 3 +oov_S 3 +EOF + +if [ ! -d RIRS_NOISES/ ]; then + # Prepare MUSAN rirs and noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip +fi + +if [ ! -d RIRS_NOISES/music ]; then + # Prepare MUSAN music + local/segmentation/prepare_musan_music.sh /export/corpora/JHU/musan RIRS_NOISES/music +fi + +if [ $stage -le 0 ]; then + # Expecting the user to have done run.sh to have $model_dir, + # $sat_model_dir, $lang, $lang_test, $train_data_dir + local/segmentation/prepare_unsad_data.sh \ + --sad-map $dir/fisher_sad.map --stage $prepare_stage \ + --config-dir conf \ + --reco-nj 40 --nj 100 --cmd "$train_cmd" \ + --sat-model-dir $sat_model_dir \ + --lang-test $lang_test \ + $train_data_dir $lang $model_dir $dir +fi + +data_dir=${train_data_dir}_whole + +if [ ! -z $subset ]; then + # Work on a subset + false && utils/subset_data_dir.sh ${data_dir} $subset \ + ${data_dir}_$subset + data_dir=${data_dir}_$subset +fi + +reco_vad_dir=$dir/`basename $model_dir`_reco_vad_`basename $train_data_dir`_sp + +if [ $stage -le 1 ]; then + # Add noise from MUSAN corpus to data directory and create a new data directory + local/segmentation/do_corruption_data_dir_snr.sh \ + --cmd "$train_cmd" --nj 40 --stage $prepare_stage \ + --data-dir $data_dir \ + --vad-dir $reco_vad_dir \ + --feat-suffix hires_bp --mfcc-config conf/mfcc_hires_bp.conf +fi + +if [ $stage -le 2 ]; then + # Add music from MUSAN corpus to data directory and create a new data directory + local/segmentation/do_corruption_data_dir_music.sh \ + --cmd "$train_cmd" --nj 40 --stage $prepare_stage \ + --data-dir $data_dir \ + --vad-dir $reco_vad_dir \ + --feat-suffix hires_bp --mfcc-config conf/mfcc_hires_bp.conf +fi diff --git a/egs/aspire/s5/local/segmentation/prepare_fisher_data_simple.sh b/egs/aspire/s5/local/segmentation/prepare_fisher_data_simple.sh new file mode 100755 index 00000000000..d48824b1a5c --- /dev/null +++ b/egs/aspire/s5/local/segmentation/prepare_fisher_data_simple.sh @@ -0,0 +1,150 @@ +#! /bin/bash + +# This script prepares Fisher data for training a speech activity detection +# and music detection system +# This script is similar to prepare_fisher_data.sh, but is a simpler version +# that create perturbed data at utterance-level using only manual segment +# regions. + +# Copyright 2016 Vimal Manohar +# Apache 2.0. + +. path.sh +. cmd.sh + +set -e -o pipefail + +if [ $# -ne 0 ]; then + echo "Usage: $0" + echo "This script is to serve as an example recipe." + echo "Edit the script to change variables if needed." + exit 1 +fi + +subset_fraction=0.15 # Fraction of utterances to keep before perturbation and + # corruption. +realign=false # If true, the speed-perturbed data is realigned using + # the SAT model. Otherwise, the existing alignment is + # warped to the required speed. + +# All the paths below can be modified to any absolute path. +ROOT_DIR=/export/a15/vmanoha1/workspace_snr/egs/aspire/s5 + +stage=-10 +prepare_stage=-10 + +. utils/parse_options.sh + +if [ $# -ne 0 ]; then + echo "Usage: $0" + echo "This script is to serve as an example recipe." + echo "Edit the script to change variables if needed." + exit 1 +fi + +dir=exp/unsad_simple/make_unsad_fisher_train_100k_sp # Work dir +train_data_dir=$ROOT_DIR/data/train_100k_sp +unperturbed_data_dir=$ROOT_DIR/data/train_100k +model_dir=$ROOT_DIR/exp/tri4a +lang=$ROOT_DIR/data/lang # Language directory + +mkdir -p $dir + +# Hard code the mapping from phones to SAD labels +# 0 for silence, 1 for speech, 2 for noise, 3 for unk +cat < $dir/fisher_sad.map +sil 0 +sil_B 0 +sil_E 0 +sil_I 0 +sil_S 0 +laughter 2 +laughter_B 2 +laughter_E 2 +laughter_I 2 +laughter_S 2 +noise 2 +noise_B 2 +noise_E 2 +noise_I 2 +noise_S 2 +oov 3 +oov_B 3 +oov_E 3 +oov_I 3 +oov_S 3 +EOF + +if [ ! -d RIRS_NOISES/ ]; then + # Prepare MUSAN rirs and noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip +fi + +if [ ! -d RIRS_NOISES/music ]; then + # Prepare MUSAN music + local/segmentation/prepare_musan_music.sh /export/corpora/JHU/musan RIRS_NOISES/music +fi + +utils/copy_data_dir.sh $train_data_dir data/fisher_train_100k_simple_sp +train_data_dir=data/fisher_train_100k_simple_sp + +utils/copy_data_dir.sh $unperturbed_data_dir data/fisher_train_100k_simple +unperturbed_data_dir=data/fisher_train_100k_simple + +# Expecting the user to have done run.sh to have $model_dir, +# $sat_model_dir, $lang, $lang_test, $train_data_dir +if $realign; then + ali_dir=$dir/`basename $model_dir`_ali_$(basename $train_data_dir) + + if [ $stage -le 0 ]; then + steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \ + $train_data_dir $lang $model_dir $ali_dir + fi + + if [ $stage -le 1 ]; then + local/segmentation/prepare_unsad_data_simple.sh \ + --sad-map $dir/fisher_sad.map --cmd "$train_cmd" \ + $train_data_dir $lang $ali_dir $dir + fi + + vad_dir=$dir/`basename $ali_dir`_vad_$(basename $train_data_dir) +else + if [ $stage -le 1 ]; then + local/segmentation/prepare_unsad_data_simple.sh --speed-perturb true \ + --sad-map $dir/fisher_sad.map --cmd "$train_cmd" \ + $unperturbed_data_dir $lang $model_dir $dir + fi + + vad_dir=$dir/`basename $model_dir`_vad_$(basename $unperturbed_data_dir) +fi + +data_dir=${unperturbed_data_dir} + +if [ ! -z "$subset_fraction" ]; then + # Work on a subset + num_utts=`cat $unperturbed_data_dir/utt2spk | wc -l` + subset=`python -c "n=int($num_utts * $subset_fraction / 1000.0) * 1000; print (n if n > 4000 else 4000)"` + subset_affix=`echo $subset | perl -pe 's/000/k/g'` + utils/subset_data_dir.sh --speakers ${unperturbed_data_dir} $subset \ + ${unperturbed_data_dir}_${subset_affix} + data_dir=${unperturbed_data_dir}_${subset_affix} +fi + +if [ $stage -le 2 ]; then + # Add noise from MUSAN corpus to data directory and create a new data directory + local/segmentation/do_corruption_data_dir_snr.sh \ + --cmd "$train_cmd" --nj 40 --stage $prepare_stage \ + --data-dir $data_dir \ + --vad-dir $vad_dir \ + --feat-suffix hires_bp --mfcc-config conf/mfcc_hires_bp.conf +fi + +if [ $stage -le 3 ]; then + # Add music from MUSAN corpus to data directory and create a new data directory + local/segmentation/do_corruption_data_dir_music.sh \ + --cmd "$train_cmd" --nj 40 --stage $prepare_stage \ + --data-dir $data_dir \ + --vad-dir $vad_dir \ + --feat-suffix hires_bp --mfcc-config conf/mfcc_hires_bp.conf +fi diff --git a/egs/aspire/s5/local/segmentation/prepare_musan_music.sh b/egs/aspire/s5/local/segmentation/prepare_musan_music.sh new file mode 100755 index 00000000000..47c83ca29fd --- /dev/null +++ b/egs/aspire/s5/local/segmentation/prepare_musan_music.sh @@ -0,0 +1,26 @@ +#! /bin/bash + +# Copyright 2016 Vimal Manohar +# Apache 2.0 + +# This script prepares MUSAN music corpus for data perturbation. + +if [ $# -ne 2 ]; then + echo "Usage: $0 " + echo " e.g.: $0 /export/corpora/JHU/musan RIRS_NOISES/music" + exit 1 +fi + +SRC_DIR=$1 +dir=$2 + +mkdir -p $dir + +local/segmentation/make_musan_music.py $SRC_DIR $dir/wav.scp + +wav-to-duration scp:$dir/wav.scp ark,t:$dir/reco2dur +steps/data/split_wavs_randomly.py $dir/wav.scp $dir/reco2dur \ + $dir/split_utt2dur $dir/split_wav.scp + +awk '{print $1" "int($2*100)}' $dir/split_utt2dur > $dir/split_utt2num_frames +steps/data/wav_scp2noise_list.py $dir/split_wav.scp $dir/music_list diff --git a/egs/aspire/s5/local/segmentation/prepare_unsad_data.sh b/egs/aspire/s5/local/segmentation/prepare_unsad_data.sh new file mode 100755 index 00000000000..7f565f6f3d6 --- /dev/null +++ b/egs/aspire/s5/local/segmentation/prepare_unsad_data.sh @@ -0,0 +1,517 @@ +#!/bin/bash + +# Copyright 2016 Vimal Manohar +# Apache 2.0. + +# This script prepares speech labels and deriv weights for +# training unsad network for speech activity detection and music detection. + +set -u +set -o pipefail +set -e + +. path.sh + +stage=-3 +cmd=queue.pl +reco_nj=40 # Number of jobs to work at recording-level +nj=100 # Number of jobs to work at utterance-level + +# Options to be passed to get_sad_map.py +map_noise_to_sil=true # Map noise phones to silence label (0) +map_unk_to_speech=true # Map unk phones to speech label (1) +sad_map= # Initial mapping from phones to speech/non-speech labels. + # Overrides the default mapping using phones/silence.txt + # and phones/nonsilence.txt + # The format is