diff --git a/egs/wsj/s5/steps/nnet3/fvector/add_noise.sh b/egs/wsj/s5/steps/nnet3/fvector/add_noise.sh new file mode 100755 index 00000000000..205c775e730 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/fvector/add_noise.sh @@ -0,0 +1,201 @@ +#!/bin/bash +# Copyright 2016 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 + +# The script is used to generate the egs which will be used in fvector framework. +# So far, the script achieves the duration files of train dataset and noise +# dataset seperately. Then, with the duration files, it will generate the range +# file which is used to control the process about adding additive noise. +# At the same time, it will generate the mapping between wav and perturbedwav. + +# Begin Configuration section. +stage=0 +cmd=run.pl +nj=8 +# Begain Configuration. +min_additive_noise_len=2.0 # the minimum duration of each noise file in seconds. +num_ranges_per_wav=4 # the number of noise ranges for each wav. +min_snr=-5 # the minimum snr value in dB. +max_snr=-15 # the maximum snr value in dB. +seed=-1 # set the random seed. +variable_len_additive_noise=true #If true, generate the variable-length range files. + #If false, generate the fixed-length range files. +# Begin Configuration of section 6 +# for the details, please see steps/nnet3/fvector/get_egs.sh +frames_per_chunk=200 +frames_per_iter=1000000 +frames_per_iter_diagnostic=1000000 +num_diagnostic_archives=3 +num_heldout_utts=500 +# End Configuration options. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh # source the path. +. parse_options.sh || exit 1; + +if [ $# != 4 ]; then + echo "usage: steps/nnet3/fvector/add_noise.sh " + echo "e.g.: steps/nnet3/fvector/add_noise.sh data/train data/noise data/perturbed exp/fvector_a/egs" + echo "main options (for others, see top of script file)" + echo " --min-additive-noise-len # limit the minimum length of noise" + echo " --num-ranges-per-wav # number of noise range kinds" + echo " --variable-len-additive-noise (true|false) # decide fixed/variable version" + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs" + exit 1 +fi + +data=$1 # contain wav.scp +noise=$2 # contain noise.scp +dir=$3 # eg: data/perturbed +exp=$4 # the directory used to store the egs + +if [ $stage -le 0 ];then + echo "The 1st stage: generating the duration file for each recording" + # remove the segments so that the duration corresponding to recording-id + if [ -f $data/segments ]; then + mv $data/segments $data/segments_backup + if [ -f $data/utt2dur ]; then + mv $data/utt2dur $data/utt2dur.backup + utils/data/get_utt2dur.sh $data + else + utils/data/get_utt2dur.sh $data + fi + mv $data/segments_backup $data/segments + else + if [ ! -f $data/utt2dur ]; then + # get original clean wav's duration + utils/data/get_utt2dur.sh $data + fi + fi + + # remove the segments so that the duration corresponding to recording-id + if [ -f $noise/segments ]; then + mv $noise/segments $noise/segments_backup + if [ -f $noise/utt2dur ]; then + mv $noise/utt2dur $noise/utt2dur.backup + utils/data/get_utt2dur.sh $noise + else + utils/data/get_utt2dur.sh $noise + fi + mv $noise/segments_backup $noise/segments + else + if [ ! -f $noise/utt2dur ]; then + # get original clean wav's duration + utils/data/get_utt2dur.sh $noise + fi + fi +fi + +mkdir -p $dir/log +if [ $stage -le 1 ]; then + echo "The 2nd stage: generate $num_kind_rage kinds of noise range for each original wav" + $cmd $dir/log/generate_noise_range.log \ + steps/nnet3/fvector/generate_noise_range.py \ + --num-ranges-per-wav=$num_ranges_per_wav \ + --min-additive-noise-len=$min_additive_noise_len \ + --min-snr=$min_snr \ + --max-snr=$max_snr \ + --variable-len-additive-noise $variable_len_additive_noise \ + --seed=$seed \ + $data/utt2dur $noise/utt2dur $dir/ranges $dir/wav2perturbedwav +fi + +if [ $stage -le 2 ]; then + echo "The 3rd stage: generate perturbed_wav_specifier" + $cmd $dir/log/generate_perturb_wav_specifier.log \ + steps/nnet3/fvector/generate_perturb_wav_specifier.py \ + --noise=$noise/wav.scp \ + $data/wav.scp $dir/ranges $dir/wav2perturbedwav $dir/tmp.wav.scp + cat $dir/tmp.wav.scp | sort > $dir/wav.scp + rm -f $dir/tmp.wav.scp +fi + +if [ $stage -le 3 ]; then + echo "The 4th stage: generate other files in data directory" + #reco2file_and_channel + cat $dir/wav2perturbedwav | cut -d ' ' -f 1 | paste -d ' ' - $dir/wav2perturbedwav > $dir/.perturb_recording_map + steps/nnet3/fvector/apply_map_one2mult.pl -f 1 $dir/.perturb_recording_map <$data/reco2file_and_channel >$dir/reco2file_and_channel + if [ -f $data/segments ]; then + awk -v num=$num_ranges_per_wav '{ + printf("%s %s",$1, $1); + for(i=1; i<= num; i++){ printf(" %s-%s%s", $1, "perturbed", i); } + printf("\n"); + }' <$data/segments > $dir/.perturb_utt_map + cat $dir/.perturb_recording_map > $dir/.perturb_map + cat $dir/.perturb_utt_map >> $dir/.perturb_map + cp $dir/.perturb_utt_map $dir/uniq2utt + #segments + steps/nnet3/fvector/apply_map_one2mult.pl -f 1 $dir/.perturb_map <$data/segments | \ + awk '{ + uttid=$1; start_time=$3; end_time=$4; + split(uttid,S,"[_]"); + if ( S[2] ~/.*-perturb.*$/ ) { + split(S[2],S1,"[-]"); + recordingid=(S[1]"-"S1[3]); + } else { + recordingid=S[1]; + } + print uttid " " recordingid " " start_time " " end_time + }' >$dir/segments + #text + steps/nnet3/fvector/apply_map_one2mult.pl -f 1 $dir/.perturb_map <$data/text | sort >$dir/text + #utt2spk + steps/nnet3/fvector/apply_map_one2mult.pl -f 1 $dir/.perturb_map <$data/utt2spk | sort >$dir/utt2spk + #spk2utt + utt2spk_to_spk2utt.pl <$dir/utt2spk | sort > $dir/spk2utt + else #no segments->wav indexed by utterence-id/ is equal to + cp $dir/.perturb_recording_map $dir/.perturb_map + cp $dir/.perturb_map $dir/uniq2utt + #segments + steps/nnet3/fvector/apply_map_one2mult.pl -f 1 $dir/.perturb_map <$data/segments | \ + awk '{ + uttid=$1; start_time=$3; end_time=$4; + split(uttid,S,"[_]"); + if ( S[2] ~/.*-perturb.*$/ ) { + split(S[2],S1,"[-]"); + recordingid=(S[1]"-"S1[3]); + } else { + recordingid=S[1]; + } + print uttid " " recordingid " " start_time " " end_time + }' | sort >$dir/segments + #text + steps/nnet3/fvector/apply_map_one2mult.pl -f 1 $dir/.perturb_map <$data/text | sort >$dir/text + #utt2spk + steps/nnet3/fvector/apply_map_one2mult.pl -f 1 $dir/.perturb_map <$data/utt2spk | sort >$dir/utt2spk + #spk2utt + utt2spk_to_spk2utt.pl <$dir/utt2spk | sort > $dir/spk2utt + fi +fi +#Now, we have already finished generating the perturbed data directory. + +if [ $stage -le 4 ]; then + echo "The 5th stage: make features." + mfccdir=mfcc + dir_basename=$(basename $dir) + steps/make_mfcc.sh --nj 50 --cmd "$cmd" \ + $dir exp/make_mfcc/$dir_basename $mfccdir + steps/compute_cmvn_stats.sh $dir exp/make_mfcc/$dir_basename $mfccdir +fi + +#restore +if [ -f $data/utt2dur.backup ]; then + mv $data/utt2dur.backup $data/utt2dur +fi +if [ -f $noise/utt2dur.backup ]; then + mv $noise/utt2dur.backup $noise/utt2dur +fi + +if [ $stage -le 5 ]; then + echo "The 6th stage: generate egs." + steps/nnet3/fvector/get_egs.sh \ + --frames-per-chunk $frames_per_chunk \ + --frames-per-iter $frames_per_iter \ + --frames-per-iter-diagnostic $frames_per_iter_diagnostic \ + --num-diagnostic-archives $num_diagnostic_archives \ + --num-heldout-utts $num_heldout_utts \ + $dir $exp +fi +exit 0 diff --git a/egs/wsj/s5/steps/nnet3/fvector/allocate_examples.py b/egs/wsj/s5/steps/nnet3/fvector/allocate_examples.py new file mode 100755 index 00000000000..219356e7388 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/fvector/allocate_examples.py @@ -0,0 +1,234 @@ +#!/usr/bin/env python + +# This script, for use when training fvectors, decides for you which examples +# will come from which utterances, and at what point. + +# You call it as (e.g.) +# +# allocate_examples.py --frames-per-chunk=200 --frames-per-iter=1000000 \ +# --num-archives=169 --num-jobs=24 exp/fvector_a/egs/temp/utt2len.train exp/fvector_a/egs +# +# and this program outputs certain things to the temp directory (exp/xvector_a/egs/temp in this case) +# that will enable you to dump the chunks for xvector training. What we'll eventually be doing is invoking +# the following program with something like the following args: +# +# nnet3-fvector-get-egs [options] exp/fvector_a/temp/ranges.1 scp:data/train/feats.scp \ +# ark:exp/fvector_a/egs/egs_temp.1.ark ark:exp/fvector_a/egs/egs_temp.2.ark \ +# ark:exp/fvector_a/egs/egs_temp.3.ark +# +# where exp/fvector_a/temp/ranges.1 contains something like the following: +# +# 0 1 50 200 +# +# where each line is interpreted as follows: +# +# +# Note: is the zero-based offset of the archive-index +# within the subset of archives that a particular ranges file corresponds to; +# and is the 1-based numeric index of the destination +# archive among the entire list of archives, which will form part of the +# archive's filename (e.g. egs/egs..ark); +# is only kept for debug purposes so you can see which +# archive each line corresponds to. +# +# The list of archives corresponding to ranges.n will be written to output.n, +# so in exp/fvector_a/temp/outputs.1 we'd have: +# +# ark:exp/fvector_a/egs/egs_temp.1.ark ark:exp/fvector_a/egs/egs_temp.2.ark ark:exp/fvector_a/egs/egs_temp.3.ark +# +# The number of these files will equal 'num-jobs'. If you add up the word-counts of +# all the outputs.* files you'll get 'num-archives'. The number of frames in each archive +# will be about the --frames-per-iter. +# + +# we're using python 3.x style print but want it to work in python 2.x, +from __future__ import print_function +import re, os, argparse, sys, math, warnings, random + + +parser = argparse.ArgumentParser(description="Writes ranges.*, outputs.* and archive_chunk_lengths files " + "in preparation for dumping egs for xvector training.", + epilog="Called by steps/nnet3/xvector/get_egs.sh") +parser.add_argument("--prefix", type=str, default="", + help="Adds a prefix to the output files. This is used to distinguish between the train " + "and diagnostic files.") +parser.add_argument("--frames-per-chunk", type=int, default=100, + help="The number of frames-per-chunk used for any archive") +parser.add_argument("--frames-per-iter", type=int, default=1000000, + help="Target number of frames for each archive") +parser.add_argument("--num-archives", type=int, default=-1, + help="Number of archives to write") +parser.add_argument("--num-jobs", type=int, default=-1, + help="Number of jobs we're going to use to write the archives; the ranges.* " + "and outputs.* files are indexed by job. Must be <= the --num-archives option.") +parser.add_argument("--seed", type=int, default=1, + help="Seed for random number generator") + +# now the positional arguments +parser.add_argument("utt2len", + help="utt2len file of the features to be used as input (format is: " + " )") +parser.add_argument("oriutt2allutt", + help="oriutt2allutt to be used as input (format is: " + " ... )") +parser.add_argument("egs_dir", + help="Name of egs directory, e.g. exp/xvector_a/egs") + +print(' '.join(sys.argv)) + +args = parser.parse_args() + +if not os.path.exists(args.egs_dir + "/temp"): + os.makedirs(args.egs_dir + "/temp") + +## Check arguments. +if args.frames_per_chunk <= 1: + sys.exit("--frames-per-chunk is invalid.") +if args.frames_per_iter < 1000: + sys.exit("--frames-per-iter is invalid.") +if args.num_archives < 1: + sys.exit("--num-archives is invalid") +if args.num_jobs > args.num_archives: + sys.exit("--num-jobs is invalid (must not exceed num-archives)") + +random.seed(args.seed) + +f = open(args.utt2len, "r"); +if f is None: + sys.exit("Error opening utt2len file " + str(args.utt2len)); +utt_ids = [] +lengths = [] +for line in f: + a = line.split() + if len(a) != 2: + sys.exit("bad line in utt2len file " + line); + utt_ids.append(a[0]) + lengths.append(int(a[1])) +f.close() + +num_utts = len(utt_ids) +max_length = max(lengths) + +if args.frames_per_chunk * 3 > max_length: + sys.exit("--max-frames-per-chunk={0} is not valid: it must be no more " + "than a third of the maximum length {1} from the utt2len file ".format( + args.max_frames_per_chunk, max_length)) + +# create the map form ori-utt-id to all kinds of utt-id. The ori-utt-id is the +# index, which is same with the elements in utt_ids[] +f = open(args.oriutt2allutt, "r"); +if f is None: + sys.exit("Error opening oriutt2allutt file " + str(args.oriutt2allutt)); +utt_map = {} +for line in f: + a = line.split() + if len(a) < 3: + sys.exit("bad line in oriutt2allutt file " + line); + tmp_list = [] + for i in range(1, len(a)): + tmp_list.append(a[i]) + tuple_list = tuple(tmp_list) + utt_map[a[0]]=tuple_list +f.close() + + +# this function returns a random integer utterance index, limited to utterances +# above a minimum length in frames, with probability proportional to its length. +def RandomUttAtLeastThisLong(min_length): + while True: + i = random.randrange(0, num_utts) + # read the next line as 'with probability lengths[i] / max_length'. + # this allows us to draw utterances with probability with + # prob proportional to their length. + if lengths[i] > min_length and random.random() < lengths[i] / float(max_length): + return i + + +# given an utterance length utt_length (in frames) and two desired chunk lengths +# (length1 and length2) whose sum is <= utt_length, +# this function randomly picks the starting points of the chunks for you. +# the chunks may appear randomly in either order. +def GetRandomOffsets(utt_length, length): + if length > utt_length: + sys.exit("code error: tot-length > utt-length") + free_length = utt_length - length + offset = random.randrange(0, free_length + 1) + return offset + + +# this function randomly choose two utt-id form utt_map depending on ori-utt-id +def ChoosePairs(ori_utt_id): + this_tuple = utt_map[ori_utt_id] + while True: + first_index = random.randint(0, len(this_tuple) - 1) + second_index = random.randint(0, len(this_tuple) - 1) + if first_index != second_index: + break + utt_a = this_tuple[first_index] + utt_b = this_tuple[second_index] + return (utt_a, utt_b) + + +# each element of all_egs (one per archive) is +# an array of 2-tuples (utterance-index, offset) +all_egs= [] + +prefix = "" +if args.prefix != "": + prefix = args.prefix + "_" + +for archive_index in range(args.num_archives): + tot_length = 2 * args.frames_per_chunk + this_num_egs = (args.frames_per_iter / tot_length) + 1 + this_egs = [ ] # this will be an array of 2-tuples (utterance-index, start-frame). + for n in range(this_num_egs): + utt_index = RandomUttAtLeastThisLong(args.frames_per_chunk) + utt_len = lengths[utt_index] + offset = GetRandomOffsets(utt_len, args.frames_per_chunk) + this_egs.append( (utt_index, offset) ) + all_egs.append(this_egs) + +# work out how many archives we assign to each job in an equitable way. +num_archives_per_job = [ 0 ] * args.num_jobs +for i in range(0, args.num_archives): + num_archives_per_job[i % args.num_jobs] = num_archives_per_job[i % args.num_jobs] + 1 + + +cur_archive = 0 +for job in range(args.num_jobs): + this_ranges = [] + this_archives_for_job = [] + this_num_archives = num_archives_per_job[job] + + for i in range(0, this_num_archives): + this_archives_for_job.append(cur_archive) + for (utterance_index, offset) in all_egs[cur_archive]: + this_ranges.append( (utterance_index, i, offset) ) + cur_archive = cur_archive + 1 + f = open(args.egs_dir + "/temp/" + prefix + "ranges." + str(job + 1), "w") + if f is None: + sys.exit("Error opening file " + args.egs_dir + "/temp/" + prefix + "ranges." + str(job + 1)) + for (utterance_index, i, offset) in sorted(this_ranges): + archive_index = this_archives_for_job[i] + this_utt_id = utt_ids[utterance_index] + #Random select two utt-id + (utt_a, utt_b) = ChoosePairs(this_utt_id) + print("{0} {1} {2} {3} {4} {5}".format(utt_a, + utt_b, + i, + archive_index + 1, + offset, + args.frames_per_chunk), + file=f) + f.close() + + f = open(args.egs_dir + "/temp/" + prefix + "outputs." + str(job + 1), "w") + if f is None: + sys.exit("Error opening file " + args.egs_dir + "/temp/" + prefix + "outputs." + str(job + 1)) + print( " ".join([ str("{0}/" + prefix + "egs_temp.{1}.ark").format(args.egs_dir, n + 1) for n in this_archives_for_job ]), + file=f) + f.close() + + +print("allocate_examples.py: finished generating " + prefix + "ranges.* and " + prefix + "outputs.* files") + diff --git a/egs/wsj/s5/steps/nnet3/fvector/apply_map_one2mult.pl b/egs/wsj/s5/steps/nnet3/fvector/apply_map_one2mult.pl new file mode 100755 index 00000000000..fbf92e10331 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/fvector/apply_map_one2mult.pl @@ -0,0 +1,111 @@ +#!/usr/bin/env perl +use warnings; #sed replacement for -w perl parameter +# Copyright 2016 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0. + +# This program try to slove the following problem: +# Assume the map is A A1 A2 A3 A4 +# The input is A B C D +# The output is A1 B C D \n A2 B C D \n A3 B C D \n A4 B C D \n +# This is a one2multiple mapping. + +# Attentation: Use ":" to join the post-map. + + +if (@ARGV > 0 && $ARGV[0] eq "-f") { + shift @ARGV; + $field_spec = shift @ARGV; + if ($field_spec =~ m/^\d+$/) { + $field_begin = $field_spec - 1; $field_end = $field_spec - 1; + } + if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10) + if ($1 ne "") { + $field_begin = $1 - 1; # Change to zero-based indexing. + } + if ($2 ne "") { + $field_end = $2 - 1; # Change to zero-based indexing. + } + } + if (!defined $field_begin && !defined $field_end) { + die "Bad argument to -f option: $field_spec"; + } +} + +# Mapping is obligatory +$permissive = 0; +if (@ARGV > 0 && $ARGV[0] eq '--permissive') { + shift @ARGV; + # Mapping is optional (missing key is printed to output) + $permissive = 1; +} + +if(@ARGV != 1) { + print STDERR "Invalid usage: " . join(" ", @ARGV) . "\n"; + print STDERR "Usage: apply_map_one2mult.pl [options] map output\n" . + "options: [-f ]\n" . + "Applies the map 'map' to all input text, where each line of the map\n" . + "is interpreted as a map from the first field to the list of the other fields\n" . + "Note: can look like 4-5, or 4-, or 5-, or 1, it means the field\n" . + "range in the input to apply the map to.\n" . + "e.g.: echo A B | apply_map.pl a.txt\n" . + "where a.txt is:\n" . + "A A1 A2\n" . + "B B1\n" . + "will produce:\n" . + "A1 B1\n" . + "A2 B1\n"; + exit(1); +} + +($map) = @ARGV; +open(M, "<$map") || die "Error opening map file $map: $!"; + +while () { + @A = split(" ", $_); + @A >= 1 || die "apply_map.pl: empty line."; + $i = shift @A; + $o = join(":", @A); + $map{$i} = $o; +} + +sub printcontent { + (my $start, my @string)=@_; + + if ( $start == @string ) { print join(" ",@string) . "\n"; + } else { + my $tmp = $string[$start]; + my @Word = split(":", $tmp); + if ( @Word != 1) { + foreach(@Word) { + $string[$start] = $_; + $start++; + &printcontent($start, @string); + $start--; + } + } else { + $start++; + &printcontent($start, @string); + } + } +} + +while() { + @A = split(" ", $_); + for ($x = 0; $x < @A; $x++) { + if ( (!defined $field_begin || $x >= $field_begin) + && (!defined $field_end || $x <= $field_end)) { + $a = $A[$x]; + if (!defined $map{$a}) { + if (!$permissive) { + die "apply_map.pl: undefined key $a\n"; + } else { + print STDERR "apply_map.pl: warning! missing key $a\n"; + } + } else { + $A[$x] = $map{$a}; + } + } + } + # print the content + &printcontent(0,@A); +} diff --git a/egs/wsj/s5/steps/nnet3/fvector/generate_noise_range.py b/egs/wsj/s5/steps/nnet3/fvector/generate_noise_range.py new file mode 100755 index 00000000000..e29359b8e9b --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/fvector/generate_noise_range.py @@ -0,0 +1,339 @@ +#!/usr/bin/env python + +# The function use to generate range-file for fvector +# Each line of the range-file corrsponds to a kind of perturbed wav. In each +# line, there is a in the beginning of the line and then +# we use comma to seperate different addnoise range. The format of each addnoise +# range is ::::: +# The line which starts with the asterisk(*) is the differences between two versions. + +# For the fixed-length version: +# In the beginning of the line, there is a +# *For +# *Except the last fragement, the length will be a fixed value T. +# For +# It is randomly selected from noise list, which is longer than --min-additive-noise-len +# *For +# *If the noise file is longer than fixed value. We randomly select the start point and +# *the length will be fixed value T. +# *If the noise file is shorter than T. We select the whole noise. +# The control the rate of signal and noise. In the other word, scale the amplitude of noise. +# The snr will be randomly selected form the range (max-snr, min-snr). + +# For the variable-length version: +# In the beginning of the line, there is a +# *For +# *Except the last fragement, the length will be random. +# For +# It is randomly selected from noise list, which is longer than --min-additive-noise-len +# *For +# *If the noise file is longer than wav length. We randomly select the start point and +# *the length will be the same as wav length. +# *If the noise file is shorter than T. We select the whole noise. +# For , it was used to control the amplitude of noise +# It will be randomly selected from the range (max-snr, min-snr) + +# At the same time, the function will generate the mapping of wav and perturbedwav +# Each line contains a mapping. (e.g.: wav1 wav1-perturbed-1 wav1-perturbed-2 ...) +from __future__ import print_function +import re, os, argparse, sys, math, warnings, random + +parser = argparse.ArgumentParser(description="Generate a noise range-file which contains " + "N lines corresponding to the number of kinds for each original wav. " + "The file which created by this python code will be supplied to " + "add additive noise program.", + epilog="Called by steps/nnet3/fvector/add_noise.sh") +parser.add_argument("--num-ranges-per-wav", type=int, default=4, + help="the number of expected addnoise kinds") +parser.add_argument("--min-additive-noise-len", type=float, default=2.0, + help="the minimum duration/length of each noise file in seconds") +parser.add_argument("--min-snr", type=int, default=-5, + help="the minimum Signal-to-Noise Rate, the default=0") +parser.add_argument("--max-snr", type=int, default=-15, + help="the maximum Signal-to-Noise Rate, the default=-10") +parser.add_argument("--seed", type=int, default=-1, + help="Seed for random number generator") +parser.add_argument("--variable-len-additive-noise", type=str, + help="If true, generate the variable-length range files for each original wavform file." + "If false, generate the fixed-length range files for each original wavform file.", + default="false", choices = ["false", "true"]) + +# now the positional arguments +parser.add_argument("wav2dur", + help="wav2dur file of the original wav to be used as input (format is: " + " ") +parser.add_argument("noise2dur", + help="noise2dur file of the noise wav to be used as input (format is: " + " ") +parser.add_argument("range_file", + help="Name of range file, e.g.: exp/fxvector/ranges") +parser.add_argument("wav2perturbedwav", + help="This file is used to store the mapping between wav and perturbedwav" + "(e.g.: wav1 wav1-perturbed-1 wav1-perturbed-2 ...") + +print(' '.join(sys.argv)) + +args = parser.parse_args() + +## Check arguments +if args.min_snr < args.max_snr: + sys.exit("For SNR, the less numerical value is, the larger noise is. So --min-snr bigger " + "than --max-snr in numerical value.") + +random.seed(args.seed) + +# This function extract the information from the file--wav2dur. Its outputs will +# be ids[] and lengths[] +def WavToDuration(duration_file, ids, lengths, strict): + f = open(duration_file, "r") + if f is None: + sys.exit("Error opening wav2dur file " + str(duration_file)) + num_error = 0 + num_done = 0 + for line in f: + a = line.split() + if len(a) != 2: + sys.exit("Bad line \"" + line.strip() +"\" in file: " + str(duration_file)) + if float(a[1]) < args.min_additive_noise_len: + if strict: + sys.exit("ERROR: The wav length \"" + line.strip()+ "\" is shorter than --min-additive-noise-len") + else: + num_error += 1 + continue + ids.append(a[0]) + lengths.append(float(a[1])) + num_done += 1 + f.close() + if num_error is not 0: + warning_str ="Warning: There are " + str(num_error) + " utterances whose length smaller than " + \ + "--min-additive-noise-len, we remove it from the list. Now, there are " + \ + str(num_done) + " utterances in the list." + sys.stdout.write( warning_str + '\n') + return + +# This function generates the fixed-length range files +def GenerateFixedLengthRangeFile(): + num_fixed_error = 0 + num_fixed_done = 0 + num_wav = len(wav_ids) + num_noise = len(noise_ids) + + # create a file to record the ranges + f = open(args.range_file, "w") + if f is None: + sys.exit("Error open file " + args.range_file) + + # create a file to record the wav2perturbedwav + g = open(args.wav2perturbedwav, "w") + if g is None: + sys.exit("Error open file " + args.wav2perturbedwav) + + for i in range(0, num_wav): + # decide the number of noises which will be add to + current_wav_len = wav_lengths[i] + max_num_additive_noise = int(current_wav_len / args.min_additive_noise_len) + + if max_num_additive_noise > num_noise: + print( "Warning: The number of noise files or the --min-additive-noise-len is too small" ) + num_fixed_error += 1 + continue + + # print the wav_id + print("{0}".format(wav_ids[i]), end="", file=g) + + # We generate $num_ranges_per_wav ranges + for j in range(0, args.num_ranges_per_wav): + # print the perturbed wav id in the beginning of line + print("{0}-{1}".format(wav_ids[i], "perturbed"+str(j+1)), end=" ", file=f) + + # print the perturbedwav_id + print(" {0}-{1}".format(wav_ids[i], "perturbed"+str(j+1)), end="", file=g) + + # select a number from [1 ... max_num_additive_noise] + num_additive_noise = random.randint(1, max_num_additive_noise) + + # decide the length of each noise, minus 0.01 to prevent overstep + additive_noise_len = float('{:.2f}'.format(current_wav_len / num_additive_noise)) - 0.01 + + # generate one line of file + # format: wav_t_start:wav_t_end:noise_name:noise_t_start:noise_t_end:snr, + for k in range(0, num_additive_noise - 1): + wav_t_start = float('{:.2f}'.format(k * additive_noise_len)) + wav_t_end = wav_t_start + additive_noise_len + + noise_index = random.randrange(0, num_noise) + current_noise_name = noise_ids[noise_index] + current_noise_len = noise_lengths[noise_index] + if current_noise_len <= additive_noise_len: + noise_t_start = 0.0 + noise_t_end = current_noise_len + else : + noise_start_bound = float('{:.2f}'.format(current_noise_len - additive_noise_len)) + noise_t_start = float('{:.2f}'.format(random.uniform(0, noise_start_bound))) + noise_t_end = noise_t_start + additive_noise_len + + current_snr = random.randrange(args.max_snr, args.min_snr) + + print("{0}:{1}:{2}:{3}:{4}:{5}".format(wav_t_start, + wav_t_end, + current_noise_name, + noise_t_start, + noise_t_end, + current_snr), + end=",",file=f) + # deal with the last noise, which cover the rest + k = num_additive_noise - 1 + wav_t_start = float('{:.2f}'.format(k * additive_noise_len)) + wav_t_end = float('{:.2f}'.format(current_wav_len)) + + noise_index = random.randrange(0, num_noise) + current_noise_name = noise_ids[noise_index] + current_noise_len = noise_lengths[noise_index] + + if current_noise_len <= (wav_t_end - wav_t_start): + noise_t_start = 0.0 + noise_t_end = current_noise_len + else : + noise_start_bound = float('{:.2f}'.format(current_noise_len - wav_t_end + wav_t_start)) + noise_t_start = float('{:.2f}'.format(random.uniform(0, noise_start_bound))) + noise_t_end = noise_t_start + wav_t_end - wav_t_start + + current_snr = random.randrange(args.max_snr, args.min_snr) + + print("{0}:{1}:{2}:{3}:{4}:{5}".format(wav_t_start, + wav_t_end, + current_noise_name, + noise_t_start, + noise_t_end, + current_snr), + file=f) + num_fixed_done += 1 + # print the "\n" + print("\n", end="", file=g) + f.close() + g.close() + print('''Finished generating fixed_length range-file for all wav. Compare with our expect, it lacks %d ranges. Now we totally have %d noise ranges in the range-file.''' %(num_fixed_error, num_fixed_done) ) + +# This function generates the variable-length range files +def GenerateVariableLengthRangeFile(): + num_variable_error = 0 + num_variable_done = 0 + + # create a file to record the ranges + f = open(args.range_file, "w") + if f is None: + sys.exit("Error open file " + args.range_file) + + # create a file to record the wav2perturbedwav + g = open(args.wav2perturbedwav, "w") + if g is None: + sys.exit("Error open file " + args.wav2perturbedwav) + + for i in range(0, num_wav): + + # check the noise list has enough sample or not + current_wav_len = wav_lengths[i] + max_num_additive_noise = int(current_wav_len / args.min_additive_noise_len) + + if max_num_additive_noise > num_noise: + print( "Warning: The number of noise files or the --min-additive-noise-len is too small" ) + num_variable_error += 1 + continue + + # print the wav_id + print("{0}".format(wav_ids[i]), end="", file=g) + + # We generate $num_ranges_per_wav ranges + for j in range(0, args.num_ranges_per_wav): + # print the perturbed wav id in the beginning of line + print("{0}-{1}".format(wav_ids[i], "perturbed"+str(j+1)), end=" ", file=f) + + # print the perturbedwav_id + print(" {0}-{1}".format(wav_ids[i], "perturbed"+str(j+1)), end="", file=g) + + # generate range file + # format: wav_t_start:wav_t_end:noise_name:noise_t_start:noise_t_end:snr, + the_rest = float('{:.2f}'.format(current_wav_len)) + wav_t_start = 0.0 + wav_t_end = 0.0 + while (the_rest > float(args.min_additive_noise_len)): + # firstly, we randomly choose a kind of noise and snr + noise_index = random.randrange(0, num_noise) + current_noise_name = noise_ids[noise_index] + current_noise_len = noise_lengths[noise_index] + current_snr = random.randrange(args.max_snr, args.min_snr) + + # Secondly, we randomly select a fragement of the noise file. + noise_start_bound = float('{:.2f}'.format(current_noise_len - float(args.min_additive_noise_len))) + noise_t_start = float('{:.2f}'.format(random.uniform(0, noise_start_bound))) + noise_end_upperbound = float('{:.2f}'.format(noise_t_start + float(args.min_additive_noise_len))) + noise_end_lowerbound = float('{:.2f}'.format(min((noise_t_start + the_rest), current_noise_len))) + noise_t_end = float('{:.2f}'.format(random.uniform(noise_end_upperbound, noise_end_lowerbound))) + current_noise_length = noise_t_end - noise_t_start + + # Thirdly, we generate the start and end point of wav + wav_t_start = wav_t_end #the new start is the end of the last. + wav_t_end = wav_t_start + current_noise_length + + # Forthly, update the_rest + the_rest = the_rest - current_noise_length + + # Fifthly, print + print("{0}:{1}:{2}:{3}:{4}:{5}".format(wav_t_start, + wav_t_end, + current_noise_name, + noise_t_start, + noise_t_end, + current_snr), + end=",",file=f) + # deal with the bit of wav + # firstly, we randomly choose a kind of noise and snr + noise_index = random.randrange(0, num_noise) + current_noise_name = noise_ids[noise_index] + current_noise_len = noise_lengths[noise_index] + current_snr = random.randrange(args.max_snr, args.min_snr) + + # Secondly, we randomly select a fragement of the noise file. + noise_start_bound = float('{:.2f}'.format(current_noise_len - the_rest)) + noise_t_start = float('{:.2f}'.format(random.uniform(0, noise_start_bound))) + noise_t_end = noise_t_start + the_rest + current_noise_length = noise_t_end - noise_t_start + + # Thirdly, we generate the start and end point of wav + wav_t_start = wav_t_end #the new start is the end of the last. + wav_t_end = wav_t_start + current_noise_length + + # Forthly, print + print("{0}:{1}:{2}:{3}:{4}:{5}".format(wav_t_start, + wav_t_end, + current_noise_name, + noise_t_start, + noise_t_end, + current_snr), + file=f) + num_variable_done += 1 + print("\n", end="", file=g) + f.close() + g.close() + print('''Finished generating variable_length range-file for all wav. Compare with our expect, it lacks %d ranges. Now we totally have %d noise ranges in the range-file.''' %(num_variable_error, num_variable_done) ) + +if __name__ == "__main__": + # deal with the original wav utt2dur + # the information was stored in wav_ids[], wav_lengths[] and num_wav + wav_ids = [] + wav_lengths = [] + WavToDuration(args.wav2dur, wav_ids, wav_lengths, True) + num_wav = len(wav_ids) + + # deal with the noise wav utt2dur + # remove the noise whose length < --min-additive-noise-len + noise_ids = [] + noise_lengths = [] + WavToDuration(args.noise2dur, noise_ids, noise_lengths, False) + num_noise = len(noise_ids) + + # generate the range file + if args.variable_len_additive_noise == "true": + GenerateVariableLengthRangeFile() + else: + GenerateFixedLengthRangeFile() diff --git a/egs/wsj/s5/steps/nnet3/fvector/generate_perturb_wav_specifier.py b/egs/wsj/s5/steps/nnet3/fvector/generate_perturb_wav_specifier.py new file mode 100755 index 00000000000..fc49a6bc4df --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/fvector/generate_perturb_wav_specifier.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python + +# This function is used to generate the perturbed_wav.scp with the inputs as +# wav.scp, wav2perturbedwav, ranges + +# The final format is : +# wav1 sph2pipe -f wav -p -c 1 $path/wav1.sph | +# wav1-p1 sph2pipe -f wav -p -c 1 $path/wav1.sph | nnet3-fvector-perturb-signal +# --noise-scp=scp:noise.scp noise-range="range-p1-for-wav1" - | + +from __future__ import print_function +import re, os, argparse, sys, math, warnings, random + +parser = argparse.ArgumentParser(description="Generate a mapping file which use to map the wav to " + "Corresponding pertrubedwav", + epilog="Called by steps/nnet3/fvector/add_noise.sh") +parser.add_argument("--noise", type=str, + help="To assign the noise.scp. You must make sure it is same with " + "the noise.scp which is used to generate range_file.") +# now the positional arguments +parser.add_argument("wav_scp", + help="The orginial wav.scp which contains all the original wav " + "The format is: .") +parser.add_argument("range_file", + help="The file contains the range information which is used to " + "control the process of adding noise. The format is : " + " .") +parser.add_argument("wav2perturbedwav", + help="This file contains the mapping between wav and perturbedwav.") +parser.add_argument("perturbed_wav_scp", + help="The file is used to store the perturbed wav sperifier.") + +print(' '.join(sys.argv)) + +args = parser.parse_args() + +# Extract the information form the wav_scprding_ids = [] +wav_recording_ids = [] +wav_extended_files = [] +f = open(args.wav_scp, "r") +if f is None: + sys.exit("Error opening wav.scp file") +for line in f: + # remove the "\n" in the end of each line + line.split("\n") + a = line.split() + wav_recording_ids.append(a[0]) + del a[0] + wav_extended_files.append(' '.join(a)) +f.close() + +# Extract the infromation from the range_file +perturbed_range_ids = [] +perturbed_range_contents = [] +f = open(args.range_file, "r") +if f is None: + sys.exit("Error opening range_file") +for line in f: + # remove the "\n" in the end of each line + line.split("\n") + a = line.split() + if len(a) != 2: + sys.exit("Bad line \"" + line + "\" in file: " + str(args.range_file)) + perturbed_range_ids.append(a[0]) + perturbed_range_contents.append(a[1]) +f.close() + +# generate the mapping file through iterating all terms in the wav2perturbedwav +f = open(args.wav2perturbedwav, "r") +if f is None: + sys.exit("Error opening wav2perturbedwav") +# make a store file. +g = open(args.perturbed_wav_scp, "w") +if g is None: + sys.exit("Error opening perturbed_wav_specifier") + +# start the loop +for line in f: + # remove the "\n" in the end of each line + line.split("\n") + wav_list = line.split() + current_wav_id = wav_list[0] + current_wav_index = wav_recording_ids.index(current_wav_id) + + # print the original wav + print("{0} {1}".format(current_wav_id, wav_extended_files[current_wav_index]), file=g) + + for i in range(1, len(wav_list)): + current_perturbed_wav_id = wav_list[i] + current_perturbed_wav_index = perturbed_range_ids.index(current_perturbed_wav_id) + print('''{0} {1} nnet3-fvector-perturb-signal --noise=scp:{2} --noise-range=\"{3}\" - - |'''.format( + current_perturbed_wav_id, + wav_extended_files[current_wav_index], + args.noise, + perturbed_range_contents[current_perturbed_wav_index]),file=g) +g.close() +f.close() +print("Finished generating the perturb_wav.scp") diff --git a/src/feat/signal-distort.cc b/src/feat/signal-distort.cc index 788860ba769..c71f8e967db 100644 --- a/src/feat/signal-distort.cc +++ b/src/feat/signal-distort.cc @@ -72,5 +72,87 @@ void TimeStretch(const MatrixBase &input_egs, perturb_egs->CopyFromMat(out_mat); } +PerturbXvectorSignal::PerturbXvectorSignal(XvectorPerturbOptions opts): opts_(opts) { + if (!opts_.add_noise.empty()) { + // initialize the noise_list_ + SequentialBaseFloatMatrixReader noise_seq_reader(opts_.add_noise); + for (; !noise_seq_reader.Done(); noise_seq_reader.Next()) { + std::string key = noise_seq_reader.Key(); + noise_list_.push_back(key); + } + noise_seq_reader.Close(); + } +} +// This function add the noise to the orginial signal. We should not normalize +// the signal level of the orginial signal. According to SNR, we rescale the noise +// and add it. So that the perturbed signal is created. +void PerturbXvectorSignal::ApplyAdditiveNoise(const MatrixBase &input_eg, + const Matrix &noise_mat, + Matrix *perturbed_eg) { + // In the version, we ask the noise_cols == input_cols. + int32 input_rows = input_eg.NumRows(), input_cols = input_eg.NumCols(); + KALDI_ASSERT(noise_mat.NumCols() == input_cols); + + // As the noise_mat is very huge and the input_eg is small normally, + // so we'd better not reload the "noise_mat" matrix + // select the noise range + + Matrix selected_noise_mat; + selected_noise_mat.Resize(input_rows, input_cols); + + int32 noise_rows = noise_mat.NumRows(); + int32 start_row_ind = RandInt(0, noise_rows - input_rows); + + if (noise_mat.NumRows() < input_rows) { + int32 indices[input_rows]; + for (int32 i=0; i < input_rows; ++i) { + indices[i] = (start_row_ind + i) % noise_mat.NumRows(); + } + selected_noise_mat.CopyRows(noise_mat, indices); + } else { + selected_noise_mat.AddMat(1.0, noise_mat.Range(start_row_ind, input_rows, + 0, input_cols)); + } + + // compute the energy of noise and input + Matrix input_energy_mat(input_rows, input_cols); + input_energy_mat.AddMatMatElements(1.0, input_eg, input_eg, 0.0); + double input_energy = input_energy_mat.Sum(); + Matrix noise_energy_mat(input_rows, input_cols); + noise_energy_mat.AddMatMatElements(1.0, selected_noise_mat, selected_noise_mat, 0.0); + double noise_energy = noise_energy_mat.Sum(); + + // In Energy domain, SNR=20log10(S/N). + // 10^(SNR/20) = input_energy / (scale^2 * noise_energy) + double scale = input_energy / noise_energy / (pow(10,opts_.snr/20)); + scale = sqrt(scale); + + // Add noise mat to input_eg mat + perturbed_eg->Resize(input_rows, input_cols); + perturbed_eg->CopyFromMat(input_eg); + perturbed_eg->AddMat(scale, selected_noise_mat); +} + +void PerturbXvectorSignal::ApplyDistortion(const MatrixBase &input_eg, + Matrix *perturbed_eg) { + // we random choose an noise example + int32 num_noises = noise_list_.size(); + int32 noise_index = RandInt(0, num_noises - 1); + std::string noise_name = noise_list_[noise_index]; + RandomAccessBaseFloatMatrixReader noise_random_reader(opts_.add_noise); + Matrix noise_mat = noise_random_reader.Value(noise_name); + + // conduct ApplyAdditiveNoise + ApplyAdditiveNoise(input_eg, noise_mat, perturbed_eg); + // conduct others + // TODO +} + +// This function calls ApplyDistortion to apply different type of perturbations. +void PerturbExample(PerturbXvectorSignal &eg_perturber, + const Matrix &input_eg, + Matrix *perturbed_eg) { + eg_perturber.ApplyDistortion(input_eg, perturbed_eg); +} } // end of namespace kaldi diff --git a/src/feat/signal-distort.h b/src/feat/signal-distort.h index b3faad96554..b8fc1542b4d 100644 --- a/src/feat/signal-distort.h +++ b/src/feat/signal-distort.h @@ -42,36 +42,47 @@ struct XvectorPerturbOptions { int32 frame_dim; int32 negation_prop; bool rand_distort; - std::string noise_egs; + std::string add_noise; + BaseFloat snr; + XvectorPerturbOptions(): max_shift(0.2), max_time_stretch(0.2), frame_dim(80), negation_prop(0.0), - rand_distort(false) { } + rand_distort(false), + snr(10.0) { } void Register(OptionsItf *opts) { - opts->Register("max-shift", &max_shift, "Maximum random shift relative" - "to frame length applied to egs."); + opts->Register("max-shift", &max_shift, "Maximum random shift relative " + "to frame length applied to egs."); opts->Register("max-speed-perturb", &max_time_stretch, "Max speed perturbation applied on egs."); opts->Register("frame-dim", &frame_dim, "The numebr of samples in input frame as product of frame_length by samp_freq."); opts->Register("negation-prop", &negation_prop, "This proportion of the input value is randomly negated."); - opts->Register("noise-egs", &noise_egs, "If supplied, the additive noise is added to input signal."); opts->Register("rand_distort", &rand_distort, "If true, the signal is slightly changes" "using some designed FIR filter with no zeros."); + opts->Register("add-noise", &add_noise, "Noise rspecifier for additive noises, if " + "nonempty, the additive noise randomly selected and added to input egs."); + opts->Register("SNR",&snr,"Specify a Signal to Noise Ration. We will scale the noise according " + "to the original signal and SNR. Normally, it's a non-zero number between -30 and 30" + "default=10"); } }; class PerturbXvectorSignal { public: - PerturbXvectorSignal(XvectorPerturbOptions opts): opts_(opts) { }; - - void ApplyDistortion(const MatrixBase &input_egs, - Matrix *perturb_egs); + PerturbXvectorSignal(XvectorPerturbOptions opts); + void ApplyDistortion(const MatrixBase &input_eg, + Matrix *perturbed_eg); + void ApplyAdditiveNoise(const MatrixBase &input_eg, + const Matrix &noise_mat, + Matrix *perturbed_eg); private: XvectorPerturbOptions opts_; + std::vector noise_list_; }; + // randomly disturb the input signal using a band-pass filter with no zeros. void ComputeAndApplyRandDistortion(const MatrixBase &input_egs, Matrix *perturb_egs); @@ -89,5 +100,9 @@ void TimeStretch(const MatrixBase &input_egs, BaseFloat max_time_stretch, Matrix *perturb_egs); +void PerturbExample(PerturbXvectorSignal &eg_perturber, + const Matrix &input_eg, + Matrix *perturbed_eg); + } // end of namespace kaldi #endif // KALDI_SIGNAL_DISTORT_H_ diff --git a/src/fvectorbin/Makefile b/src/fvectorbin/Makefile new file mode 100644 index 00000000000..48709027de1 --- /dev/null +++ b/src/fvectorbin/Makefile @@ -0,0 +1,25 @@ + +all: +EXTRA_CXXFLAGS = -Wno-sign-compare +include ../kaldi.mk + +LDFLAGS += $(CUDA_LDFLAGS) +LDLIBS += $(CUDA_LDLIBS) + +BINFILES = nnet3-fvector-get-egs nnet3-fvector-perturb-signal nnet3-fvector-get-egs-simple + +OBJFILES = + +# Add this dependency to force cuda-compiled.o to be rebuilt when we reconfigure. +cuda-compiled.o: ../kaldi.mk + +TESTFILES = + +ADDLIBS = ../xvector/kaldi-xvector.a ../nnet3/kaldi-nnet3.a ../gmm/kaldi-gmm.a \ + ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a \ + ../transform/kaldi-transform.a ../tree/kaldi-tree.a \ + ../thread/kaldi-thread.a ../feat/kaldi-feat.a ../cudamatrix/kaldi-cudamatrix.a \ + ../matrix/kaldi-matrix.a ../fstext/kaldi-fstext.a \ + ../util/kaldi-util.a ../base/kaldi-base.a + +include ../makefiles/default_rules.mk diff --git a/src/fvectorbin/nnet3-fvector-get-egs.cc b/src/fvectorbin/nnet3-fvector-get-egs.cc new file mode 100644 index 00000000000..4e3179eb2d5 --- /dev/null +++ b/src/fvectorbin/nnet3-fvector-get-egs.cc @@ -0,0 +1,216 @@ +// fvectorbin/nnet3-fvector-get-egs.cc + +// Copyright 2012-2016 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "util/common-utils.h" +#include "nnet3/nnet-example.h" + +namespace kaldi { +namespace nnet3 { + +// A struct for holding information about the position and +// duration of each pair of chunks. +struct FvectorChunkPairInfo { + std::string pair_name; + std::string utt_a; + std::string utt_b; + int32 output_archive_id; + int32 start_frame; + int32 num_frames; +}; + +// Process the range input file and store it as a map from utterance +// name to vector of ChunkPairInfo structs. +static void ProcessRangeFile(const std::string &range_rxfilename, + std::vector *pairs) { + Input range_input(range_rxfilename); + if (!range_rxfilename.empty()) { + std::string line; + while (std::getline(range_input.Stream(), line)) { + FvectorChunkPairInfo *pair = new FvectorChunkPairInfo(); + std::vector fields; + SplitStringToVector(line, " \t\n\r", true, &fields); + if (fields.size() != 6) { + KALDI_ERR << "Expected 6 fields in line of range file, got " + << fields.size() << " instead."; + } + + std::string utt_a = fields[0], + utt_b = fields[1], + start_frame_str = fields[4], + num_frames_str = fields[5]; + + if (!ConvertStringToInteger(fields[2], &(pair->output_archive_id)) || + !ConvertStringToInteger(start_frame_str, &(pair->start_frame)) || + !ConvertStringToInteger(num_frames_str, &(pair->num_frames))) { + KALDI_ERR << "Expected integer for output archive in range file."; + } + pair->pair_name = utt_a + "-" + utt_b + "-" + start_frame_str + "-" + + num_frames_str; + pair->utt_a = utt_a; + pair->utt_b = utt_b; + + pairs->push_back(pair); + } + } +} + +// Delete the dynamically allocated memory. +static void Cleanup(std::vector *pairs, + std::vector *writers) { + for (std::vector::iterator + it = writers->begin(); it != writers->end(); ++it) { + delete *it; + } + for (std::vector::iterator it = pairs->begin(); + it != pairs->end(); ++it) { + delete *it; + } +} + +} // namespace nnet3 +} // namespace kaldi + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + typedef kaldi::int32 int32; + + const char *usage = + "Get examples for training an nnet3 neural network for the fvector\n" + "system. Each output example contains a pair of feature chunks from\n" + "the specified utterance. The location and length of the feature chunks\n" + "are specified in the 'ranges' file. Each line is interpreted as\n" + "follows:\n" + " " + " " + " \n" + "where is interpreted as a zero-based\n" + "index into the wspecifiers specified on the command line (\n" + "and so on), and is ignored by this program.\n" + "For example:\n" + " utt1-p1 utt1-p2 3 13 5 65\n" + " utt2 utt2-pn 0 10 160 50\n" + "\n" + "Usage: nnet3-fvector-get-egs [options] " + " ... \n" + "\n" + "For example:\n" + "nnet3-fvector-get-egs ranges.1 \"$feats\" ark:egs_temp.1.ark" + " ark:egs_temp.2.ark ark:egs_temp.3.ark\n"; + + bool compress = true; + + ParseOptions po(usage); + po.Register("compress", &compress, "If true, write egs in " + "compressed format."); + + po.Read(argc, argv); + + if (po.NumArgs() < 3) { + po.PrintUsage(); + exit(1); + } + + std::string range_rspecifier = po.GetArg(1); + std::string feature_rspecifier = po.GetArg(2); + std::vector example_writers; + + for (int32 i = 3; i <= po.NumArgs(); i++) { + example_writers.push_back(new NnetExampleWriter(po.GetArg(i))); + } + + std::vector pairs; + // deal with the ranges file and initalize the vector + ProcessRangeFile(range_rspecifier, &pairs); + + RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier); + + int32 num_error = 0, + num_egs_written = 0; + + for (std::vector::iterator iter = pairs.begin(); + iter != pairs.end(); iter++) { + + FvectorChunkPairInfo *pair = *iter; + // get the features + if (!feature_reader.HasKey(pair->utt_a) || !feature_reader.HasKey(pair->utt_b)) { + num_error++; + KALDI_WARN << "The feature " << pair->utt_a << " or " << pair->utt_b + << " is not found."; + continue; + } + const Matrix feats_a = feature_reader.Value(pair->utt_a); + const Matrix feats_b = feature_reader.Value(pair->utt_b); + int32 num_rows = feats_a.NumRows(), + feat_dim = feats_a.NumCols(); + if (num_rows < pair->num_frames) { + num_error++; + KALDI_WARN << "Unable to create examples for utterance " << pair->pair_name + << ". Requested chunk size is " + << pair->num_frames + << ", but utterance has only " << num_rows << " frames."; + continue; + } else { + // As the utt2len file is not the exact frames of a utterance, so the + // requested chunk positions are approximate. It's possible that they + // slightly exceed the number of frames in the utterance. + // If that occurs, we can shift the chunks location back slightly. + int32 shift = std::min(0, num_rows - pair->start_frame - pair->num_frames); + SubMatrix chunk1(feats_a, pair->start_frame + shift, + pair->num_frames, 0, feat_dim), + chunk2(feats_b, pair->start_frame + shift, + pair->num_frames, 0, feat_dim); + NnetIo nnet_io1 = NnetIo("input", 0, chunk1), + nnet_io2 = NnetIo("input", 0, chunk2); + for (std::vector::iterator indx_it = nnet_io1.indexes.begin(); + indx_it != nnet_io1.indexes.end(); ++indx_it) { + indx_it->n = 0; + } + for (std::vector::iterator indx_it = nnet_io2.indexes.begin(); + indx_it != nnet_io2.indexes.end(); ++indx_it) { + indx_it->n = 1; + } + NnetExample eg; + eg.io.push_back(nnet_io1); + eg.io.push_back(nnet_io2); + if (compress) + eg.Compress(); + + if (pair->output_archive_id >= example_writers.size()) { + KALDI_ERR << "Requested output index exceeds number of specified " + << "output files."; + } + example_writers[pair->output_archive_id]->Write(pair->pair_name, eg); + num_egs_written += 1; + } + } + Cleanup(&pairs, &example_writers); + + KALDI_LOG << "Finished generating examples, " + << "successfully wrote " << num_egs_written << " examples; " + << num_error << " files had errors."; + return (num_egs_written == 0); + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} diff --git a/src/fvectorbin/nnet3-fvector-perturb-signal.cc b/src/fvectorbin/nnet3-fvector-perturb-signal.cc new file mode 100644 index 00000000000..a4459ce6d50 --- /dev/null +++ b/src/fvectorbin/nnet3-fvector-perturb-signal.cc @@ -0,0 +1,236 @@ +// fvector/nnet3-fvector-perturb-signal.cc + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "feat/wave-reader.h" +#include "feat/signal.h" + +namespace kaldi { + +struct AdditiveNoiseRange{ + BaseFloat wav_t_start; + BaseFloat wav_t_end; + std::string noise_uttid; + BaseFloat noise_t_start; + BaseFloat noise_t_end; + BaseFloat snr; + + AdditiveNoiseRange(BaseFloat wav_t_start, BaseFloat wav_t_end, std::string noise_uttid, + BaseFloat noise_t_start, BaseFloat noise_t_end, BaseFloat snr): + wav_t_start(wav_t_start), wav_t_end(wav_t_end), noise_uttid(noise_uttid), + noise_t_start(noise_t_start), noise_t_end(noise_t_end), snr(snr) { } +}; + +void GenerateController(std::vector &segments, + std::vector *controller) { + BaseFloat wav_t_start; + BaseFloat wav_t_end; + std::string noise_uttid; + BaseFloat noise_t_start; + BaseFloat noise_t_end; + BaseFloat snr; + for(int i=0; i < segments.size(); ++i) { + std::vector split_string; + SplitStringToVector(segments[i], ":", true, &split_string); + KALDI_ASSERT(split_string.size() == 6); + ConvertStringToReal(split_string[0], &wav_t_start); + ConvertStringToReal(split_string[1], &wav_t_end); + noise_uttid = split_string[2]; + ConvertStringToReal(split_string[3], &noise_t_start); + ConvertStringToReal(split_string[4], &noise_t_end); + ConvertStringToReal(split_string[5], &snr); + + controller->push_back(AdditiveNoiseRange(wav_t_start, wav_t_end, noise_uttid, + noise_t_start, noise_t_end, snr)); + } +} + +void ApplyNoise(std::string &noise_scp, const std::vector &controller, + const VectorBase &input_wav, const int &samp_freq_input, + VectorBase *perturbed_wav) { + // about noise list + RandomAccessTableReader noise_reader(noise_scp); + + // add noise + + for (int i=0; i < controller.size(); ++i) { + const WaveData &noise_wav = noise_reader.Value(controller[i].noise_uttid); + BaseFloat samp_freq_noise = noise_wav.SampFreq(); + KALDI_ASSERT(samp_freq_input == samp_freq_noise); + + const Matrix &noise_matrix = noise_wav.Data(); + int32 num_samp_noise = noise_matrix.NumCols(); + Vector noise(num_samp_noise); + noise.CopyRowFromMat(noise_matrix, 0); + + int32 input_start_point = samp_freq_input * controller[i].wav_t_start; + int32 input_end_point = samp_freq_input * controller[i].wav_t_end - 1; + int32 noise_start_point = samp_freq_noise * controller[i].noise_t_start; + int32 noise_end_point = samp_freq_noise * controller[i].noise_t_end - 1; + BaseFloat snr = controller[i].snr; + // This part is used to deal with the precise problem. + // e.g. If the wav_t_start = 259.49, the sample frequency is 8000. In theroy, + // the wav_start_point is 2075920, however, it will be 2075919 in practise. + int32 input_length = input_end_point - input_start_point + 1; + int32 noise_length = noise_end_point - noise_start_point + 1; + if (input_length != noise_length) { + int32 delta = (input_length > noise_length?(input_length - noise_length) + :(noise_length-input_length)); + if (delta < 0.01*samp_freq_input) { + if (input_length > noise_length) { + input_end_point = input_end_point - delta; + } else { + noise_end_point = noise_end_point - delta; + } + } else { + KALDI_ERR << "There is a problem about input length does not match noise length" + << " where the noise-id is: " << controller[i].noise_uttid + << ", the input length is: " << input_length + << ", the noise length is: " << noise_length << std::endl; + } + } + + // End sample must be less than total number + if ((input_end_point > input_wav.Dim()-1) || (noise_end_point > noise.Dim()-1)) { + int32 over_boundary = ((input_end_point - input_wav.Dim() + 1) > (noise_end_point - noise.Dim() + 1) ? + (input_end_point - input_wav.Dim() + 1) : (noise_end_point - noise.Dim() + 1)); + input_end_point = input_end_point - over_boundary; + noise_end_point = noise_end_point - over_boundary; + } + // The input vector and noise vector contain the whole content of utt seperately. + // According to the AdditiveNoiseRange, we stepwise add the additive noise to input. + // To save the space, we use Subvector, because it returns the pointer. + SubVector input_part(input_wav, input_start_point, + input_end_point - input_start_point + 1); + SubVector noise_part(noise, noise_start_point, + noise_end_point - noise_start_point + 1); + Vector selected_noise(input_part.Dim()); + + // When encounter the situation where noise_part_length is shorter than input_part_length, + // We pad recursively until the selected_noise_length equal to input_part_length. + // Otherwise, selected_noise = noise_part + if (noise_part.Dim() < input_part.Dim()) { + int32 the_rest_length = selected_noise.Dim(); + while (the_rest_length > noise_part.Dim()) { + selected_noise.Range(selected_noise.Dim()-the_rest_length, + noise_part.Dim()).CopyFromVec(noise_part); + the_rest_length = the_rest_length - noise_part.Dim(); + } + selected_noise.Range(selected_noise.Dim()-the_rest_length, the_rest_length).CopyFromVec( + noise_part.Range(0, the_rest_length)); + } else { + selected_noise.CopyFromVec(noise_part); + } + + BaseFloat input_energy = VecVec(input_part, input_part); + BaseFloat noise_energy = VecVec(selected_noise, selected_noise); + BaseFloat scale_factor = sqrt(input_energy/ noise_energy/ (pow(10, snr/20)) ); + perturbed_wav->Range(input_start_point, input_part.Dim()).AddVec(scale_factor, selected_noise); + } +} + +} + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + + const char *usage = + "Perturb the wave files supplied via the specified noise-range\n" + "Usage: nnet3-fvector-perturb-signal [options...] " + "\n" + "e.g.\n" + "nnet3-fvector-perturb-signal --noise=scp:noise.scp --noise-range=" + "wav1-perturbed-1 0.0:1.0:noise1:3.5:4.5:-8,... --input-channel=0 " + "input.wav perturbed_input.wav\n"; + + ParseOptions po(usage); + + std::string noise; + std::string noise_range; + int32 input_channel = 0; + + po.Register("noise",&noise, + "There is a list of optional noise. It need to match the --noise-range."); + po.Register("noise-range",&noise_range, + "Provide a range file. We use the content in this file to control " + "the process of adding noise. For each line, the format is " + ":::::,...," + ":::::"); + po.Register("input-channel",&input_channel, + "Specifies the channel to be used in input file"); + + po.Read(argc, argv); + if (po.NumArgs() != 2) { + po.PrintUsage(); + exit(1); + } + + std::string input_wave_file = po.GetArg(1); + std::string output_wave_file = po.GetArg(2); + + // Generate the Noise Controller list + std::vector controller; + if (!noise_range.empty()) { + //int index = noise_range.find_first_of(" "); + //std::string perturbed_utt_id = noise_range.substr(0, index); + //std::string noise_range_content = noise_range.substr(index+1); + std::vector segments; + SplitStringToVector(noise_range, ",", true, &segments); + GenerateController(segments, &controller); + } + + bool binary = true; + WaveData input_wave; + { + WaveHolder waveholder; + Input ki(input_wave_file, &binary); + waveholder.Read(ki.Stream()); + input_wave = waveholder.Value(); + } + + // about input wav + const Matrix &input_matrix = input_wave.Data(); + BaseFloat samp_freq_input = input_wave.SampFreq(); + int32 num_samp_input = input_matrix.NumCols(), // #samples in the input + num_input_channel = input_matrix.NumRows(); // #channels in the input + KALDI_VLOG(1) << "Sampling frequency of input: " << samp_freq_input + << "the number of samples: " << num_samp_input + << "the number of channels: " << num_input_channel; + KALDI_ASSERT(input_channel < num_input_channel); + Vector input(num_samp_input); + input.CopyRowFromMat(input_matrix, input_channel); + + // new output vector and add noise + Vector output(input); + ApplyNoise(noise, controller, input, samp_freq_input, &output); + + Matrix out_matrix(1, num_samp_input); + out_matrix.CopyRowsFromVec(output); + + WaveData out_wave(samp_freq_input, out_matrix); + Output ko(output_wave_file, binary, false); + WaveHolder::Write(ko.Stream(), true, out_wave); + + return 0; + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} + diff --git a/src/xvectorbin/nnet3-xvector-signal-perturb-egs.cc b/src/xvectorbin/nnet3-xvector-signal-perturb-egs.cc index f78c9c6a03a..d32c9a66d60 100644 --- a/src/xvectorbin/nnet3-xvector-signal-perturb-egs.cc +++ b/src/xvectorbin/nnet3-xvector-signal-perturb-egs.cc @@ -22,70 +22,6 @@ #include "feat/signal-distort.h" #include "nnet3/nnet-example.h" #include "nnet3/nnet-example-utils.h" -namespace kaldi { -namespace nnet3 { - -// This function applies different type of perturbation to input_egs. -// random distortion of inputs, random shifts, adding additive noise, -// random time stretch and random negations are different type of -// distortions used in this function. -void ApplyPerturbation(XvectorPerturbOptions opts, - const Matrix &input_egs, - Matrix *noise_egs, - Matrix *perturb_egs) { - - PerturbXvectorSignal perturb_xvector(opts); - - Matrix shifted_egs(input_egs); - // Generate random shift samples to shift egs. - if (opts.max_shift != 0.0) { - int32 max_shift_int = static_cast(opts.max_shift * opts.frame_dim); - // shift input_egs using random shift. - int32 eg_dim = input_egs.NumCols() - opts.frame_dim, - shift = RandInt(0, max_shift_int); - shifted_egs.CopyFromMat(input_egs.Range(0, input_egs.NumRows(), shift, eg_dim)); - } - - Matrix rand_distort_shifted_egs(shifted_egs); - if (opts.rand_distort) { - // randomly generate an zero-phase FIR filter with no zeros. - // In future, we can select trucated part of room impluse response - // and convolve it with input_egs. - perturb_xvector.ComputeAndApplyRandDistortion(shifted_egs, - &rand_distort_shifted_egs); - } - - if (noise_egs) { - // select random block of noise egs and add to input_egs - // number of additive noises should be larger than number of input-egs. - KALDI_ASSERT(noise_egs->NumRows() >= input_egs.NumRows()); - if (noise_egs->NumRows() < input_egs.NumRows()) { - // repeat the noise_egs_mat blocks to have same length block - // and randomly perturb the rows. - } else { - // Select random submatrix out of noise_egs and add it to perturb_egs. - // we should shuffle noise_egs before passing them to this binary. - int32 start_row_ind = RandInt(0, noise_egs->NumRows() - input_egs.NumRows()), - start_col_ind = RandInt(0, noise_egs->NumCols() - input_egs.NumCols()); - rand_distort_shifted_egs.AddMat(1.0, noise_egs->Range(start_row_ind, input_egs.NumRows(), - start_col_ind, input_egs.NumCols())); - } - } - // Perturb speed of signal egs - Matrix warped_distorted_shifted_egs(rand_distort_shifted_egs); - if (opts.max_time_stretch != 0.0) - perturb_xvector.TimeStretch(rand_distort_shifted_egs, - &warped_distorted_shifted_egs); - - // If nagation is true, the sample values are randomly negated - // with some probability. - if (opts.negation) { - - } -} - -} // end of namespace nnet3 -} // end of namespace kaldi int main(int argc, char *argv[]) { try { @@ -100,11 +36,11 @@ int main(int argc, char *argv[]) { "such as additive noise, negation, random time shifts or random distortion.\n" "Usage: nnet3-xvector-signal-perturb-egs [options...] \n" "e.g.\n" - "nnet3-xvector-signal-perturb-egs --noise-egs=noise.egs\n" - "--max-shift=0.2 --max-speed-perturb=0.1 --negation=true\n" + "nnet3-xvector-signal-perturb-egs --max-shift=0.2" + " --max-speed-perturb=0.1 --negation=true --add-noise=noise.scp --snr=10\n" "ark:input.egs akr:distorted.egs\n"; - ParseOptions po(usage); + ParseOptions po(usage); XvectorPerturbOptions perturb_opts; perturb_opts.Register(&po); @@ -123,16 +59,7 @@ int main(int argc, char *argv[]) { int64 num_read = 0, num_written = 0; - Matrix *noise_mat = NULL; - // read additive noise egs if it is specified. - if (!perturb_opts.noise_egs.empty()) { - SequentialNnetExampleReader noise_reader(perturb_opts.noise_egs); - const NnetExample &noise_egs = noise_reader.Value(); - const NnetIo &noise_io = noise_egs.io[0]; - noise_io.features.CopyToMat(noise_mat); - - } - + PerturbXvectorSignal eg_perturber(perturb_opts); for (; !example_reader.Done(); example_reader.Next(), num_read++) { std::string key = example_reader.Key(); const NnetExample &input_eg = example_reader.Value(); @@ -140,8 +67,10 @@ int main(int argc, char *argv[]) { NnetExample *perturb_eg = new NnetExample(); Matrix perturb_eg_mat, input_eg_mat; - input_eg_io.features.CopyToMat(&input_eg_mat); - ApplyPerturbation(perturb_opts, input_eg_mat, noise_mat, &perturb_eg_mat); + input_eg_io.features.CopyToMat(&input_eg_mat); + + PerturbExample(eg_perturber, input_eg_mat, &perturb_eg_mat); + perturb_eg->io.resize(1.0); perturb_eg->io[0].features.SwapFullMatrix(&perturb_eg_mat); example_writer.Write(key, *perturb_eg); diff --git a/tools/config/common_path.sh b/tools/config/common_path.sh index 5534cf2d13b..f95c5acf8e6 100644 --- a/tools/config/common_path.sh +++ b/tools/config/common_path.sh @@ -21,4 +21,5 @@ ${KALDI_ROOT}/src/onlinebin:\ ${KALDI_ROOT}/src/sgmm2bin:\ ${KALDI_ROOT}/src/sgmmbin:\ ${KALDI_ROOT}/src/xvectorbin:\ +${KALDI_ROOT}/src/fvectorbin:\ $PATH